From fa4c798a25f5c3ccf410c571498a1e67be594e45 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Thu, 2 Jul 2026 13:03:04 +0000 Subject: [PATCH] =?UTF-8?q?feat(host/linux):=20amdgpu=20session=20clock=20?= =?UTF-8?q?pin=20=E2=80=94=20gpuclocks=20grows=20the=20AMD=20arm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvclocks.rs -> gpuclocks.rs. PUNKTFUNK_PIN_CLOCKS=1 now also pins every amdgpu card's power_dpm_force_performance_level to high for the host lifetime (prior level restored on exit) — the measured AMD encode- latency lever: VCN per-frame time doubles when a 60fps paced trickle lets clocks sag (8 -> 4.4ms/frame at 1440p on the 780M with clocks hot). Root-gated by sysfs ownership; non-root degrades to a logged recipe (validated live on the AMD box). Opt-in stays deliberate: box-wide power-management override, wrong on battery/Deck. Co-Authored-By: Claude Fable 5 --- crates/punktfunk-host/src/linux/gpuclocks.rs | 157 ++++++++++++++++--- crates/punktfunk-host/src/main.rs | 8 +- 2 files changed, 135 insertions(+), 30 deletions(-) diff --git a/crates/punktfunk-host/src/linux/gpuclocks.rs b/crates/punktfunk-host/src/linux/gpuclocks.rs index ec09789..35f13eb 100644 --- a/crates/punktfunk-host/src/linux/gpuclocks.rs +++ b/crates/punktfunk-host/src/linux/gpuclocks.rs @@ -1,6 +1,15 @@ -//! NVIDIA clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency +//! GPU clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency //! plan Tier 1B): the driver's adaptive P-state ramps clocks down between bursty encode frames, -//! so every frame re-pays a spin-up. Two independent halves, both no-ops off NVIDIA: +//! so every frame re-pays a spin-up. This is NOT theoretical — measured on the 780M (VCN 4), +//! a 1440p HEVC encode takes ~4.4 ms/frame with the clocks hot (120 fps pacing) but ~8 ms/frame +//! at a 60 fps duty cycle: the sag doubles per-frame encode latency. +//! +//! **AMD** (`PUNKTFUNK_PIN_CLOCKS=1`, root-gated by sysfs ownership): write `high` into each +//! amdgpu card's `power_dpm_force_performance_level` for the host lifetime, restoring the prior +//! value on exit. Non-root gets EACCES → logged once with the privilege recipe. Deliberately +//! opt-in: it defeats power management box-wide and is wrong on battery (Steam Deck!). +//! +//! **NVIDIA** — two independent halves, both no-ops off NVIDIA: //! //! 1. **`CudaNoStablePerfLimit` application profile** (no root needed): a CUDA/NVENC context //! clamps GeForce into the P2 performance state — reduced *memory* clock — for the process @@ -111,52 +120,148 @@ fn flag_truthy(name: &str) -> bool { .unwrap_or(false) } -/// Host-lifetime guard: holds the NVML clock floor (when armed) and resets it on drop. -pub struct ClockGuard { +/// An NVML session holding locked-clock floors on one or more NVIDIA devices. +struct NvmlPin { nvml: Nvml, pinned: Vec, } +/// One amdgpu card whose `power_dpm_force_performance_level` we forced to `high`; `restore` is +/// the pre-pin value written back on drop. +struct AmdPin { + path: std::path::PathBuf, + restore: String, +} + +/// Host-lifetime guard: holds the armed clock pins (NVML floor and/or amdgpu perf level) and +/// undoes them on drop. +pub struct ClockGuard { + nvml: Option, + amd: Vec, +} + // SAFETY: `ClockGuard` holds opaque NVML device handles + resolved fn pointers from the loaded -// driver library. NVML is documented thread-safe, the handles are plain driver tokens with no -// thread affinity, and the guard is only ever *moved* (held in `main`, dropped once at exit) and -// used through `&mut`/ownership — never shared. Transfer across threads is therefore sound. +// driver library (plus plain sysfs paths/strings). NVML is documented thread-safe, the handles are +// plain driver tokens with no thread affinity, and the guard is only ever *moved* (held in `main`, +// dropped once at exit) and used through `&mut`/ownership — never shared. Transfer across threads +// is therefore sound. unsafe impl Send for ClockGuard {} impl Drop for ClockGuard { fn drop(&mut self) { - // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this live - // NVML session (init'd in `pin_clocks`, shut down only here, after the resets). The calls - // take the handle by value and return an int status — no Rust memory is borrowed. - unsafe { - for &dev in &self.pinned { - let _ = (self.nvml.reset_locked_clocks)(dev); + if let Some(pin) = &self.nvml { + // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this + // live NVML session (init'd in `pin_nvidia`, shut down only here, after the resets). + // The calls take the handle by value and return an int status — no Rust memory is + // borrowed. + unsafe { + for &dev in &pin.pinned { + let _ = (pin.nvml.reset_locked_clocks)(dev); + } + let _ = (pin.nvml.shutdown)(); + } + if !pin.pinned.is_empty() { + tracing::info!("NVIDIA clock floor released (locked clocks reset)"); } - let _ = (self.nvml.shutdown)(); } - if !self.pinned.is_empty() { - tracing::info!("GPU clock floor released (locked clocks reset)"); + for pin in &self.amd { + match std::fs::write(&pin.path, &pin.restore) { + Ok(()) => tracing::info!( + card = %pin.path.display(), + restored = %pin.restore, + "amdgpu performance level restored" + ), + Err(e) => tracing::warn!( + card = %pin.path.display(), + error = %e, + "could not restore amdgpu performance level" + ), + } } } } -/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the P2-cap -/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the NVML core-clock floor. -/// Returns the guard keeping the floor for the host lifetime. No-op (`None`) off NVIDIA. +/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the NVIDIA P2-cap +/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the vendor clock pin (NVML +/// core-clock floor / amdgpu `high` performance level). Returns the guard keeping the pins for +/// the host lifetime. `None` when nothing was armed. pub fn on_host_start() -> Option { - if !nvidia_present() { - return None; + if nvidia_present() { + ensure_cuda_perf_profile(); } - ensure_cuda_perf_profile(); if !flag_truthy("PUNKTFUNK_PIN_CLOCKS") { return None; } - pin_clocks() + let nvml = if nvidia_present() { pin_nvidia() } else { None }; + let amd = pin_amdgpu(); + if nvml.is_none() && amd.is_empty() { + return None; + } + Some(ClockGuard { nvml, amd }) +} + +/// Force every amdgpu card's DPM performance level to `high` for the session — the encode-latency +/// lever on AMD: the VCN's per-frame time tracks the clock sag (measured 8 → 4.4 ms/frame at +/// 1440p on the 780M once clocks stay hot). Remembers the prior level for restore-on-drop. +/// Root-gated by sysfs ownership; non-root warns once with the recipe, the host runs unpinned. +fn pin_amdgpu() -> Vec { + let mut pins = Vec::new(); + let mut denied = false; + let Ok(cards) = std::fs::read_dir("/sys/class/drm") else { + return pins; + }; + for entry in cards.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy().into_owned(); + // Cards only (card0, card1, …) — not connectors (card0-DP-1) or render nodes. + if !name.starts_with("card") || name.contains('-') { + continue; + } + let dev = entry.path().join("device"); + let is_amdgpu = std::fs::read_link(dev.join("driver")) + .map(|t| t.to_string_lossy().ends_with("amdgpu")) + .unwrap_or(false); + if !is_amdgpu { + continue; + } + let path = dev.join("power_dpm_force_performance_level"); + let Ok(prev) = std::fs::read_to_string(&path) else { + continue; + }; + let prev = prev.trim().to_string(); + if prev == "high" { + continue; // already pinned (externally) — nothing to do or restore + } + match std::fs::write(&path, "high") { + Ok(()) => { + tracing::info!( + card = %name, + was = %prev, + "amdgpu performance level pinned to high (encode clock sag removed) — \ + restored on host exit" + ); + pins.push(AmdPin { + path, + restore: prev, + }); + } + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => denied = true, + Err(e) => tracing::debug!(card = %name, error = %e, "amdgpu perf-level write failed"), + } + } + if denied { + tracing::warn!( + "PUNKTFUNK_PIN_CLOCKS: writing power_dpm_force_performance_level requires root — \ + grant it via a boot oneshot / udev rule chowning the attribute, or run the host as \ + a system service. The host keeps running unpinned" + ); + } + pins } /// Floor the core clock at TDP/base on every NVIDIA device (reset first, so a stale pin from a /// crashed previous run is replaced rather than compounded). -fn pin_clocks() -> Option { +fn pin_nvidia() -> Option { let nvml = match Nvml::load() { Some(n) => n, None => { @@ -222,9 +327,9 @@ fn pin_clocks() -> Option { } tracing::info!( devices = pinned.len(), - "GPU core-clock floor armed (min=TDP/base, max=boost) — released on host exit" + "NVIDIA core-clock floor armed (min=TDP/base, max=boost) — released on host exit" ); - Some(ClockGuard { nvml, pinned }) + Some(NvmlPin { nvml, pinned }) } } diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs index 5f657c3..7116c23 100644 --- a/crates/punktfunk-host/src/main.rs +++ b/crates/punktfunk-host/src/main.rs @@ -33,6 +33,9 @@ mod drm_sync; mod encode; mod gamestream; mod gpu; +#[cfg(target_os = "linux")] +#[path = "linux/gpuclocks.rs"] +mod gpuclocks; mod hdr; mod inject; #[cfg(target_os = "windows")] @@ -45,9 +48,6 @@ mod library; mod mgmt; mod mgmt_token; mod native_pairing; -#[cfg(target_os = "linux")] -#[path = "linux/nvclocks.rs"] -mod nvclocks; mod pipeline; mod punktfunk1; mod pwinit; @@ -135,7 +135,7 @@ fn real_main() -> Result<()> { // exit via the guard's Drop). No-op off NVIDIA / on the tool subcommands. #[cfg(target_os = "linux")] let _nv_clocks = match args.first().map(String::as_str) { - Some("serve") | Some("punktfunk1-host") => nvclocks::on_host_start(), + Some("serve") | Some("punktfunk1-host") => gpuclocks::on_host_start(), _ => None, };