feat(host/linux): amdgpu session clock pin — gpuclocks grows the AMD arm

nvclocks.rs -> gpuclocks.rs. PUNKTFUNK_PIN_CLOCKS=1 now also pins every
amdgpu card's power_dpm_force_performance_level to high for the host
lifetime (prior level restored on exit) — the measured AMD encode-
latency lever: VCN per-frame time doubles when a 60fps paced trickle
lets clocks sag (8 -> 4.4ms/frame at 1440p on the 780M with clocks
hot). Root-gated by sysfs ownership; non-root degrades to a logged
recipe (validated live on the AMD box). Opt-in stays deliberate:
box-wide power-management override, wrong on battery/Deck.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 13:03:04 +00:00
parent 68c92f6874
commit 6b666b7457
2 changed files with 135 additions and 30 deletions
+130 -25
View File
@@ -1,6 +1,15 @@
//! NVIDIA clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency //! GPU clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency
//! plan Tier 1B): the driver's adaptive P-state ramps clocks down between bursty encode frames, //! plan Tier 1B): the driver's adaptive P-state ramps clocks down between bursty encode frames,
//! so every frame re-pays a spin-up. Two independent halves, both no-ops off NVIDIA: //! so every frame re-pays a spin-up. This is NOT theoretical — measured on the 780M (VCN 4),
//! a 1440p HEVC encode takes ~4.4 ms/frame with the clocks hot (120 fps pacing) but ~8 ms/frame
//! at a 60 fps duty cycle: the sag doubles per-frame encode latency.
//!
//! **AMD** (`PUNKTFUNK_PIN_CLOCKS=1`, root-gated by sysfs ownership): write `high` into each
//! amdgpu card's `power_dpm_force_performance_level` for the host lifetime, restoring the prior
//! value on exit. Non-root gets EACCES → logged once with the privilege recipe. Deliberately
//! opt-in: it defeats power management box-wide and is wrong on battery (Steam Deck!).
//!
//! **NVIDIA** — two independent halves, both no-ops off NVIDIA:
//! //!
//! 1. **`CudaNoStablePerfLimit` application profile** (no root needed): a CUDA/NVENC context //! 1. **`CudaNoStablePerfLimit` application profile** (no root needed): a CUDA/NVENC context
//! clamps GeForce into the P2 performance state — reduced *memory* clock — for the process //! clamps GeForce into the P2 performance state — reduced *memory* clock — for the process
@@ -111,52 +120,148 @@ fn flag_truthy(name: &str) -> bool {
.unwrap_or(false) .unwrap_or(false)
} }
/// Host-lifetime guard: holds the NVML clock floor (when armed) and resets it on drop. /// An NVML session holding locked-clock floors on one or more NVIDIA devices.
pub struct ClockGuard { struct NvmlPin {
nvml: Nvml, nvml: Nvml,
pinned: Vec<NvmlDevice>, pinned: Vec<NvmlDevice>,
} }
/// One amdgpu card whose `power_dpm_force_performance_level` we forced to `high`; `restore` is
/// the pre-pin value written back on drop.
struct AmdPin {
path: std::path::PathBuf,
restore: String,
}
/// Host-lifetime guard: holds the armed clock pins (NVML floor and/or amdgpu perf level) and
/// undoes them on drop.
pub struct ClockGuard {
nvml: Option<NvmlPin>,
amd: Vec<AmdPin>,
}
// SAFETY: `ClockGuard` holds opaque NVML device handles + resolved fn pointers from the loaded // SAFETY: `ClockGuard` holds opaque NVML device handles + resolved fn pointers from the loaded
// driver library. NVML is documented thread-safe, the handles are plain driver tokens with no // driver library (plus plain sysfs paths/strings). NVML is documented thread-safe, the handles are
// thread affinity, and the guard is only ever *moved* (held in `main`, dropped once at exit) and // plain driver tokens with no thread affinity, and the guard is only ever *moved* (held in `main`,
// used through `&mut`/ownership — never shared. Transfer across threads is therefore sound. // dropped once at exit) and used through `&mut`/ownership — never shared. Transfer across threads
// is therefore sound.
unsafe impl Send for ClockGuard {} unsafe impl Send for ClockGuard {}
impl Drop for ClockGuard { impl Drop for ClockGuard {
fn drop(&mut self) { fn drop(&mut self) {
// SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this live if let Some(pin) = &self.nvml {
// NVML session (init'd in `pin_clocks`, shut down only here, after the resets). The calls // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this
// take the handle by value and return an int status — no Rust memory is borrowed. // live NVML session (init'd in `pin_nvidia`, shut down only here, after the resets).
// The calls take the handle by value and return an int status — no Rust memory is
// borrowed.
unsafe { unsafe {
for &dev in &self.pinned { for &dev in &pin.pinned {
let _ = (self.nvml.reset_locked_clocks)(dev); let _ = (pin.nvml.reset_locked_clocks)(dev);
} }
let _ = (self.nvml.shutdown)(); let _ = (pin.nvml.shutdown)();
}
if !pin.pinned.is_empty() {
tracing::info!("NVIDIA clock floor released (locked clocks reset)");
}
}
for pin in &self.amd {
match std::fs::write(&pin.path, &pin.restore) {
Ok(()) => tracing::info!(
card = %pin.path.display(),
restored = %pin.restore,
"amdgpu performance level restored"
),
Err(e) => tracing::warn!(
card = %pin.path.display(),
error = %e,
"could not restore amdgpu performance level"
),
} }
if !self.pinned.is_empty() {
tracing::info!("GPU clock floor released (locked clocks reset)");
} }
} }
} }
/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the P2-cap /// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the NVIDIA P2-cap
/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the NVML core-clock floor. /// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the vendor clock pin (NVML
/// Returns the guard keeping the floor for the host lifetime. No-op (`None`) off NVIDIA. /// core-clock floor / amdgpu `high` performance level). Returns the guard keeping the pins for
/// the host lifetime. `None` when nothing was armed.
pub fn on_host_start() -> Option<ClockGuard> { pub fn on_host_start() -> Option<ClockGuard> {
if !nvidia_present() { if nvidia_present() {
return None;
}
ensure_cuda_perf_profile(); ensure_cuda_perf_profile();
}
if !flag_truthy("PUNKTFUNK_PIN_CLOCKS") { if !flag_truthy("PUNKTFUNK_PIN_CLOCKS") {
return None; return None;
} }
pin_clocks() let nvml = if nvidia_present() { pin_nvidia() } else { None };
let amd = pin_amdgpu();
if nvml.is_none() && amd.is_empty() {
return None;
}
Some(ClockGuard { nvml, amd })
}
/// Force every amdgpu card's DPM performance level to `high` for the session — the encode-latency
/// lever on AMD: the VCN's per-frame time tracks the clock sag (measured 8 → 4.4 ms/frame at
/// 1440p on the 780M once clocks stay hot). Remembers the prior level for restore-on-drop.
/// Root-gated by sysfs ownership; non-root warns once with the recipe, the host runs unpinned.
fn pin_amdgpu() -> Vec<AmdPin> {
let mut pins = Vec::new();
let mut denied = false;
let Ok(cards) = std::fs::read_dir("/sys/class/drm") else {
return pins;
};
for entry in cards.flatten() {
let name = entry.file_name();
let name = name.to_string_lossy().into_owned();
// Cards only (card0, card1, …) — not connectors (card0-DP-1) or render nodes.
if !name.starts_with("card") || name.contains('-') {
continue;
}
let dev = entry.path().join("device");
let is_amdgpu = std::fs::read_link(dev.join("driver"))
.map(|t| t.to_string_lossy().ends_with("amdgpu"))
.unwrap_or(false);
if !is_amdgpu {
continue;
}
let path = dev.join("power_dpm_force_performance_level");
let Ok(prev) = std::fs::read_to_string(&path) else {
continue;
};
let prev = prev.trim().to_string();
if prev == "high" {
continue; // already pinned (externally) — nothing to do or restore
}
match std::fs::write(&path, "high") {
Ok(()) => {
tracing::info!(
card = %name,
was = %prev,
"amdgpu performance level pinned to high (encode clock sag removed) — \
restored on host exit"
);
pins.push(AmdPin {
path,
restore: prev,
});
}
Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => denied = true,
Err(e) => tracing::debug!(card = %name, error = %e, "amdgpu perf-level write failed"),
}
}
if denied {
tracing::warn!(
"PUNKTFUNK_PIN_CLOCKS: writing power_dpm_force_performance_level requires root — \
grant it via a boot oneshot / udev rule chowning the attribute, or run the host as \
a system service. The host keeps running unpinned"
);
}
pins
} }
/// Floor the core clock at TDP/base on every NVIDIA device (reset first, so a stale pin from a /// Floor the core clock at TDP/base on every NVIDIA device (reset first, so a stale pin from a
/// crashed previous run is replaced rather than compounded). /// crashed previous run is replaced rather than compounded).
fn pin_clocks() -> Option<ClockGuard> { fn pin_nvidia() -> Option<NvmlPin> {
let nvml = match Nvml::load() { let nvml = match Nvml::load() {
Some(n) => n, Some(n) => n,
None => { None => {
@@ -222,9 +327,9 @@ fn pin_clocks() -> Option<ClockGuard> {
} }
tracing::info!( tracing::info!(
devices = pinned.len(), devices = pinned.len(),
"GPU core-clock floor armed (min=TDP/base, max=boost) — released on host exit" "NVIDIA core-clock floor armed (min=TDP/base, max=boost) — released on host exit"
); );
Some(ClockGuard { nvml, pinned }) Some(NvmlPin { nvml, pinned })
} }
} }
+4 -4
View File
@@ -33,6 +33,9 @@ mod drm_sync;
mod encode; mod encode;
mod gamestream; mod gamestream;
mod gpu; mod gpu;
#[cfg(target_os = "linux")]
#[path = "linux/gpuclocks.rs"]
mod gpuclocks;
mod hdr; mod hdr;
mod inject; mod inject;
#[cfg(target_os = "windows")] #[cfg(target_os = "windows")]
@@ -45,9 +48,6 @@ mod library;
mod mgmt; mod mgmt;
mod mgmt_token; mod mgmt_token;
mod native_pairing; mod native_pairing;
#[cfg(target_os = "linux")]
#[path = "linux/nvclocks.rs"]
mod nvclocks;
mod pipeline; mod pipeline;
mod punktfunk1; mod punktfunk1;
mod pwinit; mod pwinit;
@@ -135,7 +135,7 @@ fn real_main() -> Result<()> {
// exit via the guard's Drop). No-op off NVIDIA / on the tool subcommands. // exit via the guard's Drop). No-op off NVIDIA / on the tool subcommands.
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
let _nv_clocks = match args.first().map(String::as_str) { let _nv_clocks = match args.first().map(String::as_str) {
Some("serve") | Some("punktfunk1-host") => nvclocks::on_host_start(), Some("serve") | Some("punktfunk1-host") => gpuclocks::on_host_start(),
_ => None, _ => None,
}; };