feat(host/linux): NVIDIA clock hygiene — P2-cap driver profile + opt-in NVML clock floor

Two halves of the easy-scene p99 lever (host-latency plan Tier 1B): CudaNoStablePerfLimit application profile (no root; NVIDIA's supported opt-out of the CUDA/NVENC P2 memory-clock clamp, raw key 0x166c5e=0 per open-gpu-kernel-modules#333, shipped for obs/Discord in R595) installed into ~/.nv/nvidia-application-profiles-rc.d/ keyed on procname, opt-out PUNKTFUNK_NV_PROFILE=0; and PUNKTFUNK_PIN_CLOCKS=1 arming an NVML SetGpuLockedClocks(TDP, UNLIMITED) core-clock floor (base floor, boost headroom — never a max pin) held for the host lifetime, reset-on-start self-healing a crashed run's stale pin, NO_PERMISSION degrading to a logged sudoers/oneshot recipe. libnvidia-ml is dlopen'd like libcuda — no link-time dependency, clean no-op off NVIDIA. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 12:17:24 +00:00
parent 09c9554731
commit 0e768c219b
2 changed files with 282 additions and 0 deletions
@@ -0,0 +1,270 @@
+//! NVIDIA clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency
+//! plan Tier 1B): the driver's adaptive P-state ramps clocks down between bursty encode frames,
+//! so every frame re-pays a spin-up. Two independent halves, both no-ops off NVIDIA:
+//!
+//! 1. **`CudaNoStablePerfLimit` application profile** (no root needed): a CUDA/NVENC context
+//!    clamps GeForce into the P2 performance state — reduced *memory* clock — for the process
+//!    lifetime. NVIDIA's supported opt-out is an application profile keyed on the process name
+//!    (shipped by default for `obs`/`Discord` since R595; the raw key `0x166c5e = 0` "should work
+//!    with all supported driver versions" — NVIDIA engineer, open-gpu-kernel-modules#333). We drop
+//!    a rule for `punktfunk-host` into `~/.nv/nvidia-application-profiles-rc.d/`; the driver's
+//!    user-space component reads it at load, so it takes effect when libcuda/libGL next
+//!    initializes (usually this same run — we write before any GPU work — else the next host
+//!    start). Opt out with `PUNKTFUNK_NV_PROFILE=0`. (Do NOT set `CUDA_DISABLE_PERF_BOOST` for the
+//!    host — that's the other half of the driver knob: it stops the boost *to* P2; the profile
+//!    lifts the cap *at* P2 so the process can reach P0.)
+//!
+//! 2. **GPU core-clock floor** (`PUNKTFUNK_PIN_CLOCKS=1`, opt-in; root-gated by the driver):
+//!    `nvmlDeviceSetGpuLockedClocks(TDP, UNLIMITED)` floors the core clock at the TDP/base clock
+//!    while leaving boost headroom — NVIDIA's own latency guidance is "raise the floor, don't pin
+//!    the max" (locking above base just gets throttled; a max pin only burns idle watts). Non-root
+//!    callers get `NVML_ERROR_NO_PERMISSION` — logged once with the privilege recipe, then the
+//!    host runs unpinned. The pin is undone on drop (host exit); after a crash it persists until
+//!    driver reload/reboot, which the reset-before-pin on the next start self-heals. Deliberately
+//!    NOT default-on: it defeats idle downclocking for the whole box and is wrong on
+//!    battery-powered hosts.
+// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
+#![deny(clippy::undocumented_unsafe_blocks)]
+
+use std::os::raw::{c_char, c_int, c_uint, c_void};
+
+/// `nvmlDevice_t` — an opaque driver handle.
+type NvmlDevice = *mut c_void;
+
+const NVML_SUCCESS: c_int = 0;
+const NVML_ERROR_NO_PERMISSION: c_int = 4;
+/// `nvmlClockLimitId_t`: symbolic "TDP/base clock" / "unlimited" sentinels for
+/// `nvmlDeviceSetGpuLockedClocks` (nvml.h; `(TDP, UNLIMITED)` = "lower bound is TDP but clock may
+/// boost above this" — the floor-without-capping combination).
+const NVML_CLOCK_LIMIT_ID_TDP: c_uint = 0xffff_ff01;
+const NVML_CLOCK_LIMIT_ID_UNLIMITED: c_uint = 0xffff_ff02;
+
+/// The NVML entry points we use, resolved from `libnvidia-ml.so.1` at runtime (same pattern as
+/// `zerocopy::cuda` — no link-time NVIDIA dependency, absent library = clean no-op).
+struct Nvml {
+    _lib: libloading::Library,
+    init: unsafe extern "C" fn() -> c_int,
+    shutdown: unsafe extern "C" fn() -> c_int,
+    device_count: unsafe extern "C" fn(*mut c_uint) -> c_int,
+    device_by_index: unsafe extern "C" fn(c_uint, *mut NvmlDevice) -> c_int,
+    set_locked_clocks: unsafe extern "C" fn(NvmlDevice, c_uint, c_uint) -> c_int,
+    reset_locked_clocks: unsafe extern "C" fn(NvmlDevice) -> c_int,
+    error_string: unsafe extern "C" fn(c_int) -> *const c_char,
+}
+
+impl Nvml {
+    fn load() -> Option<Nvml> {
+        // SAFETY: `Library::new` runs the trusted NVIDIA driver library's initializers
+        // (`libnvidia-ml.so.1`), exactly as `zerocopy::cuda` does for `libcuda.so.1`. Each
+        // `lib.get` resolves a documented NVML symbol to the matching `unsafe extern "C"`
+        // signature transcribed from nvml.h (all by-value ints/pointers, no callbacks). The
+        // `Library` is stored in the returned struct, so every resolved fn pointer outlives its
+        // uses (`_lib` drops last).
+        unsafe {
+            let lib = libloading::Library::new("libnvidia-ml.so.1")
+                .or_else(|_| libloading::Library::new("libnvidia-ml.so"))
+                .ok()?;
+            let init = *lib.get(b"nvmlInit_v2\0").ok()?;
+            let shutdown = *lib.get(b"nvmlShutdown\0").ok()?;
+            let device_count = *lib.get(b"nvmlDeviceGetCount_v2\0").ok()?;
+            let device_by_index = *lib.get(b"nvmlDeviceGetHandleByIndex_v2\0").ok()?;
+            let set_locked_clocks = *lib.get(b"nvmlDeviceSetGpuLockedClocks\0").ok()?;
+            let reset_locked_clocks = *lib.get(b"nvmlDeviceResetGpuLockedClocks\0").ok()?;
+            let error_string = *lib.get(b"nvmlErrorString\0").ok()?;
+            Some(Nvml {
+                _lib: lib,
+                init,
+                shutdown,
+                device_count,
+                device_by_index,
+                set_locked_clocks,
+                reset_locked_clocks,
+                error_string,
+            })
+        }
+    }
+
+    fn err_str(&self, r: c_int) -> String {
+        // SAFETY: `nvmlErrorString` returns a pointer into NVML's static error-string table for
+        // ANY input value (documented total function), valid for the process lifetime; we only
+        // read it via `CStr` while the library is loaded (`self` borrows `_lib`).
+        unsafe {
+            let p = (self.error_string)(r);
+            if p.is_null() {
+                format!("NVML error {r}")
+            } else {
+                std::ffi::CStr::from_ptr(p).to_string_lossy().into_owned()
+            }
+        }
+    }
+}
+
+/// Whether an NVIDIA GPU is present (device nodes; mirrors `encode::nvidia_present` — cheap and
+/// side-effect-free, deliberately no CUDA/NVML init on the probe).
+fn nvidia_present() -> bool {
+    std::path::Path::new("/dev/nvidiactl").exists() || std::path::Path::new("/dev/nvidia0").exists()
+}
+
+fn flag_truthy(name: &str) -> bool {
+    std::env::var(name)
+        .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
+        .unwrap_or(false)
+}
+
+/// Host-lifetime guard: holds the NVML clock floor (when armed) and resets it on drop.
+pub struct ClockGuard {
+    nvml: Nvml,
+    pinned: Vec<NvmlDevice>,
+}
+
+// SAFETY: `ClockGuard` holds opaque NVML device handles + resolved fn pointers from the loaded
+// driver library. NVML is documented thread-safe, the handles are plain driver tokens with no
+// thread affinity, and the guard is only ever *moved* (held in `main`, dropped once at exit) and
+// used through `&mut`/ownership — never shared. Transfer across threads is therefore sound.
+unsafe impl Send for ClockGuard {}
+
+impl Drop for ClockGuard {
+    fn drop(&mut self) {
+        // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this live
+        // NVML session (init'd in `pin_clocks`, shut down only here, after the resets). The calls
+        // take the handle by value and return an int status — no Rust memory is borrowed.
+        unsafe {
+            for &dev in &self.pinned {
+                let _ = (self.nvml.reset_locked_clocks)(dev);
+            }
+            let _ = (self.nvml.shutdown)();
+        }
+        if !self.pinned.is_empty() {
+            tracing::info!("GPU clock floor released (locked clocks reset)");
+        }
+    }
+}
+
+/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the P2-cap
+/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the NVML core-clock floor.
+/// Returns the guard keeping the floor for the host lifetime. No-op (`None`) off NVIDIA.
+pub fn on_host_start() -> Option<ClockGuard> {
+    if !nvidia_present() {
+        return None;
+    }
+    ensure_cuda_perf_profile();
+    if !flag_truthy("PUNKTFUNK_PIN_CLOCKS") {
+        return None;
+    }
+    pin_clocks()
+}
+
+/// Floor the core clock at TDP/base on every NVIDIA device (reset first, so a stale pin from a
+/// crashed previous run is replaced rather than compounded).
+fn pin_clocks() -> Option<ClockGuard> {
+    let nvml = match Nvml::load() {
+        Some(n) => n,
+        None => {
+            tracing::warn!("PUNKTFUNK_PIN_CLOCKS: libnvidia-ml not loadable — clocks not pinned");
+            return None;
+        }
+    };
+    // SAFETY: all calls follow the documented NVML lifecycle on the successfully-loaded library:
+    // `nvmlInit_v2` first (status-checked; on failure we return without touching anything else),
+    // then count/handle queries writing through valid `&mut` out-pointers of the exact C types,
+    // then set/reset taking those returned handles by value. `shutdown` is called on every path
+    // that does not hand the session to a `ClockGuard` (whose Drop shuts it down).
+    unsafe {
+        let r = (nvml.init)();
+        if r != NVML_SUCCESS {
+            tracing::warn!(
+                error = nvml.err_str(r),
+                "PUNKTFUNK_PIN_CLOCKS: NVML init failed — clocks not pinned"
+            );
+            return None;
+        }
+        let mut count: c_uint = 0;
+        if (nvml.device_count)(&mut count) != NVML_SUCCESS || count == 0 {
+            let _ = (nvml.shutdown)();
+            return None;
+        }
+        let mut pinned = Vec::new();
+        let mut denied = false;
+        for i in 0..count {
+            let mut dev: NvmlDevice = std::ptr::null_mut();
+            if (nvml.device_by_index)(i, &mut dev) != NVML_SUCCESS {
+                continue;
+            }
+            let _ = (nvml.reset_locked_clocks)(dev);
+            let r = (nvml.set_locked_clocks)(
+                dev,
+                NVML_CLOCK_LIMIT_ID_TDP,
+                NVML_CLOCK_LIMIT_ID_UNLIMITED,
+            );
+            match r {
+                NVML_SUCCESS => pinned.push(dev),
+                NVML_ERROR_NO_PERMISSION => denied = true,
+                _ => tracing::debug!(
+                    device = i,
+                    error = nvml.err_str(r),
+                    "SetGpuLockedClocks failed"
+                ),
+            }
+        }
+        if denied {
+            // The driver gates locked clocks to root — no GeForce exception. Give the operator
+            // the two supported recipes instead of failing the host.
+            tracing::warn!(
+                "PUNKTFUNK_PIN_CLOCKS: the driver requires root for locked clocks \
+                 (NVML_ERROR_NO_PERMISSION). Grant it via a boot oneshot (`nvidia-smi -lgc \
+                 tdp,unlimited`) or sudoers (`<user> ALL=(ALL) NOPASSWD: /usr/bin/nvidia-smi`) — \
+                 the host keeps running unpinned"
+            );
+        }
+        if pinned.is_empty() {
+            let _ = (nvml.shutdown)();
+            return None;
+        }
+        tracing::info!(
+            devices = pinned.len(),
+            "GPU core-clock floor armed (min=TDP/base, max=boost) — released on host exit"
+        );
+        Some(ClockGuard { nvml, pinned })
+    }
+}
+
+/// Install the `CudaNoStablePerfLimit` application profile + a `punktfunk-host` procname rule in
+/// `~/.nv/nvidia-application-profiles-rc.d/` (created if missing, never overwritten — the file is
+/// the operator's once it exists). Lifts the driver's P2 memory-clock cap for the host process.
+fn ensure_cuda_perf_profile() {
+    if std::env::var("PUNKTFUNK_NV_PROFILE").as_deref() == Ok("0") {
+        return;
+    }
+    let Some(home) = std::env::var_os("HOME") else {
+        return;
+    };
+    let dir = std::path::Path::new(&home)
+        .join(".nv")
+        .join("nvidia-application-profiles-rc.d");
+    let path = dir.join("50-punktfunk");
+    if path.exists() {
+        return;
+    }
+    // The exact shape NVIDIA published (open-gpu-kernel-modules#333) and ships for obs/Discord in
+    // R595; the inline profile definition makes it work on pre-R595 drivers too.
+    let profile = r#"{
+    "profiles": [ { "name": "CudaNoStablePerfLimit", "settings": [ "0x166c5e", 0 ] } ],
+    "rules": [
+        { "pattern": { "feature": "procname", "matches": "punktfunk-host" }, "profile": "CudaNoStablePerfLimit" }
+    ]
+}
+"#;
+    let write = || -> std::io::Result<()> {
+        std::fs::create_dir_all(&dir)?;
+        std::fs::write(&path, profile)
+    };
+    match write() {
+        Ok(()) => tracing::info!(
+            path = %path.display(),
+            "installed the CudaNoStablePerfLimit driver profile (lifts the P2 memory-clock cap \
+             for NVENC/CUDA; read when the driver next initializes — PUNKTFUNK_NV_PROFILE=0 opts \
+             out)"
+        ),
+        Err(e) => tracing::debug!(error = %e, "could not install the NVIDIA application profile"),
+    }
+}
@@ -45,6 +45,9 @@ mod library;
 mod mgmt;
 mod mgmt_token;
 mod native_pairing;
+#[cfg(target_os = "linux")]
+#[path = "linux/nvclocks.rs"]
+mod nvclocks;
 mod pipeline;
 mod punktfunk1;
 mod pwinit;
@@ -127,6 +130,15 @@ fn real_main() -> Result<()> {
    #[cfg(target_os = "windows")]
    crate::capture::dxgi::install_gpu_pref_hook();

+    // NVIDIA clock hygiene (Linux, host subcommands only): install the P2-cap driver profile and,
+    // under PUNKTFUNK_PIN_CLOCKS, hold the NVML core-clock floor for the host lifetime (reset on
+    // exit via the guard's Drop). No-op off NVIDIA / on the tool subcommands.
+    #[cfg(target_os = "linux")]
+    let _nv_clocks = match args.first().map(String::as_str) {
+        Some("serve") | Some("punktfunk1-host") => nvclocks::on_host_start(),
+        _ => None,
+    };
+
    match args.first().map(String::as_str) {
        // The host: the native punktfunk/1 plane + management API by default (secure), and — with
        // --gamestream — the GameStream/Moonlight-compat planes too (opt-in; #5/#9 trusted-LAN caveat).