From fa4c798a25f5c3ccf410c571498a1e67be594e45 Mon Sep 17 00:00:00 2001
From: enricobuehler <buehler@unom.io>
Date: Thu, 2 Jul 2026 13:03:04 +0000
Subject: [PATCH] =?UTF-8?q?feat(host/linux):=20amdgpu=20session=20clock=20?=
 =?UTF-8?q?pin=20=E2=80=94=20gpuclocks=20grows=20the=20AMD=20arm?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nvclocks.rs -> gpuclocks.rs. PUNKTFUNK_PIN_CLOCKS=1 now also pins every
amdgpu card's power_dpm_force_performance_level to high for the host
lifetime (prior level restored on exit) — the measured AMD encode-
latency lever: VCN per-frame time doubles when a 60fps paced trickle
lets clocks sag (8 -> 4.4ms/frame at 1440p on the 780M with clocks
hot). Root-gated by sysfs ownership; non-root degrades to a logged
recipe (validated live on the AMD box). Opt-in stays deliberate:
box-wide power-management override, wrong on battery/Deck.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 crates/punktfunk-host/src/linux/gpuclocks.rs | 157 ++++++++++++++++---
 crates/punktfunk-host/src/main.rs            |   8 +-
 2 files changed, 135 insertions(+), 30 deletions(-)
diff --git a/crates/punktfunk-host/src/linux/gpuclocks.rs b/crates/punktfunk-host/src/linux/gpuclocks.rs
index ec09789..35f13eb 100644
--- a/crates/punktfunk-host/src/linux/gpuclocks.rs
+++ b/crates/punktfunk-host/src/linux/gpuclocks.rs
@@ -1,6 +1,15 @@
-//! NVIDIA clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency
+//! GPU clock / P-state hygiene for the Linux host — the "easy-scene p99" lever (host-latency
 //! plan Tier 1B): the driver's adaptive P-state ramps clocks down between bursty encode frames,
-//! so every frame re-pays a spin-up. Two independent halves, both no-ops off NVIDIA:
+//! so every frame re-pays a spin-up. This is NOT theoretical — measured on the 780M (VCN 4),
+//! a 1440p HEVC encode takes ~4.4 ms/frame with the clocks hot (120 fps pacing) but ~8 ms/frame
+//! at a 60 fps duty cycle: the sag doubles per-frame encode latency.
+//!
+//! **AMD** (`PUNKTFUNK_PIN_CLOCKS=1`, root-gated by sysfs ownership): write `high` into each
+//! amdgpu card's `power_dpm_force_performance_level` for the host lifetime, restoring the prior
+//! value on exit. Non-root gets EACCES → logged once with the privilege recipe. Deliberately
+//! opt-in: it defeats power management box-wide and is wrong on battery (Steam Deck!).
+//!
+//! **NVIDIA** — two independent halves, both no-ops off NVIDIA:
 //!
 //! 1. **`CudaNoStablePerfLimit` application profile** (no root needed): a CUDA/NVENC context
 //!    clamps GeForce into the P2 performance state — reduced *memory* clock — for the process
@@ -111,52 +120,148 @@ fn flag_truthy(name: &str) -> bool {
         .unwrap_or(false)
 }
 
-/// Host-lifetime guard: holds the NVML clock floor (when armed) and resets it on drop.
-pub struct ClockGuard {
+/// An NVML session holding locked-clock floors on one or more NVIDIA devices.
+struct NvmlPin {
     nvml: Nvml,
     pinned: Vec<NvmlDevice>,
 }
 
+/// One amdgpu card whose `power_dpm_force_performance_level` we forced to `high`; `restore` is
+/// the pre-pin value written back on drop.
+struct AmdPin {
+    path: std::path::PathBuf,
+    restore: String,
+}
+
+/// Host-lifetime guard: holds the armed clock pins (NVML floor and/or amdgpu perf level) and
+/// undoes them on drop.
+pub struct ClockGuard {
+    nvml: Option<NvmlPin>,
+    amd: Vec<AmdPin>,
+}
+
 // SAFETY: `ClockGuard` holds opaque NVML device handles + resolved fn pointers from the loaded
-// driver library. NVML is documented thread-safe, the handles are plain driver tokens with no
-// thread affinity, and the guard is only ever *moved* (held in `main`, dropped once at exit) and
-// used through `&mut`/ownership — never shared. Transfer across threads is therefore sound.
+// driver library (plus plain sysfs paths/strings). NVML is documented thread-safe, the handles are
+// plain driver tokens with no thread affinity, and the guard is only ever *moved* (held in `main`,
+// dropped once at exit) and used through `&mut`/ownership — never shared. Transfer across threads
+// is therefore sound.
 unsafe impl Send for ClockGuard {}
 
 impl Drop for ClockGuard {
     fn drop(&mut self) {
-        // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this live
-        // NVML session (init'd in `pin_clocks`, shut down only here, after the resets). The calls
-        // take the handle by value and return an int status — no Rust memory is borrowed.
-        unsafe {
-            for &dev in &self.pinned {
-                let _ = (self.nvml.reset_locked_clocks)(dev);
+        if let Some(pin) = &self.nvml {
+            // SAFETY: each handle in `pinned` came from `nvmlDeviceGetHandleByIndex_v2` on this
+            // live NVML session (init'd in `pin_nvidia`, shut down only here, after the resets).
+            // The calls take the handle by value and return an int status — no Rust memory is
+            // borrowed.
+            unsafe {
+                for &dev in &pin.pinned {
+                    let _ = (pin.nvml.reset_locked_clocks)(dev);
+                }
+                let _ = (pin.nvml.shutdown)();
+            }
+            if !pin.pinned.is_empty() {
+                tracing::info!("NVIDIA clock floor released (locked clocks reset)");
             }
-            let _ = (self.nvml.shutdown)();
         }
-        if !self.pinned.is_empty() {
-            tracing::info!("GPU clock floor released (locked clocks reset)");
+        for pin in &self.amd {
+            match std::fs::write(&pin.path, &pin.restore) {
+                Ok(()) => tracing::info!(
+                    card = %pin.path.display(),
+                    restored = %pin.restore,
+                    "amdgpu performance level restored"
+                ),
+                Err(e) => tracing::warn!(
+                    card = %pin.path.display(),
+                    error = %e,
+                    "could not restore amdgpu performance level"
+                ),
+            }
         }
     }
 }
 
-/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the P2-cap
-/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the NVML core-clock floor.
-/// Returns the guard keeping the floor for the host lifetime. No-op (`None`) off NVIDIA.
+/// Startup hook for the host subcommands (`serve` / `punktfunk1-host`): install the NVIDIA P2-cap
+/// application profile and, when `PUNKTFUNK_PIN_CLOCKS` is set, arm the vendor clock pin (NVML
+/// core-clock floor / amdgpu `high` performance level). Returns the guard keeping the pins for
+/// the host lifetime. `None` when nothing was armed.
 pub fn on_host_start() -> Option<ClockGuard> {
-    if !nvidia_present() {
-        return None;
+    if nvidia_present() {
+        ensure_cuda_perf_profile();
     }
-    ensure_cuda_perf_profile();
     if !flag_truthy("PUNKTFUNK_PIN_CLOCKS") {
         return None;
     }
-    pin_clocks()
+    let nvml = if nvidia_present() { pin_nvidia() } else { None };
+    let amd = pin_amdgpu();
+    if nvml.is_none() && amd.is_empty() {
+        return None;
+    }
+    Some(ClockGuard { nvml, amd })
+}
+
+/// Force every amdgpu card's DPM performance level to `high` for the session — the encode-latency
+/// lever on AMD: the VCN's per-frame time tracks the clock sag (measured 8 → 4.4 ms/frame at
+/// 1440p on the 780M once clocks stay hot). Remembers the prior level for restore-on-drop.
+/// Root-gated by sysfs ownership; non-root warns once with the recipe, the host runs unpinned.
+fn pin_amdgpu() -> Vec<AmdPin> {
+    let mut pins = Vec::new();
+    let mut denied = false;
+    let Ok(cards) = std::fs::read_dir("/sys/class/drm") else {
+        return pins;
+    };
+    for entry in cards.flatten() {
+        let name = entry.file_name();
+        let name = name.to_string_lossy().into_owned();
+        // Cards only (card0, card1, …) — not connectors (card0-DP-1) or render nodes.
+        if !name.starts_with("card") || name.contains('-') {
+            continue;
+        }
+        let dev = entry.path().join("device");
+        let is_amdgpu = std::fs::read_link(dev.join("driver"))
+            .map(|t| t.to_string_lossy().ends_with("amdgpu"))
+            .unwrap_or(false);
+        if !is_amdgpu {
+            continue;
+        }
+        let path = dev.join("power_dpm_force_performance_level");
+        let Ok(prev) = std::fs::read_to_string(&path) else {
+            continue;
+        };
+        let prev = prev.trim().to_string();
+        if prev == "high" {
+            continue; // already pinned (externally) — nothing to do or restore
+        }
+        match std::fs::write(&path, "high") {
+            Ok(()) => {
+                tracing::info!(
+                    card = %name,
+                    was = %prev,
+                    "amdgpu performance level pinned to high (encode clock sag removed) — \
+                     restored on host exit"
+                );
+                pins.push(AmdPin {
+                    path,
+                    restore: prev,
+                });
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => denied = true,
+            Err(e) => tracing::debug!(card = %name, error = %e, "amdgpu perf-level write failed"),
+        }
+    }
+    if denied {
+        tracing::warn!(
+            "PUNKTFUNK_PIN_CLOCKS: writing power_dpm_force_performance_level requires root — \
+             grant it via a boot oneshot / udev rule chowning the attribute, or run the host as \
+             a system service. The host keeps running unpinned"
+        );
+    }
+    pins
 }
 
 /// Floor the core clock at TDP/base on every NVIDIA device (reset first, so a stale pin from a
 /// crashed previous run is replaced rather than compounded).
-fn pin_clocks() -> Option<ClockGuard> {
+fn pin_nvidia() -> Option<NvmlPin> {
     let nvml = match Nvml::load() {
         Some(n) => n,
         None => {
@@ -222,9 +327,9 @@ fn pin_clocks() -> Option<ClockGuard> {
         }
         tracing::info!(
             devices = pinned.len(),
-            "GPU core-clock floor armed (min=TDP/base, max=boost) — released on host exit"
+            "NVIDIA core-clock floor armed (min=TDP/base, max=boost) — released on host exit"
         );
-        Some(ClockGuard { nvml, pinned })
+        Some(NvmlPin { nvml, pinned })
     }
 }
 
diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs
index 5f657c3..7116c23 100644
--- a/crates/punktfunk-host/src/main.rs
+++ b/crates/punktfunk-host/src/main.rs
@@ -33,6 +33,9 @@ mod drm_sync;
 mod encode;
 mod gamestream;
 mod gpu;
+#[cfg(target_os = "linux")]
+#[path = "linux/gpuclocks.rs"]
+mod gpuclocks;
 mod hdr;
 mod inject;
 #[cfg(target_os = "windows")]
@@ -45,9 +48,6 @@ mod library;
 mod mgmt;
 mod mgmt_token;
 mod native_pairing;
-#[cfg(target_os = "linux")]
-#[path = "linux/nvclocks.rs"]
-mod nvclocks;
 mod pipeline;
 mod punktfunk1;
 mod pwinit;
@@ -135,7 +135,7 @@ fn real_main() -> Result<()> {
     // exit via the guard's Drop). No-op off NVIDIA / on the tool subcommands.
     #[cfg(target_os = "linux")]
     let _nv_clocks = match args.first().map(String::as_str) {
-        Some("serve") | Some("punktfunk1-host") => nvclocks::on_host_start(),
+        Some("serve") | Some("punktfunk1-host") => gpuclocks::on_host_start(),
         _ => None,
     };