perf(host): latency hardening for the game-vs-encode GPU contention collapse
Verified, prioritized analysis in docs/host-latency-plan.md (multi-agent investigation + adversarial verification). Lands the two low-risk tiers: Tier 2B — Linux scheduling hygiene: - boost_thread_priority now nices the capture/encode (-10) and send (-5) threads on Linux (setpriority, best-effort; no-op without CAP_SYS_NICE), and the wrong "gamescope caps the game" doc-comment is corrected. - CUDA context created with CU_CTX_SCHED_BLOCKING_SYNC (frees a core on the shared box instead of busy-spinning on completion). - Copies moved off the default stream onto a per-thread highest-priority CUDA stream (cuStreamCreateWithPriority, graceful NULL-stream fallback) with a per-stream sync that no longer blocks on the other worker thread's in-flight copies. Stream priority is measure-then-keep (NVIDIA Linux may ignore it); never regresses. Tier 3A — Windows session tuning (new session_tuning.rs, raw C-ABI FFI, no-op off Windows): once-per-process 1ms timer + DwmEnableMMCSS + HIGH priority class; per-thread MMCSS "Games" + keep-display-awake. Wired into both the native (boost_thread_priority) and GameStream (stream.rs) paths. We had zero session tuning before (Apollo streaming_will_start parity). Tier 2A (Linux NV12 convert) is specified but intentionally not landed: it is colour-correctness-critical and needs A/B validation on a GPU box with a display (green-screen risk). Builds + clippy + fmt green on Linux. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -60,6 +60,8 @@ fn run(
|
||||
force_idr: &AtomicBool,
|
||||
video_cap: &std::sync::Mutex<Option<Box<dyn Capturer>>>,
|
||||
) -> Result<()> {
|
||||
// GameStream capture/encode thread: apply Windows session tuning (no-op off Windows).
|
||||
crate::session_tuning::on_hot_thread();
|
||||
// Reject an out-of-range client mode before allocating capture/encode buffers.
|
||||
encode::validate_dimensions(cfg.codec, cfg.width, cfg.height)
|
||||
.context("client-requested video mode")?;
|
||||
@@ -219,6 +221,8 @@ fn spawn_sender(
|
||||
std::thread::Builder::new()
|
||||
.name("punktfunk-send".into())
|
||||
.spawn(move || {
|
||||
// GameStream send thread: Windows session tuning + MMCSS (no-op off Windows).
|
||||
crate::session_tuning::on_hot_thread();
|
||||
// Chunk pacing: 16 packets per burst, bursts spread across the send budget.
|
||||
const PACE_CHUNK: usize = 16;
|
||||
let budget = frame_interval.mul_f32(0.75);
|
||||
|
||||
@@ -33,6 +33,7 @@ mod punktfunk1;
|
||||
mod pwinit;
|
||||
#[cfg(target_os = "windows")]
|
||||
mod service;
|
||||
mod session_tuning;
|
||||
mod spike;
|
||||
mod vdisplay;
|
||||
#[cfg(target_os = "windows")]
|
||||
|
||||
@@ -1831,10 +1831,15 @@ struct FrameMsg {
|
||||
/// capture/encode/send threads. This matters even though our GPU work is already HIGH priority: the
|
||||
/// GPU scheduler can only favour commands we've actually SUBMITTED, so if a normal-priority thread is
|
||||
/// descheduled by the game it submits the convert/encode late and the GPU priority never bites. Apollo
|
||||
/// does the same (capture thread CRITICAL, encoder ABOVE_NORMAL). Windows-only — the Linux host caps
|
||||
/// the game via gamescope, so its threads aren't starved. `critical` → highest non-realtime class
|
||||
/// does the same (capture thread CRITICAL, encoder ABOVE_NORMAL). The Linux host needs this too: an
|
||||
/// uncapped GPU-saturating title (e.g. CS2 direct on a virtual output, not capped by gamescope) is
|
||||
/// also a CPU hog and can deschedule our submit threads. `critical` → highest non-realtime class
|
||||
/// (the capture+encode loop); otherwise above-normal (the send/relay thread).
|
||||
pub(crate) fn boost_thread_priority(critical: bool) {
|
||||
// Windows host-process/thread session tuning (timer 1ms, DWM MMCSS, HIGH class once; MMCSS +
|
||||
// keep-display-awake per thread). No-op off Windows. Both stream threads call us, so this covers
|
||||
// capture/encode (critical) and send (non-critical).
|
||||
crate::session_tuning::on_hot_thread();
|
||||
#[cfg(target_os = "windows")]
|
||||
unsafe {
|
||||
use windows::Win32::System::Threading::{
|
||||
@@ -1853,7 +1858,27 @@ pub(crate) fn boost_thread_priority(critical: bool) {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
// Best-effort nice of the CALLING thread. On Linux `setpriority(PRIO_PROCESS, 0, …)` acts on
|
||||
// the calling thread (the kernel resolves who==0 to the current task/tid), and both call
|
||||
// sites run inside their worker thread — so this nices exactly the capture/encode (critical)
|
||||
// and send (non-critical) threads, nothing else. Silently no-ops without CAP_SYS_NICE / a
|
||||
// raised RLIMIT_NICE, which is fine. We deliberately do NOT use SCHED_RR/FIFO by default: a
|
||||
// realtime CPU class can preempt the compositor AND the game's own render thread, adding the
|
||||
// very frame-time we refuse to add (opt-in only — see PUNKTFUNK_SCHED_RR).
|
||||
let nice = if critical { -10 } else { -5 };
|
||||
let rc = unsafe { libc::setpriority(libc::PRIO_PROCESS, 0, nice) };
|
||||
if rc == 0 {
|
||||
tracing::debug!(critical, nice, "thread nice raised");
|
||||
} else {
|
||||
tracing::debug!(
|
||||
critical,
|
||||
"setpriority(nice) no-op (needs CAP_SYS_NICE / RLIMIT_NICE)"
|
||||
);
|
||||
}
|
||||
}
|
||||
#[cfg(not(any(target_os = "windows", target_os = "linux")))]
|
||||
{
|
||||
let _ = critical;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
//! Windows host-process session tuning — parity with Apollo/Sunshine `streaming_will_start`.
|
||||
//!
|
||||
//! The default Windows process runs at NORMAL priority and ~15.6 ms timer granularity, and lets the
|
||||
//! GPU/display idle. Under a GPU-saturating game that starves our capture/encode/send threads (the
|
||||
//! "240→40 fps collapse"), and the coarse timer floors any precise frame pacing. This raises the
|
||||
//! process out of the default scheduling class, gives DWM and our hot threads MMCSS priority, drops
|
||||
//! the timer to 1 ms, and keeps the (virtual) display awake for the session.
|
||||
//!
|
||||
//! Raw C-ABI FFI (winmm/kernel32/dwmapi/avrt) rather than the `windows` crate so it builds without
|
||||
//! pulling new windows-rs features. No-op on non-Windows. Per-thread effects (MMCSS, execution
|
||||
//! state) auto-revert at thread exit (= session end); the process-wide bits revert at process exit.
|
||||
//! See `docs/host-latency-plan.md` Tier 3A.
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
mod imp {
|
||||
#![allow(non_snake_case)]
|
||||
use std::ffi::c_void;
|
||||
use std::sync::OnceLock;
|
||||
|
||||
type Handle = *mut c_void;
|
||||
type Bool = i32;
|
||||
|
||||
#[link(name = "winmm")]
|
||||
extern "system" {
|
||||
fn timeBeginPeriod(uPeriod: u32) -> u32;
|
||||
}
|
||||
#[link(name = "kernel32")]
|
||||
extern "system" {
|
||||
fn GetCurrentProcess() -> Handle;
|
||||
fn SetPriorityClass(hProcess: Handle, dwPriorityClass: u32) -> Bool;
|
||||
fn SetThreadExecutionState(esFlags: u32) -> u32;
|
||||
}
|
||||
#[link(name = "dwmapi")]
|
||||
extern "system" {
|
||||
fn DwmEnableMMCSS(fEnableMMCSS: Bool) -> i32; // HRESULT
|
||||
}
|
||||
#[link(name = "avrt")]
|
||||
extern "system" {
|
||||
fn AvSetMmThreadCharacteristicsW(TaskName: *const u16, TaskIndex: *mut u32) -> Handle;
|
||||
}
|
||||
|
||||
const HIGH_PRIORITY_CLASS: u32 = 0x0000_0080;
|
||||
const ES_CONTINUOUS: u32 = 0x8000_0000;
|
||||
const ES_SYSTEM_REQUIRED: u32 = 0x0000_0001;
|
||||
const ES_DISPLAY_REQUIRED: u32 = 0x0000_0002;
|
||||
|
||||
static PROCESS_TUNED: OnceLock<()> = OnceLock::new();
|
||||
|
||||
/// Process-wide tuning, applied exactly once. Reverts at process exit. Best-effort: each call is
|
||||
/// independent and a failure is ignored (e.g. a non-elevated host may not get HIGH class).
|
||||
fn tune_process_once() {
|
||||
PROCESS_TUNED.get_or_init(|| unsafe {
|
||||
// 1 ms timer granularity (default ~15.6 ms) — the floor for precise frame pacing and the
|
||||
// encode|send split's sub-ms sleeps.
|
||||
timeBeginPeriod(1);
|
||||
// Run DWM's compositor work at MMCSS priority — helps the compose-rate ceiling hold up
|
||||
// under a saturating game (capture is bounded by how often DWM composes).
|
||||
DwmEnableMMCSS(1);
|
||||
// Lift the whole host above NORMAL so a CPU-saturating game can't deschedule our
|
||||
// control/capture/encode/send threads on the CPU (Apollo does the same).
|
||||
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
|
||||
tracing::info!("windows session tuning applied (timer 1ms, DWM MMCSS, HIGH priority)");
|
||||
});
|
||||
}
|
||||
|
||||
/// Call at the start of each capture/encode/send (hot stream) thread. Applies the process-wide
|
||||
/// tuning once, registers the calling thread with MMCSS ("Games"), and asserts the display/system
|
||||
/// must stay awake for as long as this thread lives. The MMCSS handle is intentionally leaked and
|
||||
/// the execution-state assertion is bound to this thread — both are reverted by the OS when the
|
||||
/// thread exits, so a session that ends tears them down without explicit bookkeeping.
|
||||
pub fn on_hot_thread() {
|
||||
tune_process_once();
|
||||
unsafe {
|
||||
SetThreadExecutionState(ES_CONTINUOUS | ES_DISPLAY_REQUIRED | ES_SYSTEM_REQUIRED);
|
||||
let task: Vec<u16> = "Games\0".encode_utf16().collect();
|
||||
let mut idx: u32 = 0;
|
||||
// Leak the handle: these are session/process-lifetime worker threads; the OS reverts the
|
||||
// MMCSS characteristics at thread exit.
|
||||
let _ = AvSetMmThreadCharacteristicsW(task.as_ptr(), &mut idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
pub use imp::on_hot_thread;
|
||||
|
||||
/// No-op on non-Windows (Linux uses `setpriority` nice + CUDA stream priority instead — see
|
||||
/// `punktfunk1::boost_thread_priority` and `zerocopy::cuda`).
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
pub fn on_hot_thread() {}
|
||||
@@ -27,6 +27,15 @@ pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
|
||||
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
|
||||
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
|
||||
|
||||
/// `CUctx_flags` (cuda.h): block the CPU on an OS primitive while waiting for the GPU instead of
|
||||
/// busy-spinning. On this shared box (compositor + send thread on the same cores) spinning a core
|
||||
/// to detect copy completion steals CPU from the very threads we want scheduled; BLOCKING_SYNC
|
||||
/// frees it. Default (`CU_CTX_SCHED_AUTO=0`) heuristically picks SPIN vs YIELD by core count.
|
||||
const CU_CTX_SCHED_BLOCKING_SYNC: c_uint = 0x04;
|
||||
|
||||
/// `cuStreamCreateWithPriority` flag: don't implicitly synchronize with the legacy NULL stream.
|
||||
const CU_STREAM_NON_BLOCKING: c_uint = 0x01;
|
||||
|
||||
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
@@ -91,8 +100,15 @@ extern "C" {
|
||||
element_size: c_uint,
|
||||
) -> CUresult;
|
||||
fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult;
|
||||
fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
|
||||
fn cuCtxSynchronize() -> CUresult;
|
||||
fn cuMemcpy2DAsync_v2(copy: *const CUDA_MEMCPY2D, stream: CUstream) -> CUresult;
|
||||
fn cuStreamSynchronize(stream: CUstream) -> CUresult;
|
||||
// Greatest/least stream priority the driver exposes (greatest = numerically lowest).
|
||||
fn cuCtxGetStreamPriorityRange(least: *mut c_int, greatest: *mut c_int) -> CUresult;
|
||||
fn cuStreamCreateWithPriority(
|
||||
stream: *mut CUstream,
|
||||
flags: c_uint,
|
||||
priority: c_int,
|
||||
) -> CUresult;
|
||||
|
||||
// GL interop (cudaGL.h) — these symbols have NO `_v2` suffix. `cuGraphicsEGLRegisterImage`
|
||||
// is Tegra-only on the desktop driver, so we go EGLImage → GL texture → register the texture.
|
||||
@@ -162,7 +178,10 @@ pub fn context() -> Result<CUcontext> {
|
||||
let mut dev: CUdevice = 0;
|
||||
ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
|
||||
let mut ctx: CUcontext = std::ptr::null_mut();
|
||||
ck(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate_v2")?;
|
||||
ck(
|
||||
cuCtxCreate_v2(&mut ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev),
|
||||
"cuCtxCreate_v2",
|
||||
)?;
|
||||
ctx
|
||||
};
|
||||
// Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
|
||||
@@ -176,6 +195,57 @@ pub fn make_current() -> Result<()> {
|
||||
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
/// Per-thread copy stream. `None` until first use; `Some(null)` means "creation failed, use the
|
||||
/// default (NULL) stream". Per-thread (not shared) so each worker's `cuStreamSynchronize` waits
|
||||
/// only on ITS OWN copies — the old per-frame `cuCtxSynchronize` was context-wide and also
|
||||
/// blocked on the other worker thread's in-flight NULL-stream copies.
|
||||
static COPY_STREAM: std::cell::Cell<Option<CUstream>> = const { std::cell::Cell::new(None) };
|
||||
}
|
||||
|
||||
/// The calling thread's highest-priority copy stream (lazily created; context must be current).
|
||||
/// Carries the greatest stream priority the driver exposes — a scheduler hint that nudges our
|
||||
/// copies ahead of the game's queued compute. NOTE: stream priority is an intra-process hint and
|
||||
/// NVIDIA's Linux driver may ignore it / not preempt a saturating game's graphics context; this is
|
||||
/// "measure-then-keep", and it never regresses (falls back to the NULL stream). The greatest
|
||||
/// priority is the numerically-lowest value (`greatest` from `cuCtxGetStreamPriorityRange`).
|
||||
fn copy_stream() -> CUstream {
|
||||
COPY_STREAM.with(|cell| {
|
||||
if let Some(s) = cell.get() {
|
||||
return s;
|
||||
}
|
||||
let stream = unsafe {
|
||||
let (mut least, mut greatest) = (0i32, 0i32);
|
||||
if cuCtxGetStreamPriorityRange(&mut least, &mut greatest) != 0 {
|
||||
std::ptr::null_mut()
|
||||
} else {
|
||||
let mut s: CUstream = std::ptr::null_mut();
|
||||
if cuStreamCreateWithPriority(&mut s, CU_STREAM_NON_BLOCKING, greatest) != 0 {
|
||||
std::ptr::null_mut()
|
||||
} else {
|
||||
tracing::debug!(
|
||||
priority = greatest,
|
||||
"CUDA high-priority copy stream created"
|
||||
);
|
||||
s
|
||||
}
|
||||
}
|
||||
};
|
||||
cell.set(Some(stream));
|
||||
stream
|
||||
})
|
||||
}
|
||||
|
||||
/// Issue `copy` on this thread's priority stream and block until it completes. Replaces the
|
||||
/// per-frame `cuMemcpy2D_v2` + context-wide `cuCtxSynchronize` pair: same completion guarantee
|
||||
/// (the source dmabuf is safe to recycle once this returns), but the wait is scoped to our own
|
||||
/// stream and the copy carries the high priority hint.
|
||||
unsafe fn copy_blocking(copy: &CUDA_MEMCPY2D, what: &str) -> Result<()> {
|
||||
let stream = copy_stream();
|
||||
ck(cuMemcpy2DAsync_v2(copy, stream), what)?;
|
||||
ck(cuStreamSynchronize(stream), "cuStreamSynchronize")
|
||||
}
|
||||
|
||||
/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
|
||||
fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
|
||||
let mut ptr: CUdeviceptr = 0;
|
||||
@@ -342,7 +412,8 @@ impl RegisteredTexture {
|
||||
}
|
||||
|
||||
/// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
|
||||
/// unmap. The `cuCtxSynchronize` ensures `dst` is ready before the source dmabuf is recycled.
|
||||
/// unmap. The copy is synchronized (on our priority stream) before unmap so `dst` is ready
|
||||
/// before the source dmabuf is recycled. Always unmaps, even if the copy errors.
|
||||
pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
|
||||
unsafe {
|
||||
ck(
|
||||
@@ -364,13 +435,10 @@ impl RegisteredTexture {
|
||||
Height: dst.height as usize,
|
||||
..Default::default()
|
||||
};
|
||||
let r = cuMemcpy2D_v2(©);
|
||||
let s = cuCtxSynchronize();
|
||||
let res = copy_blocking(©, "cuMemcpy2DAsync_v2");
|
||||
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||
ck(r, "cuMemcpy2D_v2")?;
|
||||
ck(s, "cuCtxSynchronize")?;
|
||||
res
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -393,11 +461,7 @@ pub fn copy_device_to_device(
|
||||
Height: src.height as usize,
|
||||
..Default::default()
|
||||
};
|
||||
unsafe {
|
||||
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2(dev->dev)")?;
|
||||
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
||||
}
|
||||
Ok(())
|
||||
unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->dev)") }
|
||||
}
|
||||
|
||||
impl Drop for RegisteredTexture {
|
||||
@@ -500,10 +564,7 @@ pub fn copy_pitched_to_buffer(
|
||||
Height: dst.height as usize,
|
||||
..Default::default()
|
||||
};
|
||||
unsafe {
|
||||
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2(ext->dev)")?;
|
||||
// The copy must finish before the dmabuf is requeued to the producer.
|
||||
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
||||
}
|
||||
Ok(())
|
||||
// copy_blocking syncs our priority stream before returning, so the copy is complete before the
|
||||
// dmabuf is requeued to the producer.
|
||||
unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(ext->dev)") }
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user