From 3b3e8b4ba9b80d9af997034ea468262d0410184a Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Wed, 17 Jun 2026 16:12:29 +0000 Subject: [PATCH] perf(host/windows): elevate capture/encode/send thread CPU priority (Apollo-parity) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apollo runs its capture thread at CRITICAL and its encoder thread at ABOVE_NORMAL; we set none. Our GPU work is already HIGH priority, but the GPU scheduler can only favour commands we've SUBMITTED — a normal-priority thread descheduled by a CPU-heavy game submits the convert/encode late, so the HIGH GPU priority never bites (consistent with the measured "NVENC engine idle yet the encode waits ~15 ms"). Raise the WGC helper's capture+encode loop and the single-process capture+encode loop to THREAD_PRIORITY_HIGHEST, and the transmit thread to ABOVE_NORMAL, via a cross-platform boost_thread_priority() (Windows-only effect — the Linux host caps the game via gamescope so its threads aren't starved). Not yet built/validated on the GPU box (it's down); the cross-platform side compiles (cargo check) and the Windows calls are cross-checked against the windows-0.62 API. Co-Authored-By: Claude Opus 4.8 --- crates/punktfunk-host/src/m3.rs | 36 +++++++++++++++++++++++++ crates/punktfunk-host/src/wgc_helper.rs | 6 +++++ 2 files changed, 42 insertions(+) diff --git a/crates/punktfunk-host/src/m3.rs b/crates/punktfunk-host/src/m3.rs index 9af3c5d..05e1e2b 100644 --- a/crates/punktfunk-host/src/m3.rs +++ b/crates/punktfunk-host/src/m3.rs @@ -1828,6 +1828,38 @@ struct FrameMsg { /// speed-test probe bursts (which also need the Session). Decoupling the paced send from encoding /// lets the encode of frame N+1 overlap the transmit of frame N instead of waiting behind its tail. /// Runs until the encode thread drops the frame channel (end of stream) or `stop` is set. +/// Raise the current thread's OS scheduling priority so a CPU-heavy game can't deschedule our +/// capture/encode/send threads. This matters even though our GPU work is already HIGH priority: the +/// GPU scheduler can only favour commands we've actually SUBMITTED, so if a normal-priority thread is +/// descheduled by the game it submits the convert/encode late and the GPU priority never bites. Apollo +/// does the same (capture thread CRITICAL, encoder ABOVE_NORMAL). Windows-only — the Linux host caps +/// the game via gamescope, so its threads aren't starved. `critical` → highest non-realtime class +/// (the capture+encode loop); otherwise above-normal (the send/relay thread). +pub(crate) fn boost_thread_priority(critical: bool) { + #[cfg(target_os = "windows")] + unsafe { + use windows::Win32::System::Threading::{ + GetCurrentThread, SetThreadPriority, THREAD_PRIORITY_ABOVE_NORMAL, + THREAD_PRIORITY_HIGHEST, + }; + let prio = if critical { + THREAD_PRIORITY_HIGHEST + } else { + THREAD_PRIORITY_ABOVE_NORMAL + }; + match SetThreadPriority(GetCurrentThread(), prio) { + Ok(()) => tracing::debug!(critical, "thread priority raised"), + Err(e) => { + tracing::debug!(critical, error = %format!("{e:?}"), "SetThreadPriority failed") + } + } + } + #[cfg(not(target_os = "windows"))] + { + let _ = critical; + } +} + fn send_loop( mut session: Session, frame_rx: std::sync::mpsc::Receiver, @@ -1837,6 +1869,7 @@ fn send_loop( perf: bool, burst_cap: usize, ) { + boost_thread_priority(false); // transmit thread: above-normal (Apollo's encoder-thread level) let mut last_perf = std::time::Instant::now(); let mut last_bytes = 0u64; let mut last_send_dropped = 0u64; @@ -1995,6 +2028,9 @@ fn virtual_stream( probe_rx: std::sync::mpsc::Receiver, probe_result_tx: tokio::sync::mpsc::UnboundedSender, ) -> Result<()> { + // This thread runs the capture+encode loop (single-process: Linux / synthetic / NO_WGC DDA) — or + // tail-calls the relay below. Elevate it so a CPU-heavy game can't deschedule our GPU submission. + boost_thread_priority(true); // Windows two-process secure-desktop path: when the host runs as SYSTEM (required for the secure // desktop + SendInput), WGC can't activate in-process, so we capture the normal desktop via a // helper spawned in the user session and relay its AUs. (Single-process WGC/DDA is used as the diff --git a/crates/punktfunk-host/src/wgc_helper.rs b/crates/punktfunk-host/src/wgc_helper.rs index 864cd58..bc519bf 100644 --- a/crates/punktfunk-host/src/wgc_helper.rs +++ b/crates/punktfunk-host/src/wgc_helper.rs @@ -46,6 +46,12 @@ pub fn run(opts: HelperOptions) -> Result<()> { "WGC helper starting (user session)" ); + // This thread does WGC capture + video-processor convert + NVENC submit — the GPU-submitting hot + // path. Elevate its OS priority so a CPU-heavy game can't deschedule it and delay submission (which + // would leave our HIGH GPU priority with nothing queued to prioritise). Apollo's capture thread is + // likewise CRITICAL. + crate::m3::boost_thread_priority(true); + // Capture the EXISTING SudoVDA output by GDI name / target id — do NOT create one (the host owns // the virtual output + its isolate/restore; a second topology owner breaks DDA recovery). let target = WinCaptureTarget {