diff --git a/crates/punktfunk-host/src/capture.rs b/crates/punktfunk-host/src/capture.rs index 9be175a..2e70b79 100644 --- a/crates/punktfunk-host/src/capture.rs +++ b/crates/punktfunk-host/src/capture.rs @@ -59,7 +59,7 @@ pub struct OutputFormat { /// Produce GPU-resident D3D11 frames (zero-copy for a GPU encoder — NVENC/AMF/QSV) rather than CPU /// staging. `false` **only** for the GPU-less software encoder. pub gpu: bool, - /// HDR: the capturer converts to 10-bit (IDD-push FP16 → `Rgb10a2`; the DDA secure-desktop HDR hint). + /// HDR: the capturer converts to 10-bit (IDD-push FP16 → `P010`, or `Rgb10a2` for a 4:4:4 source). /// `false` = 8-bit SDR. pub hdr: bool, /// Full-chroma 4:4:4 session: the capturer must keep full chroma — deliver packed **RGB** @@ -380,23 +380,12 @@ pub fn capture_virtual_output( .map(|c| Box::new(c) as Box) } -/// `PUNKTFUNK_NO_WGC=1` forces the pure single-process DDA (Desktop Duplication) path everywhere: it -/// skips WGC in [`capture_virtual_output`] AND bypasses the two-process secure-desktop relay (so even a -/// SYSTEM host captures in-process via DDA, the way Apollo does — one capturer for the normal AND the -/// secure desktop). For bringing DDA up to parity / validating it on its own; all the WGC code stays -/// compiled and comes back the moment the flag is unset. -#[cfg(target_os = "windows")] -pub(crate) fn wgc_disabled() -> bool { - crate::config::config().no_wgc -} - #[cfg(target_os = "windows")] pub fn capture_virtual_output( vout: crate::vdisplay::VirtualOutput, want: OutputFormat, - capture: crate::session_plan::CaptureBackend, + _capture: crate::session_plan::CaptureBackend, ) -> Result> { - use crate::session_plan::CaptureBackend; let target = vout.win_capture.clone().ok_or_else(|| { anyhow::anyhow!( "SudoVDA target not yet an active display (needs a WDDM GPU to activate it)" @@ -404,97 +393,36 @@ pub fn capture_virtual_output( })?; let pref = vout.preferred_mode; let keep = vout.keepalive; - // Full-chroma 4:4:4 needs a full-chroma RGB source. The IDD-push and WGC paths emit subsampled - // NV12/P010 by default, which can't reconstruct 4:4:4; route a 4:4:4 session to DDA, which delivers - // RGB (Bgra) when its `chroma_444` flag is set. (IDD-push/WGC 4:4:4 capture is a follow-up.) - if want.chroma_444 && capture != CaptureBackend::Dda { - tracing::info!("4:4:4 session — using DDA capture (RGB source) instead of {capture:?}"); - return dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444) - .map(|c| Box::new(c) as Box); - } - // P2 direct frame push (kill DDA): consume frames straight from the pf-vdisplay driver's shared - // ring — no Desktop Duplication, no win32u reparenting hook. Resolved once in the `SessionPlan` - // (was re-derived from `config().idd_push` here); `IddPush` takes the keepalive (owns the virtual - // display) so there's no fall-through. - if capture == CaptureBackend::IddPush { - // Recreate the monitor + ring per session (fix-teardown): a FRESH monitor reliably gets a - // working IddCx swap-chain, whereas a REUSED monitor's swap-chain dies after ~2 sessions and - // the host can't revive it. The driver's recreate crash (target id resolved to 0) is fixed by - // stamping target_id onto the monitor context. The ring is always FP16 (the driver composes - // the IDD in FP16); `want_hdr` selects the per-frame conversion (FP16 → Rgb10a2 vs Bgra). - // If IDD-push can't open OR the driver doesn't attach to the ring within a few seconds (e.g. a - // hybrid-GPU render mismatch), fall back to DDA so the session is NEVER left black (audit §5.1). - // `open()` hands the keepalive back on failure so DDA can take ownership of the virtual display. - match idd_push::IddPushCapturer::open(target.clone(), pref, want.hdr, keep) { - Ok(c) => return Ok(Box::new(c) as Box), - Err((e, keep)) => { - tracing::warn!( - error = %format!("{e:#}"), - "IDD-push open/attach failed — falling back to DDA" - ); - return dxgi::DuplCapturer::open( - target, - pref, - keep, - want.gpu, - false, - want.chroma_444, - ) - .map(|c| Box::new(c) as Box); - } - } - } - // WGC (Windows.Graphics.Capture) is the default: it captures the COMPOSED desktop including the - // overlay/independent-flip planes DXGI Desktop Duplication misses (the frozen-HDR-animation bug), - // and has no ACCESS_LOST-on-overlay churn. DDA stays available via PUNKTFUNK_CAPTURE=dda and is - // the secure-desktop (lock/UAC) fallback (WGC can't capture those). `keep` is moved into the - // chosen backend (it owns the SudoVDA keepalive), so there's no open-time auto-fallback. The - // backend choice (`dda`/`dxgi`/`PUNKTFUNK_NO_WGC` → DDA, else WGC) is now resolved once in the plan. - if capture == CaptureBackend::Dda { - return dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444) - .map(|c| Box::new(c) as Box); - } - // WGC default, with a watchdog'd DDA fallback. WGC's Direct3D11CaptureFramePool::CreateFreeThreaded - // intermittently HANGS on the headless SudoVDA (IddCx) display — a blocking call we can't error out - // of in place. So run WGC open on a dedicated thread and bound it: if it doesn't finish in time - // (hang) or errors, fall back to the reliable DDA path so the session is NEVER left black. WGC, - // when it opens, captures the composed desktop (overlay/MPO-correct HDR — fixes frozen animations); - // DDA is the safety net (+ the secure-desktop path). The encode thread is set MTA so the WGC - // objects built on the watchdog thread (also MTA) are usable here; the keepalive is handed to WGC - // only on success, else to DDA. A hung watchdog thread is abandoned (holds no keepalive). - // SAFETY: `RoInitialize` is a combase FFI call that initializes the WinRT apartment for the calling - // thread. It takes the `RO_INIT_MULTITHREADED` enum by value and borrows no memory, so there is no - // pointer/lifetime/aliasing obligation; it is safe on any thread and idempotent — a second call on a - // thread already in a compatible apartment returns S_FALSE / RPC_E_CHANGED_MODE, which we discard. - // Runs on the encode thread that goes on to use the WGC (WinRT) objects built by the watchdog thread. - unsafe { - let _ = windows::Win32::System::WinRT::RoInitialize( - windows::Win32::System::WinRT::RO_INIT_MULTITHREADED, - ); - } - let (tx, rx) = std::sync::mpsc::channel(); - let t = target.clone(); - let _ = std::thread::Builder::new() - .name("wgc-open".into()) - .spawn(move || { - let _ = tx.send(wgc::WgcCapturer::open(t, pref)); - }); - match rx.recv_timeout(std::time::Duration::from_secs(5)) { - Ok(Ok(mut c)) => { - c.attach_keepalive(keep); - Ok(Box::new(c) as Box) - } - Ok(Err(e)) => { - tracing::warn!(error = %format!("{e:#}"), "WGC open failed — falling back to DDA"); - dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444) - .map(|c| Box::new(c) as Box) - } - Err(_) => { - tracing::warn!("WGC open timed out (CreateFreeThreaded hang on the virtual display) — falling back to DDA"); - dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444) - .map(|c| Box::new(c) as Box) - } - } + // IDD direct-push is the sole Windows capture path: consume frames straight from the pf-vdisplay + // driver's shared ring (in-process, Session 0 — it captures the secure desktop too; no Desktop + // Duplication, no WGC helper). A FRESH monitor + ring is created per session: a REUSED monitor's + // swap-chain dies after ~2 sessions and can't be revived. The ring is always FP16 when the display + // is HDR (the driver composes the IDD in FP16); `want.hdr` proactively enables advanced color and + // selects the per-frame conversion (FP16 → P010 vs BGRA → NV12). `IddPushCapturer` takes the + // keepalive (it owns the virtual display). There is NO fallback (DDA + the WGC relay were removed): + // if it can't open or the driver doesn't attach, the session fails cleanly and the client reconnects. + idd_push::IddPushCapturer::open(target, pref, want.hdr, keep) + .map(|c| Box::new(c) as Box) + .map_err(|(e, _keep)| e.context("IDD-push capture open (no fallback)")) +} + +/// Whether the active capturer can deliver a full-chroma (RGB) source for a 4:4:4 HEVC encode. The +/// negotiator gates 4:4:4 on this so the host honestly downgrades to 4:2:0 when the capturer can only +/// produce subsampled frames. Linux (the portal capturer feeding CPU RGB → `yuv444p`) can; the Windows +/// IDD-push path delivers subsampled NV12/P010 today, so full-chroma capture there is a follow-up. +#[cfg(target_os = "linux")] +pub(crate) fn capturer_supports_444() -> bool { + true +} +#[cfg(target_os = "windows")] +pub(crate) fn capturer_supports_444() -> bool { + // IDD-push 4:4:4 (full-chroma RGB from the FP16 ring) is the next step; until then the sole Windows + // capturer delivers subsampled NV12/P010 only, so the host honestly negotiates 4:2:0. + false +} +#[cfg(not(any(target_os = "linux", target_os = "windows")))] +pub(crate) fn capturer_supports_444() -> bool { + false } #[cfg(not(any(target_os = "linux", target_os = "windows")))] @@ -506,14 +434,9 @@ pub fn capture_virtual_output( anyhow::bail!("virtual-output capture requires Linux or Windows") } -// Goal-1 stage 6: the Windows backends live under `capture/windows/`, the Linux one under `capture/linux/` -// (`#[path]` keeps the module names flat, so every `crate::capture::*` path is unchanged). -#[cfg(target_os = "windows")] -#[path = "capture/windows/composed_flip.rs"] -pub mod composed_flip; -#[cfg(target_os = "windows")] -#[path = "capture/windows/desktop_watch.rs"] -pub mod desktop_watch; +// Goal-1 stage 6: the Windows backend lives under `capture/windows/`, the Linux one under `capture/linux/` +// (`#[path]` keeps the module names flat, so every `crate::capture::*` path is unchanged). Windows capture +// is IDD direct-push only — DXGI Desktop Duplication (DDA) and the WGC two-process relay were removed. #[cfg(target_os = "windows")] #[path = "capture/windows/dxgi.rs"] pub mod dxgi; @@ -522,9 +445,3 @@ pub mod dxgi; pub mod idd_push; #[cfg(target_os = "linux")] mod linux; -#[cfg(target_os = "windows")] -#[path = "capture/windows/wgc.rs"] -pub mod wgc; -#[cfg(target_os = "windows")] -#[path = "capture/windows/wgc_relay.rs"] -pub mod wgc_relay; diff --git a/crates/punktfunk-host/src/capture/windows/composed_flip.rs b/crates/punktfunk-host/src/capture/windows/composed_flip.rs deleted file mode 100644 index d7c7895..0000000 --- a/crates/punktfunk-host/src/capture/windows/composed_flip.rs +++ /dev/null @@ -1,217 +0,0 @@ -//! Force-composed-flip overlay (Windows) — make the secure (Winlogon: UAC / lock / login) desktop -//! capturable by Desktop Duplication. -//! -//! The secure desktop's dialog/wallpaper presents via **fullscreen independent-flip / MPO**: it scans -//! out directly, bypassing DWM composition, so `IDXGIOutputDuplication::AcquireNextFrame` returns -//! `DXGI_ERROR_ACCESS_LOST` (born-lost) — there is no composed frame to hand out (the client sees -//! black). Independent-flip requires the presenting app to own the ENTIRE output: putting ANY other -//! visible window on that output disqualifies it, forcing DWM to **composite**, which DDA can then -//! capture. So we keep a tiny, click-through, near-invisible **topmost layered window** alive on the -//! *current input desktop* (which is Winlogon while the secure desktop is up). On a desktop switch the -//! window is orphaned, so a dedicated thread tracks the input desktop and recreates it there. -//! -//! This is the non-input alternative to "tap a key to wake the lock screen": it needs no SendInput and -//! no system-wide registry change (it does NOT disable MPO globally — it only nudges OUR output to -//! composed while a session is live). Effectiveness can be build/driver-dependent; gated by -//! `PUNKTFUNK_FORCE_COMPOSED` (default ON; set =0 to disable). - -// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). -#![deny(clippy::undocumented_unsafe_blocks)] - -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; -use windows::core::w; -use windows::Win32::Foundation::{HWND, LPARAM, LRESULT, WPARAM}; -use windows::Win32::System::LibraryLoader::GetModuleHandleW; -use windows::Win32::System::StationsAndDesktops::{ - CloseDesktop, GetUserObjectInformationW, OpenInputDesktop, SetThreadDesktop, - DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS, UOI_NAME, -}; -use windows::Win32::UI::WindowsAndMessaging::{ - CreateWindowExW, DefWindowProcW, DestroyWindow, DispatchMessageW, PeekMessageW, RegisterClassW, - SetLayeredWindowAttributes, SetWindowPos, ShowWindow, TranslateMessage, HWND_TOPMOST, - LWA_ALPHA, MSG, PM_REMOVE, SWP_NOACTIVATE, SWP_NOMOVE, SWP_NOSIZE, SW_SHOWNOACTIVATE, - WNDCLASSW, WS_EX_LAYERED, WS_EX_NOACTIVATE, WS_EX_TOOLWINDOW, WS_EX_TOPMOST, WS_EX_TRANSPARENT, - WS_POPUP, -}; - -/// A running force-composed-flip overlay. Drop signals the thread to tear down its window + exit. -pub struct ForceComposedFlip { - stop: Arc, -} - -impl ForceComposedFlip { - /// Start the overlay (no-op + `None` if disabled via `PUNKTFUNK_FORCE_COMPOSED=0`). - pub fn start() -> Option { - if std::env::var("PUNKTFUNK_FORCE_COMPOSED").as_deref() == Ok("0") { - tracing::info!("force-composed-flip overlay disabled (PUNKTFUNK_FORCE_COMPOSED=0)"); - return None; - } - let stop = Arc::new(AtomicBool::new(false)); - let st = stop.clone(); - std::thread::Builder::new() - .name("composed-flip".into()) - // SAFETY: `run` is this module's `unsafe fn` (it owns a desktop+window lifecycle via Win32 - // FFI); it takes ownership of `st` (the stop `Arc`) and has no caller-side memory - // precondition. It is designed to own its thread for its whole duration — exactly the - // dedicated `composed-flip` thread spawned here. - .spawn(move || unsafe { run(st) }) - .ok()?; - tracing::info!("force-composed-flip overlay started (Winlogon-aware)"); - Some(ForceComposedFlip { stop }) - } -} - -impl Drop for ForceComposedFlip { - fn drop(&mut self) { - self.stop.store(true, Ordering::Relaxed); - } -} - -extern "system" fn wndproc(hwnd: HWND, msg: u32, wp: WPARAM, lp: LPARAM) -> LRESULT { - // SAFETY: this is the window procedure the OS invokes with the window's own `hwnd` and a real - // message `(msg, wp, lp)`. `DefWindowProcW` performs default processing for exactly those - // parameters (all passed straight through by value); it borrows no Rust memory and is synchronous. - unsafe { DefWindowProcW(hwnd, msg, wp, lp) } -} - -/// Read the current input-desktop name (e.g. "Default" / "Winlogon"); `None` if it can't be read. -unsafe fn input_desktop_name() -> Option { - let desk = OpenInputDesktop( - DESKTOP_CONTROL_FLAGS(0), - false, - DESKTOP_ACCESS_FLAGS(0x0001), - ) - .ok()?; - let mut buf = [0u16; 64]; - let mut needed = 0u32; - let ok = GetUserObjectInformationW( - windows::Win32::Foundation::HANDLE(desk.0), - UOI_NAME, - Some(buf.as_mut_ptr() as *mut _), - (buf.len() * 2) as u32, - Some(&mut needed), - ) - .is_ok(); - let _ = CloseDesktop(desk); - if !ok { - return None; - } - Some( - String::from_utf16_lossy(&buf) - .trim_end_matches('\u{0}') - .to_string(), - ) -} - -/// Create the tiny topmost layered click-through window on the CURRENT thread's desktop. Caller must -/// have `SetThreadDesktop`'d to the target input desktop first. -unsafe fn make_overlay() -> Option { - let hinst = GetModuleHandleW(None).ok()?; - let class = w!("PunktfunkComposedFlip"); - // RegisterClassW is idempotent-ish: a second register for the same name fails harmlessly; we - // ignore the result and rely on the class existing. (One process, so it registers once.) - let wc = WNDCLASSW { - lpfnWndProc: Some(wndproc), - hInstance: hinst.into(), - lpszClassName: class, - ..Default::default() - }; - let atom = RegisterClassW(&wc); - if atom == 0 { - let e = windows::Win32::Foundation::GetLastError(); - // 1410 = ERROR_CLASS_ALREADY_EXISTS is fine (re-register after a desktop switch). - if e.0 != 1410 { - tracing::warn!(err = e.0, "force-composed-flip: RegisterClassW failed"); - } - } - let hwnd = match CreateWindowExW( - WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOPMOST | WS_EX_NOACTIVATE | WS_EX_TOOLWINDOW, - class, - w!(""), - WS_POPUP, - 0, - 0, - 1, - 1, - None, - None, - Some(hinst.into()), - None, - ) { - Ok(h) => h, - Err(e) => { - let le = windows::Win32::Foundation::GetLastError(); - tracing::warn!(err = %format!("{e:?}"), last = le.0, - "force-composed-flip: CreateWindowExW failed"); - return None; - } - }; - // alpha=1: technically visible (so it disqualifies independent-flip) but imperceptible. - let _ = SetLayeredWindowAttributes(hwnd, windows::Win32::Foundation::COLORREF(0), 1, LWA_ALPHA); - let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE); - let _ = SetWindowPos( - hwnd, - Some(HWND_TOPMOST), - 0, - 0, - 0, - 0, - SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE, - ); - Some(hwnd) -} - -unsafe fn run(stop: Arc) { - let mut cur_desktop: Option = None; - let mut hwnd: Option = None; - let mut ticks: u32 = 0; - while !stop.load(Ordering::Relaxed) { - // Follow the input desktop: if it changed (Default↔Winlogon), re-attach this thread and - // recreate the window there (a window is bound to the desktop it was created on). - let name = input_desktop_name(); - if name != cur_desktop { - if let Some(h) = hwnd.take() { - let _ = DestroyWindow(h); - } - if let Ok(desk) = OpenInputDesktop( - DESKTOP_CONTROL_FLAGS(0), - false, - DESKTOP_ACCESS_FLAGS(0x1000_0000), // GENERIC_ALL (incl. DESKTOP_CREATEWINDOW=0x0002) - ) { - if SetThreadDesktop(desk).is_ok() { - hwnd = make_overlay(); - tracing::info!(desktop = ?name, created = hwnd.is_some(), - "force-composed-flip: overlay (re)created on input desktop"); - } - // Leak `desk` while it's the thread desktop (closing the current thread desktop is UB). - } - cur_desktop = name; - } - // Re-assert topmost periodically (other windows on the secure desktop can push us down) and - // pump our message queue so the window stays responsive/composited. - if let Some(h) = hwnd { - let _ = SetWindowPos( - h, - Some(HWND_TOPMOST), - 0, - 0, - 0, - 0, - SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE, - ); - let mut msg = MSG::default(); - while PeekMessageW(&mut msg, Some(h), 0, 0, PM_REMOVE).as_bool() { - let _ = TranslateMessage(&msg); - DispatchMessageW(&msg); - } - } - ticks = ticks.wrapping_add(1); - let _ = ticks; - std::thread::sleep(std::time::Duration::from_millis(200)); - } - if let Some(h) = hwnd.take() { - let _ = DestroyWindow(h); - } - tracing::info!("force-composed-flip overlay stopped"); -} diff --git a/crates/punktfunk-host/src/capture/windows/desktop_watch.rs b/crates/punktfunk-host/src/capture/windows/desktop_watch.rs deleted file mode 100644 index 945b943..0000000 --- a/crates/punktfunk-host/src/capture/windows/desktop_watch.rs +++ /dev/null @@ -1,144 +0,0 @@ -//! Input-desktop watcher (Windows) — the authoritative "normal vs secure desktop" signal for the -//! two-process secure-desktop design (design/archive/windows-secure-desktop.md). -//! -//! Windows switches the *input desktop* to "Winlogon" (the secure desktop) for UAC elevation, the -//! lock screen and the login screen, and back to "Default" for the normal session. WGC captures only -//! the normal desktop; DDA-as-SYSTEM captures the secure one. A dedicated thread polls the input -//! desktop's NAME (WTS session notifications miss UAC entirely, so the name is the reliable signal) -//! and publishes it as an atomic the capture mux + input path read. - -// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). -#![deny(clippy::undocumented_unsafe_blocks)] - -use std::sync::atomic::{AtomicBool, AtomicU8, Ordering}; -use std::sync::Arc; -use std::time::Duration; -use windows::Win32::Foundation::HANDLE; -use windows::Win32::System::StationsAndDesktops::{ - CloseDesktop, GetUserObjectInformationW, OpenInputDesktop, DESKTOP_ACCESS_FLAGS, - DESKTOP_CONTROL_FLAGS, UOI_NAME, -}; - -/// The normal interactive desktop ("Default") — WGC capture applies. -pub const DESKTOP_NORMAL: u8 = 0; -/// The secure desktop ("Winlogon": UAC / lock / login) — DDA-as-SYSTEM capture applies. -pub const DESKTOP_SECURE: u8 = 1; - -/// Polls the input-desktop name on its own thread and publishes [`DESKTOP_NORMAL`]/[`DESKTOP_SECURE`]. -pub struct DesktopWatcher { - state: Arc, - stop: Arc, -} - -impl DesktopWatcher { - pub fn start() -> Self { - // Compute the CURRENT desktop synchronously before returning, so the first reader (the capture - // mux) sees the real state immediately. Otherwise a session that begins already on the secure - // desktop (e.g. a reconnect to a locked box) would read DESKTOP_NORMAL for the first poll - // interval and relay one stale normal-desktop frame — the "flash of the login screen" bug. - // SAFETY: `is_secure_desktop` is this module's `unsafe fn` — unsafe only because it calls Win32 - // desktop FFI (`OpenInputDesktop`/`GetUserObjectInformationW`/`CloseDesktop`), with no caller - // precondition; it opens, names, and closes the input-desktop handle internally and is safe to - // call from any thread (here, on the thread running `DesktopWatcher::start`). - let initial = if unsafe { is_secure_desktop() } { - DESKTOP_SECURE - } else { - DESKTOP_NORMAL - }; - let state = Arc::new(AtomicU8::new(initial)); - let stop = Arc::new(AtomicBool::new(false)); - let s = state.clone(); - let st = stop.clone(); - let _ = std::thread::Builder::new() - .name("desktop-watch".into()) - .spawn(move || { - // Debounce: only publish a change after the raw reading has been stable for several - // polls. The input desktop flaps Default↔Winlogon transiently during a lock/UAC - // transition; publishing every flap makes the capture mux thrash (rebuild storms). - const STABLE_POLLS: u32 = 4; // ~80ms - let mut published = initial; - let mut candidate = initial; - let mut stable = 0u32; - while !st.load(Ordering::Relaxed) { - // SAFETY: same as in `start` — `is_secure_desktop` is self-contained Win32 desktop - // FFI with no caller precondition, called here on the dedicated `desktop-watch` - // polling thread. - let v = if unsafe { is_secure_desktop() } { - DESKTOP_SECURE - } else { - DESKTOP_NORMAL - }; - if v == candidate { - stable = stable.saturating_add(1); - } else { - candidate = v; - stable = 1; - } - if stable >= STABLE_POLLS && candidate != published { - s.store(candidate, Ordering::Release); - published = candidate; - tracing::info!( - desktop = if candidate == DESKTOP_SECURE { - "Winlogon(secure)" - } else { - "Default" - }, - "input desktop changed (debounced)" - ); - } - std::thread::sleep(Duration::from_millis(20)); - } - }); - DesktopWatcher { state, stop } - } - - /// The shared atomic ([`DESKTOP_NORMAL`]/[`DESKTOP_SECURE`]) for the capture mux to read. - pub fn state(&self) -> Arc { - self.state.clone() - } - - /// True when the secure (Winlogon) desktop is the input desktop right now. - pub fn is_secure(&self) -> bool { - self.state.load(Ordering::Acquire) == DESKTOP_SECURE - } -} - -impl Drop for DesktopWatcher { - fn drop(&mut self) { - self.stop.store(true, Ordering::Relaxed); - } -} - -/// True if the current input desktop is "Winlogon" (the secure desktop). Best-effort: if the desktop -/// can't be opened or named, report not-secure (the safe default — keep WGC/normal capture). -pub(crate) unsafe fn is_secure_desktop() -> bool { - let desk = match OpenInputDesktop( - DESKTOP_CONTROL_FLAGS(0), - false, - DESKTOP_ACCESS_FLAGS(DESKTOP_READOBJECTS), - ) { - Ok(d) => d, - Err(_) => return false, - }; - let mut buf = [0u16; 64]; - let mut needed = 0u32; - let ok = GetUserObjectInformationW( - HANDLE(desk.0), - UOI_NAME, - Some(buf.as_mut_ptr() as *mut _), - (buf.len() * 2) as u32, - Some(&mut needed), - ) - .is_ok(); - let _ = CloseDesktop(desk); - if !ok { - return false; - } - let name = String::from_utf16_lossy(&buf); - name.trim_end_matches('\u{0}') - .eq_ignore_ascii_case("Winlogon") -} - -/// `DESKTOP_READOBJECTS` access right (the windows crate exposes it as a typed flag; we need the raw -/// bit for `OpenInputDesktop`'s access mask). -const DESKTOP_READOBJECTS: u32 = 0x0001; diff --git a/crates/punktfunk-host/src/capture/windows/dxgi.rs b/crates/punktfunk-host/src/capture/windows/dxgi.rs index db47edf..b19d657 100644 --- a/crates/punktfunk-host/src/capture/windows/dxgi.rs +++ b/crates/punktfunk-host/src/capture/windows/dxgi.rs @@ -1,63 +1,39 @@ -//! DXGI Desktop Duplication capture (Windows) — the analogue of the PipeWire portal capturer. -//! Creates a D3D11 device on the SudoVDA adapter (by LUID), finds the matching output (by GDI -//! name), duplicates it, and on each `AcquireNextFrame` copies the desktop image into a CPU-readable -//! staging texture → tightly-packed BGRA (the GPU-less path that feeds the software encoder). A -//! future zero-copy path returns `FramePayload::D3d11` for NVENC. -//! -//! Validates only with a real GPU + an *activated* SudoVDA monitor (`DuplicateOutput` needs a live -//! WDDM output). Compiles on the GPU-less VM; the pure helpers are unit-tested there. +//! Shared Windows GPU primitives — D3D11 device creation, GPU scheduling priority hooks, +//! HLSL shader compilation, HDR FP16→P010 conversion ([`HdrP010Converter`]), video-engine +//! colour conversion ([`VideoConverter`]), and the IDD-push capture identity +//! ([`WinCaptureTarget`], [`pack_luid`]). Consumed by [`super::idd_push`]. +//! DXGI Desktop Duplication has been removed; this module contains no capturer. // Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). #![deny(clippy::undocumented_unsafe_blocks)] -use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; -use anyhow::{anyhow, bail, Context, Result}; +use anyhow::{bail, Context, Result}; use std::ffi::c_void; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; -use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +use std::sync::atomic::{AtomicU64, Ordering}; use windows::core::{s, Interface, PCSTR}; use windows::Win32::Foundation::{HMODULE, LUID}; use windows::Win32::Graphics::Direct3D::Fxc::D3DCompile; use windows::Win32::Graphics::Direct3D::{ ID3DBlob, D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0, D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST, - D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP, }; use windows::Win32::Graphics::Direct3D11::{ - D3D11CreateDevice, ID3D11BlendState, ID3D11Buffer, ID3D11Device, ID3D11DeviceContext, - ID3D11PixelShader, ID3D11RenderTargetView, ID3D11SamplerState, ID3D11ShaderResourceView, - ID3D11Texture2D, ID3D11VertexShader, D3D11_BIND_CONSTANT_BUFFER, D3D11_BIND_FLAG, - D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_BLEND_DESC, - D3D11_BLEND_INV_DEST_COLOR, D3D11_BLEND_INV_SRC_ALPHA, D3D11_BLEND_ONE, D3D11_BLEND_OP_ADD, - D3D11_BLEND_SRC_ALPHA, D3D11_BUFFER_DESC, D3D11_COLOR_WRITE_ENABLE_ALL, D3D11_COMPARISON_NEVER, - D3D11_CPU_ACCESS_READ, D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT, - D3D11_FILTER_MIN_MAG_MIP_POINT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, - D3D11_MAP_WRITE_DISCARD, D3D11_RENDER_TARGET_BLEND_DESC, D3D11_RENDER_TARGET_VIEW_DESC, - D3D11_RENDER_TARGET_VIEW_DESC_0, D3D11_RTV_DIMENSION_TEXTURE2D, D3D11_SAMPLER_DESC, - D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEX2D_RTV, D3D11_TEXTURE2D_DESC, - D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC, D3D11_USAGE_STAGING, - D3D11_VIEWPORT, + D3D11CreateDevice, ID3D11Buffer, ID3D11Device, ID3D11DeviceContext, ID3D11PixelShader, + ID3D11RenderTargetView, ID3D11SamplerState, ID3D11ShaderResourceView, ID3D11Texture2D, + ID3D11VertexShader, D3D11_BIND_CONSTANT_BUFFER, D3D11_BIND_RENDER_TARGET, + D3D11_BIND_SHADER_RESOURCE, D3D11_BUFFER_DESC, D3D11_COMPARISON_NEVER, D3D11_CPU_ACCESS_READ, + D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_FILTER_MIN_MAG_MIP_POINT, + D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_MAP_WRITE_DISCARD, + D3D11_RENDER_TARGET_VIEW_DESC, D3D11_RENDER_TARGET_VIEW_DESC_0, D3D11_RTV_DIMENSION_TEXTURE2D, + D3D11_SAMPLER_DESC, D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEX2D_RTV, + D3D11_TEXTURE2D_DESC, D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC, + D3D11_USAGE_STAGING, D3D11_VIEWPORT, }; use windows::Win32::Graphics::Dxgi::Common::{ - DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_P010, DXGI_FORMAT_R10G10B10A2_UNORM, - DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16_UNORM, DXGI_FORMAT_R16_UNORM, - DXGI_SAMPLE_DESC, + DXGI_FORMAT, DXGI_FORMAT_P010, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16_UNORM, + DXGI_FORMAT_R16_UNORM, DXGI_SAMPLE_DESC, }; -use windows::Win32::Graphics::Dxgi::{ - CreateDXGIFactory1, IDXGIAdapter1, IDXGIDevice, IDXGIDevice1, IDXGIFactory1, IDXGIOutput1, - IDXGIOutput5, IDXGIOutput6, IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST, - DXGI_ERROR_DEVICE_REMOVED, DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL, - DXGI_ERROR_MODE_CHANGE_IN_PROGRESS, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, - DXGI_OUTDUPL_FRAME_INFO, DXGI_OUTDUPL_POINTER_SHAPE_INFO, - DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR, -}; -use windows::Win32::System::StationsAndDesktops::{ - CloseDesktop, OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS, -}; -use windows::Win32::UI::WindowsAndMessaging::SetCursorPos; +use windows::Win32::Graphics::Dxgi::{IDXGIAdapter1, IDXGIDevice, IDXGIDevice1}; -/// The Windows capture identity carried out of the SudoVDA backend in -/// [`crate::vdisplay::VirtualOutput`]: which adapter + which GDI output to duplicate. -#[derive(Clone, Debug)] pub struct WinCaptureTarget { /// Packed DXGI adapter LUID (`(HighPart << 32) | (LowPart & 0xffff_ffff)`). pub adapter_luid: i64, @@ -84,85 +60,6 @@ pub fn pack_luid(luid: LUID) -> i64 { ((luid.HighPart as i64) << 32) | (luid.LowPart as i64 & 0xffff_ffff) } -/// Does a fixed-size UTF-16 GDI device name (NUL-padded, e.g. `DXGI_OUTPUT_DESC::DeviceName`) -/// equal `target`? -fn gdi_name_matches(name16: &[u16], target: &str) -> bool { - let s = String::from_utf16_lossy(name16); - s.trim_end_matches('\u{0}') == target -} - -/// Copy a row-padded BGRA surface (`pitch` >= `w*4`) into a tightly-packed `w*4*h` buffer. -fn depad_bgra(src: &[u8], pitch: usize, w: usize, h: usize) -> Vec { - let row = w * 4; - let mut out = vec![0u8; row * h]; - for y in 0..h { - out[y * row..y * row + row].copy_from_slice(&src[y * pitch..y * pitch + row]); - } - out -} - -/// Re-find the live `IDXGIOutput1` for a GDI name across all adapters (the SudoVDA monitor is -/// enumerated under the rendering GPU). Used to recover after ACCESS_LOST, where the cached handle -/// may be stale. -pub(crate) unsafe fn find_output(gdi_name: &str) -> Result<(IDXGIAdapter1, IDXGIOutput1)> { - let factory: IDXGIFactory1 = CreateDXGIFactory1().context("CreateDXGIFactory1")?; - let mut i = 0u32; - while let Ok(a) = factory.EnumAdapters1(i) { - let mut j = 0u32; - while let Ok(o) = a.EnumOutputs(j) { - let od = o.GetDesc()?; - if gdi_name_matches(&od.DeviceName, gdi_name) { - // Diagnostic: which ADAPTER does this output sit under, and at what LUID? If this LUID - // BOUNCES across an ACCESS_LOST storm, the output is being reparented between adapters - // (the multi-GPU/IDD case Apollo's win32u hook + SET_RENDER_ADAPTER fix). If it's STABLE, - // the storm is something else (e.g. HDR independent-flip DDA can't capture). - if let Ok(ad) = a.GetDesc1() { - let name = String::from_utf16_lossy(&ad.Description); - tracing::info!( - output = gdi_name, - adapter = name.trim_end_matches('\u{0}'), - luid = format!( - "{:08x}:{:08x}", - ad.AdapterLuid.HighPart, ad.AdapterLuid.LowPart - ), - "find_output: output resolved under adapter" - ); - } - return Ok((a.clone(), o.cast::()?)); - } - j += 1; - } - i += 1; - } - bail!("no DXGI output named {gdi_name} (gone after ACCESS_LOST?)") -} - -/// Read the source display's static HDR mastering metadata via `IDXGIOutput6::GetDesc1` (the -/// monitor IS the "mastering display" for a desktop capture, exactly as Sunshine/Apollo treat it). -/// GetDesc1 exposes the colour primaries, white point, and min/max mastering luminance but NOT a -/// content light level, so MaxCLL/MaxFALL are left `0` (unknown — the display tone-maps from the -/// mastering luminance). `None` if the output can't be cast to `IDXGIOutput6` or the call fails. -unsafe fn read_output_hdr_meta(output: &IDXGIOutput1) -> Option { - let out6: IDXGIOutput6 = output.cast().ok()?; - let d = out6.GetDesc1().ok()?; - let m = crate::hdr::hdr_meta_from_display( - (d.RedPrimary[0], d.RedPrimary[1]), - (d.GreenPrimary[0], d.GreenPrimary[1]), - (d.BluePrimary[0], d.BluePrimary[1]), - (d.WhitePoint[0], d.WhitePoint[1]), - d.MaxLuminance, - d.MinLuminance, - 0, // MaxCLL: GetDesc1 has no content light level (Apollo zeroes it) - 0, // MaxFALL - ); - tracing::info!( - max_nits = d.MaxLuminance, - min_nits = d.MinLuminance, - max_full_frame_nits = d.MaxFullFrameLuminance, - "read source display HDR mastering metadata (GetDesc1)" - ); - Some(m) -} /// Create a fresh D3D11 device + context on a specific adapter (driver_type UNKNOWN with an explicit /// adapter). Used at open and on every ACCESS_LOST: a device created on one desktop cannot sustain a @@ -228,9 +125,8 @@ fn configured_gpu_priority_class() -> Option { /// Enable SE_INC_BASE_PRIORITY on the CURRENT process token (best-effort) — the kernel gates the /// HIGH/REALTIME GPU scheduling-priority bump on it. Held by SYSTEM/Administrators; a UAC-FILTERED -/// token (what `CreateProcessAsUserW` hands the WGC helper) does NOT have it, which is why the helper -/// can't elevate itself and the SYSTEM host stamps the class onto it cross-process instead (see -/// [`set_child_gpu_priority_class`]). +/// token does NOT have it, which is why `elevate_process_gpu_priority` may silently no-op in a +/// restricted service context. unsafe fn enable_inc_base_priority() { use windows::core::PCWSTR; use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; @@ -300,8 +196,8 @@ unsafe fn d3dkmt_set_scheduling_priority_class( /// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT /// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this). /// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime` -/// (default high). NOTE: in the SYSTEM-host + user-session-helper deployment this self-set NO-OPs in -/// the helper (filtered token), so the host also sets it on the helper via [`set_child_gpu_priority_class`]. +/// (default high). Best-effort: silently no-ops under a UAC-filtered token (the process will not +/// hold SE_INC_BASE_PRIORITY, so the D3DKMT call is a no-op). fn elevate_process_gpu_priority() { use std::sync::Once; static ONCE: Once = Once::new(); @@ -332,192 +228,12 @@ fn elevate_process_gpu_priority() { }); } -/// Set the GPU scheduling-priority class of ANOTHER process we created — the WGC capture+encode helper -/// in the interactive user session. The helper is spawned with the user's UAC-FILTERED token, which -/// lacks SE_INC_BASE_PRIORITY, so its own [`elevate_process_gpu_priority`] silently no-ops and NVENC -/// gets starved under a GPU-saturating game (the "240→40 fps in-game collapse"). The SYSTEM host DOES -/// hold the privilege, so it stamps the class onto the child's process handle right after spawn — the -/// process-level class applies to GPU contexts the child creates afterwards. Best-effort; logged. -/// `PUNKTFUNK_GPU_PRIORITY_CLASS=off` disables it (same knob as the self path). -/// -/// # Safety -/// `process` must be a valid handle to a process we own with at least PROCESS_SET_INFORMATION access -/// (the just-created helper, `PROCESS_INFORMATION::hProcess`). -pub(crate) unsafe fn set_child_gpu_priority_class(process: windows::Win32::Foundation::HANDLE) { - let Some(prio) = configured_gpu_priority_class() else { - return; - }; - enable_inc_base_priority(); // the SYSTEM host holds SE_INC_BASE_PRIORITY; the helper does not - match d3dkmt_set_scheduling_priority_class(process, prio) { - Some(0) => tracing::info!( - priority_class = prio, - "WGC helper GPU scheduling priority class set cross-process from the SYSTEM host \ - (2=normal 4=high 5=realtime)" - ), - Some(st) => tracing::warn!( - status = format!("0x{st:08X}"), - "cross-process D3DKMTSetProcessSchedulingPriorityClass on the WGC helper failed" - ), - None => tracing::warn!( - "D3DKMTSetProcessSchedulingPriorityClass export not found — WGC helper has no GPU priority" - ), - } -} - -/// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST -/// recovery to rebuild the whole capture on the current (possibly secure) input desktop. -unsafe fn reopen_duplication( - gdi_name: &str, - want_hdr: bool, -) -> Result<( - ID3D11Device, - ID3D11DeviceContext, - IDXGIOutput1, - IDXGIOutputDuplication, -)> { - let (adapter, out) = find_output(gdi_name)?; - let (dev, ctx) = make_device(&adapter)?; - let dupl = - duplicate_output(&out, &dev, want_hdr).context("re-DuplicateOutput after ACCESS_LOST")?; - Ok((dev, ctx, out, dupl)) -} - -/// Create the output duplication. Prefer `IDXGIOutput5::DuplicateOutput1` with an explicit -/// encoder-format list (FP16 first, then BGRA8) — Apollo's path. It hands us the desktop's real -/// scanout format (HDR FP16 or SDR BGRA8) and is far more robust to overlay/format changes than -/// legacy `DuplicateOutput` (which always tone-maps to 8-bit BGRA — the source of much of the -/// ACCESS_LOST churn). Requires the process be per-monitor-v2 DPI aware (set at startup in -/// [`install_gpu_pref_hook`]). Falls back to legacy `DuplicateOutput` if Output5 is unavailable or -/// `DuplicateOutput1` fails. -unsafe fn duplicate_output( - output: &IDXGIOutput1, - device: &ID3D11Device, - want_hdr: bool, -) -> Result { - if let Ok(output5) = output.cast::() { - // For an HDR session, request FP16 FIRST so DuplicateOutput1 hands back the desktop's real - // scRGB HDR surface → the `hdr_fp16` path converts it to BT.2020 PQ 10-bit for NVENC Main10. - // For SDR request BGRA8 only: listing FP16 first would make DXGI hand back FP16 even on an SDR - // desktop, wrongly tripping the HDR path. (HDR DDA is used for the secure desktop, where the - // SudoVDA may be in HDR and legacy DuplicateOutput — the SDR-era API — can't capture FP16.) - let formats: &[DXGI_FORMAT] = if want_hdr { - &[DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_B8G8R8A8_UNORM] - } else { - &[DXGI_FORMAT_B8G8R8A8_UNORM] - }; - // RETRY DuplicateOutput1. The caller releases the OLD duplication (self.dupl = None) immediately - // before calling us, and the kernel-side teardown of that duplication is ASYNC — the FIRST - // DuplicateOutput1 right after can race it and return E_ACCESSDENIED ("output still duplicated") - // even though we dropped our only reference. A few short retries let the teardown finish so the - // ROBUST DuplicateOutput1 dup succeeds, instead of falling through to legacy DuplicateOutput, - // which "succeeds" into a fragile dup that churns ACCESS_LOST/MODE_CHANGE every few ms on this - // cross-GPU IDD. (This is why DuplicateOutput1 failed but the legacy call a beat later - // succeeded — pure timing. Apollo retries DuplicateOutput1 2x/200ms for the same reason.) - // Apollo waits 200 ms between DuplicateOutput1 attempts — the kernel-side teardown of the - // just-released duplication takes that long, so short (ms) waits aren't enough. Env-tunable so - // we can dial it without a rebuild: PUNKTFUNK_DUP_RETRY_MS (per-wait, default 200) × - // PUNKTFUNK_DUP_RETRY_N (attempts, default 6) → ~1 s worst case before the legacy fallback. - let retry_ms: u64 = std::env::var("PUNKTFUNK_DUP_RETRY_MS") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(200); - // Default 1 (no retry → immediate legacy fallback). On the secure desktop DuplicateOutput1 - // ALWAYS refuses (only LOGON_UI may use it), so retrying there just blocks the capture thread; - // and on the normal desktop the release-before-reduplicate + gentle recovery already keep the - // legacy dup stable. Raise PUNKTFUNK_DUP_RETRY_N only on a box where DuplicateOutput1 can win - // the old-dup-teardown race (then PUNKTFUNK_DUP_RETRY_MS sets the per-wait, default 200). - // HDR DDA genuinely NEEDS DuplicateOutput1 (legacy DuplicateOutput can't capture an FP16/HDR - // desktop — it returns E_INVALIDARG), so give it several attempts even on the secure desktop - // rather than bailing after one try to the useless legacy fallback. SDR keeps the default 1. - let attempts: u64 = std::env::var("PUNKTFUNK_DUP_RETRY_N") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(if want_hdr { 5 } else { 1 }) - .max(1); - let mut last_err = None; - for attempt in 0..attempts { - match output5.DuplicateOutput1(device, 0, formats) { - Ok(d) => { - if attempt > 0 { - tracing::debug!( - attempt, - "DuplicateOutput1 succeeded on retry (rode out old-dup teardown race)" - ); - } - return Ok(d); - } - Err(e) => { - last_err = Some(e); - if attempt + 1 < attempts { - std::thread::sleep(Duration::from_millis(retry_ms)); - } - } - } - } - if let Some(e) = last_err { - // Expected on the secure (Winlogon) desktop (DuplicateOutput1 is LOGON_UI-only) and fires - // once per gentle recovery there — throttle so a lock dwell doesn't flood the log. The - // legacy fallback below handles it; gentle recovery keeps it from churning. - static FALLBACKS: AtomicU64 = AtomicU64::new(0); - if FALLBACKS.fetch_add(1, Ordering::Relaxed) % 64 == 0 { - tracing::debug!( - error = %format!("{e:?}"), - "DuplicateOutput1 unavailable — using legacy DuplicateOutput (expected on the secure desktop)" - ); - } - } - } - output.DuplicateOutput(device).context("DuplicateOutput") -} - -/// Park the cursor on a duplicated output. A blank virtual display emits NO Desktop Duplication -/// frames until something changes; a pointer move IS a DDA "change", so this kicks the very first -/// `AcquireNextFrame` loose — and lands the cursor on the display the client is viewing. Two moves -/// to distinct points guarantee an actual move even if the cursor already sat at the center. -/// Re-sync the calling (capture) thread to the CURRENT input desktop. MUST be called on EVERY recovery -/// — symmetrically for ENTERING and LEAVING the Winlogon (secure: lock/login/UAC) desktop. Gating it on -/// is_secure_desktop() (the old bug) re-attached only on the way IN, so on the way OUT the capture -/// thread stayed stuck on the gone Winlogon desktop and every rebuild failed → no frames → client -/// timeout → "display disconnected". Apollo calls its equivalent (syncThreadDesktop) before every -/// duplicate. Opening the secure desktop requires SYSTEM (the host relaunches itself as SYSTEM). -/// Matches Apollo by closing the handle right after SetThreadDesktop — the thread keeps the desktop via -/// an internal reference, so this does NOT leak even when called on every recovery. -unsafe fn attach_input_desktop() { - match OpenInputDesktop( - DESKTOP_CONTROL_FLAGS(0), - false, - DESKTOP_ACCESS_FLAGS(0x1000_0000), // GENERIC_ALL - ) { - Ok(desk) => { - if let Err(e) = SetThreadDesktop(desk) { - tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: SetThreadDesktop FAILED"); - } - let _ = CloseDesktop(desk); - } - Err(e) => { - tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: OpenInputDesktop FAILED") - } - } -} - -pub(crate) unsafe fn nudge_cursor_onto(output: &IDXGIOutput1) { - if let Ok(od) = output.GetDesc() { - let r = od.DesktopCoordinates; - let _ = SetCursorPos(r.left + 8, r.top + 8); - let _ = SetCursorPos((r.left + r.right) / 2, (r.top + r.bottom) / 2); - } -} - /// How many times DXGI has actually called our hooked `NtGdiDdDDIGetCachedHybridQueryValue`. If this /// stays 0 while DDA churns with ACCESS_LOST, the hook is NOT on DXGI's GPU-preference path on this /// build (so reparenting can't be the cause — look at composition/independent-flip instead). >0 with /// continuing churn means the hook fires but reparenting isn't the trigger here. static HYBRID_HOOK_HITS: AtomicU64 = AtomicU64::new(0); -pub(crate) fn hybrid_hook_hits() -> u64 { - HYBRID_HOOK_HITS.load(Ordering::Relaxed) -} - // kernel32 — declared directly so we don't pull the whole Win32_System_Diagnostics_Debug feature for // one call. FlushInstructionCache serializes the i-cache after the inline patch: the patch is written // on the main thread but DXGI runs the hooked export from the encode/worker thread (possibly a @@ -526,12 +242,7 @@ pub(crate) fn hybrid_hook_hits() -> u64 { extern "system" { fn FlushInstructionCache(h: *mut c_void, base: *const c_void, size: usize) -> i32; fn GetCurrentProcess() -> *mut c_void; - fn SetThreadExecutionState(es_flags: u32) -> u32; } -const ES_CONTINUOUS: u32 = 0x8000_0000; -const ES_SYSTEM_REQUIRED: u32 = 0x0000_0001; -const ES_DISPLAY_REQUIRED: u32 = 0x0000_0002; - /// Replacement for `win32u.dll!NtGdiDdDDIGetCachedHybridQueryValue`: always report /// `D3DKMT_GPU_PREFERENCE_STATE_UNSPECIFIED` (3). We fully replace the function (never call the /// original), so no trampoline is needed. (Independent reimplementation of the same technique Apollo @@ -638,43 +349,6 @@ pub(crate) fn install_gpu_pref_hook() { }); } -// DXGI Desktop Duplication deliberately EXCLUDES the hardware cursor from the captured surface (the -// OS composites it separately). We capture the cursor shape/position from the frame info and blend it -// back in — on the GPU for the zero-copy path (a CPU readback would stall the 240 fps pipeline). - -const CURSOR_VS: &str = r" -cbuffer Rect : register(b0) { float4 r; }; -struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; }; -VOut main(uint vid : SV_VertexID) { - float2 uv = float2((vid == 1 || vid == 3) ? 1.0 : 0.0, (vid >= 2) ? 1.0 : 0.0); - VOut o; - o.pos = float4(lerp(r.x, r.z, uv.x), lerp(r.y, r.w, uv.y), 0.0, 1.0); - o.uv = uv; - return o; -} -"; - -const CURSOR_PS: &str = r" -Texture2D tx : register(t0); -SamplerState sm : register(s0); -// b0 is shared with the VS: float4 rect, then the HDR cursor params. For SDR white_mul=1 / decode=0 -// so this is a no-op (returns the raw sampled BGRA, blended in the display's native sRGB space). For -// HDR the cursor is composited onto a LINEAR scRGB FP16 surface where 1.0 = 80 nits, so we sRGB→ -// linear decode (correct alpha blending + no dark edge fringe) and scale to HDR graphics white -// (~203 nits → white_mul = 203/80) so the cursor isn't ~2.5x too dim vs the HDR desktop. -cbuffer C : register(b0) { float4 rect; float white_mul; float decode; float2 pad; }; -float3 srgb_to_linear(float3 c) { - return c <= 0.04045 ? c / 12.92 : pow((c + 0.055) / 1.055, 2.4); -} -float4 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { - float4 s = tx.Sample(sm, uv); - float3 rgb = s.rgb; - if (decode > 0.5) { rgb = srgb_to_linear(rgb); } - rgb *= white_mul; - return float4(rgb, s.a); -} -"; - unsafe fn compile_shader(src: &str, entry: PCSTR, target: PCSTR) -> Result> { let mut blob: Option = None; let mut errs: Option = None; @@ -707,240 +381,6 @@ unsafe fn compile_shader(src: &str, entry: PCSTR, target: PCSTR) -> Result>, - /// Layer composited with the inversion blend (white opaque → invert the screen underneath). - /// `None` if it has no pixels. - xor: Option>, -} - -/// GPU cursor overlay: a tiny shader pipeline that blends the cursor texture(s) onto the captured -/// frame. Tied to one D3D11 device; rebuilt when the capturer recreates its device on a desktop switch. -struct CursorCompositor { - vs: ID3D11VertexShader, - ps: ID3D11PixelShader, - cbuf: ID3D11Buffer, - blend: ID3D11BlendState, - /// Inversion blend for masked-color (XOR) cursors like the text I-beam: result = white*(1-dest), - /// i.e. it inverts the screen under the cursor so it's visible on any background. - blend_invert: ID3D11BlendState, - sampler: ID3D11SamplerState, - /// Alpha-blended layer (normal cursor pixels). srv + width + height. - tex_alpha: Option<(ID3D11ShaderResourceView, u32, u32)>, - /// Inversion-blended layer (screen-inverting pixels: masked-color I-beam bar, monochrome invert). - tex_xor: Option<(ID3D11ShaderResourceView, u32, u32)>, -} - -impl CursorCompositor { - unsafe fn new(device: &ID3D11Device) -> Result { - let vsb = compile_shader(CURSOR_VS, s!("main"), s!("vs_5_0"))?; - let psb = compile_shader(CURSOR_PS, s!("main"), s!("ps_5_0"))?; - let mut vs = None; - device.CreateVertexShader(&vsb, None, Some(&mut vs))?; - let mut ps = None; - device.CreatePixelShader(&psb, None, Some(&mut ps))?; - - let cbd = D3D11_BUFFER_DESC { - ByteWidth: 32, // float4 rect + (white_mul, decode, pad, pad) for the HDR cursor PS - Usage: D3D11_USAGE_DYNAMIC, - BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32, - CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32, - ..Default::default() - }; - let mut cbuf = None; - device.CreateBuffer(&cbd, None, Some(&mut cbuf))?; - - let mut bd = D3D11_BLEND_DESC::default(); - bd.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC { - BlendEnable: true.into(), - SrcBlend: D3D11_BLEND_SRC_ALPHA, - DestBlend: D3D11_BLEND_INV_SRC_ALPHA, - BlendOp: D3D11_BLEND_OP_ADD, - SrcBlendAlpha: D3D11_BLEND_ONE, - DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA, - BlendOpAlpha: D3D11_BLEND_OP_ADD, - RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8, - }; - let mut blend = None; - device.CreateBlendState(&bd, Some(&mut blend))?; - - // Inversion blend: result.rgb = src*(1-dest) + dest*(1-src.a). A white opaque cursor pixel - // (src=1,a=1) -> 1-dest (inverted); a transparent pixel (src=0,a=0) -> dest (unchanged). - let mut bdi = D3D11_BLEND_DESC::default(); - bdi.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC { - BlendEnable: true.into(), - SrcBlend: D3D11_BLEND_INV_DEST_COLOR, - DestBlend: D3D11_BLEND_INV_SRC_ALPHA, - BlendOp: D3D11_BLEND_OP_ADD, - SrcBlendAlpha: D3D11_BLEND_ONE, - DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA, - BlendOpAlpha: D3D11_BLEND_OP_ADD, - RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8, - }; - let mut blend_invert = None; - device.CreateBlendState(&bdi, Some(&mut blend_invert))?; - - let sd = D3D11_SAMPLER_DESC { - Filter: D3D11_FILTER_MIN_MAG_MIP_POINT, - AddressU: D3D11_TEXTURE_ADDRESS_CLAMP, - AddressV: D3D11_TEXTURE_ADDRESS_CLAMP, - AddressW: D3D11_TEXTURE_ADDRESS_CLAMP, - ComparisonFunc: D3D11_COMPARISON_NEVER, - MaxLOD: f32::MAX, - ..Default::default() - }; - let mut sampler = None; - device.CreateSamplerState(&sd, Some(&mut sampler))?; - - Ok(Self { - vs: vs.context("vs")?, - ps: ps.context("ps")?, - cbuf: cbuf.context("cbuf")?, - blend: blend.context("blend")?, - blend_invert: blend_invert.context("blend_invert")?, - sampler: sampler.context("sampler")?, - tex_alpha: None, - tex_xor: None, - }) - } - - /// Upload one BGRA layer as an immutable shader-resource texture and return its SRV. - unsafe fn upload_layer( - device: &ID3D11Device, - bgra: &[u8], - w: u32, - h: u32, - ) -> Result { - let desc = D3D11_TEXTURE2D_DESC { - Width: w, - Height: h, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_B8G8R8A8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, - ..Default::default() - }; - let init = D3D11_SUBRESOURCE_DATA { - pSysMem: bgra.as_ptr() as *const c_void, - SysMemPitch: w * 4, - SysMemSlicePitch: 0, - }; - let mut tex: Option = None; - device.CreateTexture2D(&desc, Some(&init), Some(&mut tex))?; - let tex = tex.context("cursor tex")?; - let mut srv = None; - device.CreateShaderResourceView(&tex, None, Some(&mut srv))?; - srv.context("cursor srv") - } - - /// (Re)upload the decomposed cursor layers; either layer may be absent (→ that pass is skipped). - unsafe fn set_shapes(&mut self, device: &ID3D11Device, shape: &CursorShape) -> Result<()> { - self.tex_alpha = match &shape.alpha { - Some(b) => Some(( - Self::upload_layer(device, b, shape.w, shape.h)?, - shape.w, - shape.h, - )), - None => None, - }; - self.tex_xor = match &shape.xor { - Some(b) => Some(( - Self::upload_layer(device, b, shape.w, shape.h)?, - shape.w, - shape.h, - )), - None => None, - }; - Ok(()) - } - - /// Blend ONE cursor layer onto `rtv` (a render-target view of the captured frame) at frame pixel - /// (cx,cy). `invert` selects the inversion blend (screen-inverting pixels); otherwise normal - /// src-over alpha. A shape with both an alpha and an XOR layer is drawn by calling this twice. - #[allow(clippy::too_many_arguments)] - unsafe fn draw_layer( - &self, - ctx: &ID3D11DeviceContext, - rtv: &ID3D11RenderTargetView, - fw: u32, - fh: u32, - cx: i32, - cy: i32, - srv: &ID3D11ShaderResourceView, - cw: u32, - ch: u32, - invert: bool, - // HDR (decode=true): sRGB→linear decode + scale the cursor to `white_mul` × 80 nits, so a - // white cursor hits HDR graphics white (~203 nits) not 80. SDR passes white_mul=1.0, - // decode=false → the PS returns the raw sample (blended in the display's native sRGB space). - // The inversion (masked-color / I-beam) blend operates on the framebuffer reference, so the - // caller passes white_mul=1.0/decode=false for the XOR layer even in HDR. - white_mul: f32, - decode: bool, - ) { - let x0 = (cx as f32 / fw as f32) * 2.0 - 1.0; - let x1 = ((cx + cw as i32) as f32 / fw as f32) * 2.0 - 1.0; - let y0 = 1.0 - (cy as f32 / fh as f32) * 2.0; - let y1 = 1.0 - ((cy + ch as i32) as f32 / fh as f32) * 2.0; - let (mul, dec) = if invert { - (1.0_f32, 0.0_f32) - } else { - (white_mul, if decode { 1.0 } else { 0.0 }) - }; - // cbuf layout: [rect.x, rect.y, rect.z, rect.w, white_mul, decode, pad, pad] (32 bytes). - let cb = [x0, y0, x1, y1, mul, dec, 0.0, 0.0]; - let mut mapped = D3D11_MAPPED_SUBRESOURCE::default(); - if ctx - .Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped)) - .is_ok() - { - std::ptr::copy_nonoverlapping(cb.as_ptr(), mapped.pData as *mut f32, cb.len()); - ctx.Unmap(&self.cbuf, 0); - } - let vp = D3D11_VIEWPORT { - TopLeftX: 0.0, - TopLeftY: 0.0, - Width: fw as f32, - Height: fh as f32, - MinDepth: 0.0, - MaxDepth: 1.0, - }; - ctx.RSSetViewports(Some(&[vp])); - ctx.OMSetRenderTargets(Some(&[Some(rtv.clone())]), None); - let blend = if invert { - &self.blend_invert - } else { - &self.blend - }; - ctx.OMSetBlendState(blend, Some(&[0.0; 4]), 0xffff_ffff); - ctx.VSSetShader(&self.vs, None); - ctx.PSSetShader(&self.ps, None); - ctx.VSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())])); - ctx.PSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())])); // white_mul/decode for the PS - ctx.PSSetShaderResources(0, Some(&[Some(srv.clone())])); - ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())])); - ctx.IASetInputLayout(None); - ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); - ctx.Draw(4, 0); - // Unbind the render target so the next frame's CopyResource into this texture is unobstructed. - ctx.OMSetRenderTargets(Some(&[None]), None); - } -} - /// Fullscreen-triangle vertex shader for the HDR conversion pass (3 verts, no input layout). const HDR_VS: &str = r" struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; }; @@ -953,120 +393,10 @@ VOut main(uint vid : SV_VertexID) { } "; -/// HDR conversion pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) → -/// BT.2020 primaries → SMPTE ST 2084 (PQ) → written to a 10-bit R10G10B10A2 target for NVENC -/// (HEVC Main10 / HDR10). This is the standard Windows-HDR capture conversion (matches OBS/Sunshine). -const HDR_PS: &str = r" -Texture2D tx : register(t0); -SamplerState sm : register(s0); -// Rec.709 → Rec.2020 primaries (linear). Column-major rows as written, used with mul(M, v). -static const float3x3 BT709_TO_BT2020 = { - 0.627403914, 0.329283038, 0.043313048, - 0.069097292, 0.919540405, 0.011362303, - 0.016391439, 0.088013308, 0.895595253 -}; -float3 pq_oetf(float3 L) { - // L normalized so 1.0 = 10000 nits. ST 2084. - const float m1 = 0.1593017578125; - const float m2 = 78.84375; - const float c1 = 0.8359375; - const float c2 = 18.8515625; - const float c3 = 18.6875; - float3 Lp = pow(saturate(L), m1); - return pow((c1 + c2 * Lp) / (1.0 + c3 * Lp), m2); -} -float4 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { - float3 scrgb = max(tx.Sample(sm, uv).rgb, 0.0); // scRGB can be negative (wide gamut); clamp - float3 nits = scrgb * 80.0; // scRGB 1.0 = 80 nits → absolute luminance - float3 lin2020 = mul(BT709_TO_BT2020, nits); // primaries conversion (linear) - float3 pq = pq_oetf(lin2020 / 10000.0); // normalize to 10k nits, encode PQ - return float4(pq, 1.0); -} -"; - -/// scRGB FP16 → BT.2020 PQ 10-bit conversion pass. One per capture device (rebuilt on device -/// recreate, like [`CursorCompositor`]). A single fullscreen draw samples the FP16 source SRV and -/// writes PQ-encoded BT.2020 to the bound R10G10B10A2 render target. -pub(crate) struct HdrConverter { - vs: ID3D11VertexShader, - ps: ID3D11PixelShader, - sampler: ID3D11SamplerState, -} - -impl HdrConverter { - pub(crate) unsafe fn new(device: &ID3D11Device) -> Result { - let vsb = compile_shader(HDR_VS, s!("main"), s!("vs_5_0"))?; - let psb = compile_shader(HDR_PS, s!("main"), s!("ps_5_0"))?; - let mut vs = None; - device.CreateVertexShader(&vsb, None, Some(&mut vs))?; - let mut ps = None; - device.CreatePixelShader(&psb, None, Some(&mut ps))?; - let sd = D3D11_SAMPLER_DESC { - Filter: D3D11_FILTER_MIN_MAG_MIP_POINT, - AddressU: D3D11_TEXTURE_ADDRESS_CLAMP, - AddressV: D3D11_TEXTURE_ADDRESS_CLAMP, - AddressW: D3D11_TEXTURE_ADDRESS_CLAMP, - ComparisonFunc: D3D11_COMPARISON_NEVER, - MaxLOD: f32::MAX, - ..Default::default() - }; - let mut sampler = None; - device.CreateSamplerState(&sd, Some(&mut sampler))?; - Ok(Self { - vs: vs.context("hdr vs")?, - ps: ps.context("hdr ps")?, - sampler: sampler.context("hdr sampler")?, - }) - } - - /// Convert `src_srv` (FP16 scRGB) into `dst_rtv` (R10G10B10A2 PQ BT.2020). Opaque pass, no blend. - pub(crate) unsafe fn convert( - &self, - ctx: &ID3D11DeviceContext, - src_srv: &ID3D11ShaderResourceView, - dst_rtv: &ID3D11RenderTargetView, - w: u32, - h: u32, - ) { - let vp = D3D11_VIEWPORT { - TopLeftX: 0.0, - TopLeftY: 0.0, - Width: w as f32, - Height: h as f32, - MinDepth: 0.0, - MaxDepth: 1.0, - }; - ctx.RSSetViewports(Some(&[vp])); - ctx.OMSetRenderTargets(Some(&[Some(dst_rtv.clone())]), None); - ctx.OMSetBlendState(None, None, 0xffff_ffff); // opaque overwrite - ctx.VSSetShader(&self.vs, None); - ctx.PSSetShader(&self.ps, None); - ctx.PSSetShaderResources(0, Some(&[Some(src_srv.clone())])); - ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())])); - ctx.IASetInputLayout(None); - ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); - ctx.Draw(3, 0); - // Unbind so the next frame can CopyResource into the source and re-RTV the destination. - ctx.OMSetRenderTargets(Some(&[None]), None); - ctx.PSSetShaderResources(0, Some(&[None])); - } -} - -/// Whether `PUNKTFUNK_HDR_SHADER_P010` is truthy (`1`/`true`/`yes`/`on`). When set, the WGC HDR path -/// emits P010 (BT.2020 PQ, 10-bit limited range) DIRECTLY from a shader pass ([`HdrP010Converter`]) -/// instead of tone-mapping to R10G10B10A2 and letting NVENC do the RGB→YUV CSC on the contended SM. -/// Default OFF → the current HDR path (R10→NVENC + the VideoProcessor attempt) is byte-for-byte -/// unchanged. -pub(crate) fn hdr_shader_p010_enabled() -> bool { - std::env::var("PUNKTFUNK_HDR_SHADER_P010") - .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on")) - .unwrap_or(false) -} - /// P010 **luma** pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) → /// BT.2020 PQ → BT.2020 non-constant-luminance limited-range Y′, written as a 10-bit code in the high /// 10 bits of an R16_UNORM render-target view of the P010 plane-0 (luma). The colour pipeline -/// (scRGB→nits→BT.2020-linear→PQ) is IDENTICAL to [`HDR_PS`]; only the final RGB→Y + studio-range +/// (scRGB→nits→BT.2020-linear→PQ) is IDENTICAL to the R10 HDR path; only the final RGB→Y + studio-range /// quantization differs. The shared HLSL is factored into [`HDR_P010_COMMON`]. const HDR_P010_COMMON: &str = r" Texture2D tx : register(t0); @@ -1157,7 +487,7 @@ float2 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { /// passes: full-res luma + half-res chroma). NVIDIA's D3D11 VideoProcessor cannot do RGB→P010 (renders /// green), so we quantize to studio-range 10-bit YUV directly and feed NVENC native P010 — skipping /// NVENC's internal RGB→YUV CSC (which runs on the contended SM). One per capture device (rebuilt on -/// device recreate, like [`HdrConverter`]). +/// device recreate). /// /// Plane writes use per-plane render-target views of the single P010 texture: an `R16_UNORM` RTV /// selects plane 0 (luma, full WxH), an `R16G16_UNORM` RTV selects plane 1 (chroma, W/2 x H/2). This @@ -1800,1595 +1130,3 @@ impl VideoConverter { .context("VideoProcessorBlt") } } - -/// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA. -fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option { - let w = si.Width as usize; - let pitch = si.Pitch as usize; - if w == 0 || pitch == 0 { - return None; - } - // Type is a u32 (newtype constants compared via .0). - if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR.0 as u32 { - // Straight 32bpp BGRA with a real alpha channel → one alpha-blended layer, no XOR layer. - let h = si.Height as usize; - if buf.len() < pitch * h { - return None; - } - let mut alpha = vec![0u8; w * h * 4]; - for y in 0..h { - for x in 0..w { - let s = y * pitch + x * 4; - let d = (y * w + x) * 4; - alpha[d] = buf[s]; - alpha[d + 1] = buf[s + 1]; - alpha[d + 2] = buf[s + 2]; - alpha[d + 3] = buf[s + 3]; - } - } - Some(CursorShape { - w: w as u32, - h: h as u32, - alpha: Some(alpha), - xor: None, - }) - } else if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR.0 as u32 { - // 32bpp where the alpha byte is a MASK selector (0x00 or 0xFF), not an alpha. A single shape - // can mix opaque and screen-inverting pixels (the text I-beam: opaque hot-spot dot + an - // inverting bar), so we split it into BOTH layers: - // mask 0x00 -> opaque RGB → ALPHA layer - // mask 0xFF, RGB != 0 -> invert the screen (white) → XOR layer - // mask 0xFF, RGB == 0 -> XOR with black = no-op → transparent in both - let h = si.Height as usize; - if buf.len() < pitch * h { - return None; - } - let mut alpha = vec![0u8; w * h * 4]; - let mut xor = vec![0u8; w * h * 4]; - let (mut any_alpha, mut any_xor) = (false, false); - for y in 0..h { - for x in 0..w { - let s = y * pitch + x * 4; - let d = (y * w + x) * 4; - let (b, g, r, mask) = (buf[s], buf[s + 1], buf[s + 2], buf[s + 3]); - if mask == 0 { - alpha[d] = b; - alpha[d + 1] = g; - alpha[d + 2] = r; - alpha[d + 3] = 255; - any_alpha = true; - } else if b != 0 || g != 0 || r != 0 { - // inverting pixel → white opaque; the inversion blend turns this into 1-dest - xor[d] = 255; - xor[d + 1] = 255; - xor[d + 2] = 255; - xor[d + 3] = 255; - any_xor = true; - } - } - } - Some(CursorShape { - w: w as u32, - h: h as u32, - alpha: any_alpha.then_some(alpha), - xor: any_xor.then_some(xor), - }) - } else { - // Monochrome: top half = AND mask, bottom half = XOR mask, 1 bpp. Per-pixel (AND,XOR): - // (0,0) opaque black → ALPHA layer - // (0,1) opaque white → ALPHA layer - // (1,0) transparent → neither layer - // (1,1) invert the screen → XOR layer (white opaque) — was previously approximated as - // solid black, which is the bug this split fixes. - let h = (si.Height / 2) as usize; - if buf.len() < pitch * h * 2 { - return None; - } - let bit = |row: usize, x: usize| (buf[row * pitch + x / 8] >> (7 - (x % 8))) & 1; - let mut alpha = vec![0u8; w * h * 4]; - let mut xor = vec![0u8; w * h * 4]; - let (mut any_alpha, mut any_xor) = (false, false); - for y in 0..h { - for x in 0..w { - let and_bit = bit(y, x); - let xor_bit = bit(y + h, x); - let d = (y * w + x) * 4; - match (and_bit, xor_bit) { - (0, 0) => { - // opaque black: BGR already 0, just mark opaque - alpha[d + 3] = 255; - any_alpha = true; - } - (0, 1) => { - alpha[d] = 255; - alpha[d + 1] = 255; - alpha[d + 2] = 255; - alpha[d + 3] = 255; - any_alpha = true; - } - (1, 0) => {} // transparent - _ => { - // (1,1) invert screen → white opaque into the XOR layer - xor[d] = 255; - xor[d + 1] = 255; - xor[d + 2] = 255; - xor[d + 3] = 255; - any_xor = true; - } - } - } - } - Some(CursorShape { - w: w as u32, - h: h as u32, - alpha: any_alpha.then_some(alpha), - xor: any_xor.then_some(xor), - }) - } -} - -/// CPU src-over alpha blend of a BGRA cursor into a BGRA frame buffer (software-encode path). When -/// `invert` is set (masked-color / XOR cursor), a covered pixel inverts the frame instead (true XOR). -#[allow(clippy::too_many_arguments)] -fn blend_cursor_cpu( - frame: &mut [u8], - fw: u32, - fh: u32, - cur: &[u8], - cw: u32, - ch: u32, - cx: i32, - cy: i32, - invert: bool, -) { - let (fw, fh, cw, ch) = (fw as i32, fh as i32, cw as i32, ch as i32); - for y in 0..ch { - let fy = cy + y; - if fy < 0 || fy >= fh { - continue; - } - for x in 0..cw { - let fx = cx + x; - if fx < 0 || fx >= fw { - continue; - } - let s = ((y * cw + x) * 4) as usize; - let a = cur[s + 3] as u32; - if a == 0 { - continue; - } - let d = ((fy * fw + fx) * 4) as usize; - if invert { - for k in 0..3 { - frame[d + k] = 255 - frame[d + k]; - } - } else { - for k in 0..3 { - frame[d + k] = - ((cur[s + k] as u32 * a + frame[d + k] as u32 * (255 - a)) / 255) as u8; - } - } - } - } -} - -pub struct DuplCapturer { - device: ID3D11Device, - context: ID3D11DeviceContext, - output: IDXGIOutput1, - /// The output duplication. `Option` so recovery can RELEASE it (set `None`) BEFORE re-duplicating: - /// DXGI permits only ONE `IDXGIOutputDuplication` per output, and a stale one (incl. an ACCESS_LOST - /// one) keeps holding the output, so a re-`DuplicateOutput1` returns E_ACCESSDENIED and legacy - /// `DuplicateOutput` returns a BORN-LOST dup — the storm. Apollo releases before re-duplicating; so - /// do we now. `None` only transiently during recovery (acquire routes None → recovery). - dupl: Option, - /// The output's GDI name — re-resolved on ACCESS_LOST (a mode change can stale the cached handle). - gdi_name: String, - /// Stable SudoVDA target id, used to re-resolve `gdi_name` during recovery. - target_id: u32, - width: u32, - height: u32, - refresh_hz: u32, - staging: Option, - holding_frame: bool, - active: AtomicBool, - timeout_ms: u32, - /// The first AcquireNextFrame after a (re)DuplicateOutput gets a generous timeout — the initial - /// desktop snapshot of a large surface can take longer than the per-frame budget. - first_frame: bool, - dbg_timeouts: u32, - dbg_lost: u32, - dbg_black_seeds: u32, - last: Option>, - /// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA. - /// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input. - gpu_mode: bool, - /// Reused owned texture the duplication frame is copied into for the D3D11 path (the duplication - /// surface is transient and released each frame). - gpu_copy: Option, - /// The most recently produced presentable GPU texture + its pixel format, repeated by - /// `next_frame` when AcquireNextFrame reports no change (static desktop) or during a rebuild. - /// Format-tagged because the SDR path presents BGRA `gpu_copy` while the HDR path presents the - /// 10-bit `hdr10_out` — the encoder needs the right format on every frame. - last_present: Option<(ID3D11Texture2D, PixelFormat)>, - /// Whether this capturer should request an HDR (FP16) duplication — `DuplicateOutput1` with FP16 - /// first, retried (legacy DuplicateOutput can't capture HDR). Set for the secure-desktop DDA leg - /// when the SudoVDA is in HDR; threaded into every (re)duplication incl. ACCESS_LOST recovery. - want_hdr: bool, - /// Full-chroma 4:4:4 session: deliver packed RGB (`Bgra` SDR / `Rgb10a2` HDR) and SKIP the - /// video-engine RGB→YUV (NV12/P010) conversion — NVENC reconstructs 4:4:4 only from a full-chroma - /// source, so we hand it the RGB texture and it CSCs to YUV444 at encode (chroma_format_idc=3). - chroma_444: bool, - /// HDR (scRGB FP16) capture state. Set when the duplication surface is `R16G16B16A16_FLOAT` - /// (the desktop has HDR on). The frame can't be `CopyResource`d into a BGRA target, so the HDR - /// path copies it into an FP16 SRV texture, composites the cursor, then runs [`HdrConverter`] to - /// produce a BT.2020 PQ 10-bit (`R10G10B10A2`) frame for NVENC. Toggling HDR fires ACCESS_LOST → - /// `recreate_dupl` re-detects the format, so this tracks the *current* duplication. - hdr_fp16: bool, - /// The source display's static HDR mastering metadata (ST.2086 + content light level), read from - /// `IDXGIOutput6::GetDesc1` whenever the duplication is HDR (`hdr_fp16`). The stream loop forwards - /// it to the encoder (in-band SEI) and the client (0xCE). `None` when SDR or the read failed. - hdr_meta: Option, - /// FP16 copy of the duplication surface (RT|SRV): the cursor composites onto it and the converter - /// samples it. Reallocated on device/size change. - fp16_src: Option, - fp16_srv: Option, - /// 10-bit `R10G10B10A2` PQ output of the HDR conversion — the texture handed to NVENC. - hdr10_out: Option, - /// scRGB→PQ conversion pass; rebuilt on device recreate. - hdr_conv: Option, - /// Video-processor RGB→YUV converter (runs on the VIDEO engine, not the 3D engine) + its NV12 - /// (SDR) / P010 (HDR) output texture. This is the zero-3D path: the per-frame colour conversion and - /// NVENC's RGB→YUV both move off the 3D engine so capture+encode don't fight a GPU-saturating game. - /// Lazily built for the current size+HDR; rebuilt on change. `None`/error → falls back to the - /// legacy RGB path. Disabled with `PUNKTFUNK_NO_VIDEO_PROCESSOR=1`. - video_conv: Option, - yuv_out: Option, - /// HDR-ness the current `video_conv`/`yuv_out` were built for, so an HDR toggle rebuilds them. - yuv_is_hdr: bool, - /// Latched off after a VideoConverter failure so we don't retry it every frame (fall back to RGB). - vp_disabled: bool, - /// Last time a duplication rebuild was attempted, to throttle retries during an outage (e.g. a - /// secure-desktop dwell where the output is gone) so we don't block the encode loop or hammer - /// DuplicateOutput — between attempts the last good frame is repeated. `None` = never attempted. - last_rebuild: Option, - /// Throttle for ALL ACCESS_LOST recovery attempts (cheap re-duplicate + full rebuild). A - /// constantly-invalidated duplication (HDR overlay/MPO churn) would otherwise spin recovery and - /// starve the encode thread; cap attempts to ~one per 5 ms and repeat the last frame between them. - last_recover: Option, - /// True once at least one real frame has been produced. After that, a frame drought (e.g. a long - /// secure-desktop dwell with nothing rendering to the virtual output) must never fatally end the - /// session — `next_frame` keeps repeating the last/seeded frame instead of erroring on its - /// deadline. The deadline stays fatal only *before* the first frame (a genuine startup misconfig). - ever_got_frame: bool, - /// Consecutive rebuilds that produced a BORN-LOST duplication (created OK, but its first - /// AcquireNextFrame instantly returned ACCESS_LOST). On the NORMAL desktop this is the hybrid - /// reparent/flip storm — once it persists, `acquire` returns Err so the punktfunk1 loop cold-rebuilds the - /// whole pipeline (new device/output) instead of spinning on a dead dup forever (the bug where the - /// stream froze on the last frame). Reset to 0 by any real frame. NOT armed on the secure - /// (Winlogon) desktop, where a long static dwell is legitimate and must never end the session. - consecutive_born_lost: u32, - /// GPU cursor overlay (rebuilt on device recreate). `None` until the first composite. - cursor: Option, - /// Last cursor shape, decomposed into alpha + XOR layers (kept device-independent so it survives - /// a device recreate). - cursor_shape: Option, - cursor_pos: (i32, i32), - cursor_visible: bool, - /// Cursor shape changed → re-upload to the GPU texture(s) before the next composite. - cursor_dirty: bool, - dbg_cursor: u64, - _keepalive: Box, -} -// SAFETY: `DuplCapturer` holds D3D11 device/context/duplication COM pointers plus plain data. The -// device is created free-threaded (`make_device` sets no `D3D11_CREATE_DEVICE_SINGLETHREADED`) and -// COM reference counting is interlocked, so moving ownership of the whole capturer to another thread -// is sound. It is used by exactly one thread (the encode thread) at a time — moved to it once, never -// shared (no `Sync`) — so the single-threaded immediate context is never touched concurrently. -unsafe impl Send for DuplCapturer {} - -impl DuplCapturer { - pub fn open( - target: WinCaptureTarget, - preferred: Option<(u32, u32, u32)>, - keepalive: Box, - // Whether the (already-resolved) encode backend wants GPU-resident frames — passed IN (Goal-1 - // stage 5) so the capturer never re-derives the encode backend itself. - gpu: bool, - want_hdr: bool, - // 4:4:4 session → deliver RGB, skip the NV12/P010 video-engine conversion (see the field doc). - chroma_444: bool, - ) -> Result { - // SAFETY: runs on the capture thread that will own this `DuplCapturer`. `install_gpu_pref_hook()` - // and the DPI-context calls take by-value handles / no args and touch only thread/process state; - // `SetThreadExecutionState` takes a flags bitmask by value. `CreateDXGIFactory1` yields a live - // `IDXGIFactory1`, and every subsequent COM method (`EnumAdapters1`/`EnumOutputs`/`GetDesc1`/ - // `GetDesc`/`cast`) is called on that factory or on an adapter/output it returned — each obtained - // through a checked `while let Ok(..)`/`?` — all from this one thread. No raw pointers are - // dereferenced; the borrowed strings/locals outlive each synchronous call. - unsafe { - // Stop DXGI hybrid-GPU output reparenting BEFORE we create the factory / enumerate outputs - // (the cause of the 0x887A0026 ACCESS_LOST churn on this hybrid box: RTX 4090 + AMD iGPU). - install_gpu_pref_hook(); - // Force PER-MONITOR-AWARE-V2 on THIS (capture) thread. IDXGIOutput5::DuplicateOutput1 - // REQUIRES V2 — without it the call returns E_ACCESSDENIED forever (the 4370x failures - // measured live), forcing the legacy DuplicateOutput fallback which yields a BORN-LOST - // duplication on this box → the ACCESS_LOST storm. SetProcessDpiAwarenessContext failed at - // startup ("already set" — a manifest/runtime locked the process to a LOWER awareness, and - // GetAwarenessFromDpiAwarenessContext can't tell V1 from V2: it reports 2 for both). The - // per-THREAD override works regardless of the process default, so DuplicateOutput1 can - // succeed (the working dup Apollo gets). Must run on the capture thread before any DXGI use. - { - use windows::Win32::UI::HiDpi::{ - AreDpiAwarenessContextsEqual, GetThreadDpiAwarenessContext, - SetThreadDpiAwarenessContext, DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2, - }; - let prev = SetThreadDpiAwarenessContext(DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2); - let is_v2 = AreDpiAwarenessContextsEqual( - GetThreadDpiAwarenessContext(), - DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2, - ) - .as_bool(); - tracing::info!( - set_ok = !prev.0.is_null(), - thread_is_v2 = is_v2, - "capture thread DPI awareness -> PER_MONITOR_AWARE_V2 (required for DuplicateOutput1)" - ); - } - // Keep the IDD (SudoVDA) virtual display awake for the capture lifetime: an idle indirect - // display can be power-gated, which invalidates the duplication (a contributor to the - // "freezes randomly while streaming" loss). Restored to ES_CONTINUOUS on Drop. (Apollo does - // this too.) Must run on the capture thread (this one owns the capturer). - SetThreadExecutionState(ES_CONTINUOUS | ES_DISPLAY_REQUIRED | ES_SYSTEM_REQUIRED); - let factory: IDXGIFactory1 = CreateDXGIFactory1().context("CreateDXGIFactory1")?; - // 1) Find the output (monitor) whose GDI DeviceName matches, across ALL adapters. On a - // real-GPU box the SudoVDA virtual monitor's DXGI output is enumerated under the GPU that - // *renders* it (the discrete/integrated GPU), NOT under the SudoVDA "adapter" LUID that - // SudoVDA reports — so we can't restrict the search to `target.adapter_luid`. The output - // also appears a beat after the display is created, so settle-retry for up to ~2 s. - // `target.adapter_luid` is kept only as a tie-break preference (matched adapter first). - let _ = target.adapter_luid; - let deadline = Instant::now() + Duration::from_millis(2000); - let (adapter, output): (IDXGIAdapter1, IDXGIOutput1) = loop { - let mut hit = None; - let mut i = 0u32; - while let Ok(a) = factory.EnumAdapters1(i) { - let ad = a.GetDesc1()?; - let aname = String::from_utf16_lossy(&ad.Description); - let aname = aname.trim_end_matches('\u{0}'); - let mut j = 0u32; - while let Ok(o) = a.EnumOutputs(j) { - let od = o.GetDesc()?; - let oname = String::from_utf16_lossy(&od.DeviceName); - let oname = oname.trim_end_matches('\u{0}').to_string(); - tracing::debug!( - adapter = aname, - luid = format!("{:#x}", pack_luid(ad.AdapterLuid)), - output = oname, - want = target.gdi_name, - "DXGI output seen" - ); - if gdi_name_matches(&od.DeviceName, &target.gdi_name) { - tracing::info!( - adapter = aname, - luid = format!("{:#x}", pack_luid(ad.AdapterLuid)), - output = oname, - "capturing the SudoVDA output on this adapter" - ); - hit = Some((a.clone(), o.cast::()?)); - break; - } - j += 1; - } - if hit.is_some() { - break; - } - i += 1; - } - if let Some(h) = hit { - break h; - } - if Instant::now() >= deadline { - let mut topo = Vec::new(); - let mut i = 0u32; - while let Ok(a) = factory.EnumAdapters1(i) { - let ad = a.GetDesc1()?; - let an = String::from_utf16_lossy(&ad.Description); - let mut outs = Vec::new(); - let mut j = 0u32; - while let Ok(o) = a.EnumOutputs(j) { - let od = o.GetDesc()?; - outs.push( - String::from_utf16_lossy(&od.DeviceName) - .trim_end_matches('\u{0}') - .to_string(), - ); - j += 1; - } - topo.push(format!( - "{} [{:#x}]: {:?}", - an.trim_end_matches('\u{0}'), - pack_luid(ad.AdapterLuid), - outs - )); - i += 1; - } - bail!( - "no DXGI adapter exposes output {} (topology: {})", - target.gdi_name, - topo.join(" | ") - ); - } - std::thread::sleep(Duration::from_millis(100)); - }; - // 2) D3D11 device ON the adapter that exposes the output (driver_type MUST be UNKNOWN with - // an explicit adapter). NVENC binds to this same device for zero-copy encode. - let mut device: Option = None; - let mut context: Option = None; - D3D11CreateDevice( - &adapter, - D3D_DRIVER_TYPE_UNKNOWN, - HMODULE::default(), - D3D11_CREATE_DEVICE_BGRA_SUPPORT, - Some(&[D3D_FEATURE_LEVEL_11_0]), - D3D11_SDK_VERSION, - Some(&mut device), - None, - Some(&mut context), - ) - .context("D3D11CreateDevice")?; - let device = device.context("null D3D11 device")?; - let context = context.context("null D3D11 context")?; - // 3) duplicate the output. Attach to the current input desktop first (as SYSTEM this can - // be the Winlogon secure desktop) so a session that starts at the lock/login screen works. - // The virtual display is kept the sole desktop via the CCD isolation the pf-vdisplay backend - // applies at monitor creation (registry-persisted), so the secure desktop has nowhere to render - // but the output we capture — no per-open re-isolation needed. - attach_input_desktop(); - let dupl = duplicate_output(&output, &device, want_hdr) - .context("DuplicateOutput (already duplicated by another app?)")?; - // Did DXGI actually call our win32u GPU-pref hook during factory/device/dupl creation? hits==0 - // here means the hook is NOT on DXGI's reparenting path on this build → reparenting can't be - // the churn cause (look at independent-flip/composition instead). Diagnostic only. - tracing::debug!( - hook_hits = hybrid_hook_hits(), - "win32u GPU-pref hook call count after open" - ); - // Kick the first frame loose: a blank virtual display is otherwise change-less. - nudge_cursor_onto(&output); - let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc(); - let (width, height) = (dd.ModeDesc.Width, dd.ModeDesc.Height); - let refresh_hz = preferred - .map(|(_, _, hz)| hz) - .filter(|&hz| hz > 0) - .unwrap_or_else(|| { - let r = dd.ModeDesc.RefreshRate; - r.Numerator - .checked_div(r.Denominator) - .map_or(60, |hz| hz.max(1)) - }); - let timeout_ms = std::env::var("PUNKTFUNK_CAPTURE_TIMEOUT_MS") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or((2000 / refresh_hz.max(1)).max(100)); - // Produce GPU-resident D3D11 frames (zero-copy NVENC, or the NV12/P010 the AMF/QSV backends - // read back / import) whenever the encode backend is a GPU one — so the capturer's output - // format matches the encoder's input. Only the software (GPU-less) path takes CPU staging. - // The decision is resolved ONCE per session and passed in (Goal-1 stage 5), instead of this - // capturer re-calling `encode::windows_resolved_backend()` — the back-reference that let - // capture and encode disagree (plan §2.3/§5). - let gpu_mode = gpu; - // Read the source display's HDR mastering metadata while we still hold `output` (it is - // moved into the struct below). Only meaningful for an HDR (FP16) duplication. - let is_hdr_init = dd.ModeDesc.Format == DXGI_FORMAT_R16G16B16A16_FLOAT; - let hdr_meta_init = if is_hdr_init { - read_output_hdr_meta(&output) - } else { - None - }; - tracing::info!( - "DXGI duplication: {}x{}@{} on {} ({}) dxgi_format={} (87=BGRA8 24=R10G10B10A2 10=R16G16B16A16_FLOAT)", - width, - height, - refresh_hz, - target.gdi_name, - if gpu_mode { - "D3D11 zero-copy" - } else { - "CPU staging" - }, - dd.ModeDesc.Format.0, - ); - Ok(Self { - device, - context, - output, - dupl: Some(dupl), - target_id: target.target_id, - gdi_name: target.gdi_name, - width, - height, - refresh_hz, - staging: None, - holding_frame: false, - active: AtomicBool::new(false), - timeout_ms, - first_frame: true, - dbg_timeouts: 0, - dbg_lost: 0, - dbg_black_seeds: 0, - last: None, - gpu_mode, - gpu_copy: None, - last_present: None, - want_hdr, - chroma_444, - hdr_fp16: is_hdr_init, - hdr_meta: hdr_meta_init, - fp16_src: None, - fp16_srv: None, - hdr10_out: None, - hdr_conv: None, - video_conv: None, - yuv_out: None, - yuv_is_hdr: false, - vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(), - last_rebuild: None, - last_recover: None, - ever_got_frame: false, - consecutive_born_lost: 0, - cursor: None, - cursor_shape: None, - cursor_pos: (0, 0), - cursor_visible: false, - cursor_dirty: false, - dbg_cursor: 0, - _keepalive: keepalive, - }) - } - } - - unsafe fn ensure_staging(&mut self) -> Result<()> { - if self.staging.is_some() { - return Ok(()); - } - let desc = D3D11_TEXTURE2D_DESC { - Width: self.width, - Height: self.height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_B8G8R8A8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_STAGING, - BindFlags: D3D11_BIND_FLAG(0).0 as u32, - CPUAccessFlags: D3D11_CPU_ACCESS_READ.0 as u32, - MiscFlags: 0, - }; - let mut t: Option = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(staging)")?; - self.staging = t; - Ok(()) - } - - unsafe fn ensure_gpu_copy(&mut self) -> Result<()> { - if self.gpu_copy.is_some() { - return Ok(()); - } - let desc = D3D11_TEXTURE2D_DESC { - Width: self.width, - Height: self.height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_B8G8R8A8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - let mut t: Option = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(gpu copy)")?; - self.gpu_copy = t; - Ok(()) - } - - /// Convert `input` (BGRA for SDR, scRGB FP16 for HDR) to NVENC's native YUV (NV12 / P010) via the - /// D3D11 **video processor** (video engine) — keeping the per-frame colour conversion AND NVENC's - /// RGB→YUV off the 3D engine so capture+encode don't fight a GPU-saturating game. Returns the YUV - /// texture, or `None` to fall back to the legacy RGB path (processor disabled/unavailable). Lazily - /// builds + caches the processor + output texture for the current size + HDR-ness. - unsafe fn convert_to_yuv( - &mut self, - input: &ID3D11Texture2D, - hdr: bool, - ) -> Option { - if self.vp_disabled { - return None; - } - if self.video_conv.is_none() || self.yuv_out.is_none() || self.yuv_is_hdr != hdr { - self.video_conv = None; - self.yuv_out = None; - let vc = match VideoConverter::new( - &self.device, - &self.context, - self.width, - self.height, - hdr, - ) { - Ok(vc) => vc, - Err(e) => { - tracing::warn!(error = %format!("{e:#}"), - "video processor unavailable — falling back to RGB encode path"); - self.vp_disabled = true; - return None; - } - }; - let fmt = if hdr { - windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010 - } else { - windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12 - }; - let desc = D3D11_TEXTURE2D_DESC { - Width: self.width, - Height: self.height, - MipLevels: 1, - ArraySize: 1, - Format: fmt, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - let mut t: Option = None; - if let Err(e) = self.device.CreateTexture2D(&desc, None, Some(&mut t)) { - tracing::warn!(error = %format!("{e:?}"), - "CreateTexture2D(YUV out) failed — falling back to RGB encode path"); - self.vp_disabled = true; - return None; - } - self.video_conv = Some(vc); - self.yuv_out = t; - self.yuv_is_hdr = hdr; - tracing::info!( - hdr, - "video-processor YUV path active ({} on the video engine, 0% 3D)", - if hdr { "P010" } else { "NV12" } - ); - } - let out = self.yuv_out.clone()?; - if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) { - tracing::warn!(error = %format!("{e:#}"), - "VideoProcessorBlt failed — falling back to RGB encode path"); - self.vp_disabled = true; - self.video_conv = None; - self.yuv_out = None; - return None; - } - Some(out) - } - - /// FP16 (`R16G16B16A16_FLOAT`) copy of the HDR duplication surface (RT for the cursor composite + - /// SRV for the converter). Reallocated when absent (device/size change drops it). - unsafe fn ensure_fp16_src(&mut self) -> Result<()> { - if self.fp16_src.is_some() { - return Ok(()); - } - let desc = D3D11_TEXTURE2D_DESC { - Width: self.width, - Height: self.height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_R16G16B16A16_FLOAT, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - let mut t: Option = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(fp16 src)")?; - let t = t.context("fp16 src tex")?; - let mut srv = None; - self.device - .CreateShaderResourceView(&t, None, Some(&mut srv))?; - self.fp16_srv = Some(srv.context("fp16 srv")?); - self.fp16_src = Some(t); - Ok(()) - } - - /// 10-bit `R10G10B10A2_UNORM` PQ output of the HDR conversion — the texture NVENC encodes. - unsafe fn ensure_hdr10_out(&mut self) -> Result<()> { - if self.hdr10_out.is_some() { - return Ok(()); - } - let desc = D3D11_TEXTURE2D_DESC { - Width: self.width, - Height: self.height, - MipLevels: 1, - ArraySize: 1, - Format: DXGI_FORMAT_R10G10B10A2_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32, - CPUAccessFlags: 0, - MiscFlags: 0, - }; - let mut t: Option = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(hdr10 out)")?; - self.hdr10_out = t; - Ok(()) - } - - /// Allocate a presentable GPU texture on the *current* device, clear it to black, and record it - /// as `last_present`. Called after a desktop-switch recovery so `next_frame` always has a D3D11 - /// frame to repeat even while the (secure) desktop renders nothing to the virtual output — this - /// is what keeps the session alive across a lock/login/UAC transition instead of dropping it. In - /// HDR mode it seeds the 10-bit output (black = PQ 0); otherwise the BGRA copy. One-shot: the next - /// real frame overwrites the texture in place. - unsafe fn seed_black_gpu_frame(&mut self) -> Result<()> { - // Instrumentation: a BLACK seed means we have no real desktop frame to show — if the client - // streams black, this is why. On the secure (Winlogon) desktop this fires when the duplication - // came back born-lost / idle. Counted + logged (throttled) so a real-lock repro shows the mode. - self.dbg_black_seeds += 1; - if self.dbg_black_seeds % 32 == 1 { - tracing::warn!( - black_seeds = self.dbg_black_seeds, - "DDA: seeding BLACK frame — no real desktop frame available (secure desktop idle/born-lost?)" - ); - } - if self.hdr_fp16 { - self.ensure_hdr10_out()?; - let out = self.hdr10_out.clone().context("hdr10 out texture")?; - let mut rtv: Option = None; - self.device - .CreateRenderTargetView(&out, None, Some(&mut rtv))?; - self.context - .ClearRenderTargetView(&rtv.context("null RTV (hdr seed)")?, &[0.0, 0.0, 0.0, 1.0]); - self.last_present = Some((out, PixelFormat::Rgb10a2)); - } else { - self.ensure_gpu_copy()?; - let gpu = self.gpu_copy.clone().context("gpu copy texture")?; - let mut rtv: Option = None; - self.device - .CreateRenderTargetView(&gpu, None, Some(&mut rtv))?; - self.context - .ClearRenderTargetView(&rtv.context("null RTV (sdr seed)")?, &[0.0, 0.0, 0.0, 1.0]); - self.last_present = Some((gpu, PixelFormat::Bgra)); - } - Ok(()) - } - - /// Pull cursor position/visibility/shape out of the frame info (the HW cursor is NOT in the frame). - unsafe fn update_cursor(&mut self, info: &DXGI_OUTDUPL_FRAME_INFO) { - if info.LastMouseUpdateTime != 0 { - self.cursor_pos = ( - info.PointerPosition.Position.x, - info.PointerPosition.Position.y, - ); - self.cursor_visible = info.PointerPosition.Visible.as_bool(); - } - if info.PointerShapeBufferSize > 0 { - let mut buf = vec![0u8; info.PointerShapeBufferSize as usize]; - let mut required = 0u32; - let mut si = DXGI_OUTDUPL_POINTER_SHAPE_INFO::default(); - if self.dupl.as_ref().is_some_and(|d| { - d.GetFramePointerShape( - info.PointerShapeBufferSize, - buf.as_mut_ptr() as *mut c_void, - &mut required, - &mut si, - ) - .is_ok() - }) { - if let Some(shape) = convert_pointer_shape(&buf, &si) { - tracing::info!( - shape_type = si.Type, - size = format!("{}x{}", shape.w, shape.h), - alpha = shape.alpha.is_some(), - xor = shape.xor.is_some(), - "cursor shape captured" - ); - self.cursor_shape = Some(shape); - self.cursor_dirty = true; - } - } - } - } - - /// Composite the cursor onto the GPU frame texture (zero-copy path). `hdr` = the target is the - /// linear scRGB FP16 surface (HDR path) — the cursor is then sRGB→linear decoded and scaled to - /// HDR graphics white (PUNKTFUNK_HDR_CURSOR_NITS, default 203, per BT.2408) so it isn't ~2.5× - /// too dim; SDR composites the raw cursor in the display's native sRGB space. - unsafe fn composite_cursor_gpu(&mut self, gpu: &ID3D11Texture2D, hdr: bool) -> Result<()> { - self.dbg_cursor += 1; - if self.dbg_cursor % 240 == 1 { - tracing::debug!( - visible = self.cursor_visible, - pos = format!("{:?}", self.cursor_pos), - shape = self - .cursor_shape - .as_ref() - .map(|s| format!("{}x{}", s.w, s.h)), - "cursor state" - ); - } - if !self.cursor_visible || self.cursor_shape.is_none() { - return Ok(()); - } - if self.cursor.is_none() { - self.cursor = Some(CursorCompositor::new(&self.device)?); - self.cursor_dirty = true; // fresh device → must (re)upload the shape texture - } - if self.cursor_dirty { - if let Some(shape) = &self.cursor_shape { - self.cursor - .as_mut() - .unwrap() - .set_shapes(&self.device, shape)?; - } - self.cursor_dirty = false; - } - let mut rtv: Option = None; - self.device - .CreateRenderTargetView(gpu, None, Some(&mut rtv))?; - let rtv = rtv.context("cursor rtv")?; - let (cx, cy) = self.cursor_pos; - // HDR graphics-white target in nits → scRGB multiplier (scRGB 1.0 = 80 nits). Default 203 - // (BT.2408); PUNKTFUNK_HDR_CURSOR_NITS overrides without a rebuild. SDR → 1.0, no decode. - let white_mul = if hdr { - let nits = std::env::var("PUNKTFUNK_HDR_CURSOR_NITS") - .ok() - .and_then(|s| s.parse::().ok()) - .filter(|n| n.is_finite() && *n > 0.0) - .unwrap_or(203.0); - nits / 80.0 - } else { - 1.0 - }; - let (w, h) = (self.width, self.height); - let comp = self.cursor.as_ref().unwrap(); - // Alpha-blended layer (normal cursor pixels); HDR brightness scale applies here. - if let Some((srv, cw, ch)) = &comp.tex_alpha { - comp.draw_layer( - &self.context, - &rtv, - w, - h, - cx, - cy, - srv, - *cw, - *ch, - false, - white_mul, - hdr, // decode sRGB→linear only on the HDR (linear FP16) target - ); - } - // Inversion layer (masked-color I-beam bar / monochrome invert): operates on the framebuffer - // reference, so it is never HDR-scaled or sRGB-decoded. - if let Some((srv, cw, ch)) = &comp.tex_xor { - comp.draw_layer( - &self.context, - &rtv, - w, - h, - cx, - cy, - srv, - *cw, - *ch, - true, - 1.0, - false, - ); - } - Ok(()) - } - - /// CHEAP recovery for the ACCESS_LOST *churn*: re-`DuplicateOutput` on the EXISTING device + - /// output. No new device/factory, so the encoder is NOT re-initialized and no black is seeded — - /// the existing `gpu_copy`/HDR textures/`last_present` are kept and frames resume immediately. This - /// is the right recovery for the HDR overlay-flip churn (the duplication is invalidated but the - /// output is still live). Returns false when the output can't be re-duplicated (desktop switch / - /// output gone) so the caller falls back to the full [`recreate_dupl`]. Probes the new duplication - /// (like recreate_dupl) so a born-lost one is rejected rather than adopted. - unsafe fn try_reduplicate(&mut self) -> bool { - if self.holding_frame { - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - } - // RELEASE the old duplication FIRST (drop it → frees the output) before re-duplicating. DXGI - // allows one duplication per output; leaving the stale one alive is exactly why DuplicateOutput1 - // returned E_ACCESSDENIED and the legacy fallback produced a born-lost dup. - self.dupl = None; - let dupl = match duplicate_output(&self.output, &self.device, self.want_hdr) { - Ok(d) => d, - Err(_) => return false, - }; - // Adopt first (SAME device → existing gpu_copy/HDR textures/last_present stay valid), then probe - // + CAPTURE the frame: a born-lost duplication returns ACCESS_LOST immediately; alive-but-idle - // waits the full 16ms. On a real frame we present it (so a static desktop keeps a real - // last_present instead of the discarded one); idle keeps the existing last_present. - self.dupl = Some(dupl); - let mut info = DXGI_OUTDUPL_FRAME_INFO::default(); - let mut res: Option = None; - match self - .dupl - .as_ref() - .unwrap() - .AcquireNextFrame(16, &mut info, &mut res) - { - Ok(()) => { - self.update_cursor(&info); - if let Some(r) = res { - let _ = self.present_acquired(r); - } - } - Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => {} - Err(_) => return false, // born-lost on the same output → need the full rebuild - } - true - } - - /// ONE rebuild attempt — deliberately non-blocking. ACCESS_LOST fires on desktop switches - /// (normal ↔ Winlogon secure: lock/login/UAC) and on the mode change we issue at create. We - /// re-attach to the now-current input desktop and recreate the D3D11 device + duplication on it - /// (a device made on the previous desktop can't sustain a duplication on the new one). CRUCIAL: - /// no internal multi-second retry loop — during a secure-desktop dwell the SudoVDA output is - /// *gone* (`no DXGI output named …`), and a blocking retry here would starve the encode/send - /// loop of frames for seconds, so the client times out and disconnects (the bug this fixes). - /// Instead a single attempt returns immediately; the caller ([`acquire`]) repeats the last good - /// frame and retries on a throttle, so the session survives an arbitrarily long secure visit. - unsafe fn recreate_dupl(&mut self) -> Result<()> { - if self.holding_frame { - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - } - // The SudoVDA output's GDI name can CHANGE across a secure-desktop topology rebuild — - // re-resolve from the STABLE target id so we find it under its current name. - if let Some(n) = crate::win_display::resolve_gdi_name(self.target_id) { - self.gdi_name = n; - } - // Re-sync the capture thread to the CURRENT input desktop on EVERY rebuild — symmetric for - // ENTERING and LEAVING the secure (Winlogon) desktop. This is the fix for "UAC/lock appears - // fine but breaks the instant you click out of it": leaving secure used to skip this (it was - // gated on is_secure_desktop()), stranding the thread on the gone Winlogon desktop. Cheap + - // leak-free (attach_input_desktop closes its handle). Apollo (syncThreadDesktop) does the same. - // We do NOT re-isolate the display on recovery: the CCD isolation from create_monitor is - // registry-persisted, and a CCD topology mutation here would itself invalidate the freshly-rebuilt - // duplication → a self-feeding ACCESS_LOST storm (200 rebuilds/session observed before this). - attach_input_desktop(); - // RELEASE the old duplication FIRST (frees the output). reopen_duplication creates a NEW device - // and re-DuplicateOutputs the output; if the stale duplication is still alive it holds the output - // and the new one is born-lost / E_ACCESSDENIED. (On reopen failure self.dupl stays None and - // acquire's None-guard re-drives recovery.) - self.dupl = None; - let (dev, ctx, out, dupl) = reopen_duplication(&self.gdi_name, self.want_hdr)?; // Err → caller repeats + retries - - // (The born-lost guard is now the capture-acquire at the end: we adopt, then grab the current - // frame; ACCESS_LOST there means born-lost, and we seed black + let the throttled caller retry.) - // A desktop switch can come back at a different size (e.g. the user session applies its own - // resolution on login). Adopt it: update dimensions and drop the staging/gpu copies so they - // reallocate. NVENC re-inits at the new size when it sees the frame. - let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc(); - let (nw, nh) = (dd.ModeDesc.Width, dd.ModeDesc.Height); - tracing::info!( - dxgi_format = dd.ModeDesc.Format.0, - "DXGI duplication rebuilt (format: 87=BGRA8 24=R10G10B10A2 10=R16G16B16A16_FLOAT)" - ); - if nw != self.width || nh != self.height { - tracing::info!( - old = format!("{}x{}", self.width, self.height), - new = format!("{nw}x{nh}"), - "DXGI duplication size changed across switch" - ); - self.width = nw; - self.height = nh; - self.staging = None; - } - self.device = dev; - self.context = ctx; - self.output = out; - self.dupl = Some(dupl); - self.gpu_copy = None; // stale: belonged to the old device - self.cursor = None; // shaders/textures belonged to the old device; rebuilt on demand - self.last_present = None; // belonged to the old device; reseeded below - // Re-detect HDR and drop the HDR textures/converter (old device). Toggling HDR on or - // off is exactly this path: the duplication comes back as FP16 (HDR) or BGRA8. - self.hdr_fp16 = dd.ModeDesc.Format == DXGI_FORMAT_R16G16B16A16_FLOAT; - // Re-read the source mastering metadata for the (possibly new) HDR output, or clear it on SDR. - self.hdr_meta = if self.hdr_fp16 { - read_output_hdr_meta(&self.output) - } else { - None - }; - self.fp16_src = None; - self.fp16_srv = None; - self.hdr10_out = None; - self.hdr_conv = None; - // Video processor + its YUV output belonged to the old device / size / HDR-ness — rebuild lazily. - self.video_conv = None; - self.yuv_out = None; - self.first_frame = true; - // Capture the CURRENT desktop frame as `last_present` (instead of seeding black). The secure - // (lock/login/UAC) desktop is STATIC, so DDA only emits a frame on change — if we seeded black - // we'd stream black until the user pressed a key (the reported bug). A freshly-created - // duplication's first AcquireNextFrame returns the full current desktop; grab it and present it, - // so the client shows the real (frozen-until-it-changes) secure desktop. Born-lost (ACCESS_LOST - // here) or no-initial-frame (timeout) → seed black as a fallback and let the throttled caller - // retry — a brief black flash during the unsettled switch, then real content. - nudge_cursor_onto(&self.output); // kick a change so a static desktop yields its first frame - let mut info = DXGI_OUTDUPL_FRAME_INFO::default(); - let mut res: Option = None; - let captured = match self - .dupl - .as_ref() - .unwrap() - .AcquireNextFrame(120, &mut info, &mut res) - { - Ok(()) => { - self.update_cursor(&info); - match res { - Some(r) => match self.present_acquired(r) { - Ok(_) => { - self.first_frame = false; - tracing::info!("DXGI recovery: captured real secure-desktop frame"); - true - } - Err(e) => { - tracing::warn!(error = %format!("{e:#}"), "recovery: present_acquired failed"); - false - } - }, - None => false, - } - } - Err(e) => { - tracing::warn!( - code = format!("{:#x}", e.code().0), - "DXGI recovery: no initial frame (born-lost/idle) — seeding black, will retry" - ); - false - } - }; - if !captured && self.gpu_mode { - if let Err(e) = self.seed_black_gpu_frame() { - tracing::warn!(error = %format!("{e:#}"), "seed black frame after recovery failed"); - } - } - // Track the born-lost storm: a rebuild that grabbed a real frame clears it; one that came back - // born-lost (created OK, first AcquireNextFrame == ACCESS_LOST) advances it. `acquire` uses this - // to escape to a full pipeline cold-rebuild on the normal desktop instead of spinning forever. - if captured { - self.consecutive_born_lost = 0; - } else { - self.consecutive_born_lost = self.consecutive_born_lost.saturating_add(1); - } - Ok(()) - } - - /// Acquire one frame: `Some` on a fresh image, `None` on timeout (no change → caller reuses last). - unsafe fn acquire(&mut self) -> Result> { - if self.holding_frame { - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - } - let mut info = DXGI_OUTDUPL_FRAME_INFO::default(); - let mut res: Option = None; - let timeout = if self.first_frame { - 2000 - } else { - self.timeout_ms - }; - // If a prior recovery released the old duplication but couldn't create a new one yet (output - // gone during a secure dwell, etc.), self.dupl is None — synthesize ACCESS_LOST so we flow into - // the recovery path below instead of panicking. - let acq = match self.dupl.as_ref() { - Some(d) => d.AcquireNextFrame(timeout, &mut info, &mut res), - None => Err(windows::core::Error::from_hresult(DXGI_ERROR_ACCESS_LOST)), - }; - match acq { - Ok(()) => { - if self.first_frame { - tracing::info!(w = self.width, h = self.height, "DXGI first frame acquired"); - self.first_frame = false; - } - self.consecutive_born_lost = 0; // a real frame breaks the born-lost storm - self.update_cursor(&info); - } - Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => { - self.dbg_timeouts += 1; - if self.dbg_timeouts % 40 == 1 { - // A static desktop produces no DDA frames, so timeouts are NORMAL idle, not an error. - tracing::debug!( - timeouts = self.dbg_timeouts, - first_frame = self.first_frame, - "DXGI AcquireNextFrame timeout (no desktop change yet)" - ); - } - return Ok(None); - } - // MODE_CHANGE_IN_PROGRESS (0x887A0025) is TRANSIENT by design ("the call may succeed at a - // later attempt") — the display topology is mid-settle (e.g. just after the IDD's mode is - // applied). Do NOT recover/rebuild: a rebuild re-issues create()→set_active_mode, re-touching - // the topology and PERPETUATING the change (the storm we measured). Just repeat the last frame - // and wait it out, like a timeout. Throttled log so a genuinely stuck change stays visible. - Err(e) if e.code() == DXGI_ERROR_MODE_CHANGE_IN_PROGRESS => { - self.dbg_timeouts += 1; - if self.dbg_timeouts % 120 == 1 { - tracing::warn!( - "DXGI mode change in progress (0x887A0025) — waiting for topology to settle" - ); - } - return Ok(None); - } - // Recoverable losses, ALL handled by rebuilding the duplication (device + re-DuplicateOutput): - // ACCESS_LOST — desktop switch (normal <-> Winlogon secure: lock/login/UAC) or mode change - // INVALID_CALL — the secure->user-desktop switch (post-login) leaves the duplication in a - // state where AcquireNextFrame returns 0x887A0001; recreating recovers it. - // Previously fatal -> the stream dropped the instant the user logged in. - // DEVICE_REMOVED/RESET — GPU TDR / driver reset. - Err(e) - if e.code() == DXGI_ERROR_ACCESS_LOST - || e.code() == DXGI_ERROR_INVALID_CALL - || e.code() == DXGI_ERROR_DEVICE_REMOVED - || e.code() == DXGI_ERROR_DEVICE_RESET => - { - self.dbg_lost += 1; - // TIERED recovery. The HDR path produces a constant ACCESS_LOST *churn*: the - // duplication keeps getting invalidated (overlay/MPO flips that HDR makes aggressive) - // but the OUTPUT stays valid — a probe passes, the dup lives briefly, dies, repeats. - // For that, the cheap fix is a fresh DuplicateOutput on the SAME device+output: no new - // device/factory → NO encoder re-init, NO black seed → frames stay near-continuous - // (this is what makes HDR animations smooth). Only a genuine output loss (secure-desktop - // switch, where DISPLAY10 is gone) or a dead device needs the full rebuild — and THAT - // is throttled so a long secure dwell doesn't hammer DuplicateOutput / starve the - // client (between attempts we repeat the last frame). - let device_dead = - e.code() == DXGI_ERROR_DEVICE_REMOVED || e.code() == DXGI_ERROR_DEVICE_RESET; - if self.dbg_lost % 64 == 1 { - tracing::warn!( - lost = self.dbg_lost, - code = format!("{:#x}", e.code().0), - "DXGI capture lost — recovering (cheap re-duplicate, full rebuild if output gone)" - ); - } - // GENTLE recovery. On the secure (Winlogon) desktop the duplication dies on EVERY - // independent-flip; a tight re-duplicate loop tears the duplication down + brings it up - // hundreds of times/sec — that release/recreate cycle is the real kernel stress (and it - // stalls the send thread long enough that the client times out → "display disconnected"). - // So instead of fighting it: cap recovery HARD and just repeat the last frame in between - // (no busy-spin, no per-flip teardown). The session stays alive across a secure dwell; the - // lock/UAC screen is frozen/laggy, then capture resumes cleanly when the desktop returns. - // Tunable: PUNKTFUNK_RECOVER_MS (cheap re-duplicate cadence, default 250) and - // PUNKTFUNK_REBUILD_MS (heavy new-device rebuild cadence, default 1500). - let recover_ms = std::env::var("PUNKTFUNK_RECOVER_MS") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(250u64); - let now = Instant::now(); - if self - .last_recover - .is_some_and(|t| now.duration_since(t) < Duration::from_millis(recover_ms)) - { - return Ok(None); // repeat the last frame; do NOT tear down/recreate yet - } - self.last_recover = Some(now); - if !device_dead && self.try_reduplicate() { - // Cheap recovery succeeded (same device, no teardown of the device/monitor). - self.first_frame = true; - return Ok(None); - } - // Heavy full rebuild (new device) — the costliest teardown/recreate, so throttle it the - // hardest. Only when the cheap re-duplicate keeps failing (genuine output/device loss). - let rebuild_ms = std::env::var("PUNKTFUNK_REBUILD_MS") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(1500u64); - let now = Instant::now(); - let due = self - .last_rebuild - .is_none_or(|t| now.duration_since(t) >= Duration::from_millis(rebuild_ms)); - if due { - self.last_rebuild = Some(now); - if self.recreate_dupl().is_ok() { - self.first_frame = true; - } - } - // Born-lost rebuilds (created OK, instant ACCESS_LOST) used to escalate to a full pipeline - // cold-rebuild here — but that re-issued vd.create()→set_active_mode (an audible PnP - // add/remove chime + a fresh topology mode change), which never converged and amplified - // the storm. With the topology fix (set_active_mode no longer promotes the IDD to PRIMARY - // by default) the born-lost storm is gone at its source; if one ever recurs, just keep - // repeating the last frame in-process — never tear the IDD down mid-session (Apollo never - // does). Throttled visibility only. - if self.consecutive_born_lost > 0 && self.consecutive_born_lost % 40 == 1 { - tracing::warn!( - consecutive = self.consecutive_born_lost, - "DDA born-lost rebuilds — repeating last frame in-process (no teardown)" - ); - } - return Ok(None); - } - Err(e) => return Err(e).context("AcquireNextFrame"), - } - let res = res.context("AcquireNextFrame: null resource")?; - // Detect a mode/format change on the hot path. The desktop can flip HDR<->SDR (FP16<->BGRA — - // e.g. the SudoVDA output dropping out of HDR for the secure desktop) or change resolution - // WITHOUT raising ACCESS_LOST; `hdr_fp16`/`width`/`height` would then be stale and - // `present_acquired` would CopyResource into a mismatched-format/size target — corruption, or - // the secure-desktop "works once, then HDR breaks" bug. Re-read the acquired texture's desc - // every frame (Apollo does this) and rebuild on a real change instead of presenting a - // mismatched frame. Throttled like the ACCESS_LOST path so a flapping toggle can't hammer - // DuplicateOutput. - if let Ok(tex) = res.cast::() { - let mut d = D3D11_TEXTURE2D_DESC::default(); - tex.GetDesc(&mut d); - // Only a real SIZE change is reliably detectable here. Format/HDR is NOT: legacy - // DuplicateOutput always hands back an 8-bit BGRA surface regardless of the output's FP16 - // scanout mode, so comparing the acquired-texture format against `hdr_fp16` (derived from - // the OUTDUPL ModeDesc) self-fires every frame → a rebuild storm. A genuine resolution - // change is caught here; a real HDR↔SDR toggle arrives as ACCESS_LOST → recreate_dupl - // re-detects it. (Genuine FP16 capture is a separate change: DuplicateOutput1.) - if d.Width != self.width || d.Height != self.height { - tracing::info!( - old = format!("{}x{}", self.width, self.height), - new = format!("{}x{}", d.Width, d.Height), - "DXGI capture size changed mid-stream — rebuilding" - ); - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - let now = Instant::now(); - let due = self - .last_rebuild - .is_none_or(|t| now.duration_since(t) >= Duration::from_millis(250)); - if due { - self.last_rebuild = Some(now); - if self.recreate_dupl().is_ok() { - self.first_frame = true; - } - } - return Ok(None); - } - } - Ok(Some(self.present_acquired(res)?)) - } - - /// Turn a freshly-acquired duplication resource into a `CapturedFrame` and record it as - /// `last_present`. Factored out of [`acquire`] so the recovery path ([`recreate_dupl`]) can grab - /// the CURRENT desktop frame instead of seeding black: the secure (lock/login/UAC) desktop is - /// static, so DDA emits no change-frame to replace a black seed — the cause of the black-screen- - /// until-you-press-a-key bug. The caller has already `AcquireNextFrame`d; this releases it. - unsafe fn present_acquired(&mut self, res: IDXGIResource) -> Result { - self.holding_frame = true; - let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?; - if self.gpu_mode && self.hdr_fp16 { - // HDR zero-copy path: the duplication surface is scRGB FP16 (R16G16B16A16_FLOAT) — it can't - // be CopyResource'd into a BGRA target (that was the freeze + cursor-trail bug). Copy it into - // an FP16 SRV texture (same format → valid), composite the cursor onto it (the cursor lands - // at ~SDR-white brightness, then goes through the PQ curve correctly), then convert scRGB → - // BT.2020 PQ 10-bit into hdr10_out and hand THAT to NVENC (HEVC Main10 / HDR10). - self.ensure_fp16_src()?; - let src = self.fp16_src.clone().context("fp16 src texture")?; - self.context.CopyResource(&src, &tex); - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - self.composite_cursor_gpu(&src, true)?; // onto the FP16 surface (HDR: decode + nits scale) - // Video-engine path: scRGB FP16 → BT.2020 PQ P010 on the VIDEO engine (no 3D shader, and - // NVENC encodes P010 natively). Fall back to the HdrConverter pixel shader (3D) only if the - // video processor is unavailable. - if let Some(p010) = (!self.chroma_444) - .then(|| self.convert_to_yuv(&src, true)) - .flatten() - { - self.last_present = Some((p010.clone(), PixelFormat::P010)); - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::P010, - payload: FramePayload::D3d11(D3d11Frame { - texture: p010, - device: self.device.clone(), - }), - }); - } - self.ensure_hdr10_out()?; - let out = self.hdr10_out.clone().context("hdr10 out texture")?; - if self.hdr_conv.is_none() { - self.hdr_conv = Some(HdrConverter::new(&self.device)?); - } - let srv = self.fp16_srv.clone().context("fp16 srv")?; - let mut rtv: Option = None; - self.device - .CreateRenderTargetView(&out, None, Some(&mut rtv))?; - let rtv = rtv.context("hdr10 rtv")?; - self.hdr_conv.as_ref().unwrap().convert( - &self.context, - &srv, - &rtv, - self.width, - self.height, - ); - self.last_present = Some((out.clone(), PixelFormat::Rgb10a2)); - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::Rgb10a2, - payload: FramePayload::D3d11(D3d11Frame { - texture: out, - device: self.device.clone(), - }), - }); - } - if self.gpu_mode { - // Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication - // surface into a reused owned texture, release the duplication frame, hand off the texture. - // NOTE: do NOT convert the duplication surface directly on the video processor to skip this - // copy — the VP colour-convert (3D/compute on NVIDIA) holds the DDA surface until it - // completes, blocking ReleaseFrame/AcquireNextFrame and SERIALIZING capture+convert (~60 fps, - // encode_us 15-20 ms measured). The fast same-format CopyResource decouples them: it releases - // the DDA frame immediately so the convert runs independently (40-200 fps). Worth ~5% 3D. - self.ensure_gpu_copy()?; - let gpu = self.gpu_copy.clone().context("gpu copy texture")?; - self.context.CopyResource(&gpu, &tex); - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - self.composite_cursor_gpu(&gpu, false)?; - // Prefer the video-engine YUV path (BGRA → NV12 on the video engine) so the colour - // conversion AND NVENC's encode stay OFF the 3D engine — the only way to keep up when a - // game pins the 3D engine at ~100%. Fall back to handing NVENC the BGRA texture (it then - // does RGB→YUV internally on the 3D/compute engine). - if let Some(nv12) = (!self.chroma_444) - .then(|| self.convert_to_yuv(&gpu, false)) - .flatten() - { - self.last_present = Some((nv12.clone(), PixelFormat::Nv12)); - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::Nv12, - payload: FramePayload::D3d11(D3d11Frame { - texture: nv12, - device: self.device.clone(), - }), - }); - } - self.last_present = Some((gpu.clone(), PixelFormat::Bgra)); - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::Bgra, - payload: FramePayload::D3d11(D3d11Frame { - texture: gpu, - device: self.device.clone(), - }), - }); - } - self.ensure_staging()?; - let staging = self.staging.clone().context("staging texture")?; - self.context.CopyResource(&staging, &tex); - let mut map = D3D11_MAPPED_SUBRESOURCE::default(); - self.context - .Map(&staging, 0, D3D11_MAP_READ, 0, Some(&mut map)) - .context("Map staging")?; - let (w, h) = (self.width as usize, self.height as usize); - let pitch = map.RowPitch as usize; - let src = std::slice::from_raw_parts(map.pData as *const u8, pitch * h); - let mut tight = depad_bgra(src, pitch, w, h); - self.context.Unmap(&staging, 0); - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - self.holding_frame = false; - if self.cursor_visible { - if let Some(shape) = &self.cursor_shape { - let (cx, cy) = self.cursor_pos; - if let Some(bgra) = &shape.alpha { - blend_cursor_cpu( - &mut tight, - self.width, - self.height, - bgra, - shape.w, - shape.h, - cx, - cy, - false, - ); - } - if let Some(bgra) = &shape.xor { - blend_cursor_cpu( - &mut tight, - self.width, - self.height, - bgra, - shape.w, - shape.h, - cx, - cy, - true, - ); - } - } - } - self.last = Some(tight.clone()); - Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::Bgra, - payload: FramePayload::Cpu(tight), - }) - } -} - -fn now_ns() -> u64 { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_nanos() as u64) - .unwrap_or(0) -} - -impl Capturer for DuplCapturer { - fn hdr_meta(&self) -> Option { - // Only when the duplication is actually HDR (FP16); cleared to None on an SDR rebuild. - if self.hdr_fp16 { - self.hdr_meta - } else { - None - } - } - - fn next_frame(&mut self) -> Result { - // Generous: a secure-desktop switch can take several seconds to settle (re-resolve + recreate - // the duplication up to 12 s). Better a few seconds of frozen-last-frame than dropping the stream. - let mut deadline = Instant::now() + Duration::from_secs(20); - loop { - // SAFETY: `acquire` is an `unsafe fn` because it drives the D3D11 immediate context + the - // output duplication, which must be touched only from the capturer's owning thread. - // `next_frame` runs on that one thread — `DuplCapturer` is `Send` but not `Sync`, so it is - // owned by a single (encode) thread for its whole life — and `&mut self` gives exclusive - // access for the call, satisfying that contract. - if let Some(f) = unsafe { self.acquire() }? { - self.ever_got_frame = true; - return Ok(f); - } - if self.gpu_mode { - if let Some((tex, fmt)) = &self.last_present { - // Repeat the last presented GPU frame (SDR BGRA or HDR 10-bit), keeping the encoder - // on a matching format through a static desktop or a mid-rebuild gap. - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: *fmt, - payload: FramePayload::D3d11(D3d11Frame { - texture: tex.clone(), - device: self.device.clone(), - }), - }); - } - } - if let Some(b) = &self.last { - return Ok(CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format: PixelFormat::Bgra, - payload: FramePayload::Cpu(b.clone()), - }); - } - if Instant::now() > deadline { - // After we've streamed at least once, never fatally drop on a frame drought: a long - // secure-desktop dwell (or a slow rebuild) just means no NEW frame yet. Reset the - // deadline and keep repeating the last/seeded frame so the session stays alive. The - // deadline stays fatal only before the first frame — a genuine "monitor never lit up". - if self.ever_got_frame { - deadline = Instant::now() + Duration::from_secs(20); - continue; - } - return Err(anyhow!( - "no DXGI frame within 20s (SudoVDA monitor not activated by a WDDM GPU?)" - )); - } - } - } - - fn try_latest(&mut self) -> Result> { - // SAFETY: as in `next_frame` — `acquire` must run on the capturer's single owning thread, and - // `try_latest` is called on it (`DuplCapturer` is `Send`, not `Sync`); `&mut self` is exclusive. - unsafe { self.acquire() } - } - - fn set_active(&self, active: bool) { - self.active.store(active, Ordering::Relaxed); - } -} - -impl Drop for DuplCapturer { - fn drop(&mut self) { - if self.holding_frame { - // SAFETY: `self.dupl` is the live `IDXGIOutputDuplication` this capturer created and owns; - // `ReleaseFrame` is a valid COM method on it, called only when `holding_frame` records that a - // frame was acquired and not yet released (so it is not an unbalanced release). Drop runs on - // whichever thread owns the capturer — its sole owner, since it is `!Sync` — and the `&` - // borrow of the duplication outlives this synchronous call. - unsafe { - let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); - } - } - // Release the display/system-required execution state we took at open(). - // SAFETY: `SetThreadExecutionState` is a Win32 FFI call taking an execution-state flag bitmask - // by value (`ES_CONTINUOUS` clears the display/system-required state taken at open); it borrows - // no Rust memory and is safe to call from any thread. - unsafe { - SetThreadExecutionState(ES_CONTINUOUS); - } - // _keepalive drops after, REMOVEing the SudoVDA monitor. - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn pack_luid_roundtrip() { - let l = LUID { - LowPart: 0x1234_5678, - HighPart: 0x0000_0009, - }; - assert_eq!(pack_luid(l), (0x9i64 << 32) | 0x1234_5678); - } - - #[test] - fn gdi_name_match() { - let mut buf = [0u16; 32]; - for (i, c) in r"\\.\DISPLAY3".encode_utf16().enumerate() { - buf[i] = c; - } - assert!(gdi_name_matches(&buf, r"\\.\DISPLAY3")); - assert!(!gdi_name_matches(&buf, r"\\.\DISPLAY1")); - } - - #[test] - fn depad_removes_row_padding() { - // 2x2 BGRA, pitch = 12 (row=8 + 4 pad bytes). - let pitch = 12; - let mut src = vec![0u8; pitch * 2]; - for y in 0..2 { - for x in 0..8 { - src[y * pitch + x] = (y * 8 + x) as u8; - } - } - let out = depad_bgra(&src, pitch, 2, 2); - assert_eq!(out.len(), 16); - assert_eq!(&out[0..8], &[0, 1, 2, 3, 4, 5, 6, 7]); - assert_eq!(&out[8..16], &[8, 9, 10, 11, 12, 13, 14, 15]); - } -} diff --git a/crates/punktfunk-host/src/capture/windows/wgc.rs b/crates/punktfunk-host/src/capture/windows/wgc.rs deleted file mode 100644 index 65c5d97..0000000 --- a/crates/punktfunk-host/src/capture/windows/wgc.rs +++ /dev/null @@ -1,816 +0,0 @@ -//! Windows.Graphics.Capture (WGC) capture backend — the HDR/animation-correct path. -//! -//! Why WGC over DXGI Desktop Duplication: DDA duplicates only the DWM-composed primary surface, so -//! HDR desktop animations the OS routes onto hardware overlay / independent-flip / MPO planes (Start -//! menu, Win11 Mica/acrylic, window resize) never enter the surface DDA reads — the stream shows a -//! frozen desktop ("broken HDR animations"). Engaging WGC capture pulls that content back through DWM -//! composition, so the surface WGC hands back contains the animations. WGC also has no -//! ACCESS_LOST-on-overlay-flip churn. -//! -//! It reuses the rest of the pipeline UNCHANGED: the frame's GPU texture (the OS already composited -//! the cursor into it — `IsCursorCaptureEnabled(true)`) goes through the same scRGB→BT.2020-PQ shader -//! ([`super::dxgi::HdrConverter`]) into a host-owned `R10G10B10A2` texture (HDR) or is copied into a -//! BGRA texture (SDR), which is handed to NVENC zero-copy (registered by pointer, encoded in place). -//! Shares the D3D11 device with NVENC via `FramePayload::D3d11`. -//! -//! Limitation: WGC cannot capture the secure desktop (lock / UAC / login) — the caller falls back to -//! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs). - -// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). -#![deny(clippy::undocumented_unsafe_blocks)] - -use super::dxgi::{ - find_output, hdr_shader_p010_enabled, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, - HdrP010Converter, VideoConverter, WinCaptureTarget, -}; -use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; -use anyhow::{bail, Context, Result}; -use std::collections::VecDeque; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::{Arc, Condvar, Mutex}; -use std::time::{Duration, Instant}; -use windows::core::{IInspectable, Interface}; -use windows::Foundation::{TimeSpan, TypedEventHandler}; -use windows::Graphics::Capture::{ - Direct3D11CaptureFrame, Direct3D11CaptureFramePool, GraphicsCaptureItem, GraphicsCaptureSession, -}; -use windows::Graphics::DirectX::DirectXPixelFormat; -use windows::Win32::Foundation::{CloseHandle, HANDLE}; -use windows::Win32::Graphics::Direct3D11::{ - ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView, - ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_TEXTURE2D_DESC, - D3D11_USAGE_DEFAULT, -}; -use windows::Win32::Graphics::Dxgi::Common::{ - DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_R10G10B10A2_UNORM, - DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, -}; -use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6}; -use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf}; -use windows::Win32::System::RemoteDesktop::{WTSGetActiveConsoleSessionId, WTSQueryUserToken}; -use windows::Win32::System::WinRT::Direct3D11::{ - CreateDirect3D11DeviceFromDXGIDevice, IDirect3DDxgiInterfaceAccess, -}; -use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop; -use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED}; - -/// Output texture ring depth. The encode loop pipelines one frame deep (NVENC encodes frame N while -/// the capturer produces N+1), so two live textures suffice; three gives headroom against a slow -/// `lock_bitstream` and matches the WGC frame-pool depth. -// Sized for the deep encode pipeline (`PUNKTFUNK_ENCODE_DEPTH`, default 4, clamped ≤ 6): up to DEPTH -// frames are in flight in NVENC at once, so the HDR convert ring and the SDR held-frame set must each -// keep DEPTH(+headroom) live textures, and the WGC pool needs spare buffers beyond what we hold. -const OUT_RING: usize = 8; - -/// SDR zero-copy: how many recent WGC frames to keep alive so NVENC can encode the pool texture in -/// place (no `CopyResource`). Each in-flight encode reads a distinct frame, so this must exceed the -/// pipeline depth; the oldest is released once `HELD_FRAMES` newer ones exist. -const HELD_FRAMES: usize = 8; -/// WGC frame-pool buffer count. Must exceed `HELD_FRAMES` so the compositor always has free buffers -/// to render into while we hold frames for in-place (zero-copy) SDR encode. -const WGC_POOL_BUFFERS: i32 = 10; - -/// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under -/// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user -/// for the WGC activation. Returns the user token (the caller reverts + closes it after activation) -/// or `None` (no active user, or the host already runs AS the user — WTSQueryUserToken then fails and -/// WGC works without impersonation). SYSTEM-only; harmless under a user-token host. -unsafe fn impersonate_active_user() -> Option { - let session = WTSGetActiveConsoleSessionId(); - if session == 0xFFFF_FFFF { - return None; - } - let mut token = HANDLE::default(); - if WTSQueryUserToken(session, &mut token).is_ok() { - if ImpersonateLoggedOnUser(token).is_ok() { - return Some(token); - } - let _ = CloseHandle(token); - } - None -} - -/// RAII: reverts the WGC-activation impersonation when it drops (covers every `?` early-return). -struct Deimpersonate(Option); -impl Drop for Deimpersonate { - fn drop(&mut self) { - if let Some(tok) = self.0.take() { - // SAFETY: `RevertToSelf` takes no arguments and undoes the thread impersonation set during - // WGC activation; `tok` is the impersonation token `HANDLE` from `impersonate_active_user`, - // owned by this `Deimpersonate` and closed exactly once here (taken out of the `Option`, so - // no double-close). Both are FFI calls borrowing no Rust memory. - unsafe { - let _ = RevertToSelf(); - let _ = CloseHandle(tok); - } - } - } -} - -/// Signal from the free-threaded FrameArrived callback to the encode thread: a monotonically -/// increasing count of arrived frames + a condvar to wake `next_frame`. The encode thread tracks how -/// many it has consumed; `TryGetNextFrame` is called exactly `available - consumed` times so we never -/// hit the empty-pool ambiguity, and draining to the newest keeps latency at one frame. -struct WgcSignal { - available: AtomicU64, - mtx: Mutex<()>, - cv: Condvar, -} - -pub struct WgcCapturer { - device: ID3D11Device, - context: ID3D11DeviceContext, - // WGC objects — kept alive for the session's lifetime. - pool: Direct3D11CaptureFramePool, - session: GraphicsCaptureSession, - _item: GraphicsCaptureItem, - _frame_arrived_token: i64, - signal: Arc, - consumed: u64, - - width: u32, - height: u32, - timeout_ms: u64, - first_frame: bool, - - hdr: bool, - /// The source display's static HDR mastering metadata (ST.2086 + content light level), read from - /// `IDXGIOutput6::GetDesc1` at open when the output is HDR. Forwarded to the encoder (in-band SEI) - /// and the client (0xCE) by the stream loop. `None` when SDR. (The helper relay path also encodes, - /// so this is what gives the secure/normal-desktop HDR stream its mastering SEI.) - hdr_meta: Option, - hdr_conv: Option, - fp16_src: Option, - fp16_srv: Option, - /// `PUNKTFUNK_HDR_SHADER_P010` path: emit P010 (BT.2020 PQ 10-bit limited range) DIRECTLY from our - /// own shader (`HdrP010Converter`) so NVENC takes native P010 and skips its SM-side RGB→YUV CSC. - /// Gated by [`hdr_shader_p010_enabled`] AND `self.hdr`; `None`/empty when off → the existing R10 + - /// VideoProcessor paths run unchanged. `p010_disabled` latches a runtime failure (e.g. a driver - /// that rejects the planar plane RTV) so we fall back to the R10 path and stop retrying. - hdr_p010_conv: Option, - p010_out: Vec, - p010_idx: usize, - p010_disabled: bool, - /// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed - /// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC - /// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so - /// N+1 must land in a DIFFERENT texture or it clobbers the in-flight encode. (`fp16_src` stays - /// single: it's only touched within the D3D11 immediate context, whose op ordering already - /// serializes the convert's read against the next copy's write — NVENC's async engine read is the - /// only consumer that escapes that ordering, and it reads the ring output, never `fp16_src`.) - out_ring: Vec, - ring_idx: usize, - /// Video-processor RGB→YUV converter (off the 3D engine where possible) + its NV12/P010 output - /// ring. Preferred path: the OS-composited capture (cursor already in it) is converted DIRECTLY to - /// NVENC's native YUV — no `CopyResource`, no cursor draw, and NVENC skips its internal RGB→YUV. - /// `None`/error → falls back to the legacy SDR-zero-copy / HDR-shader paths. - video_conv: Option, - yuv_out: Vec, - yuv_idx: usize, - yuv_is_hdr: bool, - vp_disabled: bool, - /// SDR zero-copy: the recent WGC frames we hand to NVENC in place. Held so the pool doesn't - /// recycle the texture mid-encode; the oldest is released once `HELD_FRAMES` newer ones exist. - held: VecDeque, - /// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop). - last_present: Option<(ID3D11Texture2D, PixelFormat)>, - - /// Owns the SudoVDA keepalive once attached (after WGC is confirmed open) — dropping the capturer - /// then REMOVEs the virtual output. `None` between open and attach so a WGC-open failure leaves - /// the keepalive with the caller for the DDA fallback. - _keepalive: Option>, -} - -// SAFETY: like `DuplCapturer`. `WgcCapturer` holds D3D11 (free-threaded device/context) plus WGC WinRT -// objects (`Direct3D11CaptureFramePool` etc., created free-threaded via `CreateFreeThreaded`). COM/WinRT -// reference counting is interlocked, and the capturer is owned + used by exactly one encode thread, -// moved to it once and never shared (no `Sync`), so transferring ownership across threads is sound. The -// free-threaded `FrameArrived` callback touches only the `Arc` (itself `Send + Sync`), not -// the capturer's COM fields. -unsafe impl Send for WgcCapturer {} - -impl WgcCapturer { - /// Open WGC capture. Does NOT take the keepalive — the caller attaches it via - /// [`attach_keepalive`](Self::attach_keepalive) only after open succeeds, so a failure leaves the - /// keepalive with the caller to hand to the DDA fallback. - pub fn open(target: WinCaptureTarget, preferred: Option<(u32, u32, u32)>) -> Result { - // SAFETY: runs on the thread opening the WGC session. `RoInitialize` inits this thread's WinRT - // apartment (idempotent; result ignored). `impersonate_active_user()` and `find_output()` are - // this module's `unsafe fn`s whose contracts (call on the activating thread; pass a GDI name) - // are met, and the impersonation is reverted by `_deimp`'s Drop on every return path. Every - // COM/WinRT call thereafter operates on an object obtained + `?`-checked earlier in this same - // block on this single thread — the `IDXGIOutput1` from `find_output`, the device/context from - // `make_device`, the factory/interop/item/pool/session — and the `TypedEventHandler` closure - // captures an `Arc` (Send+Sync) by move. No raw pointers are dereferenced; borrowed - // locals outlive their synchronous calls. - unsafe { - // WGC is WinRT — the calling thread needs a COM/WinRT apartment for the GraphicsCaptureItem - // activation factory (RoGetActivationFactory). Initialize MTA; ignore "already initialized" - // / "changed mode" (another component on this thread may have init'd a compatible apartment). - let ro = RoInitialize(RO_INIT_MULTITHREADED); - // Impersonate the interactive user for the duration of WGC activation (host runs as - // SYSTEM; WGC won't activate under SYSTEM). Reverted by the guard's Drop on return. The - // WGC objects, once created, are accessed from the (SYSTEM) encode thread thereafter. - let imp = impersonate_active_user(); - let _deimp = Deimpersonate(imp); - tracing::info!(ro_result = ?ro, impersonated = imp.is_some(), "WGC: RoInitialize(MTA)"); - // The SudoVDA output appears a beat after the display is created — settle-retry like DDA. - let deadline = Instant::now() + Duration::from_millis(2000); - let (adapter, output) = loop { - if let Some(n) = crate::win_display::resolve_gdi_name(target.target_id) { - if let Ok(found) = find_output(&n) { - break found; - } - } - if let Ok(found) = find_output(&target.gdi_name) { - break found; - } - if Instant::now() >= deadline { - bail!( - "WGC: no DXGI output for SudoVDA target {} yet", - target.target_id - ); - } - std::thread::sleep(Duration::from_millis(100)); - }; - - let (device, context) = make_device(&adapter)?; - let od = output.GetDesc().context("output GetDesc")?; - let hmonitor = od.Monitor; - - // HDR iff the output's colour space is BT.2020 PQ (G2084) — matches the DDA FP16 detection. - // From the same desc, read the source display's mastering metadata (ST.2086) when HDR. - let desc1 = output - .cast::() - .ok() - .and_then(|o6| o6.GetDesc1().ok()); - let hdr = desc1 - .as_ref() - .map(|d1| d1.ColorSpace == DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020) - .unwrap_or(false); - let hdr_meta = if hdr { - desc1.as_ref().map(|d| { - crate::hdr::hdr_meta_from_display( - (d.RedPrimary[0], d.RedPrimary[1]), - (d.GreenPrimary[0], d.GreenPrimary[1]), - (d.BluePrimary[0], d.BluePrimary[1]), - (d.WhitePoint[0], d.WhitePoint[1]), - d.MaxLuminance, - d.MinLuminance, - 0, // MaxCLL: GetDesc1 has no content light level (Apollo zeroes it) - 0, // MaxFALL - ) - }) - } else { - None - }; - - // Wrap our D3D11 device as a WinRT IDirect3DDevice so the frame pool allocates on it (the - // pool textures land on our device → CopyResource + NVENC are same-device, no readback). - let dxgi_device: IDXGIDevice = device.cast().context("ID3D11Device as IDXGIDevice")?; - let inspectable: IInspectable = CreateDirect3D11DeviceFromDXGIDevice(&dxgi_device) - .context("CreateDirect3D11DeviceFromDXGIDevice")?; - let d3d_device: windows::Graphics::DirectX::Direct3D11::IDirect3DDevice = inspectable - .cast() - .context("IInspectable as IDirect3DDevice")?; - - tracing::info!(hdr, "WGC: device ready, creating capture item"); - // GraphicsCaptureItem for the monitor (the SudoVDA output enumerates as a normal monitor). - let interop: IGraphicsCaptureItemInterop = - windows::core::factory::() - .context("GraphicsCaptureItem interop factory")?; - let item: GraphicsCaptureItem = interop - .CreateForMonitor(hmonitor) - .context("CreateForMonitor")?; - let size = item.Size().context("item Size")?; - let (width, height) = (size.Width.max(0) as u32, size.Height.max(0) as u32); - tracing::info!( - width, - height, - "WGC: capture item created, creating frame pool" - ); - - let pixel_format = if hdr { - DirectXPixelFormat::R16G16B16A16Float // scRGB FP16 — same surface DDA gives on HDR - } else { - DirectXPixelFormat::B8G8R8A8UIntNormalized - }; - // Extra buffers: SDR zero-copy holds the last `HELD_FRAMES` frames (encoded in place), so - // the pool needs headroom beyond that for the producer to keep rendering at 240 Hz. - let pool = Direct3D11CaptureFramePool::CreateFreeThreaded( - &d3d_device, - pixel_format, - WGC_POOL_BUFFERS, - size, - ) - .context("CreateFreeThreaded frame pool")?; - - let signal = Arc::new(WgcSignal { - available: AtomicU64::new(0), - mtx: Mutex::new(()), - cv: Condvar::new(), - }); - let sig = signal.clone(); - let handler = TypedEventHandler::::new( - move |_pool, _arg| { - sig.available.fetch_add(1, Ordering::Release); - sig.cv.notify_one(); - Ok(()) - }, - ); - let token = pool.FrameArrived(&handler).context("FrameArrived")?; - - tracing::info!("WGC: creating capture session"); - let session = pool - .CreateCaptureSession(&item) - .context("CreateCaptureSession")?; - // OS composites the cursor into the frame (HDR-correct, no manual composite pass). - let _ = session.SetIsCursorCaptureEnabled(true); - // Drop the yellow capture border (best-effort — older builds reject it). - let _ = session.SetIsBorderRequired(false); - // Lift the 60 Hz cap: allow up to the client's refresh (Win11 24H2+; below that this is a - // no-op and WGC caps ~60). 100 ns ticks per frame. - let refresh = preferred - .map(|(_, _, hz)| hz) - .filter(|&hz| hz > 0) - .unwrap_or(60); - let ticks = (10_000_000i64 / refresh.max(1) as i64).max(1); - let _ = session.SetMinUpdateInterval(TimeSpan { Duration: ticks }); - tracing::info!("WGC: StartCapture"); - session.StartCapture().context("StartCapture")?; - // WGC fires FrameArrived on CHANGE; a static desktop may never deliver the first frame - // (→ black, then the next_frame deadline ends the session). Nudge the cursor onto the - // output to force the first composition change, exactly like the DDA path does. - nudge_cursor_onto(&output); - - let timeout_ms = (2000 / refresh.max(1) as u64).max(8); - tracing::info!( - width, - height, - hdr, - refresh, - "WGC capture started ({})", - if hdr { - "HDR FP16→BT.2020 PQ" - } else { - "SDR BGRA" - } - ); - - Ok(Self { - device, - context, - pool, - session, - _item: item, - _frame_arrived_token: token, - signal, - consumed: 0, - width, - height, - timeout_ms, - first_frame: true, - hdr, - hdr_meta, - hdr_conv: None, - fp16_src: None, - fp16_srv: None, - hdr_p010_conv: None, - p010_out: Vec::new(), - p010_idx: 0, - p010_disabled: false, - out_ring: Vec::new(), - ring_idx: 0, - video_conv: None, - yuv_out: Vec::new(), - yuv_idx: 0, - yuv_is_hdr: false, - vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(), - held: VecDeque::new(), - last_present: None, - _keepalive: None, - }) - } - } - - /// Take ownership of the SudoVDA keepalive once the WGC session is confirmed open. - pub fn attach_keepalive(&mut self, keepalive: Box) { - self._keepalive = Some(keepalive); - } - - /// Block until a new frame arrives (cv), then drain `TryGetNextFrame` to the NEWEST queued frame - /// (skip stale → one-frame latency). Returns `None` on timeout (no new frame → caller repeats). - fn wait_and_drain(&mut self) -> Option { - let wait_ms = if self.first_frame { - 2000 - } else { - self.timeout_ms - }; - { - let mut g = self.signal.mtx.lock().unwrap(); - while self.signal.available.load(Ordering::Acquire) <= self.consumed { - let (ng, res) = self - .signal - .cv - .wait_timeout(g, Duration::from_millis(wait_ms)) - .unwrap(); - g = ng; - if res.timed_out() { - return None; - } - } - } - let target = self.signal.available.load(Ordering::Acquire); - let mut last = None; - while self.consumed < target { - if let Ok(f) = self.pool.TryGetNextFrame() { - last = Some(f); - } - self.consumed += 1; - } - last - } - - unsafe fn ensure_fp16_src(&mut self) -> Result<()> { - if self.fp16_src.is_some() { - return Ok(()); - } - let desc = tex_desc( - self.width, - self.height, - DXGI_FORMAT_R16G16B16A16_FLOAT, - (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32, - ); - let mut t = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(wgc fp16 src)")?; - let t = t.context("fp16 src")?; - let mut srv = None; - self.device - .CreateShaderResourceView(&t, None, Some(&mut srv))?; - self.fp16_srv = Some(srv.context("fp16 srv")?); - self.fp16_src = Some(t); - Ok(()) - } - - /// Lazily allocate the HDR output texture ring (R10G10B10A2, the convert pass's render target → - /// NVENC input), `RENDER_TARGET`-bindable. SDR is zero-copy (encodes the WGC pool texture in - /// place) and uses no ring. - unsafe fn ensure_out_ring( - &mut self, - format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT, - ) -> Result<()> { - if !self.out_ring.is_empty() { - return Ok(()); - } - let desc = tex_desc( - self.width, - self.height, - format, - D3D11_BIND_RENDER_TARGET.0 as u32, - ); - for _ in 0..OUT_RING { - let mut t = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(wgc out ring)")?; - self.out_ring.push(t.context("wgc out ring tex")?); - } - Ok(()) - } - - /// Convert `input` (the OS-composited WGC pool texture: BGRA or scRGB FP16) → NVENC's native YUV - /// (NV12 / P010) on the video processor. Returns the YUV texture (from a ring so consecutive - /// encodes don't collide), or `None` to fall back to the legacy RGB paths. - unsafe fn convert_to_yuv( - &mut self, - input: &ID3D11Texture2D, - hdr: bool, - ) -> Option { - if self.vp_disabled { - return None; - } - if self.video_conv.is_none() || self.yuv_out.is_empty() || self.yuv_is_hdr != hdr { - self.video_conv = None; - self.yuv_out.clear(); - self.yuv_idx = 0; - let vc = match VideoConverter::new( - &self.device, - &self.context, - self.width, - self.height, - hdr, - ) { - Ok(vc) => vc, - Err(e) => { - tracing::warn!(error = %format!("{e:#}"), - "WGC: video processor unavailable — falling back to RGB path"); - self.vp_disabled = true; - return None; - } - }; - let fmt = if hdr { - windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010 - } else { - windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12 - }; - let desc = tex_desc( - self.width, - self.height, - fmt, - D3D11_BIND_RENDER_TARGET.0 as u32, - ); - for _ in 0..OUT_RING { - let mut t = None; - if self - .device - .CreateTexture2D(&desc, None, Some(&mut t)) - .is_err() - { - tracing::warn!("WGC: CreateTexture2D(YUV) failed — falling back to RGB path"); - self.vp_disabled = true; - self.yuv_out.clear(); - return None; - } - let Some(tex) = t else { - self.vp_disabled = true; - self.yuv_out.clear(); - return None; - }; - self.yuv_out.push(tex); - } - self.video_conv = Some(vc); - self.yuv_is_hdr = hdr; - tracing::info!( - hdr, - "WGC: video-processor YUV path active ({})", - if hdr { "P010" } else { "NV12" } - ); - } - let slot = self.yuv_idx; - self.yuv_idx = (self.yuv_idx + 1) % self.yuv_out.len(); - let out = self.yuv_out[slot].clone(); - if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) { - tracing::warn!(error = %format!("{e:#}"), - "WGC: VideoProcessorBlt failed — falling back to RGB path"); - self.vp_disabled = true; - self.video_conv = None; - self.yuv_out.clear(); - return None; - } - Some(out) - } - - /// `PUNKTFUNK_HDR_SHADER_P010` path: convert the OS-composited FP16 scRGB capture DIRECTLY to a - /// host-owned P010 texture (BT.2020 PQ, 10-bit limited range) via [`HdrP010Converter`] — two - /// shader passes writing the P010 planes. NVENC then takes native P010 and skips its internal - /// RGB→YUV CSC. Returns the next ring slot's P010 texture, or `Err` if the converter / a planar - /// plane RTV fails (the caller latches `p010_disabled` and falls back to the R10 path). - unsafe fn hdr_to_p010(&mut self, src: &ID3D11Texture2D) -> Result { - let slot = self.p010_idx; - // Lazily allocate the FP16 source (shared with the R10 path) + the P010 output ring. - self.ensure_fp16_src()?; - let fp16 = self.fp16_src.clone().context("fp16 src")?; - self.context.CopyResource(&fp16, src); - if self.p010_out.is_empty() { - let desc = tex_desc( - self.width, - self.height, - windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010, - D3D11_BIND_RENDER_TARGET.0 as u32, - ); - for _ in 0..OUT_RING { - let mut t = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(wgc p010 ring)")?; - self.p010_out.push(t.context("wgc p010 ring tex")?); - } - } - self.p010_idx = (self.p010_idx + 1) % self.p010_out.len(); - let out = self.p010_out[slot].clone(); - if self.hdr_p010_conv.is_none() { - self.hdr_p010_conv = Some(HdrP010Converter::new(&self.device)?); - } - let srv = self.fp16_srv.clone().context("fp16 srv")?; - self.hdr_p010_conv.as_ref().unwrap().convert( - &self.device, - &self.context, - &srv, - &out, - self.width, - self.height, - )?; - Ok(out) - } - - fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result { - // SAFETY: runs on the capturer's single owning thread. `frame` is a live - // `Direct3D11CaptureFrame` from `self.pool`; `frame.Surface().cast::().GetInterface()` yields the frame's backing `ID3D11Texture2D`, which belongs to - // `self.device` (the pool was created on it via `CreateDirect3D11DeviceFromDXGIDevice`). Every - // helper called here — `hdr_to_p010`, `convert_to_yuv`, `ensure_fp16_src`, `ensure_out_ring`, - // `HdrConverter::convert`, `CopyResource`, `CreateRenderTargetView` — operates on - // `self.device`/`self.context` and that same-device texture, so all resources share one device. - // The frame is held in `self.held` until its async GPU read completes for the zero-copy paths. - // Single-threaded immediate-context use; borrowed textures/SRVs/RTVs outlive each synchronous call. - unsafe { - let surface = frame.Surface().context("frame Surface")?; - let access: IDirect3DDxgiInterfaceAccess = surface - .cast() - .context("surface as IDirect3DDxgiInterfaceAccess")?; - let src: ID3D11Texture2D = access - .GetInterface() - .context("GetInterface ID3D11Texture2D")?; - - // GATED P010-shader path (`PUNKTFUNK_HDR_SHADER_P010`): for HDR, emit P010 (BT.2020 PQ - // 10-bit limited range) DIRECTLY from our shader so NVENC takes native P010 and skips its - // SM-side RGB→YUV CSC. Runs BEFORE the R10 + VideoProcessor path. A converter/plane-RTV - // failure latches `p010_disabled` → we fall through to the unchanged R10 path for the rest - // of the session. Default OFF → none of this executes and behaviour is byte-for-byte as - // today. - if self.hdr && !self.p010_disabled && hdr_shader_p010_enabled() { - match self.hdr_to_p010(&src) { - Ok(p010) => { - // The P010 output is host-owned (the ring), and the FP16 CopyResource read - // `src` synchronously on the immediate context before the shader passes — so we - // do NOT need to hold `frame` past here (unlike the SDR/R10 in-place paths). - // Dropping it returns the pool buffer to WGC immediately. - drop(frame); - self.last_present = Some((p010.clone(), PixelFormat::P010)); - return Ok(self.d3d11_frame(p010, PixelFormat::P010)); - } - Err(e) => { - tracing::warn!(error = %format!("{e:#}"), - "WGC: HDR P010 shader path failed — disabling it, falling back to R10"); - self.p010_disabled = true; - self.hdr_p010_conv = None; - self.p010_out.clear(); - } - } - } - - // Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to - // NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC - // skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set - // means reading the pool texture directly does NOT serialize (unlike DDA's single-frame - // model). The frame is held until the async Blt finishes. (HDR: the video processor can't - // ingest FP16 scRGB, so the Blt fails and we fall back to the R10 path below; the - // `PUNKTFUNK_HDR_SHADER_P010` branch above is the off-the-SM HDR path.) - if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) { - let fmt = if self.hdr { - PixelFormat::P010 - } else { - PixelFormat::Nv12 - }; - self.last_present = Some((yuv.clone(), fmt)); - let out = self.d3d11_frame(yuv, fmt); - self.held.push_back(frame); - while self.held.len() > HELD_FRAMES { - self.held.pop_front(); - } - return Ok(out); - } - - // --- fallback (video processor unavailable) --- - if self.hdr { - // Next ring slot — the in-flight encode reads the slot we handed out last time, so - // this capture must land in a different one (see `out_ring`). - let slot = self.ring_idx; - self.ring_idx = (self.ring_idx + 1) % OUT_RING; - // FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC. - self.ensure_fp16_src()?; - let fp16 = self.fp16_src.clone().context("fp16 src")?; - self.context.CopyResource(&fp16, &src); - self.ensure_out_ring(DXGI_FORMAT_R10G10B10A2_UNORM)?; - let out = self.out_ring[slot].clone(); - if self.hdr_conv.is_none() { - self.hdr_conv = Some(HdrConverter::new(&self.device)?); - } - let srv = self.fp16_srv.clone().context("fp16 srv")?; - let mut rtv: Option = None; - self.device - .CreateRenderTargetView(&out, None, Some(&mut rtv))?; - let rtv = rtv.context("hdr10 rtv")?; - self.hdr_conv.as_ref().unwrap().convert( - &self.context, - &srv, - &rtv, - self.width, - self.height, - ); - self.last_present = Some((out.clone(), PixelFormat::Rgb10a2)); - Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2)) - } else { - // SDR ZERO-COPY: hand NVENC the WGC pool texture DIRECTLY — no `CopyResource`. The - // per-frame copy otherwise queues on the graphics engine behind a GPU-saturating game - // and stalls `lock_bitstream` ~20 ms (NVENC sits idle waiting for its input). Encoding - // the pool texture in place removes that graphics-queue dependency (Apollo's model). - // We must keep the frame alive until its async encode finishes, so retain the last - // `HELD_FRAMES`; the pool has spare buffers so the producer never starves. - self.last_present = Some((src.clone(), PixelFormat::Bgra)); - let out = self.d3d11_frame(src, PixelFormat::Bgra); - self.held.push_back(frame); - while self.held.len() > HELD_FRAMES { - self.held.pop_front(); - } - Ok(out) - } - } - } - - fn d3d11_frame(&self, texture: ID3D11Texture2D, format: PixelFormat) -> CapturedFrame { - CapturedFrame { - width: self.width, - height: self.height, - pts_ns: now_ns(), - format, - payload: FramePayload::D3d11(D3d11Frame { - texture, - device: self.device.clone(), - }), - } - } -} - -impl Capturer for WgcCapturer { - fn hdr_meta(&self) -> Option { - self.hdr_meta - } - - fn next_frame(&mut self) -> Result { - let overall = Instant::now() + Duration::from_secs(20); - loop { - if let Some(frame) = self.wait_and_drain() { - self.first_frame = false; - return self.process_frame(frame); - } - // No new frame within the wait — repeat the last presented frame (static desktop). - if let Some((tex, fmt)) = &self.last_present { - return Ok(self.d3d11_frame(tex.clone(), *fmt)); - } - if Instant::now() > overall { - bail!("no WGC frame within 20s (SudoVDA monitor not lit / no capture access?)"); - } - } - } - - fn try_latest(&mut self) -> Result> { - let target = self.signal.available.load(Ordering::Acquire); - if target <= self.consumed { - return Ok(None); - } - let mut last = None; - while self.consumed < target { - if let Ok(f) = self.pool.TryGetNextFrame() { - last = Some(f); - } - self.consumed += 1; - } - match last { - Some(frame) => self.process_frame(frame).map(Some), - None => Ok(None), - } - } - // set_active: the trait default (no-op) is correct — WGC keeps its session running across the - // active/idle gate (cheap; the frame pool just recycles), like the DDA duplication. -} - -impl Drop for WgcCapturer { - fn drop(&mut self) { - let _ = self.session.Close(); - let _ = self.pool.Close(); - // _keepalive drops after, REMOVEing the SudoVDA monitor. - } -} - -fn tex_desc( - width: u32, - height: u32, - format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT, - bind: u32, -) -> D3D11_TEXTURE2D_DESC { - D3D11_TEXTURE2D_DESC { - Width: width, - Height: height, - MipLevels: 1, - ArraySize: 1, - Format: format, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - Usage: D3D11_USAGE_DEFAULT, - BindFlags: bind, - CPUAccessFlags: 0, - MiscFlags: 0, - } -} - -fn now_ns() -> u64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .map(|d| d.as_nanos() as u64) - .unwrap_or(0) -} diff --git a/crates/punktfunk-host/src/capture/windows/wgc_relay.rs b/crates/punktfunk-host/src/capture/windows/wgc_relay.rs deleted file mode 100644 index b2c1147..0000000 --- a/crates/punktfunk-host/src/capture/windows/wgc_relay.rs +++ /dev/null @@ -1,484 +0,0 @@ -//! Host-side WGC helper relay (Windows two-process secure-desktop design, -//! design/archive/windows-secure-desktop.md — step 4). -//! -//! WGC won't activate under the SYSTEM account, so the SYSTEM host can't capture the normal desktop -//! itself. Instead it spawns `punktfunk-host wgc-helper` in the **interactive user session** (so WGC works) -//! via `CreateProcessAsUserW`, with the helper's **stdout** redirected to an anonymous pipe the host -//! reads. The helper ships framed Annex-B access units; this module parses them back into AUs the -//! host relays onto the live QUIC session (same `EncodedFrame` flow, just sourced over a pipe instead -//! of a local encoder). A second pipe carries a tiny **control** channel to the helper (stdin: force -//! keyframe), and the helper's **stderr** is forwarded line-by-line into host tracing so its logs are -//! visible from the SYSTEM host's console. -//! -//! Wire framing (must match `wgc_helper::write_au`): per AU -//! `[u32 magic "PFAU" LE][u32 len LE][u64 pts_ns LE][u8 keyframe][len bytes data]`. - -// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). -#![deny(clippy::undocumented_unsafe_blocks)] - -use crate::capture::dxgi::WinCaptureTarget; -use anyhow::{bail, Context, Result}; -use std::io::{BufRead, BufReader, Read}; -use std::sync::mpsc::{Receiver, SyncSender}; -use std::sync::Mutex; -use windows::core::PWSTR; -use windows::Win32::Foundation::SetHandleInformation; -use windows::Win32::Foundation::{CloseHandle, HANDLE}; -use windows::Win32::Foundation::{HANDLE_FLAGS, HANDLE_FLAG_INHERIT}; -use windows::Win32::Security::{ - DuplicateTokenEx, SecurityImpersonation, TokenPrimary, SECURITY_ATTRIBUTES, TOKEN_ALL_ACCESS, -}; -use windows::Win32::System::Environment::{CreateEnvironmentBlock, DestroyEnvironmentBlock}; -use windows::Win32::System::Pipes::CreatePipe; -use windows::Win32::System::RemoteDesktop::{WTSGetActiveConsoleSessionId, WTSQueryUserToken}; -use windows::Win32::System::Threading::{ - CreateProcessAsUserW, TerminateProcess, CREATE_NO_WINDOW, CREATE_UNICODE_ENVIRONMENT, - PROCESS_INFORMATION, STARTF_USESTDHANDLES, STARTUPINFOW, -}; - -/// Must match [`crate::wgc_helper`]'s `AU_MAGIC` ("PFAU"). -const AU_MAGIC: u32 = 0x5046_4155; - -/// One access unit relayed from the helper, in the helper's (= the host's, same machine) monotonic -/// clock — `pts_ns` is directly comparable to the host's `now_ns()`. -pub struct RelayAu { - pub data: Vec, - pub pts_ns: u64, - pub keyframe: bool, -} - -/// A running USER-session WGC helper whose AUs the SYSTEM host relays. Drop kills the child + closes -/// the pipes; the reader threads then end on the broken pipe. -pub struct HelperRelay { - proc: HANDLE, - thread: HANDLE, - /// Host write end of the helper's stdin — control commands (force keyframe). Mutex so the relay - /// can be shared while the encode thread requests keyframes. - stdin_w: Mutex, - /// Parsed AUs from the helper's stdout reader thread. - rx: Receiver, -} - -// SAFETY: every field is itself `Send`: the `proc`/`thread` `HANDLE`s are process-global kernel -// handle values (plain integers valid from any thread, owned for the relay's lifetime and closed once -// on Drop), `stdin_w` is a `Mutex`, and `rx` is an mpsc `Receiver` (which is `Send`). -// The relay is moved to one thread and owned there, so transferring it across threads is sound. -unsafe impl Send for HelperRelay {} -// NOTE: `HelperRelay` is deliberately NOT `Sync`. Its `rx: Receiver` is `!Sync` (std mpsc -// is single-consumer), and the relay is only ever a single-owner local in the punktfunk1 two-process -// mux loop — never shared by `&` across threads — so `Sync` is neither sound nor needed. (A prior -// `unsafe impl Sync` here asserted more than the fields support; removed.) - -/// Control byte on the helper's stdin: force the next encoded frame to be an IDR (client decode -/// recovery). Mirrors `enc.request_keyframe()` in the single-process path. -const CTL_KEYFRAME: u8 = 0x01; - -impl HelperRelay { - /// Spawn the helper in the interactive user session and start relaying its AUs. `target` is the - /// SudoVDA output the host already created (captured by GDI name only — the helper never touches - /// display topology). `(w, h, hz)` is the negotiated mode; `bitrate_kbps` the negotiated bitrate. - pub fn spawn( - target: &WinCaptureTarget, - mode: (u32, u32, u32), - bitrate_kbps: u32, - bit_depth: u8, - ) -> Result { - let exe = std::env::current_exe().context("current_exe for helper spawn")?; - let exe = exe.to_string_lossy().into_owned(); - let (w, h, hz) = mode; - // CreateProcessAsUserW takes a single mutable command line (argv[0] = exe). - let cmdline = format!( - "\"{exe}\" wgc-helper --gdi \"{}\" --target-id {} --mode {w}x{h}x{hz} --bitrate {bitrate_kbps} --bit-depth {bit_depth}", - target.gdi_name, target.target_id - ); - tracing::info!(cmd = %cmdline, "spawning WGC helper in user session"); - - // SAFETY: `spawn_inner` is an `unsafe fn` only because it drives raw Win32 token/pipe/process - // FFI; it imposes no caller-side memory precondition beyond valid arguments. `cmdline` is a live - // `&str` borrowed for the synchronous call and `(w, h, hz)` are plain `u32`s. It validates its - // own runtime requirements (active console session, SYSTEM token) and returns `Err` otherwise. - unsafe { spawn_inner(&cmdline, w, h, hz) } - } - - /// Receive the next relayed AU. Distinguishes a `Timeout` (helper slow/stalled — keep waiting) - /// from `Disconnected` (helper exited → its stdout closed → reader thread ended → channel - /// dropped), which returns *immediately* and means the relay must stop, not spin. - pub fn recv_timeout( - &self, - dur: std::time::Duration, - ) -> Result { - self.rx.recv_timeout(dur) - } - - /// Non-blocking receive — used to drain stale buffered AUs (encoded while the secure desktop was - /// the live source) before resuming the relay. `Ok` while AUs remain, `Err` once empty. - pub fn try_recv(&self) -> Result { - self.rx.try_recv() - } - - /// Ask the helper's encoder for an IDR on the next frame (client decode recovery). Best-effort: - /// a write failure means the helper is gone — the caller's recv loop will see the disconnect. - pub fn request_keyframe(&self) { - let h = self.stdin_w.lock().unwrap(); - let mut written = 0u32; - // SAFETY: `*h` is the host's write end of the helper's stdin pipe — a live `HANDLE` owned by - // this `HelperRelay` (held under the `stdin_w` Mutex, locked here), closed only in Drop. - // `WriteFile` reads the 1-byte `&[CTL_KEYFRAME]` buffer and writes the byte count into - // `written`; both are live locals that outlive the synchronous call. A failure (helper gone) is - // discarded as documented. - unsafe { - let _ = windows::Win32::Storage::FileSystem::WriteFile( - *h, - Some(&[CTL_KEYFRAME]), - Some(&mut written), - None, - ); - } - } -} - -impl Drop for HelperRelay { - fn drop(&mut self) { - // SAFETY: `self.proc`/`self.thread` are the child process/thread `HANDLE`s from - // `CreateProcessAsUserW`, and `stdin_w` is the host's pipe write end — all owned by this - // `HelperRelay` and closed exactly once here in Drop (no double-close). `TerminateProcess` and - // the three `CloseHandle`s are FFI calls taking those handles by value, borrowing no Rust memory. - unsafe { - // Terminate the child first so its WGC capture + NVENC session tear down, then close our - // handles (the reader threads end on the resulting broken pipe). - let _ = TerminateProcess(self.proc, 1); - let _ = CloseHandle(*self.stdin_w.lock().unwrap()); - let _ = CloseHandle(self.proc); - let _ = CloseHandle(self.thread); - } - tracing::info!("WGC helper relay torn down"); - } -} - -/// Inheritable anonymous pipe (read, write). The caller marks whichever end the host keeps as -/// non-inheritable so the child only inherits its own end. -unsafe fn make_pipe() -> Result<(HANDLE, HANDLE)> { - let mut read = HANDLE::default(); - let mut write = HANDLE::default(); - let sa = SECURITY_ATTRIBUTES { - nLength: std::mem::size_of::() as u32, - lpSecurityDescriptor: std::ptr::null_mut(), - bInheritHandle: true.into(), - }; - CreatePipe(&mut read, &mut write, Some(&sa), 0).context("CreatePipe")?; - Ok((read, write)) -} - -/// Mark a handle non-inheritable (the host keeps it; the child must not get a copy). -unsafe fn no_inherit(h: HANDLE) { - let _ = SetHandleInformation(h, HANDLE_FLAG_INHERIT.0, HANDLE_FLAGS(0)); -} - -/// Build a child environment block: the target session's block (so DLL/PATH/SystemRoot resolve) with -/// this process's `PUNKTFUNK_*` vars overlaid, so the child runs with the SAME settings this process -/// has (`PUNKTFUNK_ENCODER=nvenc`, `PUNKTFUNK_ZEROCOPY`, …) instead of the target shell's. Returns a -/// UTF-16, double-null-terminated block suitable for `CREATE_UNICODE_ENVIRONMENT`. Shared by the WGC -/// helper spawn (here) and the Windows service launching the host into the active session. -pub(crate) unsafe fn merged_env_block(user_block: *const u16) -> Vec { - // Parse the user block ("VAR=VALUE\0" … "\0") into entries. - let mut entries: Vec = Vec::new(); - if !user_block.is_null() { - let mut p = user_block; - loop { - let mut len = 0isize; - while *p.offset(len) != 0 { - len += 1; - } - if len == 0 { - break; // the trailing empty string = end of block - } - let slice = std::slice::from_raw_parts(p, len as usize); - entries.push(String::from_utf16_lossy(slice)); - p = p.offset(len + 1); - } - } - // Overlay "our" settings — PUNKTFUNK_* and RUST_LOG — dropping whatever the target block had. - let is_ours = |k: &str| k.starts_with("PUNKTFUNK_") || k == "RUST_LOG"; - entries.retain(|e| !is_ours(e.split('=').next().unwrap_or(""))); - for (k, v) in std::env::vars().filter(|(k, _)| is_ours(k)) { - entries.push(format!("{k}={v}")); - } - // Serialize back to a UTF-16 double-null-terminated block. - let mut block: Vec = Vec::new(); - for e in entries { - block.extend(e.encode_utf16()); - block.push(0); - } - block.push(0); - block -} - -unsafe fn spawn_inner(cmdline: &str, w: u32, h: u32, hz: u32) -> Result { - // The user token of the active console session (requires the host to be SYSTEM). - let session = WTSGetActiveConsoleSessionId(); - if session == 0xFFFF_FFFF { - bail!("no active console session (WTSGetActiveConsoleSessionId)"); - } - let mut user_token = HANDLE::default(); - WTSQueryUserToken(session, &mut user_token) - .context("WTSQueryUserToken (host must run as SYSTEM)")?; - - // A primary token for CreateProcessAsUserW. - let mut primary = HANDLE::default(); - let dup = DuplicateTokenEx( - user_token, - TOKEN_ALL_ACCESS, - None, - SecurityImpersonation, - TokenPrimary, - &mut primary, - ); - let _ = CloseHandle(user_token); - dup.context("DuplicateTokenEx(TokenPrimary)")?; - - // The user's environment block (PATH, USERPROFILE, SystemRoot → DLL resolution), MERGED with the - // host's PUNKTFUNK_* vars. CreateProcessAsUserW would otherwise give the helper the *user's* env - // only, dropping PUNKTFUNK_ENCODER=nvenc / PUNKTFUNK_ZEROCOPY/… that the host runs with — so the - // helper would fall back to the software (H.264-only) encoder. We parse the user block, strip any - // PUNKTFUNK_* it has, append the host's, and pass the merged block. - let mut env_block: *mut core::ffi::c_void = std::ptr::null_mut(); - let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false); - let merged_env = merged_env_block(env_block as *const u16); - if !env_block.is_null() { - let _ = DestroyEnvironmentBlock(env_block); - } - - // Three pipes: stdout (helper→host AUs), stdin (host→helper control), stderr (helper→host logs). - let (out_r, out_w) = make_pipe().context("stdout pipe")?; - let (in_r, in_w) = make_pipe().context("stdin pipe")?; - let (err_r, err_w) = make_pipe().context("stderr pipe")?; - // The host keeps out_r / in_w / err_r — none inheritable; the child inherits out_w/in_r/err_w. - no_inherit(out_r); - no_inherit(in_w); - no_inherit(err_r); - - let mut si = STARTUPINFOW { - cb: std::mem::size_of::() as u32, - dwFlags: STARTF_USESTDHANDLES, - hStdInput: in_r, - hStdOutput: out_w, - hStdError: err_w, - ..Default::default() - }; - // WGC needs the interactive desktop. - let mut desktop: Vec = "winsta0\\default\0".encode_utf16().collect(); - si.lpDesktop = PWSTR(desktop.as_mut_ptr()); - - let mut cmd: Vec = cmdline.encode_utf16().chain(std::iter::once(0)).collect(); - let mut pi = PROCESS_INFORMATION::default(); - - let created = CreateProcessAsUserW( - Some(primary), - None, - Some(PWSTR(cmd.as_mut_ptr())), - None, - None, - true, // inherit handles (the child's std ends) - CREATE_UNICODE_ENVIRONMENT | CREATE_NO_WINDOW, - Some(merged_env.as_ptr() as *const core::ffi::c_void), - None, - &si, - &mut pi, - ); - - // Clean up regardless of outcome: the child now owns its inherited ends; close our copies. - let _ = CloseHandle(out_w); - let _ = CloseHandle(in_r); - let _ = CloseHandle(err_w); - let _ = CloseHandle(primary); - - if let Err(e) = created { - let _ = CloseHandle(out_r); - let _ = CloseHandle(in_w); - let _ = CloseHandle(err_r); - return Err(e).context("CreateProcessAsUserW(wgc-helper)"); - } - tracing::info!(pid = pi.dwProcessId, mode = %format!("{w}x{h}@{hz}"), "WGC helper spawned"); - - // The helper does the WGC capture + NVENC encode, but it runs under the user's UAC-FILTERED token - // (no SE_INC_BASE_PRIORITY), so it can't raise its OWN GPU scheduling-priority class — under a - // GPU-saturating game NVENC then gets starved (the "240→40 fps in-game collapse"). The SYSTEM host - // holds the privilege, so stamp the HIGH GPU priority class onto the child here, right after spawn - // (the process-level class applies to the GPU contexts the helper creates afterwards). - crate::capture::dxgi::set_child_gpu_priority_class(pi.hProcess); - - // stderr → host tracing, line by line. - let err_handle = HandleReader(err_r); - std::thread::Builder::new() - .name("wgc-helper-log".into()) - .spawn(move || { - let r = BufReader::new(err_handle); - for line in r.lines() { - match line { - Ok(l) if !l.trim().is_empty() => tracing::info!(target: "wgc_helper", "{l}"), - Ok(_) => {} - Err(_) => break, - } - } - }) - .ok(); - - // stdout → parsed AUs. Bounded so a stalled relay applies backpressure (the pipe then fills and - // the helper blocks on write — the same backpressure the single-process channel gives). - let (tx, rx) = std::sync::mpsc::sync_channel::(3); - let out_handle = HandleReader(out_r); - std::thread::Builder::new() - .name("wgc-helper-au".into()) - .spawn(move || au_reader(out_handle, tx)) - .ok(); - - Ok(HelperRelay { - proc: pi.hProcess, - thread: pi.hThread, - stdin_w: Mutex::new(in_w), - rx, - }) -} - -/// Parse the AU framing off the helper's stdout and forward each AU. Ends (returns) when the pipe -/// breaks (helper exit) or the channel's receiver is dropped (relay torn down). -fn au_reader(mut r: HandleReader, tx: SyncSender) { - loop { - let mut hdr = [0u8; 4 + 4 + 8 + 1]; - if r.read_exact(&mut hdr).is_err() { - break; - } - let magic = u32::from_le_bytes([hdr[0], hdr[1], hdr[2], hdr[3]]); - if magic != AU_MAGIC { - tracing::error!( - magic = format!("{magic:#x}"), - "WGC helper AU stream desync — aborting relay" - ); - break; - } - let len = u32::from_le_bytes([hdr[4], hdr[5], hdr[6], hdr[7]]) as usize; - let pts_ns = u64::from_le_bytes([ - hdr[8], hdr[9], hdr[10], hdr[11], hdr[12], hdr[13], hdr[14], hdr[15], - ]); - let keyframe = hdr[16] != 0; - // Bound the allocation — a corrupt length must not OOM the host. 64 MiB is far above any real - // AU (a 5K keyframe is a few MB). - if len > 64 * 1024 * 1024 { - tracing::error!(len, "WGC helper AU length implausible — aborting relay"); - break; - } - let mut data = vec![0u8; len]; - if r.read_exact(&mut data).is_err() { - break; - } - if tx - .send(RelayAu { - data, - pts_ns, - keyframe, - }) - .is_err() - { - break; // relay dropped - } - } -} - -/// Minimal `Read` over a Win32 pipe HANDLE (the windows crate doesn't impl `Read` on HANDLE). -struct HandleReader(HANDLE); -// SAFETY: `HandleReader` owns a single pipe `HANDLE` (a process-global kernel handle value, valid from -// any thread). It is moved into the dedicated reader thread and used only there (and closed once on -// Drop), never shared — so transferring ownership across threads is sound. -unsafe impl Send for HandleReader {} -impl Read for HandleReader { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let mut read = 0u32; - // SAFETY: `self.0` is the live read end of an anonymous pipe owned by this `HandleReader` - // (closed only in Drop). `ReadFile` fills the caller-provided `buf` (writing at most `buf.len()` - // bytes) and stores the count in `read`; both outlive the synchronous call. A broken pipe - // surfaces as `Err` and is mapped to EOF below. - let ok = unsafe { - windows::Win32::Storage::FileSystem::ReadFile(self.0, Some(buf), Some(&mut read), None) - }; - match ok { - Ok(()) => Ok(read as usize), - // A broken pipe (helper exited) reads as ERROR_BROKEN_PIPE → report EOF (0). - Err(_) => Ok(0), - } - } -} -impl Drop for HandleReader { - fn drop(&mut self) { - // SAFETY: `self.0` is the pipe `HANDLE` this `HandleReader` owns; `CloseHandle` (an FFI call - // taking the handle by value) is invoked exactly once here in Drop, so there is no double-close. - unsafe { - let _ = CloseHandle(self.0); - } - } -} - -/// Is this process running as the LOCAL SYSTEM account? Used to decide whether the two-process -/// secure-desktop path applies (only SYSTEM can `WTSQueryUserToken` + capture the Winlogon desktop). -pub fn running_as_system() -> bool { - use windows::Win32::Security::{GetTokenInformation, TokenUser, TOKEN_QUERY, TOKEN_USER}; - use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken}; - // SAFETY: `OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &mut token)` opens the current-process - // token (the pseudo-handle is always valid) into `token`, which is closed once before each return. - // The first `GetTokenInformation` (null buffer) queries the required `len`; `buf` is then a - // `Vec` of exactly `len` bytes and the second call fills it, so `&*(buf.as_ptr() as *const - // TOKEN_USER)` reads a `TOKEN_USER` the kernel just wrote into a sufficiently-sized buffer (the - // variable-length SID it points at also lies within `buf`, which outlives the borrow). - // `is_local_system_sid` is this module's `unsafe fn`, given that in-buffer `PSID`. Safe on any thread. - unsafe { - let mut token = HANDLE::default(); - if OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &mut token).is_err() { - return false; - } - let mut len = 0u32; - let _ = GetTokenInformation(token, TokenUser, None, 0, &mut len); - if len == 0 { - let _ = CloseHandle(token); - return false; - } - let mut buf = vec![0u8; len as usize]; - let ok = GetTokenInformation( - token, - TokenUser, - Some(buf.as_mut_ptr() as *mut _), - len, - &mut len, - ) - .is_ok(); - let _ = CloseHandle(token); - if !ok { - return false; - } - let tu = &*(buf.as_ptr() as *const TOKEN_USER); - // The well-known LocalSystem SID is S-1-5-18. - is_local_system_sid(tu.User.Sid) - } -} - -/// True iff `sid` is S-1-5-18 (LocalSystem). -unsafe fn is_local_system_sid(sid: windows::Win32::Security::PSID) -> bool { - use windows::Win32::Security::{ - GetSidIdentifierAuthority, GetSidSubAuthority, GetSidSubAuthorityCount, IsValidSid, - }; - if !IsValidSid(sid).as_bool() { - return false; - } - let auth = GetSidIdentifierAuthority(sid); - if auth.is_null() { - return false; - } - // NT Authority = {0,0,0,0,0,5}. - let a = (*auth).Value; - if a != [0, 0, 0, 0, 0, 5] { - return false; - } - let count = *GetSidSubAuthorityCount(sid); - if count != 1 { - return false; - } - *GetSidSubAuthority(sid, 0) == 18 // SECURITY_LOCAL_SYSTEM_RID -} diff --git a/crates/punktfunk-host/src/config.rs b/crates/punktfunk-host/src/config.rs index c67f326..91676ce 100644 --- a/crates/punktfunk-host/src/config.rs +++ b/crates/punktfunk-host/src/config.rs @@ -6,8 +6,8 @@ //! //! **Goal-1 stages 1–2** (`design/windows-host-rewrite.md` §2.2): stage 1 stood this up; stage 2 migrated the //! genuinely-constant operator/dispatch knobs onto it (the dispatch-disagreement bug class: `idd_push`, -//! `capture_backend`, `encoder_pref`, `render_adapter`, `no_wgc`, the vdisplay backend select — plus the -//! plan-named `secure_dda`/`idd_depth`/`zerocopy`/`ten_bit`/`four_four_four` and the multi-site `perf`/`compositor`/ +//! `encoder_pref`, `render_adapter`, the vdisplay backend select — plus the plan-named +//! `idd_depth`/`zerocopy`/`ten_bit`/`four_four_four` and the multi-site `perf`/`compositor`/ //! `video_source`/`gamepad`). `SessionPlan` (stage 3) consumes it as the single owner of the //! capture/topology/encoder decision. //! @@ -36,27 +36,17 @@ use std::sync::OnceLock; /// derived `Debug` impl, so the parser can stay a single platform-neutral function. #[derive(Debug, Clone, Default)] pub struct HostConfig { - /// `PUNKTFUNK_IDD_PUSH` — capture from the pf-vdisplay driver's shared ring (in-process Session-0 - /// capture; no WGC helper). **Value-aware** (`0`/`false`/`no`/`off`/empty ⇒ off, else on); unset ⇒ off. - /// The installer's default `host.env` sets it on, so a fresh install runs the validated IDD-push path - /// (it falls back to DDA if the driver can't attach — see [`crate::capture`]). NOT a bare presence flag - /// (so an operator can turn it OFF in `host.env` with `=0`, which a `var_os` presence check can't). + /// `PUNKTFUNK_IDD_PUSH` — IDD direct-push monitor mode (the per-session monitor + ring recreate and + /// the discrete-render-GPU pin in [`crate::vdisplay::manager`]). IDD-push is the sole Windows capture + /// path (DXGI Desktop Duplication and the WGC relay were removed), so this should stay on — the + /// installer's `host.env` sets it. **Value-aware** (`0`/`false`/`no`/`off`/empty ⇒ off, else on); + /// unset ⇒ off. NOT a bare presence flag (so an operator can turn it OFF with `=0`). pub idd_push: bool, /// `PUNKTFUNK_ENCODER` — explicit encoder-backend override (lowercased; empty = auto-detect by GPU vendor). pub encoder_pref: String, - /// `PUNKTFUNK_NO_HELPER` — never spawn the user-session WGC helper. - pub no_helper: bool, - /// `PUNKTFUNK_FORCE_HELPER` — force the WGC helper even when not running as SYSTEM. - pub force_helper: bool, - /// `PUNKTFUNK_NO_WGC` — force the pure single-process DDA path (skip WGC and the two-process relay). - pub no_wgc: bool, - /// `PUNKTFUNK_CAPTURE` — explicit Windows capture-backend override (lowercased; `dda`/`dxgi` vs the WGC default). - pub capture_backend: String, /// `PUNKTFUNK_RENDER_ADAPTER` — discrete render-GPU pin by description substring (`Some` even when empty: /// the empty string still counts as "set" for the presence checks, and the value reader filters it). pub render_adapter: Option, - /// `PUNKTFUNK_SECURE_DDA` — enable the experimental DDA-on-secure-desktop (Winlogon/UAC) mux leg. - pub secure_dda: bool, /// `PUNKTFUNK_IDD_DEPTH` — IDD-push pipeline depth override (default 2; the call site clamps to its `OUT_RING`). pub idd_depth: usize, /// `PUNKTFUNK_ZEROCOPY` — opt into the Windows D3D11 zero-copy encode path (presence semantics; see module docs). @@ -103,14 +93,7 @@ impl HostConfig { encoder_pref: std::env::var("PUNKTFUNK_ENCODER") .unwrap_or_default() .to_ascii_lowercase(), - no_helper: flag("PUNKTFUNK_NO_HELPER"), - force_helper: flag("PUNKTFUNK_FORCE_HELPER"), - no_wgc: flag("PUNKTFUNK_NO_WGC"), - capture_backend: std::env::var("PUNKTFUNK_CAPTURE") - .unwrap_or_default() - .to_ascii_lowercase(), render_adapter: val("PUNKTFUNK_RENDER_ADAPTER"), - secure_dda: flag("PUNKTFUNK_SECURE_DDA"), idd_depth: val("PUNKTFUNK_IDD_DEPTH") .and_then(|s| s.parse::().ok()) .unwrap_or(2), diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs index fc7c360..d04c270 100644 --- a/crates/punktfunk-host/src/main.rs +++ b/crates/punktfunk-host/src/main.rs @@ -56,9 +56,6 @@ mod spike; mod stats_recorder; mod vdisplay; #[cfg(target_os = "windows")] -#[path = "windows/wgc_helper.rs"] -mod wgc_helper; -#[cfg(target_os = "windows")] #[path = "windows/win_adapter.rs"] mod win_adapter; #[cfg(target_os = "windows")] @@ -392,35 +389,6 @@ fn real_main() -> Result<()> { paired_store: None, }) } - // USER-session WGC helper (Windows two-process secure-desktop design): capture the EXISTING - // SudoVDA via WGC + NVENC, stream AUs on stdout to the SYSTEM host. Spawned by the host - // (CreateProcessAsUser), not run by hand. See design/archive/windows-secure-desktop.md. - #[cfg(target_os = "windows")] - Some("wgc-helper") => { - let get = |flag: &str| { - args.iter() - .skip_while(|a| *a != flag) - .nth(1) - .map(String::as_str) - }; - let (width, height, fps) = get("--mode") - .and_then(|m| { - let p: Vec = m.split('x').filter_map(|s| s.parse().ok()).collect(); - (p.len() == 3).then(|| (p[0], p[1], p[2])) - }) - .unwrap_or((1920, 1080, 60)); - wgc_helper::run(wgc_helper::HelperOptions { - target_id: get("--target-id").and_then(|s| s.parse().ok()).unwrap_or(0), - gdi_name: get("--gdi").unwrap_or("").to_string(), - width, - height, - fps, - bitrate_kbps: get("--bitrate") - .and_then(|s| s.parse().ok()) - .unwrap_or(20000), - bit_depth: get("--bit-depth").and_then(|s| s.parse().ok()).unwrap_or(8), - }) - } // Windows service control: install/uninstall/start/stop/status + the SCM `run` entry point. // Replaces the ad-hoc launch chain — `service install` registers an auto-start SYSTEM service // that launches the host into the active interactive session. diff --git a/crates/punktfunk-host/src/punktfunk1.rs b/crates/punktfunk-host/src/punktfunk1.rs index 4cdf6bc..aa23779 100644 --- a/crates/punktfunk-host/src/punktfunk1.rs +++ b/crates/punktfunk-host/src/punktfunk1.rs @@ -755,14 +755,18 @@ async fn serve_session( // opens a tiny encoder; it runs only when both opt-ins are set and is cached after the first. let host_wants_444 = crate::config::config().four_four_four; let client_supports_444 = hello.video_caps & punktfunk_core::quic::VIDEO_CAP_444 != 0; - let single_process = crate::session_plan::resolve_topology() - == crate::session_plan::SessionTopology::SingleProcess; + // The active capturer must be able to deliver a full-chroma (RGB) source — the honest-downgrade + // gate. Linux's portal capturer can; the Windows IDD-push path delivers subsampled NV12/P010 + // today (full-chroma IDD-push capture is a follow-up), so it returns false there and the host + // negotiates 4:2:0. (Replaces the old `single_process` gate — single-process is now the only + // topology, and 4:4:4 routed to DDA, which was removed.) + let capture_supports_444 = crate::capture::capturer_supports_444(); // The GPU probe opens a real (tiny) encoder on first use, so run it off the reactor like the // compositor probe above (blocking probes → spawn_blocking). Short-circuit so it only runs when // the cheap gates already pass. The result is cached process-wide (a negative latches until // restart — acceptable: a GPU either supports HEVC 4:4:4 or it doesn't, and a transient open // failure here is rare since the session's own encoder isn't open yet). - let gpu_supports_444 = if host_wants_444 && client_supports_444 && single_process { + let gpu_supports_444 = if host_wants_444 && client_supports_444 && capture_supports_444 { tokio::task::spawn_blocking(|| { crate::encode::can_encode_444(crate::encode::Codec::H265) }) @@ -780,7 +784,7 @@ async fn serve_session( chroma = ?chroma, host_wants_444, client_supports_444, - single_process, + capture_supports_444, "encode chroma" ); @@ -2696,7 +2700,7 @@ fn session_watcher_loop(tx: std::sync::mpsc::Sender, stop: Arc Result<()> { - // This thread runs the capture+encode loop (single-process: Linux / synthetic / NO_WGC DDA) — or - // tail-calls the relay below. Elevate it so a CPU-heavy game can't deschedule our GPU submission. + // This thread runs the capture+encode loop (single-process — the only topology: Linux portal / + // synthetic, Windows in-process IDD-push). Elevate it so a CPU-heavy game can't deschedule our GPU + // submission. boost_thread_priority(true); // Resolve the per-session capture / topology / encoder decision ONCE (Goal-1 stage 3): the deployed // path now reads this typed `SessionPlan` instead of re-deriving from config at each dispatch site @@ -2753,14 +2758,6 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> { // only per-session input — capture/topology/encoder are otherwise pure functions of `HostConfig`. let plan = crate::session_plan::SessionPlan::resolve(ctx.bit_depth, ctx.chroma); tracing::info!(?plan, "resolved session plan"); - // Windows two-process secure-desktop path: when the host runs as SYSTEM (required for the secure - // desktop + SendInput), WGC can't activate in-process, so we capture the normal desktop via a - // helper spawned in the user session and relay its AUs. (Single-process WGC/DDA is used as the - // user, and stays the path on Linux.) See design/archive/windows-secure-desktop.md. - #[cfg(target_os = "windows")] - if plan.topology == crate::session_plan::SessionTopology::TwoProcessRelay { - return virtual_stream_relay(ctx); - } // Single-process path: unpack the context into the locals the loop below uses (names unchanged, so the // body is byte-for-byte the same; the receivers are now owned but `try_recv()` is identical). let SessionContext { @@ -2810,20 +2807,7 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> { #[cfg(target_os = "windows")] drop(_idd_setup_guard); - // Windows single-process DDA path (PUNKTFUNK_NO_WGC=1): the SudoVDA virtual display, isolated as the - // SOLE active output, goes into fullscreen independent-flip (one plane on one display) which Desktop - // Duplication cannot capture → the born-lost ACCESS_LOST storm we measured on the RTX4090+iGPU box - // (hook verified-firing, DPI=2, yet 100% DuplicateOutput1 E_ACCESSDENIED + born-lost). A tiny topmost - // layered overlay disqualifies independent-flip and forces DWM composition, which DDA CAN capture. - // (Apollo never hits this because it runs WITH a physical monitor attached — multi-display is already - // DWM-composited; we isolate to sole-display, so we must force composition ourselves.) Unlike the WGC - // relay path — where WGC owns the normal desktop and the overlay is secure-only — here DDA owns the - // normal desktop too, so it must run unconditionally. Held for the session; Drop tears it down. - // Best-effort; disable with PUNKTFUNK_FORCE_COMPOSED=0. - #[cfg(target_os = "windows")] - let _composed_flip = crate::capture::composed_flip::ForceComposedFlip::start(); - - // Windows: capture is live (and composition forced) — launch the requested library title into the + // Windows: capture is live — launch the requested library title into the // interactive user session so it renders onto the captured desktop and grabs foreground. Linux // nests its launch in gamescope instead (the handshake `PUNKTFUNK_GAMESCOPE_APP` path). Best-effort: // a launch failure (no recipe for the kind, no interactive user) leaves the user on the desktop. @@ -3295,480 +3279,6 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> { Ok(()) } -/// Windows two-process video stream: the SYSTEM host creates the SudoVDA virtual output (and holds -/// its keepalive = the sole topology/isolation owner), spawns the WGC helper in the user session to -/// capture+encode the NORMAL desktop, and relays the helper's AUs onto the QUIC data plane via the -/// same send thread as the single-process path. A [`DesktopWatcher`](crate::capture::desktop_watch) -/// muxes the source: while the input desktop is Winlogon (UAC / lock / login — which WGC can't -/// capture), the host captures it with its OWN DDA encoder; back on Default it resumes the relay. -/// Every source switch latches a "wait for IDR" so the client's decoder resumes on a keyframe (the -/// two encoders keep independent infinite-GOP state). Reconfigure rebuilds the output + re-spawns the -/// helper at the new mode (and drops the stale-target DDA); keyframe requests forward to the active -/// source. -#[cfg(target_os = "windows")] -fn virtual_stream_relay(ctx: SessionContext) -> Result<()> { - use crate::capture::dxgi::WinCaptureTarget; - use crate::capture::wgc_relay::HelperRelay; - use crate::capture::Capturer; // trait methods (set_active/next_frame) on the concrete DuplCapturer - - // Unpack the context (names unchanged so the body is identical). The relay doesn't yet send the - // source's 0xCE HDR metadata — the helper's in-band SEI carries it (a Windows follow-up) — so `conn` - // is held unused. - let SessionContext { - session, - mode, - seconds, - stop, - reconfig, - keyframe, - compositor, - bitrate_kbps, - bit_depth, - // The two-process WGC relay encodes 4:2:0 in v1 — the handshake's `single_process` gate already - // forced `chroma` to Yuv420 for this topology, so the helper + secure-desktop DDA stay 4:2:0. - chroma: _, - probe_rx, - probe_result_tx, - fec_target, - conn: _conn, - stats, - client_label, - launch, - } = ctx; - tracing::info!( - ?mode, - bitrate_kbps, - bit_depth, - "punktfunk/1 two-process stream (SYSTEM host + user-session WGC helper)" - ); - - let mut vd = crate::vdisplay::open(compositor)?; - - // Create the SudoVDA output + spawn a helper capturing it by GDI name. Returns the keepalive - // (held for the output's life — the sole isolation owner), the running relay, the capture target - // (so the host can also open DDA on it for the secure desktop), and the achieved refresh. - type Built = (Box, HelperRelay, WinCaptureTarget, u32); - let build = |vd: &mut Box, - mode: punktfunk_core::Mode| - -> Result { - let vout = vd.create(mode).context("create virtual output")?; - let effective_hz = vout - .preferred_mode - .map(|(_, _, hz)| hz) - .filter(|&hz| hz > 0) - .unwrap_or(mode.refresh_hz); - let target = vout.win_capture.clone().ok_or_else(|| { - anyhow!("SudoVDA target not yet an active display (needs a WDDM GPU to activate it)") - })?; - // HDR is driven by the SudoVDA monitor's ACTUAL advanced-color state, not the handshake bit - // depth: the whole pipeline follows the monitor (WGC captures FP16 when HDR is on; NVENC forces - // Main10 + BT.2020 PQ from the 10-bit capture format regardless of the negotiated depth; the - // client auto-detects PQ from the HEVC VUI). So: - // - a negotiated 10-bit session PROACTIVELY enables HDR on the monitor (below), but - // - we must NEVER force HDR *off* here — that would wipe out a user's deliberate Windows HDR - // toggle on the virtual display on every build (the "HDR doesn't persist" bug). Leaving the - // monitor's state alone lets a user-enabled HDR session flow through end-to-end. - // The secure-desktop HDR drop (for the DDA leg) keys off the monitor's real state in the mux loop. - #[cfg(target_os = "windows")] - if bit_depth >= 10 { - // SAFETY: `set_advanced_color` is marked `unsafe` only because it drives the Win32 CCD API - // internally; it takes `target_id` by value (Copy `u32` — this session's live SudoVDA - // monitor's CCD target id) and sizes + owns every buffer it hands the OS on its own stack. - // We pass no pointers, so nothing must outlive the call and there is no aliasing; an - // unknown/absent target id simply returns false. - unsafe { - if crate::win_display::set_advanced_color(target.target_id, true) { - // Let the colorspace change settle before WGC creates its capture item / detects HDR. - std::thread::sleep(std::time::Duration::from_millis(250)); - } - } - } - let relay = HelperRelay::spawn( - &target, - (mode.width, mode.height, effective_hz), - bitrate_kbps, - bit_depth, - ) - .context("spawn WGC helper")?; - Ok((vout.keepalive, relay, target, effective_hz)) - }; - - let (mut _keepalive, mut relay, mut target, mut effective_hz) = build(&mut vd, mode)?; - let mut cur_mode = mode; - - // Capture is live (the WGC helper is relaying) — launch the requested library title into the - // interactive user session so it renders onto the captured desktop and grabs foreground. - // Best-effort: a failure (no recipe for the kind, no interactive user) leaves the user on the desktop. - if let Some(id) = launch.as_deref() { - if let Err(e) = crate::library::launch_title(id) { - tracing::warn!(launch_id = id, error = %e, "could not launch requested library title"); - } - } - - // O3.1: optionally observe the IDD-push ring alongside WGC (WGC = the presentation trigger) to - // confirm the 0257 driver pushes frames into a HOST-created ring. Diagnostic only; gated. - if std::env::var_os("PUNKTFUNK_IDD_PUSH_OBSERVE").is_some() { - crate::capture::idd_push::spawn_observer( - target.clone(), - Some((cur_mode.width, cur_mode.height, effective_hz)), - ); - } - - // The host's own DDA capturer+encoder for the SECURE (Winlogon) desktop, which WGC — and thus the - // helper — cannot capture. Opened lazily on the first secure transition (so a session that never - // hits a UAC/lock screen never pays for a second NVENC session), then kept for fast re-switch. - struct DdaPipe { - cap: Box, - enc: Box, - frame: crate::capture::CapturedFrame, - } - // Note: takes the dimensions as args rather than capturing `cur_mode` — `cur_mode` is reassigned - // on reconfig, and a closure holding a shared borrow of it for the whole fn would forbid that. - let open_dda = - |target: &WinCaptureTarget, w: u32, h: u32, hz: u32, hdr: bool| -> Result { - // The host already holds the real keepalive (sole isolation owner), so DDA gets a no-op one. - // `hdr` requests an FP16 DuplicateOutput1 so the secure desktop is captured in HDR (→ BT.2020 - // PQ Main10) instead of black — legacy DuplicateOutput can't capture an HDR/FP16 desktop. - let mut cap = crate::capture::dxgi::DuplCapturer::open( - target.clone(), - Some((w, h, hz)), - Box::new(()), - // The relay's host encoder is GPU (NVENC/AMF/QSV unless software) — pass `gpu` in (Goal-1 - // stage 5) so the DDA capturer doesn't re-derive it. - crate::capture::gpu_encode(), - hdr, - false, // the two-process relay path is 4:2:0 in v1 - ) - .context("open DDA for secure desktop")?; - cap.set_active(true); - let frame = cap.next_frame().context("DDA first frame")?; - let enc = crate::encode::open_video( - crate::encode::Codec::H265, - frame.format, - frame.width, - frame.height, - hz, - bitrate_kbps as u64 * 1000, - frame.is_cuda(), - bit_depth, - // Secure-desktop DDA on the two-process relay path: 4:2:0 in v1 (matches the helper). - crate::encode::ChromaFormat::Yuv420, - ) - .context("open video encoder for DDA")?; - Ok(DdaPipe { - cap: Box::new(cap), - enc, - frame, - }) - }; - - let perf = crate::config::config().perf; - let burst_cap = std::env::var("PUNKTFUNK_PACE_BURST_KB") - .ok() - .and_then(|s| s.parse::().ok()) - .unwrap_or(128) - * 1024; - - // Same encode|send split as the single-process path: this thread relays AUs, a dedicated send - // thread owns the Session and does FEC+seal+paced-send. The relay encodes in the helper process, - // so this path's FrameMsgs carry no cap/submit/encode split (those stages stay 0 in the sample); - // the send thread still emits fps/goodput/pacing/loss from `session.stats()`. - let send_stats = SendStats { - rec: stats, - width: mode.width, - height: mode.height, - fps: effective_hz, - codec: "hevc", - client: client_label, - bitrate_kbps, - }; - let (frame_tx, frame_rx) = std::sync::mpsc::sync_channel::(3); - let send_thread = std::thread::Builder::new() - .name("punktfunk-send".into()) - .spawn({ - let stop = stop.clone(); - move || { - send_loop( - session, - frame_rx, - probe_rx, - probe_result_tx, - stop, - perf, - burst_cap, - fec_target, - send_stats, - ) - } - }) - .context("spawn send thread")?; - - // Test hook: PUNKTFUNK_SECURE_TEST_PERIOD_MS=N drives a square-wave secure/normal toggle every N ms - // instead of the real watcher — exercises the mid-session helper↔DDA mux without a live UAC/lock. - let secure_test_ms: Option = std::env::var("PUNKTFUNK_SECURE_TEST_PERIOD_MS") - .ok() - .and_then(|s| s.parse().ok()) - .filter(|&n| n > 0); - // Switching to the host DDA on the secure (Winlogon) desktop is OPT-IN: DDA can't reliably capture - // the secure desktop's HDR independent-flip (it storms ACCESS_LOST → black), whereas the WGC helper - // STAYS LIVE through a lock/UAC. So by default the mux keeps WGC the whole time (no DesktopWatcher - // switch, no overlay). Enable the experimental DDA-on-secure path with PUNKTFUNK_SECURE_DDA=1. - let dda_secure = crate::config::config().secure_dda || secure_test_ms.is_some(); - // The authoritative Default↔Winlogon signal (requires SYSTEM to read the Winlogon desktop name); - // only needed when the DDA-on-secure path is enabled. - let watcher = dda_secure.then(crate::capture::desktop_watch::DesktopWatcher::start); - // Force-composed-flip overlay (only with DDA-on-secure): keeps the secure desktop out of fullscreen - // independent-flip so DDA can duplicate it. Off by default to avoid touching the normal desktop. - let _composed_flip = dda_secure - .then(crate::capture::composed_flip::ForceComposedFlip::start) - .flatten(); - let start = std::time::Instant::now(); - - let mut interval = std::time::Duration::from_secs_f64(1.0 / effective_hz.max(1) as f64); - let deadline = std::time::Instant::now() + std::time::Duration::from_secs(seconds as u64); - let mut sent: u64 = 0; - // Mux state: which source is live, the lazily-opened DDA pipe, a DDA pacing clock, and a - // "wait for the next IDR before forwarding" latch set on every source switch (the client's - // decoder must resume on a keyframe — the two encoders keep independent infinite-GOP state). - let mut dda: Option = None; - let mut on_secure = false; - let mut next = std::time::Instant::now(); - let mut await_idr = false; - // Step 6 relaunch watchdog: how many times in a row the helper has died without producing a frame. - // A console disconnect/reconnect or a helper crash kills it; we respawn (the new helper picks up - // the now-active session via WTSGetActiveConsoleSessionId). Reset on the first relayed frame; only - // give up (end the stream) after a run of failures spanning a few seconds. - let mut helper_fails = 0u32; - const MAX_HELPER_FAILS: u32 = 20; - - // Build a FrameMsg + hand it to the send thread; returns false if the send thread is gone (caller - // breaks the loop). Kept as a macro (not a closure) so each use borrows `frame_tx`/`sent`/`interval` - // at its own site without a long-lived capture, and `break 'outer` stays a literal at the call site - // (a `break 'outer` inside the macro body risks label-hygiene resolution failures). - macro_rules! forward { - ($data:expr, $capture_ns:expr, $keyframe:expr) => {{ - let flags = if $keyframe { - (FLAG_PIC | FLAG_SOF) as u32 - } else { - FLAG_PIC as u32 - }; - let capture_ns = $capture_ns; - let encode_us = (now_ns().saturating_sub(capture_ns) / 1000) as u32; - let msg = FrameMsg { - data: $data, - capture_ns, - flags, - deadline: std::time::Instant::now() + interval, - encode_us, - cap_us: 0, - submit_us: 0, - wait_us: 0, - repeat: false, - was_measured: false, - }; - let ok = frame_tx.send(msg).is_ok(); - if ok { - sent += 1; - } - ok - }}; - } - - 'outer: while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline { - // Mode switch: rebuild the output + re-spawn the helper at the new mode (drop the old relay + - // keepalive only after the new pair is up, so a failed rebuild keeps the current stream). The - // DDA pipe (on the old target) is dropped — it reopens on the next secure transition. - let mut want = None; - while let Ok(m) = reconfig.try_recv() { - want = Some(m); - } - if let Some(new_mode) = want { - tracing::info!(?new_mode, "two-process: rebuilding for mode switch"); - match build(&mut vd, new_mode) { - Ok((ka, rl, tg, hz)) => { - relay = rl; // drops the old relay (kills old helper) ... - _keepalive = ka; // ... then releases the old output - target = tg; - effective_hz = hz; - cur_mode = new_mode; - dda = None; // old-target DDA is stale; reopen on next secure - interval = std::time::Duration::from_secs_f64(1.0 / hz.max(1) as f64); - } - Err(e) => { - tracing::error!(error = %format!("{e:#}"), ?new_mode, - "two-process mode-switch rebuild failed — staying on the current mode"); - } - } - } - // Coalesce client decode-recovery keyframe requests and forward to the active source. - let mut want_kf = false; - while keyframe.try_recv().is_ok() { - want_kf = true; - } - - // Source mux: capture the secure (Winlogon) desktop via the host's DDA, the normal desktop via - // the helper relay. On a switch, latch await_idr + force the now-active source to emit an IDR - // so the client resumes cleanly. - let secure = dda_secure - && match secure_test_ms { - Some(p) => (start.elapsed().as_millis() / p) % 2 == 1, - None => watcher.as_ref().is_some_and(|w| w.is_secure()), - }; - if secure != on_secure { - on_secure = secure; - await_idr = true; - tracing::info!( - to = if secure { - "secure(DDA)" - } else { - "normal(WGC relay)" - }, - "two-process: source switch" - ); - if secure { - // Capture the secure (Winlogon) desktop in its NATIVE colorspace. Don't try to drop the - // SudoVDA out of HDR for the DDA leg — display-config changes are denied on the secure - // desktop (the drop just churned + still went black). Instead, if the monitor is in HDR, - // open DDA in HDR (FP16 DuplicateOutput1 → BT.2020 PQ Main10); the normal-desktop DDA - // overlay/flip issues that drove us to WGC don't apply to the composed Winlogon UI. - // SAFETY: `advanced_color_enabled` is `unsafe` only because it queries the Win32 CCD - // API; it takes `target_id` by value (the live SudoVDA monitor's CCD target id) and - // allocates + owns every buffer it passes the OS internally. No caller pointer is - // involved, so nothing must outlive the call and there is no aliasing; a missing - // target id just yields false. - let hdr = unsafe { crate::win_display::advanced_color_enabled(target.target_id) }; - dda = None; // reopen to capture the secure desktop - match open_dda(&target, cur_mode.width, cur_mode.height, effective_hz, hdr) { - Ok(mut p) => { - tracing::info!(hdr, "two-process: opened DDA for the secure desktop"); - p.enc.request_keyframe(); - dda = Some(p); - } - Err(e) => { - tracing::error!(error = %format!("{e:#}"), - "two-process: DDA open failed — secure desktop will freeze on last frame"); - } - } - next = std::time::Instant::now(); - } else { - // Returning to the normal desktop: RESUME from the still-alive WGC helper. Do NOT - // recreate the SudoVDA monitor or respawn the helper — build()'s vd.create() is an - // IOCTL_REMOVE+ADD of the monitor (the audible disconnect/connect chime + the - // teardown/recreate kernel stress that broke DDA, now applied to the mux). The monitor + - // helper persist for the WHOLE session; only the host-DDA leg opens (secure) and closes - // (normal). Apply the DDA learning here: reuse, don't tear down. - dda = None; // free the secure DDA encoder; the relay (helper) is the source again - while relay.try_recv().is_ok() {} // drop secure-dwell backlog - relay.request_keyframe(); // client decoder resumes on the helper's next IDR - // Nothing to restore: we no longer toggle the SudoVDA's HDR state for the DDA leg, so the - // monitor's colorspace is unchanged and the still-alive WGC helper just resumes. - next = std::time::Instant::now(); - } - } - if want_kf { - if secure { - if let Some(d) = dda.as_mut() { - d.enc.request_keyframe(); - } - } else { - relay.request_keyframe(); - } - await_idr = true; - } - - if secure { - // DDA capture+encode for the secure desktop, paced to the frame interval. - let Some(d) = dda.as_mut() else { - std::thread::sleep(interval); - continue; - }; - if let Some(f) = d.cap.try_latest().context("DDA capture")? { - d.frame = f; - } - let capture_ns = now_ns(); - d.enc.submit(&d.frame).context("DDA encoder submit")?; - next += interval; - while let Some(au) = d.enc.poll().context("DDA encoder poll")? { - if await_idr && !au.keyframe { - continue; - } - await_idr = false; - if !forward!(au.data, capture_ns, au.keyframe) { - break 'outer; // send thread gone - } - } - match next.checked_duration_since(std::time::Instant::now()) { - Some(dur) => std::thread::sleep(dur), - None => next = std::time::Instant::now(), - } - } else { - // Relay the helper's AUs for the normal desktop. Timeout → keep servicing the loop; - // Disconnected → the helper exited (step 6 adds the relaunch watchdog). - let au = match relay.recv_timeout(std::time::Duration::from_millis(500)) { - Ok(au) => au, - Err(std::sync::mpsc::RecvTimeoutError::Timeout) => { - if stop.load(Ordering::SeqCst) { - break; - } - tracing::warn!("two-process: no AU from helper within 500ms"); - continue; - } - Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => { - // The helper exited (crash, or a console disconnect killed its session). REBUILD - // the whole output + helper (not just respawn on the old target): an abruptly-killed - // helper leaves the SudoVDA's DXGI output briefly unresolvable ("no DXGI output for - // target N yet"), and a console reconnect needs a fresh output in the new session — - // `build` recreates both. Back off so a hard-failing rebuild (e.g. no active session - // yet) doesn't spin; give up only after a sustained run of failures. - helper_fails += 1; - if helper_fails > MAX_HELPER_FAILS { - tracing::error!( - fails = helper_fails, - "two-process: WGC helper keeps dying — ending stream" - ); - break; - } - std::thread::sleep(std::time::Duration::from_millis(500)); - match build(&mut vd, cur_mode) { - Ok((ka, rl, tg, hz)) => { - tracing::warn!( - fails = helper_fails, - "two-process: WGC helper exited — rebuilt output + helper" - ); - relay = rl; - _keepalive = ka; - target = tg; - effective_hz = hz; - dda = None; // old-target DDA is stale - interval = std::time::Duration::from_secs_f64(1.0 / hz.max(1) as f64); - await_idr = true; // resume on the new helper's opening IDR - } - Err(e) => { - tracing::warn!(error = %format!("{e:#}"), fails = helper_fails, - "two-process: helper rebuild failed — will retry"); - } - } - continue; - } - }; - if await_idr && !au.keyframe { - continue; // skip stale deltas until the post-switch IDR - } - await_idr = false; - helper_fails = 0; // a frame flowed → the helper is healthy again - // The helper's pts_ns is on this machine's monotonic clock (same `now_ns()` source). - if !forward!(au.data, au.pts_ns, au.keyframe) { - break 'outer; // send thread gone - } - } - } - drop(frame_tx); - let _ = send_thread.join(); - drop(watcher); - tracing::info!(sent, "punktfunk/1 two-process stream complete"); - Ok(()) -} - /// One mode's capture/encode pipeline: (capturer, encoder, first frame, frame interval). /// Dropping the capturer tears down the PipeWire stream and the virtual output with it. type Pipeline = ( diff --git a/crates/punktfunk-host/src/session_plan.rs b/crates/punktfunk-host/src/session_plan.rs index 7932797..55d5c98 100644 --- a/crates/punktfunk-host/src/session_plan.rs +++ b/crates/punktfunk-host/src/session_plan.rs @@ -26,12 +26,9 @@ pub enum CaptureBackend { /// Linux: the xdg ScreenCast portal → PipeWire (the only Linux capture path). Portal, /// Windows: IDD direct-push — frames pulled straight from the pf-vdisplay driver's shared ring - /// (in-process, Session 0; no Desktop Duplication, no WGC helper). + /// (in-process, Session 0; captures the secure desktop too). The sole Windows capture path — + /// DXGI Desktop Duplication (DDA) and the WGC two-process relay were removed. IddPush, - /// Windows: DXGI Desktop Duplication (`PUNKTFUNK_CAPTURE=dda|dxgi` or `PUNKTFUNK_NO_WGC`). - Dda, - /// Windows: Windows.Graphics.Capture (the composed-desktop default), with a DDA watchdog fallback. - Wgc, } impl CaptureBackend { @@ -42,20 +39,10 @@ impl CaptureBackend { CaptureBackend::Portal } - /// Windows precedence (identical to the pre-stage-3 `capture_virtual_output` branch order): - /// IDD-push wins; else an explicit `dda`/`dxgi` request or `PUNKTFUNK_NO_WGC` selects DDA; else WGC. + /// Windows: IDD direct-push is the sole capture path (DDA + the WGC two-process relay were removed). #[cfg(target_os = "windows")] pub fn resolve() -> Self { - let cfg = crate::config::config(); - if cfg.idd_push { - CaptureBackend::IddPush - } else if matches!(cfg.capture_backend.as_str(), "dda" | "dxgi") - || crate::capture::wgc_disabled() - { - CaptureBackend::Dda - } else { - CaptureBackend::Wgc - } + CaptureBackend::IddPush } #[cfg(not(any(target_os = "linux", target_os = "windows")))] @@ -67,11 +54,9 @@ impl CaptureBackend { /// How a session is structured across processes. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum SessionTopology { - /// One process captures + encodes (Linux; Windows non-SYSTEM / IDD-push / `NO_WGC`). + /// One process captures + encodes. The only topology: Linux (portal) and Windows (in-process + /// IDD-push in Session 0). The SYSTEM-host + user-session WGC relay was removed with DDA/WGC. SingleProcess, - /// SYSTEM host + a user-session WGC helper relay (the Windows normal-desktop path under SYSTEM, - /// where in-process WGC can't activate). See `virtual_stream_relay`. - TwoProcessRelay, } /// The resolved encode backend (recorded for logging / stages 4–5; the per-session encoder open still @@ -103,8 +88,8 @@ pub struct SessionPlan { pub encoder: EncoderBackend, /// Handshake-negotiated encode bit depth (8, or 10 = HEVC Main10). pub bit_depth: u8, - /// The IDD-push HDR hint (`bit_depth >= 10`) — the want-HDR flag the capturer was passed before. - /// Non-IDD-push Windows backends ignore it and auto-detect HDR from the monitor; Linux is 8-bit. + /// The IDD-push HDR hint (`bit_depth >= 10`) — the want-HDR flag handed to the capturer so it + /// proactively enables advanced color on the virtual display. Linux is 8-bit (HDR blocked upstream). pub hdr: bool, /// Handshake-negotiated chroma subsampling (4:2:0, or full-chroma 4:4:4 when the client + host + /// GPU all support it). Resolved before the Welcome; `Yuv420` on every backend that declined it. @@ -151,26 +136,8 @@ impl SessionPlan { } } -/// Process topology. On Windows this is the former `punktfunk1::should_use_helper` logic verbatim; on -/// every other platform the session is always single-process. -#[cfg(target_os = "windows")] -pub(crate) fn resolve_topology() -> SessionTopology { - let cfg = crate::config::config(); - // `NO_HELPER`/`NO_WGC` force single-process; IDD-push captures in-process in Session 0 (no helper); - // otherwise the helper runs when forced or when we're SYSTEM (in-process WGC can't activate there). - let helper = if cfg.no_helper || crate::capture::wgc_disabled() || cfg.idd_push { - false - } else { - cfg.force_helper || crate::capture::wgc_relay::running_as_system() - }; - if helper { - SessionTopology::TwoProcessRelay - } else { - SessionTopology::SingleProcess - } -} - -#[cfg(not(target_os = "windows"))] +/// Process topology. Single-process is the only topology now: Linux (portal) and Windows (in-process +/// IDD-push in Session 0). The Windows SYSTEM-host + user-session WGC relay was removed with DDA/WGC. pub(crate) fn resolve_topology() -> SessionTopology { SessionTopology::SingleProcess } diff --git a/crates/punktfunk-host/src/windows/interactive.rs b/crates/punktfunk-host/src/windows/interactive.rs index 1e8d0c8..69298a6 100644 --- a/crates/punktfunk-host/src/windows/interactive.rs +++ b/crates/punktfunk-host/src/windows/interactive.rs @@ -5,9 +5,8 @@ //! activation, and each store's auth/entitlement context resolve — the process must run in the //! interactive session under the **logged-in user's** token, not SYSTEM and not session 0. //! -//! This is the same `WTSGetActiveConsoleSessionId → WTSQueryUserToken → DuplicateTokenEx → -//! CreateProcessAsUserW(winsta0\\default)` primitive the WGC helper relay uses -//! ([`crate::capture::wgc_relay`]), factored out for the library launch path +//! This is the standard `WTSGetActiveConsoleSessionId → WTSQueryUserToken → DuplicateTokenEx → +//! CreateProcessAsUserW(winsta0\\default)` primitive, used for the library launch path //! ([`crate::library::launch_title`]). //! //! IMPORTANT — use the **user** token (`WTSQueryUserToken`), NOT a session-retargeted SYSTEM token @@ -36,7 +35,7 @@ use windows::Win32::System::Threading::{ /// /// Fire-and-forget: the launched game/launcher outlives this call, so the host does not track the /// child — its handles are closed before returning (the process keeps running). The environment is -/// the user's block merged with the host's `PUNKTFUNK_*`/`RUST_LOG` (same merge the WGC helper uses), +/// the user's block merged with the host's `PUNKTFUNK_*`/`RUST_LOG` (see [`merged_env_block`]), /// so `host.env` settings propagate. /// /// Requires the host to run as SYSTEM (`WTSQueryUserToken` needs `SE_TCB`). Fails when no interactive @@ -75,7 +74,7 @@ unsafe fn spawn_inner(cmdline: &str, workdir: Option<&Path>) -> Result { // with the host's PUNKTFUNK_*/RUST_LOG vars — same shared helper the WGC helper + service spawns use. let mut env_block: *mut core::ffi::c_void = std::ptr::null_mut(); let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false); - let merged_env = crate::capture::wgc_relay::merged_env_block(env_block as *const u16); + let merged_env = merged_env_block(env_block as *const u16); if !env_block.is_null() { let _ = DestroyEnvironmentBlock(env_block); } @@ -124,3 +123,48 @@ unsafe fn spawn_inner(cmdline: &str, workdir: Option<&Path>) -> Result { let _ = CloseHandle(pi.hThread); Ok(pid) } + +/// Build the environment block for a process launched into the interactive session: the target +/// session's block (`user_block`, from `CreateEnvironmentBlock`) with this process's `PUNKTFUNK_*` +/// vars overlaid, so the child runs with the SAME settings this process has +/// (`PUNKTFUNK_ENCODER=nvenc`, `PUNKTFUNK_ZEROCOPY`, …) instead of the target shell's. Returns a +/// UTF-16, double-null-terminated block suitable for `CREATE_UNICODE_ENVIRONMENT`. Shared by the +/// interactive library launch (here) and the Windows service launching the host into the active +/// session ([`crate::service`]). +/// +/// # Safety +/// `user_block` must be either null or a valid pointer to a UTF-16, double-null-terminated +/// environment block (the `CreateEnvironmentBlock` output), readable for its whole length. +pub(crate) unsafe fn merged_env_block(user_block: *const u16) -> Vec { + // Parse the user block ("VAR=VALUE\0" … "\0") into entries. + let mut entries: Vec = Vec::new(); + if !user_block.is_null() { + let mut p = user_block; + loop { + let mut len = 0isize; + while *p.offset(len) != 0 { + len += 1; + } + if len == 0 { + break; // the trailing empty string = end of block + } + let slice = std::slice::from_raw_parts(p, len as usize); + entries.push(String::from_utf16_lossy(slice)); + p = p.offset(len + 1); + } + } + // Overlay "our" settings — PUNKTFUNK_* and RUST_LOG — dropping whatever the target block had. + let is_ours = |k: &str| k.starts_with("PUNKTFUNK_") || k == "RUST_LOG"; + entries.retain(|e| !is_ours(e.split('=').next().unwrap_or(""))); + for (k, v) in std::env::vars().filter(|(k, _)| is_ours(k)) { + entries.push(format!("{k}={v}")); + } + // Serialize back to a UTF-16 double-null-terminated block. + let mut block: Vec = Vec::new(); + for e in entries { + block.extend(e.encode_utf16()); + block.push(0); + } + block.push(0); + block +} diff --git a/crates/punktfunk-host/src/windows/service.rs b/crates/punktfunk-host/src/windows/service.rs index 9bcc270..77bee04 100644 --- a/crates/punktfunk-host/src/windows/service.rs +++ b/crates/punktfunk-host/src/windows/service.rs @@ -3,12 +3,12 @@ //! for the ad-hoc PsExec / VBS / scheduled-task launch chain used during bring-up. //! //! Why a supervisor and not just "run the host as a service": the host must run **as SYSTEM in the -//! interactive session** (session 1+). Desktop Duplication of the secure (Winlogon/UAC/lock) desktop -//! and `SendInput` both need SYSTEM; capture and injection both need the *interactive* session, which +//! interactive session** (session 1+). Capturing the secure (Winlogon/UAC/lock) desktop and +//! `SendInput` both need SYSTEM; capture and injection both need the *interactive* session, which //! a plain session-0 service is not in. So this service (itself in session 0) never captures — it //! duplicates its own LocalSystem token, retargets it to the active console session, and -//! `CreateProcessAsUserW`s the host there. This is the Sunshine/Apollo model. The host in turn spawns -//! the WGC helper into the *user* session (see `capture::wgc_relay`) — two nested launches. +//! `CreateProcessAsUserW`s the host there. This is the Sunshine/Apollo model. The host captures the +//! virtual display in-process via IDD direct-push (no helper process). //! //! Subcommands (Windows only): //! ```text @@ -230,8 +230,9 @@ fn run_service() -> Result<()> { let _ = SESSION_EVENT.set(session_owned); // The control handler captures nothing — it reaches the events through the statics, so it stays - // `Fn + Send + 'static`. Session lock/unlock are handled inside the host (DesktopWatcher), so we - // only flag console connect/disconnect/logon — the events that change the active session. + // `Fn + Send + 'static`. Lock/unlock is handled by the in-process IDD-push capture (the driver + // composes the secure desktop into the ring), so we only flag console connect/disconnect/logon — + // the events that change the active session. let handler = move |control| -> ServiceControlHandlerResult { match control { ServiceControl::Stop | ServiceControl::Preshutdown | ServiceControl::Shutdown => { @@ -517,10 +518,10 @@ unsafe fn spawn_host( .context("SetTokenInformation(TokenSessionId)")?; // 2) The session's environment block, merged with this process's PUNKTFUNK_*/RUST_LOG (so the - // host runs with host.env's settings, not a bare block). Same merge the WGC helper uses. + // host runs with host.env's settings, not a bare block). Same merge the interactive launch uses. let mut env_block: *mut c_void = std::ptr::null_mut(); let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false); - let merged = crate::capture::wgc_relay::merged_env_block(env_block as *const u16); + let merged = crate::interactive::merged_env_block(env_block as *const u16); if !env_block.is_null() { let _ = DestroyEnvironmentBlock(env_block); } diff --git a/crates/punktfunk-host/src/windows/wgc_helper.rs b/crates/punktfunk-host/src/windows/wgc_helper.rs deleted file mode 100644 index 4224e96..0000000 --- a/crates/punktfunk-host/src/windows/wgc_helper.rs +++ /dev/null @@ -1,346 +0,0 @@ -//! USER-session WGC helper (Windows) — part of the two-process secure-desktop design -//! (design/archive/windows-secure-desktop.md). -//! -//! WGC won't activate under the SYSTEM account, but the host must run as SYSTEM for the secure -//! desktop. So the SYSTEM host spawns THIS helper in the interactive user session -//! (`CreateProcessAsUserW`) to do the WGC capture + NVENC encode that needs the user token, and the -//! helper ships the encoded Annex-B access units back over its **stdout** pipe (which the host -//! inherits + reads). The host relays them on the live QUIC session while the normal desktop is up, -//! and switches to its own DDA encoder on the secure desktop. The helper captures the SAME SudoVDA -//! output **by GDI name only** — it never creates a virtual output / touches display topology (a -//! second topology owner would re-trigger the ACCESS_LOST born-lost storm). -//! -//! Wire framing on stdout, per AU: `[u32 len LE][u64 pts_ns LE][u8 keyframe][len bytes data]`. - -// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). -#![deny(clippy::undocumented_unsafe_blocks)] - -use crate::capture::{dxgi::WinCaptureTarget, wgc::WgcCapturer, Capturer}; -use crate::encode::{self, Codec}; -use anyhow::{Context, Result}; -use std::io::{Read, Write}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; - -pub struct HelperOptions { - pub target_id: u32, - pub gdi_name: String, - pub width: u32, - pub height: u32, - pub fps: u32, - pub bitrate_kbps: u32, - /// Negotiated encode bit depth (8, or 10 = HEVC Main10). HDR auto-upgrades to 10 from the - /// captured frame's `Rgb10a2` format regardless. - pub bit_depth: u8, -} - -/// AU framing magic + version, so the host can resync / detect a helper crash on its stdout stream. -const AU_MAGIC: u32 = 0x5046_4155; // "PFAU" - -/// Control byte the host writes on our stdin to force the next frame to be an IDR. Must match -/// `wgc_relay::CTL_KEYFRAME`. -const CTL_KEYFRAME: u8 = 0x01; - -pub fn run(opts: HelperOptions) -> Result<()> { - tracing::info!( - target_id = opts.target_id, - gdi = %opts.gdi_name, - mode = format!("{}x{}@{}", opts.width, opts.height, opts.fps), - "WGC helper starting (user session)" - ); - - // This thread does WGC capture + video-processor convert + NVENC submit — the GPU-submitting hot - // path. Elevate its OS priority so a CPU-heavy game can't deschedule it and delay submission (which - // would leave our HIGH GPU priority with nothing queued to prioritise). Apollo's capture thread is - // likewise CRITICAL. - crate::punktfunk1::boost_thread_priority(true); - - // Capture the EXISTING SudoVDA output by GDI name / target id — do NOT create one (the host owns - // the virtual output + its isolate/restore; a second topology owner breaks DDA recovery). - let target = WinCaptureTarget { - adapter_luid: 0, - gdi_name: opts.gdi_name.clone(), - target_id: opts.target_id, - }; - let mut cap = - WgcCapturer::open(target, Some((opts.width, opts.height, opts.fps))).context("WGC open")?; - cap.set_active(true); - - // O3 present-trigger experiment: spawn a thread that PRESENTS a D3D swapchain to the virtual - // display (a present SOURCE), testing whether that — unlike WGC's READ — makes the OS assign the - // driver's IddCx swap-chain (so the driver's run_core runs + can push). Gated; diagnostic. - if std::env::var_os("PUNKTFUNK_PRESENT_TRIGGER").is_some() { - let (w, h) = (opts.width, opts.height); - std::thread::Builder::new() - .name("pf-present-trigger".into()) - .spawn(move || { - tracing::info!("present-trigger: starting D3D present loop on the virtual display"); - // SAFETY: `present_trigger` is unsafe only for its Win32/D3D11 FFI; it has no caller - // preconditions (it creates and exclusively owns its own window, device, and swapchain on - // this dedicated thread), so the call is sound. - if let Err(e) = unsafe { present_trigger(w, h) } { - tracing::warn!("present-trigger error: {e:#}"); - } - }) - .ok(); - } - - // First frame establishes the real dimensions + whether the desktop is HDR (the encoder derives - // Main10/HDR from the frame's PixelFormat::Rgb10a2). Then open NVENC on the capture device. - let first = cap.next_frame().context("first WGC frame")?; - let (w, h) = (first.width, first.height); - let mut enc = encode::open_video( - Codec::H265, - first.format, - w, - h, - opts.fps, - opts.bitrate_kbps as u64 * 1000, - false, // not cuda - opts.bit_depth, // 8, or 10 = Main10 (HDR auto-upgrades from the Rgb10a2 frame regardless) - // The two-process WGC relay helper encodes 4:2:0 in v1 (4:4:4 over the relay is a follow-up); - // the host gates 4:4:4 to the single-process topology. - encode::ChromaFormat::Yuv420, - ) - .context("open NVENC")?; - - // Control channel: the host writes a single byte on our stdin to force an IDR (client decode - // recovery), mirroring `enc.request_keyframe()` in the single-process path. A reader thread sets - // a flag the encode loop checks; stdin EOF (host gone) just stops the thread. - let kf = Arc::new(AtomicBool::new(false)); - { - let kf = kf.clone(); - std::thread::Builder::new() - .name("wgc-helper-ctl".into()) - .spawn(move || { - let mut stdin = std::io::stdin(); - let mut byte = [0u8; 1]; - while let Ok(n) = stdin.read(&mut byte) { - if n == 0 { - break; // host closed our stdin - } - if byte[0] == CTL_KEYFRAME { - kf.store(true, Ordering::Relaxed); - } - } - }) - .ok(); - } - - // Binary stdout — lock it once + write framed AUs. A short write / broken pipe means the host - // (parent) went away → exit cleanly so the host's relaunch watchdog can respawn us. - let stdout = std::io::stdout(); - let mut out = stdout.lock(); - - // FIXED-CADENCE encode loop (mirrors the single-process `punktfunk1::virtual_stream` loop). The - // host runs as SYSTEM and relays our AUs; to deliver a STEADY `fps` to the client (the "fixed 240" - // goal) we must NOT gate on WGC's content-driven FrameArrived — `WgcCapturer::next_frame` blocks up - // to its ~8 ms static-repeat timeout when the desktop is quiet, capping a barely-changing desktop - // ~125 fps regardless of the GPU. Instead we pace to `1/fps` and take the FRESHEST frame with the - // non-blocking `try_latest`, repeating the last one when nothing newer arrived. Depth-1: NVENC's - // `poll` (lock_bitstream) blocks until the just-submitted frame is encoded, so exactly one frame is - // in flight per iteration. A deeper pipeline was measured to only stack latency under a - // GPU-saturating game (the encodes serialize on the contended GPU anyway) — the in-game lever is - // the GPU scheduling priority the SYSTEM host stamps on us, not pipeline depth. - let interval = std::time::Duration::from_secs_f64(1.0 / opts.fps.max(1) as f64); - - let perf = crate::config::config().perf; - let mut frames = 0u64; - let mut repeats = 0u64; // frames where no newer capture had arrived (duplicate re-encode) - let mut cap_ns = 0u64; // time in try_latest (capture + video-processor convert) - let mut encode_ns = 0u64; // time blocked in lock_bitstream - let mut write_ns = 0u64; // time writing the AU to the stdout pipe (relay backpressure) - let mut window = std::time::Instant::now(); - - // `frame` is held across iterations and repeated when `try_latest` has nothing newer, so a static - // desktop still clocks `fps`. The capturer's held-set / output ring keep its texture alive across - // the repeat; reassigning `frame` on a fresh capture drops the prior one (already drained by poll). - let mut frame = first; - let mut next = std::time::Instant::now(); - loop { - if kf.swap(false, Ordering::Relaxed) { - enc.request_keyframe(); - } - // Freshest captured frame, or repeat the last (no new composition: static desktop / between a - // game's presents). Non-blocking, so the cadence is OURS, not WGC's event rate. - let t0 = std::time::Instant::now(); - match cap.try_latest().context("WGC try_latest")? { - Some(f) => frame = f, - None => repeats += 1, - } - if perf { - cap_ns += t0.elapsed().as_nanos() as u64; - } - enc.submit(&frame).context("encoder submit")?; - // Drain the just-submitted frame. NVENC's poll blocks in lock_bitstream until it's encoded, so - // this returns exactly one AU (then None) — depth-1, no accumulation. - loop { - let p0 = std::time::Instant::now(); - let polled = enc.poll().context("encoder poll")?; - if perf { - encode_ns += p0.elapsed().as_nanos() as u64; - } - let Some(au) = polled else { break }; - let w0 = std::time::Instant::now(); - let wrote = write_au(&mut out, &au); - if perf { - write_ns += w0.elapsed().as_nanos() as u64; - } - if wrote.is_err() { - tracing::info!("WGC helper: stdout closed (host gone) — exiting"); - return Ok(()); - } - } - // Pace to this frame's due time. If we're already past it (encode couldn't keep up under a - // GPU-saturating game), skip the sleep and re-baseline so we don't spiral into catch-up. - next += interval; - match next.checked_duration_since(std::time::Instant::now()) { - Some(d) => std::thread::sleep(d), - None => next = std::time::Instant::now(), - } - - if perf { - frames += 1; - let since = window.elapsed(); - if since.as_secs() >= 2 { - let secs = since.as_secs_f64(); - let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6); - tracing::info!( - fps = format!("{:.1}", frames as f64 / secs), - repeats, - cap_ms = per(cap_ns), - encode_ms = per(encode_ns), - write_ms = per(write_ns), - "WGC helper perf (fixed-cadence depth-1; encode_ms=lock_bitstream; repeats=duplicated frames)" - ); - frames = 0; - repeats = 0; - cap_ns = 0; - encode_ns = 0; - write_ns = 0; - window = std::time::Instant::now(); - } - } - } -} - -fn write_au(out: &mut impl Write, au: &encode::EncodedFrame) -> std::io::Result<()> { - out.write_all(&AU_MAGIC.to_le_bytes())?; - out.write_all(&(au.data.len() as u32).to_le_bytes())?; - out.write_all(&au.pts_ns.to_le_bytes())?; - out.write_all(&[au.keyframe as u8])?; - out.write_all(&au.data)?; - out.flush() -} - -/// O3 present-trigger experiment (see the gated call in `run`). Creates a small swapchain-backed -/// window on the virtual display (the CCD-isolated primary) and presents continuously — an active -/// present SOURCE on the display — to test whether that makes the OS assign the driver's IddCx -/// swap-chain (which WGC's read does not). Runs forever on its own thread. -/// -/// # Safety -/// Win32/D3D11 FFI; called once on a dedicated helper thread. -unsafe fn present_trigger(disp_w: u32, disp_h: u32) -> Result<()> { - use windows::core::{w, Interface}; - use windows::Win32::Foundation::{HMODULE, HWND, LPARAM, LRESULT, WPARAM}; - use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE; - use windows::Win32::Graphics::Direct3D11::{ - D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, - ID3D11Texture2D, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_SDK_VERSION, - }; - use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC}; - use windows::Win32::Graphics::Dxgi::{ - IDXGIAdapter, IDXGIDevice, IDXGIFactory2, DXGI_PRESENT, DXGI_SWAP_CHAIN_DESC1, - DXGI_SWAP_EFFECT_FLIP_DISCARD, DXGI_USAGE_RENDER_TARGET_OUTPUT, - }; - use windows::Win32::System::LibraryLoader::GetModuleHandleW; - use windows::Win32::UI::WindowsAndMessaging::{ - CreateWindowExW, DefWindowProcW, DispatchMessageW, PeekMessageW, RegisterClassW, - ShowWindow, MSG, PM_REMOVE, SW_SHOWNOACTIVATE, WNDCLASSW, WS_EX_NOACTIVATE, WS_EX_TOPMOST, - WS_POPUP, WS_VISIBLE, - }; - - unsafe extern "system" fn wndproc(h: HWND, m: u32, wp: WPARAM, lp: LPARAM) -> LRESULT { - DefWindowProcW(h, m, wp, lp) - } - - let hinst: HMODULE = GetModuleHandleW(None)?; - let cls = w!("pfPresentTrigger"); - let wc = WNDCLASSW { - lpfnWndProc: Some(wndproc), - hInstance: hinst.into(), - lpszClassName: cls, - ..Default::default() - }; - RegisterClassW(&wc); - // Small window at the top-left of the (primary = virtual) display so it barely obscures the - // captured desktop; topmost + no-activate so it doesn't steal focus. - let win_w = disp_w.min(96) as i32; - let win_h = disp_h.min(96) as i32; - let hwnd: HWND = CreateWindowExW( - WS_EX_TOPMOST | WS_EX_NOACTIVATE, - cls, - w!("pf-present"), - WS_POPUP | WS_VISIBLE, - 0, - 0, - win_w, - win_h, - None, - None, - Some(hinst.into()), - None, - )?; - let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE); - - let mut device: Option = None; - let mut context: Option = None; - D3D11CreateDevice( - None, - D3D_DRIVER_TYPE_HARDWARE, - HMODULE::default(), - D3D11_CREATE_DEVICE_BGRA_SUPPORT, - None, - D3D11_SDK_VERSION, - Some(&mut device), - None, - Some(&mut context), - )?; - let device = device.context("present-trigger d3d11 device")?; - let context = context.context("present-trigger d3d11 context")?; - - let dxgi_dev: IDXGIDevice = device.cast()?; - let adapter: IDXGIAdapter = dxgi_dev.GetAdapter()?; - let factory: IDXGIFactory2 = adapter.GetParent()?; - let scd = DXGI_SWAP_CHAIN_DESC1 { - Width: win_w as u32, - Height: win_h as u32, - Format: DXGI_FORMAT_B8G8R8A8_UNORM, - SampleDesc: DXGI_SAMPLE_DESC { - Count: 1, - Quality: 0, - }, - BufferUsage: DXGI_USAGE_RENDER_TARGET_OUTPUT, - BufferCount: 2, - SwapEffect: DXGI_SWAP_EFFECT_FLIP_DISCARD, - ..Default::default() - }; - let swapchain = factory.CreateSwapChainForHwnd(&device, hwnd, &scd, None, None)?; - tracing::info!("present-trigger: swapchain created on the virtual display; presenting"); - - let mut frame = 0u32; - loop { - let mut msg = MSG::default(); - while PeekMessageW(&mut msg, None, 0, 0, PM_REMOVE).as_bool() { - let _ = DispatchMessageW(&msg); - } - let back: ID3D11Texture2D = swapchain.GetBuffer(0)?; - let mut rtv: Option = None; - device.CreateRenderTargetView(&back, None, Some(&mut rtv))?; - let rtv = rtv.context("present-trigger rtv")?; - let c = (frame % 120) as f32 / 120.0; - context.ClearRenderTargetView(&rtv, &[c, 0.1, 0.2, 1.0]); - let _ = swapchain.Present(1, DXGI_PRESENT(0)); - frame = frame.wrapping_add(1); - } -}