refactor(host/windows): collapse Windows capture to IDD-push only
apple / swift (push) Successful in 1m5s
ci / rust (push) Failing after 1m29s
windows-host / package (push) Failing after 1m11s
ci / web (push) Successful in 56s
ci / docs-site (push) Successful in 1m4s
android / android (push) Successful in 3m35s
apple / screenshots (push) Successful in 5m30s
deb / build-publish (push) Successful in 3m18s
decky / build-publish (push) Successful in 27s
ci / bench (push) Successful in 4m39s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 34s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 2m38s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 2m23s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 52s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 2m24s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 9m7s
docker / deploy-docs (push) Failing after 12m53s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Has been cancelled

Remove DXGI Desktop Duplication (DuplCapturer), Windows.Graphics.Capture
(WgcCapturer), the two-process SYSTEM+helper relay (virtual_stream_relay /
HelperRelay / DesktopWatcher / composed_flip), and the five source files that
implemented them. IDD direct-push is now the sole Windows capture path; the
session topology is always SingleProcess.

Deleted files: wgc.rs, wgc_relay.rs, desktop_watch.rs, composed_flip.rs,
windows/wgc_helper.rs (+ wgc-helper subcommand in main.rs).

dxgi.rs is kept but carved to shared GPU primitives only (make_device,
HdrP010Converter, VideoConverter, install_gpu_pref_hook, WinCaptureTarget,
pack_luid) — ~2237 lines of DDA-only code removed; imports cleaned.

capture.rs: IDD-push open failure fails the session cleanly (no fallback).
Adds capturer_supports_444() — returns false on Windows (IDD-push 4:4:4 is a
follow-up), replacing the stale single_process gate in 4:4:4 negotiation.
session_plan.rs: CaptureBackend{Dda,Wgc} and SessionTopology::TwoProcessRelay
removed. config.rs: no_helper/force_helper/no_wgc/capture_backend/secure_dda
removed. merged_env_block relocated from wgc_relay to windows/interactive.rs.

Linux cargo check clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-30 06:44:26 +00:00
parent 1c04e77293
commit 080c55dbf7
13 changed files with 149 additions and 5028 deletions
+34 -117
View File
@@ -59,7 +59,7 @@ pub struct OutputFormat {
/// Produce GPU-resident D3D11 frames (zero-copy for a GPU encoder — NVENC/AMF/QSV) rather than CPU
/// staging. `false` **only** for the GPU-less software encoder.
pub gpu: bool,
/// HDR: the capturer converts to 10-bit (IDD-push FP16 → `Rgb10a2`; the DDA secure-desktop HDR hint).
/// HDR: the capturer converts to 10-bit (IDD-push FP16 → `P010`, or `Rgb10a2` for a 4:4:4 source).
/// `false` = 8-bit SDR.
pub hdr: bool,
/// Full-chroma 4:4:4 session: the capturer must keep full chroma — deliver packed **RGB**
@@ -380,23 +380,12 @@ pub fn capture_virtual_output(
.map(|c| Box::new(c) as Box<dyn Capturer>)
}
/// `PUNKTFUNK_NO_WGC=1` forces the pure single-process DDA (Desktop Duplication) path everywhere: it
/// skips WGC in [`capture_virtual_output`] AND bypasses the two-process secure-desktop relay (so even a
/// SYSTEM host captures in-process via DDA, the way Apollo does — one capturer for the normal AND the
/// secure desktop). For bringing DDA up to parity / validating it on its own; all the WGC code stays
/// compiled and comes back the moment the flag is unset.
#[cfg(target_os = "windows")]
pub(crate) fn wgc_disabled() -> bool {
crate::config::config().no_wgc
}
#[cfg(target_os = "windows")]
pub fn capture_virtual_output(
vout: crate::vdisplay::VirtualOutput,
want: OutputFormat,
capture: crate::session_plan::CaptureBackend,
_capture: crate::session_plan::CaptureBackend,
) -> Result<Box<dyn Capturer>> {
use crate::session_plan::CaptureBackend;
let target = vout.win_capture.clone().ok_or_else(|| {
anyhow::anyhow!(
"SudoVDA target not yet an active display (needs a WDDM GPU to activate it)"
@@ -404,97 +393,36 @@ pub fn capture_virtual_output(
})?;
let pref = vout.preferred_mode;
let keep = vout.keepalive;
// Full-chroma 4:4:4 needs a full-chroma RGB source. The IDD-push and WGC paths emit subsampled
// NV12/P010 by default, which can't reconstruct 4:4:4; route a 4:4:4 session to DDA, which delivers
// RGB (Bgra) when its `chroma_444` flag is set. (IDD-push/WGC 4:4:4 capture is a follow-up.)
if want.chroma_444 && capture != CaptureBackend::Dda {
tracing::info!("4:4:4 session — using DDA capture (RGB source) instead of {capture:?}");
return dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444)
.map(|c| Box::new(c) as Box<dyn Capturer>);
}
// P2 direct frame push (kill DDA): consume frames straight from the pf-vdisplay driver's shared
// ring — no Desktop Duplication, no win32u reparenting hook. Resolved once in the `SessionPlan`
// (was re-derived from `config().idd_push` here); `IddPush` takes the keepalive (owns the virtual
// display) so there's no fall-through.
if capture == CaptureBackend::IddPush {
// Recreate the monitor + ring per session (fix-teardown): a FRESH monitor reliably gets a
// working IddCx swap-chain, whereas a REUSED monitor's swap-chain dies after ~2 sessions and
// the host can't revive it. The driver's recreate crash (target id resolved to 0) is fixed by
// stamping target_id onto the monitor context. The ring is always FP16 (the driver composes
// the IDD in FP16); `want_hdr` selects the per-frame conversion (FP16 → Rgb10a2 vs Bgra).
// If IDD-push can't open OR the driver doesn't attach to the ring within a few seconds (e.g. a
// hybrid-GPU render mismatch), fall back to DDA so the session is NEVER left black (audit §5.1).
// `open()` hands the keepalive back on failure so DDA can take ownership of the virtual display.
match idd_push::IddPushCapturer::open(target.clone(), pref, want.hdr, keep) {
Ok(c) => return Ok(Box::new(c) as Box<dyn Capturer>),
Err((e, keep)) => {
tracing::warn!(
error = %format!("{e:#}"),
"IDD-push open/attach failed — falling back to DDA"
);
return dxgi::DuplCapturer::open(
target,
pref,
keep,
want.gpu,
false,
want.chroma_444,
)
.map(|c| Box::new(c) as Box<dyn Capturer>);
}
}
}
// WGC (Windows.Graphics.Capture) is the default: it captures the COMPOSED desktop including the
// overlay/independent-flip planes DXGI Desktop Duplication misses (the frozen-HDR-animation bug),
// and has no ACCESS_LOST-on-overlay churn. DDA stays available via PUNKTFUNK_CAPTURE=dda and is
// the secure-desktop (lock/UAC) fallback (WGC can't capture those). `keep` is moved into the
// chosen backend (it owns the SudoVDA keepalive), so there's no open-time auto-fallback. The
// backend choice (`dda`/`dxgi`/`PUNKTFUNK_NO_WGC` → DDA, else WGC) is now resolved once in the plan.
if capture == CaptureBackend::Dda {
return dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444)
.map(|c| Box::new(c) as Box<dyn Capturer>);
}
// WGC default, with a watchdog'd DDA fallback. WGC's Direct3D11CaptureFramePool::CreateFreeThreaded
// intermittently HANGS on the headless SudoVDA (IddCx) display — a blocking call we can't error out
// of in place. So run WGC open on a dedicated thread and bound it: if it doesn't finish in time
// (hang) or errors, fall back to the reliable DDA path so the session is NEVER left black. WGC,
// when it opens, captures the composed desktop (overlay/MPO-correct HDR — fixes frozen animations);
// DDA is the safety net (+ the secure-desktop path). The encode thread is set MTA so the WGC
// objects built on the watchdog thread (also MTA) are usable here; the keepalive is handed to WGC
// only on success, else to DDA. A hung watchdog thread is abandoned (holds no keepalive).
// SAFETY: `RoInitialize` is a combase FFI call that initializes the WinRT apartment for the calling
// thread. It takes the `RO_INIT_MULTITHREADED` enum by value and borrows no memory, so there is no
// pointer/lifetime/aliasing obligation; it is safe on any thread and idempotent — a second call on a
// thread already in a compatible apartment returns S_FALSE / RPC_E_CHANGED_MODE, which we discard.
// Runs on the encode thread that goes on to use the WGC (WinRT) objects built by the watchdog thread.
unsafe {
let _ = windows::Win32::System::WinRT::RoInitialize(
windows::Win32::System::WinRT::RO_INIT_MULTITHREADED,
);
}
let (tx, rx) = std::sync::mpsc::channel();
let t = target.clone();
let _ = std::thread::Builder::new()
.name("wgc-open".into())
.spawn(move || {
let _ = tx.send(wgc::WgcCapturer::open(t, pref));
});
match rx.recv_timeout(std::time::Duration::from_secs(5)) {
Ok(Ok(mut c)) => {
c.attach_keepalive(keep);
Ok(Box::new(c) as Box<dyn Capturer>)
}
Ok(Err(e)) => {
tracing::warn!(error = %format!("{e:#}"), "WGC open failed — falling back to DDA");
dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444)
// IDD direct-push is the sole Windows capture path: consume frames straight from the pf-vdisplay
// driver's shared ring (in-process, Session 0 — it captures the secure desktop too; no Desktop
// Duplication, no WGC helper). A FRESH monitor + ring is created per session: a REUSED monitor's
// swap-chain dies after ~2 sessions and can't be revived. The ring is always FP16 when the display
// is HDR (the driver composes the IDD in FP16); `want.hdr` proactively enables advanced color and
// selects the per-frame conversion (FP16 → P010 vs BGRA → NV12). `IddPushCapturer` takes the
// keepalive (it owns the virtual display). There is NO fallback (DDA + the WGC relay were removed):
// if it can't open or the driver doesn't attach, the session fails cleanly and the client reconnects.
idd_push::IddPushCapturer::open(target, pref, want.hdr, keep)
.map(|c| Box::new(c) as Box<dyn Capturer>)
}
Err(_) => {
tracing::warn!("WGC open timed out (CreateFreeThreaded hang on the virtual display) — falling back to DDA");
dxgi::DuplCapturer::open(target, pref, keep, want.gpu, false, want.chroma_444)
.map(|c| Box::new(c) as Box<dyn Capturer>)
}
}
.map_err(|(e, _keep)| e.context("IDD-push capture open (no fallback)"))
}
/// Whether the active capturer can deliver a full-chroma (RGB) source for a 4:4:4 HEVC encode. The
/// negotiator gates 4:4:4 on this so the host honestly downgrades to 4:2:0 when the capturer can only
/// produce subsampled frames. Linux (the portal capturer feeding CPU RGB → `yuv444p`) can; the Windows
/// IDD-push path delivers subsampled NV12/P010 today, so full-chroma capture there is a follow-up.
#[cfg(target_os = "linux")]
pub(crate) fn capturer_supports_444() -> bool {
true
}
#[cfg(target_os = "windows")]
pub(crate) fn capturer_supports_444() -> bool {
// IDD-push 4:4:4 (full-chroma RGB from the FP16 ring) is the next step; until then the sole Windows
// capturer delivers subsampled NV12/P010 only, so the host honestly negotiates 4:2:0.
false
}
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
pub(crate) fn capturer_supports_444() -> bool {
false
}
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
@@ -506,14 +434,9 @@ pub fn capture_virtual_output(
anyhow::bail!("virtual-output capture requires Linux or Windows")
}
// Goal-1 stage 6: the Windows backends live under `capture/windows/`, the Linux one under `capture/linux/`
// (`#[path]` keeps the module names flat, so every `crate::capture::*` path is unchanged).
#[cfg(target_os = "windows")]
#[path = "capture/windows/composed_flip.rs"]
pub mod composed_flip;
#[cfg(target_os = "windows")]
#[path = "capture/windows/desktop_watch.rs"]
pub mod desktop_watch;
// Goal-1 stage 6: the Windows backend lives under `capture/windows/`, the Linux one under `capture/linux/`
// (`#[path]` keeps the module names flat, so every `crate::capture::*` path is unchanged). Windows capture
// is IDD direct-push only — DXGI Desktop Duplication (DDA) and the WGC two-process relay were removed.
#[cfg(target_os = "windows")]
#[path = "capture/windows/dxgi.rs"]
pub mod dxgi;
@@ -522,9 +445,3 @@ pub mod dxgi;
pub mod idd_push;
#[cfg(target_os = "linux")]
mod linux;
#[cfg(target_os = "windows")]
#[path = "capture/windows/wgc.rs"]
pub mod wgc;
#[cfg(target_os = "windows")]
#[path = "capture/windows/wgc_relay.rs"]
pub mod wgc_relay;
@@ -1,217 +0,0 @@
//! Force-composed-flip overlay (Windows) — make the secure (Winlogon: UAC / lock / login) desktop
//! capturable by Desktop Duplication.
//!
//! The secure desktop's dialog/wallpaper presents via **fullscreen independent-flip / MPO**: it scans
//! out directly, bypassing DWM composition, so `IDXGIOutputDuplication::AcquireNextFrame` returns
//! `DXGI_ERROR_ACCESS_LOST` (born-lost) — there is no composed frame to hand out (the client sees
//! black). Independent-flip requires the presenting app to own the ENTIRE output: putting ANY other
//! visible window on that output disqualifies it, forcing DWM to **composite**, which DDA can then
//! capture. So we keep a tiny, click-through, near-invisible **topmost layered window** alive on the
//! *current input desktop* (which is Winlogon while the secure desktop is up). On a desktop switch the
//! window is orphaned, so a dedicated thread tracks the input desktop and recreates it there.
//!
//! This is the non-input alternative to "tap a key to wake the lock screen": it needs no SendInput and
//! no system-wide registry change (it does NOT disable MPO globally — it only nudges OUR output to
//! composed while a session is live). Effectiveness can be build/driver-dependent; gated by
//! `PUNKTFUNK_FORCE_COMPOSED` (default ON; set =0 to disable).
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use windows::core::w;
use windows::Win32::Foundation::{HWND, LPARAM, LRESULT, WPARAM};
use windows::Win32::System::LibraryLoader::GetModuleHandleW;
use windows::Win32::System::StationsAndDesktops::{
CloseDesktop, GetUserObjectInformationW, OpenInputDesktop, SetThreadDesktop,
DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS, UOI_NAME,
};
use windows::Win32::UI::WindowsAndMessaging::{
CreateWindowExW, DefWindowProcW, DestroyWindow, DispatchMessageW, PeekMessageW, RegisterClassW,
SetLayeredWindowAttributes, SetWindowPos, ShowWindow, TranslateMessage, HWND_TOPMOST,
LWA_ALPHA, MSG, PM_REMOVE, SWP_NOACTIVATE, SWP_NOMOVE, SWP_NOSIZE, SW_SHOWNOACTIVATE,
WNDCLASSW, WS_EX_LAYERED, WS_EX_NOACTIVATE, WS_EX_TOOLWINDOW, WS_EX_TOPMOST, WS_EX_TRANSPARENT,
WS_POPUP,
};
/// A running force-composed-flip overlay. Drop signals the thread to tear down its window + exit.
pub struct ForceComposedFlip {
stop: Arc<AtomicBool>,
}
impl ForceComposedFlip {
/// Start the overlay (no-op + `None` if disabled via `PUNKTFUNK_FORCE_COMPOSED=0`).
pub fn start() -> Option<Self> {
if std::env::var("PUNKTFUNK_FORCE_COMPOSED").as_deref() == Ok("0") {
tracing::info!("force-composed-flip overlay disabled (PUNKTFUNK_FORCE_COMPOSED=0)");
return None;
}
let stop = Arc::new(AtomicBool::new(false));
let st = stop.clone();
std::thread::Builder::new()
.name("composed-flip".into())
// SAFETY: `run` is this module's `unsafe fn` (it owns a desktop+window lifecycle via Win32
// FFI); it takes ownership of `st` (the stop `Arc<AtomicBool>`) and has no caller-side memory
// precondition. It is designed to own its thread for its whole duration — exactly the
// dedicated `composed-flip` thread spawned here.
.spawn(move || unsafe { run(st) })
.ok()?;
tracing::info!("force-composed-flip overlay started (Winlogon-aware)");
Some(ForceComposedFlip { stop })
}
}
impl Drop for ForceComposedFlip {
fn drop(&mut self) {
self.stop.store(true, Ordering::Relaxed);
}
}
extern "system" fn wndproc(hwnd: HWND, msg: u32, wp: WPARAM, lp: LPARAM) -> LRESULT {
// SAFETY: this is the window procedure the OS invokes with the window's own `hwnd` and a real
// message `(msg, wp, lp)`. `DefWindowProcW` performs default processing for exactly those
// parameters (all passed straight through by value); it borrows no Rust memory and is synchronous.
unsafe { DefWindowProcW(hwnd, msg, wp, lp) }
}
/// Read the current input-desktop name (e.g. "Default" / "Winlogon"); `None` if it can't be read.
unsafe fn input_desktop_name() -> Option<String> {
let desk = OpenInputDesktop(
DESKTOP_CONTROL_FLAGS(0),
false,
DESKTOP_ACCESS_FLAGS(0x0001),
)
.ok()?;
let mut buf = [0u16; 64];
let mut needed = 0u32;
let ok = GetUserObjectInformationW(
windows::Win32::Foundation::HANDLE(desk.0),
UOI_NAME,
Some(buf.as_mut_ptr() as *mut _),
(buf.len() * 2) as u32,
Some(&mut needed),
)
.is_ok();
let _ = CloseDesktop(desk);
if !ok {
return None;
}
Some(
String::from_utf16_lossy(&buf)
.trim_end_matches('\u{0}')
.to_string(),
)
}
/// Create the tiny topmost layered click-through window on the CURRENT thread's desktop. Caller must
/// have `SetThreadDesktop`'d to the target input desktop first.
unsafe fn make_overlay() -> Option<HWND> {
let hinst = GetModuleHandleW(None).ok()?;
let class = w!("PunktfunkComposedFlip");
// RegisterClassW is idempotent-ish: a second register for the same name fails harmlessly; we
// ignore the result and rely on the class existing. (One process, so it registers once.)
let wc = WNDCLASSW {
lpfnWndProc: Some(wndproc),
hInstance: hinst.into(),
lpszClassName: class,
..Default::default()
};
let atom = RegisterClassW(&wc);
if atom == 0 {
let e = windows::Win32::Foundation::GetLastError();
// 1410 = ERROR_CLASS_ALREADY_EXISTS is fine (re-register after a desktop switch).
if e.0 != 1410 {
tracing::warn!(err = e.0, "force-composed-flip: RegisterClassW failed");
}
}
let hwnd = match CreateWindowExW(
WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOPMOST | WS_EX_NOACTIVATE | WS_EX_TOOLWINDOW,
class,
w!(""),
WS_POPUP,
0,
0,
1,
1,
None,
None,
Some(hinst.into()),
None,
) {
Ok(h) => h,
Err(e) => {
let le = windows::Win32::Foundation::GetLastError();
tracing::warn!(err = %format!("{e:?}"), last = le.0,
"force-composed-flip: CreateWindowExW failed");
return None;
}
};
// alpha=1: technically visible (so it disqualifies independent-flip) but imperceptible.
let _ = SetLayeredWindowAttributes(hwnd, windows::Win32::Foundation::COLORREF(0), 1, LWA_ALPHA);
let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE);
let _ = SetWindowPos(
hwnd,
Some(HWND_TOPMOST),
0,
0,
0,
0,
SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE,
);
Some(hwnd)
}
unsafe fn run(stop: Arc<AtomicBool>) {
let mut cur_desktop: Option<String> = None;
let mut hwnd: Option<HWND> = None;
let mut ticks: u32 = 0;
while !stop.load(Ordering::Relaxed) {
// Follow the input desktop: if it changed (Default↔Winlogon), re-attach this thread and
// recreate the window there (a window is bound to the desktop it was created on).
let name = input_desktop_name();
if name != cur_desktop {
if let Some(h) = hwnd.take() {
let _ = DestroyWindow(h);
}
if let Ok(desk) = OpenInputDesktop(
DESKTOP_CONTROL_FLAGS(0),
false,
DESKTOP_ACCESS_FLAGS(0x1000_0000), // GENERIC_ALL (incl. DESKTOP_CREATEWINDOW=0x0002)
) {
if SetThreadDesktop(desk).is_ok() {
hwnd = make_overlay();
tracing::info!(desktop = ?name, created = hwnd.is_some(),
"force-composed-flip: overlay (re)created on input desktop");
}
// Leak `desk` while it's the thread desktop (closing the current thread desktop is UB).
}
cur_desktop = name;
}
// Re-assert topmost periodically (other windows on the secure desktop can push us down) and
// pump our message queue so the window stays responsive/composited.
if let Some(h) = hwnd {
let _ = SetWindowPos(
h,
Some(HWND_TOPMOST),
0,
0,
0,
0,
SWP_NOMOVE | SWP_NOSIZE | SWP_NOACTIVATE,
);
let mut msg = MSG::default();
while PeekMessageW(&mut msg, Some(h), 0, 0, PM_REMOVE).as_bool() {
let _ = TranslateMessage(&msg);
DispatchMessageW(&msg);
}
}
ticks = ticks.wrapping_add(1);
let _ = ticks;
std::thread::sleep(std::time::Duration::from_millis(200));
}
if let Some(h) = hwnd.take() {
let _ = DestroyWindow(h);
}
tracing::info!("force-composed-flip overlay stopped");
}
@@ -1,144 +0,0 @@
//! Input-desktop watcher (Windows) — the authoritative "normal vs secure desktop" signal for the
//! two-process secure-desktop design (design/archive/windows-secure-desktop.md).
//!
//! Windows switches the *input desktop* to "Winlogon" (the secure desktop) for UAC elevation, the
//! lock screen and the login screen, and back to "Default" for the normal session. WGC captures only
//! the normal desktop; DDA-as-SYSTEM captures the secure one. A dedicated thread polls the input
//! desktop's NAME (WTS session notifications miss UAC entirely, so the name is the reliable signal)
//! and publishes it as an atomic the capture mux + input path read.
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
use std::sync::Arc;
use std::time::Duration;
use windows::Win32::Foundation::HANDLE;
use windows::Win32::System::StationsAndDesktops::{
CloseDesktop, GetUserObjectInformationW, OpenInputDesktop, DESKTOP_ACCESS_FLAGS,
DESKTOP_CONTROL_FLAGS, UOI_NAME,
};
/// The normal interactive desktop ("Default") — WGC capture applies.
pub const DESKTOP_NORMAL: u8 = 0;
/// The secure desktop ("Winlogon": UAC / lock / login) — DDA-as-SYSTEM capture applies.
pub const DESKTOP_SECURE: u8 = 1;
/// Polls the input-desktop name on its own thread and publishes [`DESKTOP_NORMAL`]/[`DESKTOP_SECURE`].
pub struct DesktopWatcher {
state: Arc<AtomicU8>,
stop: Arc<AtomicBool>,
}
impl DesktopWatcher {
pub fn start() -> Self {
// Compute the CURRENT desktop synchronously before returning, so the first reader (the capture
// mux) sees the real state immediately. Otherwise a session that begins already on the secure
// desktop (e.g. a reconnect to a locked box) would read DESKTOP_NORMAL for the first poll
// interval and relay one stale normal-desktop frame — the "flash of the login screen" bug.
// SAFETY: `is_secure_desktop` is this module's `unsafe fn` — unsafe only because it calls Win32
// desktop FFI (`OpenInputDesktop`/`GetUserObjectInformationW`/`CloseDesktop`), with no caller
// precondition; it opens, names, and closes the input-desktop handle internally and is safe to
// call from any thread (here, on the thread running `DesktopWatcher::start`).
let initial = if unsafe { is_secure_desktop() } {
DESKTOP_SECURE
} else {
DESKTOP_NORMAL
};
let state = Arc::new(AtomicU8::new(initial));
let stop = Arc::new(AtomicBool::new(false));
let s = state.clone();
let st = stop.clone();
let _ = std::thread::Builder::new()
.name("desktop-watch".into())
.spawn(move || {
// Debounce: only publish a change after the raw reading has been stable for several
// polls. The input desktop flaps Default↔Winlogon transiently during a lock/UAC
// transition; publishing every flap makes the capture mux thrash (rebuild storms).
const STABLE_POLLS: u32 = 4; // ~80ms
let mut published = initial;
let mut candidate = initial;
let mut stable = 0u32;
while !st.load(Ordering::Relaxed) {
// SAFETY: same as in `start` — `is_secure_desktop` is self-contained Win32 desktop
// FFI with no caller precondition, called here on the dedicated `desktop-watch`
// polling thread.
let v = if unsafe { is_secure_desktop() } {
DESKTOP_SECURE
} else {
DESKTOP_NORMAL
};
if v == candidate {
stable = stable.saturating_add(1);
} else {
candidate = v;
stable = 1;
}
if stable >= STABLE_POLLS && candidate != published {
s.store(candidate, Ordering::Release);
published = candidate;
tracing::info!(
desktop = if candidate == DESKTOP_SECURE {
"Winlogon(secure)"
} else {
"Default"
},
"input desktop changed (debounced)"
);
}
std::thread::sleep(Duration::from_millis(20));
}
});
DesktopWatcher { state, stop }
}
/// The shared atomic ([`DESKTOP_NORMAL`]/[`DESKTOP_SECURE`]) for the capture mux to read.
pub fn state(&self) -> Arc<AtomicU8> {
self.state.clone()
}
/// True when the secure (Winlogon) desktop is the input desktop right now.
pub fn is_secure(&self) -> bool {
self.state.load(Ordering::Acquire) == DESKTOP_SECURE
}
}
impl Drop for DesktopWatcher {
fn drop(&mut self) {
self.stop.store(true, Ordering::Relaxed);
}
}
/// True if the current input desktop is "Winlogon" (the secure desktop). Best-effort: if the desktop
/// can't be opened or named, report not-secure (the safe default — keep WGC/normal capture).
pub(crate) unsafe fn is_secure_desktop() -> bool {
let desk = match OpenInputDesktop(
DESKTOP_CONTROL_FLAGS(0),
false,
DESKTOP_ACCESS_FLAGS(DESKTOP_READOBJECTS),
) {
Ok(d) => d,
Err(_) => return false,
};
let mut buf = [0u16; 64];
let mut needed = 0u32;
let ok = GetUserObjectInformationW(
HANDLE(desk.0),
UOI_NAME,
Some(buf.as_mut_ptr() as *mut _),
(buf.len() * 2) as u32,
Some(&mut needed),
)
.is_ok();
let _ = CloseDesktop(desk);
if !ok {
return false;
}
let name = String::from_utf16_lossy(&buf);
name.trim_end_matches('\u{0}')
.eq_ignore_ascii_case("Winlogon")
}
/// `DESKTOP_READOBJECTS` access right (the windows crate exposes it as a typed flag; we need the raw
/// bit for `OpenInputDesktop`'s access mask).
const DESKTOP_READOBJECTS: u32 = 0x0001;
File diff suppressed because it is too large Load Diff
@@ -1,816 +0,0 @@
//! Windows.Graphics.Capture (WGC) capture backend — the HDR/animation-correct path.
//!
//! Why WGC over DXGI Desktop Duplication: DDA duplicates only the DWM-composed primary surface, so
//! HDR desktop animations the OS routes onto hardware overlay / independent-flip / MPO planes (Start
//! menu, Win11 Mica/acrylic, window resize) never enter the surface DDA reads — the stream shows a
//! frozen desktop ("broken HDR animations"). Engaging WGC capture pulls that content back through DWM
//! composition, so the surface WGC hands back contains the animations. WGC also has no
//! ACCESS_LOST-on-overlay-flip churn.
//!
//! It reuses the rest of the pipeline UNCHANGED: the frame's GPU texture (the OS already composited
//! the cursor into it — `IsCursorCaptureEnabled(true)`) goes through the same scRGB→BT.2020-PQ shader
//! ([`super::dxgi::HdrConverter`]) into a host-owned `R10G10B10A2` texture (HDR) or is copied into a
//! BGRA texture (SDR), which is handed to NVENC zero-copy (registered by pointer, encoded in place).
//! Shares the D3D11 device with NVENC via `FramePayload::D3d11`.
//!
//! Limitation: WGC cannot capture the secure desktop (lock / UAC / login) — the caller falls back to
//! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs).
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use super::dxgi::{
find_output, hdr_shader_p010_enabled, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter,
HdrP010Converter, VideoConverter, WinCaptureTarget,
};
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
use anyhow::{bail, Context, Result};
use std::collections::VecDeque;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::time::{Duration, Instant};
use windows::core::{IInspectable, Interface};
use windows::Foundation::{TimeSpan, TypedEventHandler};
use windows::Graphics::Capture::{
Direct3D11CaptureFrame, Direct3D11CaptureFramePool, GraphicsCaptureItem, GraphicsCaptureSession,
};
use windows::Graphics::DirectX::DirectXPixelFormat;
use windows::Win32::Foundation::{CloseHandle, HANDLE};
use windows::Win32::Graphics::Direct3D11::{
ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView,
ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_TEXTURE2D_DESC,
D3D11_USAGE_DEFAULT,
};
use windows::Win32::Graphics::Dxgi::Common::{
DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_R10G10B10A2_UNORM,
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
};
use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6};
use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf};
use windows::Win32::System::RemoteDesktop::{WTSGetActiveConsoleSessionId, WTSQueryUserToken};
use windows::Win32::System::WinRT::Direct3D11::{
CreateDirect3D11DeviceFromDXGIDevice, IDirect3DDxgiInterfaceAccess,
};
use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop;
use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED};
/// Output texture ring depth. The encode loop pipelines one frame deep (NVENC encodes frame N while
/// the capturer produces N+1), so two live textures suffice; three gives headroom against a slow
/// `lock_bitstream` and matches the WGC frame-pool depth.
// Sized for the deep encode pipeline (`PUNKTFUNK_ENCODE_DEPTH`, default 4, clamped ≤ 6): up to DEPTH
// frames are in flight in NVENC at once, so the HDR convert ring and the SDR held-frame set must each
// keep DEPTH(+headroom) live textures, and the WGC pool needs spare buffers beyond what we hold.
const OUT_RING: usize = 8;
/// SDR zero-copy: how many recent WGC frames to keep alive so NVENC can encode the pool texture in
/// place (no `CopyResource`). Each in-flight encode reads a distinct frame, so this must exceed the
/// pipeline depth; the oldest is released once `HELD_FRAMES` newer ones exist.
const HELD_FRAMES: usize = 8;
/// WGC frame-pool buffer count. Must exceed `HELD_FRAMES` so the compositor always has free buffers
/// to render into while we hold frames for in-place (zero-copy) SDR encode.
const WGC_POOL_BUFFERS: i32 = 10;
/// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under
/// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user
/// for the WGC activation. Returns the user token (the caller reverts + closes it after activation)
/// or `None` (no active user, or the host already runs AS the user — WTSQueryUserToken then fails and
/// WGC works without impersonation). SYSTEM-only; harmless under a user-token host.
unsafe fn impersonate_active_user() -> Option<HANDLE> {
let session = WTSGetActiveConsoleSessionId();
if session == 0xFFFF_FFFF {
return None;
}
let mut token = HANDLE::default();
if WTSQueryUserToken(session, &mut token).is_ok() {
if ImpersonateLoggedOnUser(token).is_ok() {
return Some(token);
}
let _ = CloseHandle(token);
}
None
}
/// RAII: reverts the WGC-activation impersonation when it drops (covers every `?` early-return).
struct Deimpersonate(Option<HANDLE>);
impl Drop for Deimpersonate {
fn drop(&mut self) {
if let Some(tok) = self.0.take() {
// SAFETY: `RevertToSelf` takes no arguments and undoes the thread impersonation set during
// WGC activation; `tok` is the impersonation token `HANDLE` from `impersonate_active_user`,
// owned by this `Deimpersonate` and closed exactly once here (taken out of the `Option`, so
// no double-close). Both are FFI calls borrowing no Rust memory.
unsafe {
let _ = RevertToSelf();
let _ = CloseHandle(tok);
}
}
}
}
/// Signal from the free-threaded FrameArrived callback to the encode thread: a monotonically
/// increasing count of arrived frames + a condvar to wake `next_frame`. The encode thread tracks how
/// many it has consumed; `TryGetNextFrame` is called exactly `available - consumed` times so we never
/// hit the empty-pool ambiguity, and draining to the newest keeps latency at one frame.
struct WgcSignal {
available: AtomicU64,
mtx: Mutex<()>,
cv: Condvar,
}
pub struct WgcCapturer {
device: ID3D11Device,
context: ID3D11DeviceContext,
// WGC objects — kept alive for the session's lifetime.
pool: Direct3D11CaptureFramePool,
session: GraphicsCaptureSession,
_item: GraphicsCaptureItem,
_frame_arrived_token: i64,
signal: Arc<WgcSignal>,
consumed: u64,
width: u32,
height: u32,
timeout_ms: u64,
first_frame: bool,
hdr: bool,
/// The source display's static HDR mastering metadata (ST.2086 + content light level), read from
/// `IDXGIOutput6::GetDesc1` at open when the output is HDR. Forwarded to the encoder (in-band SEI)
/// and the client (0xCE) by the stream loop. `None` when SDR. (The helper relay path also encodes,
/// so this is what gives the secure/normal-desktop HDR stream its mastering SEI.)
hdr_meta: Option<punktfunk_core::quic::HdrMeta>,
hdr_conv: Option<HdrConverter>,
fp16_src: Option<ID3D11Texture2D>,
fp16_srv: Option<ID3D11ShaderResourceView>,
/// `PUNKTFUNK_HDR_SHADER_P010` path: emit P010 (BT.2020 PQ 10-bit limited range) DIRECTLY from our
/// own shader (`HdrP010Converter`) so NVENC takes native P010 and skips its SM-side RGB→YUV CSC.
/// Gated by [`hdr_shader_p010_enabled`] AND `self.hdr`; `None`/empty when off → the existing R10 +
/// VideoProcessor paths run unchanged. `p010_disabled` latches a runtime failure (e.g. a driver
/// that rejects the planar plane RTV) so we fall back to the R10 path and stop retrying.
hdr_p010_conv: Option<HdrP010Converter>,
p010_out: Vec<ID3D11Texture2D>,
p010_idx: usize,
p010_disabled: bool,
/// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed
/// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC
/// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so
/// N+1 must land in a DIFFERENT texture or it clobbers the in-flight encode. (`fp16_src` stays
/// single: it's only touched within the D3D11 immediate context, whose op ordering already
/// serializes the convert's read against the next copy's write — NVENC's async engine read is the
/// only consumer that escapes that ordering, and it reads the ring output, never `fp16_src`.)
out_ring: Vec<ID3D11Texture2D>,
ring_idx: usize,
/// Video-processor RGB→YUV converter (off the 3D engine where possible) + its NV12/P010 output
/// ring. Preferred path: the OS-composited capture (cursor already in it) is converted DIRECTLY to
/// NVENC's native YUV — no `CopyResource`, no cursor draw, and NVENC skips its internal RGB→YUV.
/// `None`/error → falls back to the legacy SDR-zero-copy / HDR-shader paths.
video_conv: Option<VideoConverter>,
yuv_out: Vec<ID3D11Texture2D>,
yuv_idx: usize,
yuv_is_hdr: bool,
vp_disabled: bool,
/// SDR zero-copy: the recent WGC frames we hand to NVENC in place. Held so the pool doesn't
/// recycle the texture mid-encode; the oldest is released once `HELD_FRAMES` newer ones exist.
held: VecDeque<Direct3D11CaptureFrame>,
/// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop).
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
/// Owns the SudoVDA keepalive once attached (after WGC is confirmed open) — dropping the capturer
/// then REMOVEs the virtual output. `None` between open and attach so a WGC-open failure leaves
/// the keepalive with the caller for the DDA fallback.
_keepalive: Option<Box<dyn Send>>,
}
// SAFETY: like `DuplCapturer`. `WgcCapturer` holds D3D11 (free-threaded device/context) plus WGC WinRT
// objects (`Direct3D11CaptureFramePool` etc., created free-threaded via `CreateFreeThreaded`). COM/WinRT
// reference counting is interlocked, and the capturer is owned + used by exactly one encode thread,
// moved to it once and never shared (no `Sync`), so transferring ownership across threads is sound. The
// free-threaded `FrameArrived` callback touches only the `Arc<WgcSignal>` (itself `Send + Sync`), not
// the capturer's COM fields.
unsafe impl Send for WgcCapturer {}
impl WgcCapturer {
/// Open WGC capture. Does NOT take the keepalive — the caller attaches it via
/// [`attach_keepalive`](Self::attach_keepalive) only after open succeeds, so a failure leaves the
/// keepalive with the caller to hand to the DDA fallback.
pub fn open(target: WinCaptureTarget, preferred: Option<(u32, u32, u32)>) -> Result<Self> {
// SAFETY: runs on the thread opening the WGC session. `RoInitialize` inits this thread's WinRT
// apartment (idempotent; result ignored). `impersonate_active_user()` and `find_output()` are
// this module's `unsafe fn`s whose contracts (call on the activating thread; pass a GDI name)
// are met, and the impersonation is reverted by `_deimp`'s Drop on every return path. Every
// COM/WinRT call thereafter operates on an object obtained + `?`-checked earlier in this same
// block on this single thread — the `IDXGIOutput1` from `find_output`, the device/context from
// `make_device`, the factory/interop/item/pool/session — and the `TypedEventHandler` closure
// captures an `Arc<WgcSignal>` (Send+Sync) by move. No raw pointers are dereferenced; borrowed
// locals outlive their synchronous calls.
unsafe {
// WGC is WinRT — the calling thread needs a COM/WinRT apartment for the GraphicsCaptureItem
// activation factory (RoGetActivationFactory). Initialize MTA; ignore "already initialized"
// / "changed mode" (another component on this thread may have init'd a compatible apartment).
let ro = RoInitialize(RO_INIT_MULTITHREADED);
// Impersonate the interactive user for the duration of WGC activation (host runs as
// SYSTEM; WGC won't activate under SYSTEM). Reverted by the guard's Drop on return. The
// WGC objects, once created, are accessed from the (SYSTEM) encode thread thereafter.
let imp = impersonate_active_user();
let _deimp = Deimpersonate(imp);
tracing::info!(ro_result = ?ro, impersonated = imp.is_some(), "WGC: RoInitialize(MTA)");
// The SudoVDA output appears a beat after the display is created — settle-retry like DDA.
let deadline = Instant::now() + Duration::from_millis(2000);
let (adapter, output) = loop {
if let Some(n) = crate::win_display::resolve_gdi_name(target.target_id) {
if let Ok(found) = find_output(&n) {
break found;
}
}
if let Ok(found) = find_output(&target.gdi_name) {
break found;
}
if Instant::now() >= deadline {
bail!(
"WGC: no DXGI output for SudoVDA target {} yet",
target.target_id
);
}
std::thread::sleep(Duration::from_millis(100));
};
let (device, context) = make_device(&adapter)?;
let od = output.GetDesc().context("output GetDesc")?;
let hmonitor = od.Monitor;
// HDR iff the output's colour space is BT.2020 PQ (G2084) — matches the DDA FP16 detection.
// From the same desc, read the source display's mastering metadata (ST.2086) when HDR.
let desc1 = output
.cast::<IDXGIOutput6>()
.ok()
.and_then(|o6| o6.GetDesc1().ok());
let hdr = desc1
.as_ref()
.map(|d1| d1.ColorSpace == DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020)
.unwrap_or(false);
let hdr_meta = if hdr {
desc1.as_ref().map(|d| {
crate::hdr::hdr_meta_from_display(
(d.RedPrimary[0], d.RedPrimary[1]),
(d.GreenPrimary[0], d.GreenPrimary[1]),
(d.BluePrimary[0], d.BluePrimary[1]),
(d.WhitePoint[0], d.WhitePoint[1]),
d.MaxLuminance,
d.MinLuminance,
0, // MaxCLL: GetDesc1 has no content light level (Apollo zeroes it)
0, // MaxFALL
)
})
} else {
None
};
// Wrap our D3D11 device as a WinRT IDirect3DDevice so the frame pool allocates on it (the
// pool textures land on our device → CopyResource + NVENC are same-device, no readback).
let dxgi_device: IDXGIDevice = device.cast().context("ID3D11Device as IDXGIDevice")?;
let inspectable: IInspectable = CreateDirect3D11DeviceFromDXGIDevice(&dxgi_device)
.context("CreateDirect3D11DeviceFromDXGIDevice")?;
let d3d_device: windows::Graphics::DirectX::Direct3D11::IDirect3DDevice = inspectable
.cast()
.context("IInspectable as IDirect3DDevice")?;
tracing::info!(hdr, "WGC: device ready, creating capture item");
// GraphicsCaptureItem for the monitor (the SudoVDA output enumerates as a normal monitor).
let interop: IGraphicsCaptureItemInterop =
windows::core::factory::<GraphicsCaptureItem, IGraphicsCaptureItemInterop>()
.context("GraphicsCaptureItem interop factory")?;
let item: GraphicsCaptureItem = interop
.CreateForMonitor(hmonitor)
.context("CreateForMonitor")?;
let size = item.Size().context("item Size")?;
let (width, height) = (size.Width.max(0) as u32, size.Height.max(0) as u32);
tracing::info!(
width,
height,
"WGC: capture item created, creating frame pool"
);
let pixel_format = if hdr {
DirectXPixelFormat::R16G16B16A16Float // scRGB FP16 — same surface DDA gives on HDR
} else {
DirectXPixelFormat::B8G8R8A8UIntNormalized
};
// Extra buffers: SDR zero-copy holds the last `HELD_FRAMES` frames (encoded in place), so
// the pool needs headroom beyond that for the producer to keep rendering at 240 Hz.
let pool = Direct3D11CaptureFramePool::CreateFreeThreaded(
&d3d_device,
pixel_format,
WGC_POOL_BUFFERS,
size,
)
.context("CreateFreeThreaded frame pool")?;
let signal = Arc::new(WgcSignal {
available: AtomicU64::new(0),
mtx: Mutex::new(()),
cv: Condvar::new(),
});
let sig = signal.clone();
let handler = TypedEventHandler::<Direct3D11CaptureFramePool, IInspectable>::new(
move |_pool, _arg| {
sig.available.fetch_add(1, Ordering::Release);
sig.cv.notify_one();
Ok(())
},
);
let token = pool.FrameArrived(&handler).context("FrameArrived")?;
tracing::info!("WGC: creating capture session");
let session = pool
.CreateCaptureSession(&item)
.context("CreateCaptureSession")?;
// OS composites the cursor into the frame (HDR-correct, no manual composite pass).
let _ = session.SetIsCursorCaptureEnabled(true);
// Drop the yellow capture border (best-effort — older builds reject it).
let _ = session.SetIsBorderRequired(false);
// Lift the 60 Hz cap: allow up to the client's refresh (Win11 24H2+; below that this is a
// no-op and WGC caps ~60). 100 ns ticks per frame.
let refresh = preferred
.map(|(_, _, hz)| hz)
.filter(|&hz| hz > 0)
.unwrap_or(60);
let ticks = (10_000_000i64 / refresh.max(1) as i64).max(1);
let _ = session.SetMinUpdateInterval(TimeSpan { Duration: ticks });
tracing::info!("WGC: StartCapture");
session.StartCapture().context("StartCapture")?;
// WGC fires FrameArrived on CHANGE; a static desktop may never deliver the first frame
// (→ black, then the next_frame deadline ends the session). Nudge the cursor onto the
// output to force the first composition change, exactly like the DDA path does.
nudge_cursor_onto(&output);
let timeout_ms = (2000 / refresh.max(1) as u64).max(8);
tracing::info!(
width,
height,
hdr,
refresh,
"WGC capture started ({})",
if hdr {
"HDR FP16→BT.2020 PQ"
} else {
"SDR BGRA"
}
);
Ok(Self {
device,
context,
pool,
session,
_item: item,
_frame_arrived_token: token,
signal,
consumed: 0,
width,
height,
timeout_ms,
first_frame: true,
hdr,
hdr_meta,
hdr_conv: None,
fp16_src: None,
fp16_srv: None,
hdr_p010_conv: None,
p010_out: Vec::new(),
p010_idx: 0,
p010_disabled: false,
out_ring: Vec::new(),
ring_idx: 0,
video_conv: None,
yuv_out: Vec::new(),
yuv_idx: 0,
yuv_is_hdr: false,
vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(),
held: VecDeque::new(),
last_present: None,
_keepalive: None,
})
}
}
/// Take ownership of the SudoVDA keepalive once the WGC session is confirmed open.
pub fn attach_keepalive(&mut self, keepalive: Box<dyn Send>) {
self._keepalive = Some(keepalive);
}
/// Block until a new frame arrives (cv), then drain `TryGetNextFrame` to the NEWEST queued frame
/// (skip stale → one-frame latency). Returns `None` on timeout (no new frame → caller repeats).
fn wait_and_drain(&mut self) -> Option<Direct3D11CaptureFrame> {
let wait_ms = if self.first_frame {
2000
} else {
self.timeout_ms
};
{
let mut g = self.signal.mtx.lock().unwrap();
while self.signal.available.load(Ordering::Acquire) <= self.consumed {
let (ng, res) = self
.signal
.cv
.wait_timeout(g, Duration::from_millis(wait_ms))
.unwrap();
g = ng;
if res.timed_out() {
return None;
}
}
}
let target = self.signal.available.load(Ordering::Acquire);
let mut last = None;
while self.consumed < target {
if let Ok(f) = self.pool.TryGetNextFrame() {
last = Some(f);
}
self.consumed += 1;
}
last
}
unsafe fn ensure_fp16_src(&mut self) -> Result<()> {
if self.fp16_src.is_some() {
return Ok(());
}
let desc = tex_desc(
self.width,
self.height,
DXGI_FORMAT_R16G16B16A16_FLOAT,
(D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
);
let mut t = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(wgc fp16 src)")?;
let t = t.context("fp16 src")?;
let mut srv = None;
self.device
.CreateShaderResourceView(&t, None, Some(&mut srv))?;
self.fp16_srv = Some(srv.context("fp16 srv")?);
self.fp16_src = Some(t);
Ok(())
}
/// Lazily allocate the HDR output texture ring (R10G10B10A2, the convert pass's render target →
/// NVENC input), `RENDER_TARGET`-bindable. SDR is zero-copy (encodes the WGC pool texture in
/// place) and uses no ring.
unsafe fn ensure_out_ring(
&mut self,
format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT,
) -> Result<()> {
if !self.out_ring.is_empty() {
return Ok(());
}
let desc = tex_desc(
self.width,
self.height,
format,
D3D11_BIND_RENDER_TARGET.0 as u32,
);
for _ in 0..OUT_RING {
let mut t = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(wgc out ring)")?;
self.out_ring.push(t.context("wgc out ring tex")?);
}
Ok(())
}
/// Convert `input` (the OS-composited WGC pool texture: BGRA or scRGB FP16) → NVENC's native YUV
/// (NV12 / P010) on the video processor. Returns the YUV texture (from a ring so consecutive
/// encodes don't collide), or `None` to fall back to the legacy RGB paths.
unsafe fn convert_to_yuv(
&mut self,
input: &ID3D11Texture2D,
hdr: bool,
) -> Option<ID3D11Texture2D> {
if self.vp_disabled {
return None;
}
if self.video_conv.is_none() || self.yuv_out.is_empty() || self.yuv_is_hdr != hdr {
self.video_conv = None;
self.yuv_out.clear();
self.yuv_idx = 0;
let vc = match VideoConverter::new(
&self.device,
&self.context,
self.width,
self.height,
hdr,
) {
Ok(vc) => vc,
Err(e) => {
tracing::warn!(error = %format!("{e:#}"),
"WGC: video processor unavailable — falling back to RGB path");
self.vp_disabled = true;
return None;
}
};
let fmt = if hdr {
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010
} else {
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12
};
let desc = tex_desc(
self.width,
self.height,
fmt,
D3D11_BIND_RENDER_TARGET.0 as u32,
);
for _ in 0..OUT_RING {
let mut t = None;
if self
.device
.CreateTexture2D(&desc, None, Some(&mut t))
.is_err()
{
tracing::warn!("WGC: CreateTexture2D(YUV) failed — falling back to RGB path");
self.vp_disabled = true;
self.yuv_out.clear();
return None;
}
let Some(tex) = t else {
self.vp_disabled = true;
self.yuv_out.clear();
return None;
};
self.yuv_out.push(tex);
}
self.video_conv = Some(vc);
self.yuv_is_hdr = hdr;
tracing::info!(
hdr,
"WGC: video-processor YUV path active ({})",
if hdr { "P010" } else { "NV12" }
);
}
let slot = self.yuv_idx;
self.yuv_idx = (self.yuv_idx + 1) % self.yuv_out.len();
let out = self.yuv_out[slot].clone();
if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) {
tracing::warn!(error = %format!("{e:#}"),
"WGC: VideoProcessorBlt failed — falling back to RGB path");
self.vp_disabled = true;
self.video_conv = None;
self.yuv_out.clear();
return None;
}
Some(out)
}
/// `PUNKTFUNK_HDR_SHADER_P010` path: convert the OS-composited FP16 scRGB capture DIRECTLY to a
/// host-owned P010 texture (BT.2020 PQ, 10-bit limited range) via [`HdrP010Converter`] — two
/// shader passes writing the P010 planes. NVENC then takes native P010 and skips its internal
/// RGB→YUV CSC. Returns the next ring slot's P010 texture, or `Err` if the converter / a planar
/// plane RTV fails (the caller latches `p010_disabled` and falls back to the R10 path).
unsafe fn hdr_to_p010(&mut self, src: &ID3D11Texture2D) -> Result<ID3D11Texture2D> {
let slot = self.p010_idx;
// Lazily allocate the FP16 source (shared with the R10 path) + the P010 output ring.
self.ensure_fp16_src()?;
let fp16 = self.fp16_src.clone().context("fp16 src")?;
self.context.CopyResource(&fp16, src);
if self.p010_out.is_empty() {
let desc = tex_desc(
self.width,
self.height,
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010,
D3D11_BIND_RENDER_TARGET.0 as u32,
);
for _ in 0..OUT_RING {
let mut t = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(wgc p010 ring)")?;
self.p010_out.push(t.context("wgc p010 ring tex")?);
}
}
self.p010_idx = (self.p010_idx + 1) % self.p010_out.len();
let out = self.p010_out[slot].clone();
if self.hdr_p010_conv.is_none() {
self.hdr_p010_conv = Some(HdrP010Converter::new(&self.device)?);
}
let srv = self.fp16_srv.clone().context("fp16 srv")?;
self.hdr_p010_conv.as_ref().unwrap().convert(
&self.device,
&self.context,
&srv,
&out,
self.width,
self.height,
)?;
Ok(out)
}
fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result<CapturedFrame> {
// SAFETY: runs on the capturer's single owning thread. `frame` is a live
// `Direct3D11CaptureFrame` from `self.pool`; `frame.Surface().cast::<IDirect3DDxgiInterfaceAccess
// >().GetInterface()` yields the frame's backing `ID3D11Texture2D`, which belongs to
// `self.device` (the pool was created on it via `CreateDirect3D11DeviceFromDXGIDevice`). Every
// helper called here — `hdr_to_p010`, `convert_to_yuv`, `ensure_fp16_src`, `ensure_out_ring`,
// `HdrConverter::convert`, `CopyResource`, `CreateRenderTargetView` — operates on
// `self.device`/`self.context` and that same-device texture, so all resources share one device.
// The frame is held in `self.held` until its async GPU read completes for the zero-copy paths.
// Single-threaded immediate-context use; borrowed textures/SRVs/RTVs outlive each synchronous call.
unsafe {
let surface = frame.Surface().context("frame Surface")?;
let access: IDirect3DDxgiInterfaceAccess = surface
.cast()
.context("surface as IDirect3DDxgiInterfaceAccess")?;
let src: ID3D11Texture2D = access
.GetInterface()
.context("GetInterface ID3D11Texture2D")?;
// GATED P010-shader path (`PUNKTFUNK_HDR_SHADER_P010`): for HDR, emit P010 (BT.2020 PQ
// 10-bit limited range) DIRECTLY from our shader so NVENC takes native P010 and skips its
// SM-side RGB→YUV CSC. Runs BEFORE the R10 + VideoProcessor path. A converter/plane-RTV
// failure latches `p010_disabled` → we fall through to the unchanged R10 path for the rest
// of the session. Default OFF → none of this executes and behaviour is byte-for-byte as
// today.
if self.hdr && !self.p010_disabled && hdr_shader_p010_enabled() {
match self.hdr_to_p010(&src) {
Ok(p010) => {
// The P010 output is host-owned (the ring), and the FP16 CopyResource read
// `src` synchronously on the immediate context before the shader passes — so we
// do NOT need to hold `frame` past here (unlike the SDR/R10 in-place paths).
// Dropping it returns the pool buffer to WGC immediately.
drop(frame);
self.last_present = Some((p010.clone(), PixelFormat::P010));
return Ok(self.d3d11_frame(p010, PixelFormat::P010));
}
Err(e) => {
tracing::warn!(error = %format!("{e:#}"),
"WGC: HDR P010 shader path failed — disabling it, falling back to R10");
self.p010_disabled = true;
self.hdr_p010_conv = None;
self.p010_out.clear();
}
}
}
// Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to
// NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC
// skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set
// means reading the pool texture directly does NOT serialize (unlike DDA's single-frame
// model). The frame is held until the async Blt finishes. (HDR: the video processor can't
// ingest FP16 scRGB, so the Blt fails and we fall back to the R10 path below; the
// `PUNKTFUNK_HDR_SHADER_P010` branch above is the off-the-SM HDR path.)
if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) {
let fmt = if self.hdr {
PixelFormat::P010
} else {
PixelFormat::Nv12
};
self.last_present = Some((yuv.clone(), fmt));
let out = self.d3d11_frame(yuv, fmt);
self.held.push_back(frame);
while self.held.len() > HELD_FRAMES {
self.held.pop_front();
}
return Ok(out);
}
// --- fallback (video processor unavailable) ---
if self.hdr {
// Next ring slot — the in-flight encode reads the slot we handed out last time, so
// this capture must land in a different one (see `out_ring`).
let slot = self.ring_idx;
self.ring_idx = (self.ring_idx + 1) % OUT_RING;
// FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC.
self.ensure_fp16_src()?;
let fp16 = self.fp16_src.clone().context("fp16 src")?;
self.context.CopyResource(&fp16, &src);
self.ensure_out_ring(DXGI_FORMAT_R10G10B10A2_UNORM)?;
let out = self.out_ring[slot].clone();
if self.hdr_conv.is_none() {
self.hdr_conv = Some(HdrConverter::new(&self.device)?);
}
let srv = self.fp16_srv.clone().context("fp16 srv")?;
let mut rtv: Option<ID3D11RenderTargetView> = None;
self.device
.CreateRenderTargetView(&out, None, Some(&mut rtv))?;
let rtv = rtv.context("hdr10 rtv")?;
self.hdr_conv.as_ref().unwrap().convert(
&self.context,
&srv,
&rtv,
self.width,
self.height,
);
self.last_present = Some((out.clone(), PixelFormat::Rgb10a2));
Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2))
} else {
// SDR ZERO-COPY: hand NVENC the WGC pool texture DIRECTLY — no `CopyResource`. The
// per-frame copy otherwise queues on the graphics engine behind a GPU-saturating game
// and stalls `lock_bitstream` ~20 ms (NVENC sits idle waiting for its input). Encoding
// the pool texture in place removes that graphics-queue dependency (Apollo's model).
// We must keep the frame alive until its async encode finishes, so retain the last
// `HELD_FRAMES`; the pool has spare buffers so the producer never starves.
self.last_present = Some((src.clone(), PixelFormat::Bgra));
let out = self.d3d11_frame(src, PixelFormat::Bgra);
self.held.push_back(frame);
while self.held.len() > HELD_FRAMES {
self.held.pop_front();
}
Ok(out)
}
}
}
fn d3d11_frame(&self, texture: ID3D11Texture2D, format: PixelFormat) -> CapturedFrame {
CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format,
payload: FramePayload::D3d11(D3d11Frame {
texture,
device: self.device.clone(),
}),
}
}
}
impl Capturer for WgcCapturer {
fn hdr_meta(&self) -> Option<punktfunk_core::quic::HdrMeta> {
self.hdr_meta
}
fn next_frame(&mut self) -> Result<CapturedFrame> {
let overall = Instant::now() + Duration::from_secs(20);
loop {
if let Some(frame) = self.wait_and_drain() {
self.first_frame = false;
return self.process_frame(frame);
}
// No new frame within the wait — repeat the last presented frame (static desktop).
if let Some((tex, fmt)) = &self.last_present {
return Ok(self.d3d11_frame(tex.clone(), *fmt));
}
if Instant::now() > overall {
bail!("no WGC frame within 20s (SudoVDA monitor not lit / no capture access?)");
}
}
}
fn try_latest(&mut self) -> Result<Option<CapturedFrame>> {
let target = self.signal.available.load(Ordering::Acquire);
if target <= self.consumed {
return Ok(None);
}
let mut last = None;
while self.consumed < target {
if let Ok(f) = self.pool.TryGetNextFrame() {
last = Some(f);
}
self.consumed += 1;
}
match last {
Some(frame) => self.process_frame(frame).map(Some),
None => Ok(None),
}
}
// set_active: the trait default (no-op) is correct — WGC keeps its session running across the
// active/idle gate (cheap; the frame pool just recycles), like the DDA duplication.
}
impl Drop for WgcCapturer {
fn drop(&mut self) {
let _ = self.session.Close();
let _ = self.pool.Close();
// _keepalive drops after, REMOVEing the SudoVDA monitor.
}
}
fn tex_desc(
width: u32,
height: u32,
format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT,
bind: u32,
) -> D3D11_TEXTURE2D_DESC {
D3D11_TEXTURE2D_DESC {
Width: width,
Height: height,
MipLevels: 1,
ArraySize: 1,
Format: format,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: bind,
CPUAccessFlags: 0,
MiscFlags: 0,
}
}
fn now_ns() -> u64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0)
}
@@ -1,484 +0,0 @@
//! Host-side WGC helper relay (Windows two-process secure-desktop design,
//! design/archive/windows-secure-desktop.md — step 4).
//!
//! WGC won't activate under the SYSTEM account, so the SYSTEM host can't capture the normal desktop
//! itself. Instead it spawns `punktfunk-host wgc-helper` in the **interactive user session** (so WGC works)
//! via `CreateProcessAsUserW`, with the helper's **stdout** redirected to an anonymous pipe the host
//! reads. The helper ships framed Annex-B access units; this module parses them back into AUs the
//! host relays onto the live QUIC session (same `EncodedFrame` flow, just sourced over a pipe instead
//! of a local encoder). A second pipe carries a tiny **control** channel to the helper (stdin: force
//! keyframe), and the helper's **stderr** is forwarded line-by-line into host tracing so its logs are
//! visible from the SYSTEM host's console.
//!
//! Wire framing (must match `wgc_helper::write_au`): per AU
//! `[u32 magic "PFAU" LE][u32 len LE][u64 pts_ns LE][u8 keyframe][len bytes data]`.
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use crate::capture::dxgi::WinCaptureTarget;
use anyhow::{bail, Context, Result};
use std::io::{BufRead, BufReader, Read};
use std::sync::mpsc::{Receiver, SyncSender};
use std::sync::Mutex;
use windows::core::PWSTR;
use windows::Win32::Foundation::SetHandleInformation;
use windows::Win32::Foundation::{CloseHandle, HANDLE};
use windows::Win32::Foundation::{HANDLE_FLAGS, HANDLE_FLAG_INHERIT};
use windows::Win32::Security::{
DuplicateTokenEx, SecurityImpersonation, TokenPrimary, SECURITY_ATTRIBUTES, TOKEN_ALL_ACCESS,
};
use windows::Win32::System::Environment::{CreateEnvironmentBlock, DestroyEnvironmentBlock};
use windows::Win32::System::Pipes::CreatePipe;
use windows::Win32::System::RemoteDesktop::{WTSGetActiveConsoleSessionId, WTSQueryUserToken};
use windows::Win32::System::Threading::{
CreateProcessAsUserW, TerminateProcess, CREATE_NO_WINDOW, CREATE_UNICODE_ENVIRONMENT,
PROCESS_INFORMATION, STARTF_USESTDHANDLES, STARTUPINFOW,
};
/// Must match [`crate::wgc_helper`]'s `AU_MAGIC` ("PFAU").
const AU_MAGIC: u32 = 0x5046_4155;
/// One access unit relayed from the helper, in the helper's (= the host's, same machine) monotonic
/// clock — `pts_ns` is directly comparable to the host's `now_ns()`.
pub struct RelayAu {
pub data: Vec<u8>,
pub pts_ns: u64,
pub keyframe: bool,
}
/// A running USER-session WGC helper whose AUs the SYSTEM host relays. Drop kills the child + closes
/// the pipes; the reader threads then end on the broken pipe.
pub struct HelperRelay {
proc: HANDLE,
thread: HANDLE,
/// Host write end of the helper's stdin — control commands (force keyframe). Mutex so the relay
/// can be shared while the encode thread requests keyframes.
stdin_w: Mutex<HANDLE>,
/// Parsed AUs from the helper's stdout reader thread.
rx: Receiver<RelayAu>,
}
// SAFETY: every field is itself `Send`: the `proc`/`thread` `HANDLE`s are process-global kernel
// handle values (plain integers valid from any thread, owned for the relay's lifetime and closed once
// on Drop), `stdin_w` is a `Mutex<HANDLE>`, and `rx` is an mpsc `Receiver<RelayAu>` (which is `Send`).
// The relay is moved to one thread and owned there, so transferring it across threads is sound.
unsafe impl Send for HelperRelay {}
// NOTE: `HelperRelay` is deliberately NOT `Sync`. Its `rx: Receiver<RelayAu>` is `!Sync` (std mpsc
// is single-consumer), and the relay is only ever a single-owner local in the punktfunk1 two-process
// mux loop — never shared by `&` across threads — so `Sync` is neither sound nor needed. (A prior
// `unsafe impl Sync` here asserted more than the fields support; removed.)
/// Control byte on the helper's stdin: force the next encoded frame to be an IDR (client decode
/// recovery). Mirrors `enc.request_keyframe()` in the single-process path.
const CTL_KEYFRAME: u8 = 0x01;
impl HelperRelay {
/// Spawn the helper in the interactive user session and start relaying its AUs. `target` is the
/// SudoVDA output the host already created (captured by GDI name only — the helper never touches
/// display topology). `(w, h, hz)` is the negotiated mode; `bitrate_kbps` the negotiated bitrate.
pub fn spawn(
target: &WinCaptureTarget,
mode: (u32, u32, u32),
bitrate_kbps: u32,
bit_depth: u8,
) -> Result<HelperRelay> {
let exe = std::env::current_exe().context("current_exe for helper spawn")?;
let exe = exe.to_string_lossy().into_owned();
let (w, h, hz) = mode;
// CreateProcessAsUserW takes a single mutable command line (argv[0] = exe).
let cmdline = format!(
"\"{exe}\" wgc-helper --gdi \"{}\" --target-id {} --mode {w}x{h}x{hz} --bitrate {bitrate_kbps} --bit-depth {bit_depth}",
target.gdi_name, target.target_id
);
tracing::info!(cmd = %cmdline, "spawning WGC helper in user session");
// SAFETY: `spawn_inner` is an `unsafe fn` only because it drives raw Win32 token/pipe/process
// FFI; it imposes no caller-side memory precondition beyond valid arguments. `cmdline` is a live
// `&str` borrowed for the synchronous call and `(w, h, hz)` are plain `u32`s. It validates its
// own runtime requirements (active console session, SYSTEM token) and returns `Err` otherwise.
unsafe { spawn_inner(&cmdline, w, h, hz) }
}
/// Receive the next relayed AU. Distinguishes a `Timeout` (helper slow/stalled — keep waiting)
/// from `Disconnected` (helper exited → its stdout closed → reader thread ended → channel
/// dropped), which returns *immediately* and means the relay must stop, not spin.
pub fn recv_timeout(
&self,
dur: std::time::Duration,
) -> Result<RelayAu, std::sync::mpsc::RecvTimeoutError> {
self.rx.recv_timeout(dur)
}
/// Non-blocking receive — used to drain stale buffered AUs (encoded while the secure desktop was
/// the live source) before resuming the relay. `Ok` while AUs remain, `Err` once empty.
pub fn try_recv(&self) -> Result<RelayAu, std::sync::mpsc::TryRecvError> {
self.rx.try_recv()
}
/// Ask the helper's encoder for an IDR on the next frame (client decode recovery). Best-effort:
/// a write failure means the helper is gone — the caller's recv loop will see the disconnect.
pub fn request_keyframe(&self) {
let h = self.stdin_w.lock().unwrap();
let mut written = 0u32;
// SAFETY: `*h` is the host's write end of the helper's stdin pipe — a live `HANDLE` owned by
// this `HelperRelay` (held under the `stdin_w` Mutex, locked here), closed only in Drop.
// `WriteFile` reads the 1-byte `&[CTL_KEYFRAME]` buffer and writes the byte count into
// `written`; both are live locals that outlive the synchronous call. A failure (helper gone) is
// discarded as documented.
unsafe {
let _ = windows::Win32::Storage::FileSystem::WriteFile(
*h,
Some(&[CTL_KEYFRAME]),
Some(&mut written),
None,
);
}
}
}
impl Drop for HelperRelay {
fn drop(&mut self) {
// SAFETY: `self.proc`/`self.thread` are the child process/thread `HANDLE`s from
// `CreateProcessAsUserW`, and `stdin_w` is the host's pipe write end — all owned by this
// `HelperRelay` and closed exactly once here in Drop (no double-close). `TerminateProcess` and
// the three `CloseHandle`s are FFI calls taking those handles by value, borrowing no Rust memory.
unsafe {
// Terminate the child first so its WGC capture + NVENC session tear down, then close our
// handles (the reader threads end on the resulting broken pipe).
let _ = TerminateProcess(self.proc, 1);
let _ = CloseHandle(*self.stdin_w.lock().unwrap());
let _ = CloseHandle(self.proc);
let _ = CloseHandle(self.thread);
}
tracing::info!("WGC helper relay torn down");
}
}
/// Inheritable anonymous pipe (read, write). The caller marks whichever end the host keeps as
/// non-inheritable so the child only inherits its own end.
unsafe fn make_pipe() -> Result<(HANDLE, HANDLE)> {
let mut read = HANDLE::default();
let mut write = HANDLE::default();
let sa = SECURITY_ATTRIBUTES {
nLength: std::mem::size_of::<SECURITY_ATTRIBUTES>() as u32,
lpSecurityDescriptor: std::ptr::null_mut(),
bInheritHandle: true.into(),
};
CreatePipe(&mut read, &mut write, Some(&sa), 0).context("CreatePipe")?;
Ok((read, write))
}
/// Mark a handle non-inheritable (the host keeps it; the child must not get a copy).
unsafe fn no_inherit(h: HANDLE) {
let _ = SetHandleInformation(h, HANDLE_FLAG_INHERIT.0, HANDLE_FLAGS(0));
}
/// Build a child environment block: the target session's block (so DLL/PATH/SystemRoot resolve) with
/// this process's `PUNKTFUNK_*` vars overlaid, so the child runs with the SAME settings this process
/// has (`PUNKTFUNK_ENCODER=nvenc`, `PUNKTFUNK_ZEROCOPY`, …) instead of the target shell's. Returns a
/// UTF-16, double-null-terminated block suitable for `CREATE_UNICODE_ENVIRONMENT`. Shared by the WGC
/// helper spawn (here) and the Windows service launching the host into the active session.
pub(crate) unsafe fn merged_env_block(user_block: *const u16) -> Vec<u16> {
// Parse the user block ("VAR=VALUE\0" … "\0") into entries.
let mut entries: Vec<String> = Vec::new();
if !user_block.is_null() {
let mut p = user_block;
loop {
let mut len = 0isize;
while *p.offset(len) != 0 {
len += 1;
}
if len == 0 {
break; // the trailing empty string = end of block
}
let slice = std::slice::from_raw_parts(p, len as usize);
entries.push(String::from_utf16_lossy(slice));
p = p.offset(len + 1);
}
}
// Overlay "our" settings — PUNKTFUNK_* and RUST_LOG — dropping whatever the target block had.
let is_ours = |k: &str| k.starts_with("PUNKTFUNK_") || k == "RUST_LOG";
entries.retain(|e| !is_ours(e.split('=').next().unwrap_or("")));
for (k, v) in std::env::vars().filter(|(k, _)| is_ours(k)) {
entries.push(format!("{k}={v}"));
}
// Serialize back to a UTF-16 double-null-terminated block.
let mut block: Vec<u16> = Vec::new();
for e in entries {
block.extend(e.encode_utf16());
block.push(0);
}
block.push(0);
block
}
unsafe fn spawn_inner(cmdline: &str, w: u32, h: u32, hz: u32) -> Result<HelperRelay> {
// The user token of the active console session (requires the host to be SYSTEM).
let session = WTSGetActiveConsoleSessionId();
if session == 0xFFFF_FFFF {
bail!("no active console session (WTSGetActiveConsoleSessionId)");
}
let mut user_token = HANDLE::default();
WTSQueryUserToken(session, &mut user_token)
.context("WTSQueryUserToken (host must run as SYSTEM)")?;
// A primary token for CreateProcessAsUserW.
let mut primary = HANDLE::default();
let dup = DuplicateTokenEx(
user_token,
TOKEN_ALL_ACCESS,
None,
SecurityImpersonation,
TokenPrimary,
&mut primary,
);
let _ = CloseHandle(user_token);
dup.context("DuplicateTokenEx(TokenPrimary)")?;
// The user's environment block (PATH, USERPROFILE, SystemRoot → DLL resolution), MERGED with the
// host's PUNKTFUNK_* vars. CreateProcessAsUserW would otherwise give the helper the *user's* env
// only, dropping PUNKTFUNK_ENCODER=nvenc / PUNKTFUNK_ZEROCOPY/… that the host runs with — so the
// helper would fall back to the software (H.264-only) encoder. We parse the user block, strip any
// PUNKTFUNK_* it has, append the host's, and pass the merged block.
let mut env_block: *mut core::ffi::c_void = std::ptr::null_mut();
let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false);
let merged_env = merged_env_block(env_block as *const u16);
if !env_block.is_null() {
let _ = DestroyEnvironmentBlock(env_block);
}
// Three pipes: stdout (helper→host AUs), stdin (host→helper control), stderr (helper→host logs).
let (out_r, out_w) = make_pipe().context("stdout pipe")?;
let (in_r, in_w) = make_pipe().context("stdin pipe")?;
let (err_r, err_w) = make_pipe().context("stderr pipe")?;
// The host keeps out_r / in_w / err_r — none inheritable; the child inherits out_w/in_r/err_w.
no_inherit(out_r);
no_inherit(in_w);
no_inherit(err_r);
let mut si = STARTUPINFOW {
cb: std::mem::size_of::<STARTUPINFOW>() as u32,
dwFlags: STARTF_USESTDHANDLES,
hStdInput: in_r,
hStdOutput: out_w,
hStdError: err_w,
..Default::default()
};
// WGC needs the interactive desktop.
let mut desktop: Vec<u16> = "winsta0\\default\0".encode_utf16().collect();
si.lpDesktop = PWSTR(desktop.as_mut_ptr());
let mut cmd: Vec<u16> = cmdline.encode_utf16().chain(std::iter::once(0)).collect();
let mut pi = PROCESS_INFORMATION::default();
let created = CreateProcessAsUserW(
Some(primary),
None,
Some(PWSTR(cmd.as_mut_ptr())),
None,
None,
true, // inherit handles (the child's std ends)
CREATE_UNICODE_ENVIRONMENT | CREATE_NO_WINDOW,
Some(merged_env.as_ptr() as *const core::ffi::c_void),
None,
&si,
&mut pi,
);
// Clean up regardless of outcome: the child now owns its inherited ends; close our copies.
let _ = CloseHandle(out_w);
let _ = CloseHandle(in_r);
let _ = CloseHandle(err_w);
let _ = CloseHandle(primary);
if let Err(e) = created {
let _ = CloseHandle(out_r);
let _ = CloseHandle(in_w);
let _ = CloseHandle(err_r);
return Err(e).context("CreateProcessAsUserW(wgc-helper)");
}
tracing::info!(pid = pi.dwProcessId, mode = %format!("{w}x{h}@{hz}"), "WGC helper spawned");
// The helper does the WGC capture + NVENC encode, but it runs under the user's UAC-FILTERED token
// (no SE_INC_BASE_PRIORITY), so it can't raise its OWN GPU scheduling-priority class — under a
// GPU-saturating game NVENC then gets starved (the "240→40 fps in-game collapse"). The SYSTEM host
// holds the privilege, so stamp the HIGH GPU priority class onto the child here, right after spawn
// (the process-level class applies to the GPU contexts the helper creates afterwards).
crate::capture::dxgi::set_child_gpu_priority_class(pi.hProcess);
// stderr → host tracing, line by line.
let err_handle = HandleReader(err_r);
std::thread::Builder::new()
.name("wgc-helper-log".into())
.spawn(move || {
let r = BufReader::new(err_handle);
for line in r.lines() {
match line {
Ok(l) if !l.trim().is_empty() => tracing::info!(target: "wgc_helper", "{l}"),
Ok(_) => {}
Err(_) => break,
}
}
})
.ok();
// stdout → parsed AUs. Bounded so a stalled relay applies backpressure (the pipe then fills and
// the helper blocks on write — the same backpressure the single-process channel gives).
let (tx, rx) = std::sync::mpsc::sync_channel::<RelayAu>(3);
let out_handle = HandleReader(out_r);
std::thread::Builder::new()
.name("wgc-helper-au".into())
.spawn(move || au_reader(out_handle, tx))
.ok();
Ok(HelperRelay {
proc: pi.hProcess,
thread: pi.hThread,
stdin_w: Mutex::new(in_w),
rx,
})
}
/// Parse the AU framing off the helper's stdout and forward each AU. Ends (returns) when the pipe
/// breaks (helper exit) or the channel's receiver is dropped (relay torn down).
fn au_reader(mut r: HandleReader, tx: SyncSender<RelayAu>) {
loop {
let mut hdr = [0u8; 4 + 4 + 8 + 1];
if r.read_exact(&mut hdr).is_err() {
break;
}
let magic = u32::from_le_bytes([hdr[0], hdr[1], hdr[2], hdr[3]]);
if magic != AU_MAGIC {
tracing::error!(
magic = format!("{magic:#x}"),
"WGC helper AU stream desync — aborting relay"
);
break;
}
let len = u32::from_le_bytes([hdr[4], hdr[5], hdr[6], hdr[7]]) as usize;
let pts_ns = u64::from_le_bytes([
hdr[8], hdr[9], hdr[10], hdr[11], hdr[12], hdr[13], hdr[14], hdr[15],
]);
let keyframe = hdr[16] != 0;
// Bound the allocation — a corrupt length must not OOM the host. 64 MiB is far above any real
// AU (a 5K keyframe is a few MB).
if len > 64 * 1024 * 1024 {
tracing::error!(len, "WGC helper AU length implausible — aborting relay");
break;
}
let mut data = vec![0u8; len];
if r.read_exact(&mut data).is_err() {
break;
}
if tx
.send(RelayAu {
data,
pts_ns,
keyframe,
})
.is_err()
{
break; // relay dropped
}
}
}
/// Minimal `Read` over a Win32 pipe HANDLE (the windows crate doesn't impl `Read` on HANDLE).
struct HandleReader(HANDLE);
// SAFETY: `HandleReader` owns a single pipe `HANDLE` (a process-global kernel handle value, valid from
// any thread). It is moved into the dedicated reader thread and used only there (and closed once on
// Drop), never shared — so transferring ownership across threads is sound.
unsafe impl Send for HandleReader {}
impl Read for HandleReader {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let mut read = 0u32;
// SAFETY: `self.0` is the live read end of an anonymous pipe owned by this `HandleReader`
// (closed only in Drop). `ReadFile` fills the caller-provided `buf` (writing at most `buf.len()`
// bytes) and stores the count in `read`; both outlive the synchronous call. A broken pipe
// surfaces as `Err` and is mapped to EOF below.
let ok = unsafe {
windows::Win32::Storage::FileSystem::ReadFile(self.0, Some(buf), Some(&mut read), None)
};
match ok {
Ok(()) => Ok(read as usize),
// A broken pipe (helper exited) reads as ERROR_BROKEN_PIPE → report EOF (0).
Err(_) => Ok(0),
}
}
}
impl Drop for HandleReader {
fn drop(&mut self) {
// SAFETY: `self.0` is the pipe `HANDLE` this `HandleReader` owns; `CloseHandle` (an FFI call
// taking the handle by value) is invoked exactly once here in Drop, so there is no double-close.
unsafe {
let _ = CloseHandle(self.0);
}
}
}
/// Is this process running as the LOCAL SYSTEM account? Used to decide whether the two-process
/// secure-desktop path applies (only SYSTEM can `WTSQueryUserToken` + capture the Winlogon desktop).
pub fn running_as_system() -> bool {
use windows::Win32::Security::{GetTokenInformation, TokenUser, TOKEN_QUERY, TOKEN_USER};
use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
// SAFETY: `OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &mut token)` opens the current-process
// token (the pseudo-handle is always valid) into `token`, which is closed once before each return.
// The first `GetTokenInformation` (null buffer) queries the required `len`; `buf` is then a
// `Vec<u8>` of exactly `len` bytes and the second call fills it, so `&*(buf.as_ptr() as *const
// TOKEN_USER)` reads a `TOKEN_USER` the kernel just wrote into a sufficiently-sized buffer (the
// variable-length SID it points at also lies within `buf`, which outlives the borrow).
// `is_local_system_sid` is this module's `unsafe fn`, given that in-buffer `PSID`. Safe on any thread.
unsafe {
let mut token = HANDLE::default();
if OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &mut token).is_err() {
return false;
}
let mut len = 0u32;
let _ = GetTokenInformation(token, TokenUser, None, 0, &mut len);
if len == 0 {
let _ = CloseHandle(token);
return false;
}
let mut buf = vec![0u8; len as usize];
let ok = GetTokenInformation(
token,
TokenUser,
Some(buf.as_mut_ptr() as *mut _),
len,
&mut len,
)
.is_ok();
let _ = CloseHandle(token);
if !ok {
return false;
}
let tu = &*(buf.as_ptr() as *const TOKEN_USER);
// The well-known LocalSystem SID is S-1-5-18.
is_local_system_sid(tu.User.Sid)
}
}
/// True iff `sid` is S-1-5-18 (LocalSystem).
unsafe fn is_local_system_sid(sid: windows::Win32::Security::PSID) -> bool {
use windows::Win32::Security::{
GetSidIdentifierAuthority, GetSidSubAuthority, GetSidSubAuthorityCount, IsValidSid,
};
if !IsValidSid(sid).as_bool() {
return false;
}
let auth = GetSidIdentifierAuthority(sid);
if auth.is_null() {
return false;
}
// NT Authority = {0,0,0,0,0,5}.
let a = (*auth).Value;
if a != [0, 0, 0, 0, 0, 5] {
return false;
}
let count = *GetSidSubAuthorityCount(sid);
if count != 1 {
return false;
}
*GetSidSubAuthority(sid, 0) == 18 // SECURITY_LOCAL_SYSTEM_RID
}
+7 -24
View File
@@ -6,8 +6,8 @@
//!
//! **Goal-1 stages 12** (`design/windows-host-rewrite.md` §2.2): stage 1 stood this up; stage 2 migrated the
//! genuinely-constant operator/dispatch knobs onto it (the dispatch-disagreement bug class: `idd_push`,
//! `capture_backend`, `encoder_pref`, `render_adapter`, `no_wgc`, the vdisplay backend select — plus the
//! plan-named `secure_dda`/`idd_depth`/`zerocopy`/`ten_bit`/`four_four_four` and the multi-site `perf`/`compositor`/
//! `encoder_pref`, `render_adapter`, the vdisplay backend select — plus the plan-named
//! `idd_depth`/`zerocopy`/`ten_bit`/`four_four_four` and the multi-site `perf`/`compositor`/
//! `video_source`/`gamepad`). `SessionPlan` (stage 3) consumes it as the single owner of the
//! capture/topology/encoder decision.
//!
@@ -36,27 +36,17 @@ use std::sync::OnceLock;
/// derived `Debug` impl, so the parser can stay a single platform-neutral function.
#[derive(Debug, Clone, Default)]
pub struct HostConfig {
/// `PUNKTFUNK_IDD_PUSH` — capture from the pf-vdisplay driver's shared ring (in-process Session-0
/// capture; no WGC helper). **Value-aware** (`0`/`false`/`no`/`off`/empty ⇒ off, else on); unset ⇒ off.
/// The installer's default `host.env` sets it on, so a fresh install runs the validated IDD-push path
/// (it falls back to DDA if the driver can't attach — see [`crate::capture`]). NOT a bare presence flag
/// (so an operator can turn it OFF in `host.env` with `=0`, which a `var_os` presence check can't).
/// `PUNKTFUNK_IDD_PUSH` — IDD direct-push monitor mode (the per-session monitor + ring recreate and
/// the discrete-render-GPU pin in [`crate::vdisplay::manager`]). IDD-push is the sole Windows capture
/// path (DXGI Desktop Duplication and the WGC relay were removed), so this should stay on — the
/// installer's `host.env` sets it. **Value-aware** (`0`/`false`/`no`/`off`/empty ⇒ off, else on);
/// unset ⇒ off. NOT a bare presence flag (so an operator can turn it OFF with `=0`).
pub idd_push: bool,
/// `PUNKTFUNK_ENCODER` — explicit encoder-backend override (lowercased; empty = auto-detect by GPU vendor).
pub encoder_pref: String,
/// `PUNKTFUNK_NO_HELPER` — never spawn the user-session WGC helper.
pub no_helper: bool,
/// `PUNKTFUNK_FORCE_HELPER` — force the WGC helper even when not running as SYSTEM.
pub force_helper: bool,
/// `PUNKTFUNK_NO_WGC` — force the pure single-process DDA path (skip WGC and the two-process relay).
pub no_wgc: bool,
/// `PUNKTFUNK_CAPTURE` — explicit Windows capture-backend override (lowercased; `dda`/`dxgi` vs the WGC default).
pub capture_backend: String,
/// `PUNKTFUNK_RENDER_ADAPTER` — discrete render-GPU pin by description substring (`Some` even when empty:
/// the empty string still counts as "set" for the presence checks, and the value reader filters it).
pub render_adapter: Option<String>,
/// `PUNKTFUNK_SECURE_DDA` — enable the experimental DDA-on-secure-desktop (Winlogon/UAC) mux leg.
pub secure_dda: bool,
/// `PUNKTFUNK_IDD_DEPTH` — IDD-push pipeline depth override (default 2; the call site clamps to its `OUT_RING`).
pub idd_depth: usize,
/// `PUNKTFUNK_ZEROCOPY` — opt into the Windows D3D11 zero-copy encode path (presence semantics; see module docs).
@@ -103,14 +93,7 @@ impl HostConfig {
encoder_pref: std::env::var("PUNKTFUNK_ENCODER")
.unwrap_or_default()
.to_ascii_lowercase(),
no_helper: flag("PUNKTFUNK_NO_HELPER"),
force_helper: flag("PUNKTFUNK_FORCE_HELPER"),
no_wgc: flag("PUNKTFUNK_NO_WGC"),
capture_backend: std::env::var("PUNKTFUNK_CAPTURE")
.unwrap_or_default()
.to_ascii_lowercase(),
render_adapter: val("PUNKTFUNK_RENDER_ADAPTER"),
secure_dda: flag("PUNKTFUNK_SECURE_DDA"),
idd_depth: val("PUNKTFUNK_IDD_DEPTH")
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(2),
-32
View File
@@ -56,9 +56,6 @@ mod spike;
mod stats_recorder;
mod vdisplay;
#[cfg(target_os = "windows")]
#[path = "windows/wgc_helper.rs"]
mod wgc_helper;
#[cfg(target_os = "windows")]
#[path = "windows/win_adapter.rs"]
mod win_adapter;
#[cfg(target_os = "windows")]
@@ -392,35 +389,6 @@ fn real_main() -> Result<()> {
paired_store: None,
})
}
// USER-session WGC helper (Windows two-process secure-desktop design): capture the EXISTING
// SudoVDA via WGC + NVENC, stream AUs on stdout to the SYSTEM host. Spawned by the host
// (CreateProcessAsUser), not run by hand. See design/archive/windows-secure-desktop.md.
#[cfg(target_os = "windows")]
Some("wgc-helper") => {
let get = |flag: &str| {
args.iter()
.skip_while(|a| *a != flag)
.nth(1)
.map(String::as_str)
};
let (width, height, fps) = get("--mode")
.and_then(|m| {
let p: Vec<u32> = m.split('x').filter_map(|s| s.parse().ok()).collect();
(p.len() == 3).then(|| (p[0], p[1], p[2]))
})
.unwrap_or((1920, 1080, 60));
wgc_helper::run(wgc_helper::HelperOptions {
target_id: get("--target-id").and_then(|s| s.parse().ok()).unwrap_or(0),
gdi_name: get("--gdi").unwrap_or("").to_string(),
width,
height,
fps,
bitrate_kbps: get("--bitrate")
.and_then(|s| s.parse().ok())
.unwrap_or(20000),
bit_depth: get("--bit-depth").and_then(|s| s.parse().ok()).unwrap_or(8),
})
}
// Windows service control: install/uninstall/start/stop/status + the SCM `run` entry point.
// Replaces the ad-hoc launch chain — `service install` registers an auto-start SYSTEM service
// that launches the host into the active interactive session.
+13 -503
View File
@@ -755,14 +755,18 @@ async fn serve_session(
// opens a tiny encoder; it runs only when both opt-ins are set and is cached after the first.
let host_wants_444 = crate::config::config().four_four_four;
let client_supports_444 = hello.video_caps & punktfunk_core::quic::VIDEO_CAP_444 != 0;
let single_process = crate::session_plan::resolve_topology()
== crate::session_plan::SessionTopology::SingleProcess;
// The active capturer must be able to deliver a full-chroma (RGB) source — the honest-downgrade
// gate. Linux's portal capturer can; the Windows IDD-push path delivers subsampled NV12/P010
// today (full-chroma IDD-push capture is a follow-up), so it returns false there and the host
// negotiates 4:2:0. (Replaces the old `single_process` gate — single-process is now the only
// topology, and 4:4:4 routed to DDA, which was removed.)
let capture_supports_444 = crate::capture::capturer_supports_444();
// The GPU probe opens a real (tiny) encoder on first use, so run it off the reactor like the
// compositor probe above (blocking probes → spawn_blocking). Short-circuit so it only runs when
// the cheap gates already pass. The result is cached process-wide (a negative latches until
// restart — acceptable: a GPU either supports HEVC 4:4:4 or it doesn't, and a transient open
// failure here is rare since the session's own encoder isn't open yet).
let gpu_supports_444 = if host_wants_444 && client_supports_444 && single_process {
let gpu_supports_444 = if host_wants_444 && client_supports_444 && capture_supports_444 {
tokio::task::spawn_blocking(|| {
crate::encode::can_encode_444(crate::encode::Codec::H265)
})
@@ -780,7 +784,7 @@ async fn serve_session(
chroma = ?chroma,
host_wants_444,
client_supports_444,
single_process,
capture_supports_444,
"encode chroma"
);
@@ -2696,7 +2700,7 @@ fn session_watcher_loop(tx: std::sync::mpsc::Sender<SessionSwitch>, stop: Arc<At
}
}
/// All per-session inputs for [`virtual_stream`] / [`virtual_stream_relay`], bundled so the session entry
/// All per-session inputs for [`virtual_stream`], bundled so the session entry
/// is one moved value instead of a 13-positional-argument `#[allow(too_many_arguments)]` signature
/// (Goal-1 stage 4, plan §2.4). Everything is **owned** — the receivers move in (`virtual_stream` is their
/// only consumer) — so the whole context moves into the stream thread and the borrow plumbing disappears.
@@ -2744,8 +2748,9 @@ struct SessionContext {
}
fn virtual_stream(ctx: SessionContext) -> Result<()> {
// This thread runs the capture+encode loop (single-process: Linux / synthetic / NO_WGC DDA) — or
// tail-calls the relay below. Elevate it so a CPU-heavy game can't deschedule our GPU submission.
// This thread runs the capture+encode loop (single-process — the only topology: Linux portal /
// synthetic, Windows in-process IDD-push). Elevate it so a CPU-heavy game can't deschedule our GPU
// submission.
boost_thread_priority(true);
// Resolve the per-session capture / topology / encoder decision ONCE (Goal-1 stage 3): the deployed
// path now reads this typed `SessionPlan` instead of re-deriving from config at each dispatch site
@@ -2753,14 +2758,6 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
// only per-session input — capture/topology/encoder are otherwise pure functions of `HostConfig`.
let plan = crate::session_plan::SessionPlan::resolve(ctx.bit_depth, ctx.chroma);
tracing::info!(?plan, "resolved session plan");
// Windows two-process secure-desktop path: when the host runs as SYSTEM (required for the secure
// desktop + SendInput), WGC can't activate in-process, so we capture the normal desktop via a
// helper spawned in the user session and relay its AUs. (Single-process WGC/DDA is used as the
// user, and stays the path on Linux.) See design/archive/windows-secure-desktop.md.
#[cfg(target_os = "windows")]
if plan.topology == crate::session_plan::SessionTopology::TwoProcessRelay {
return virtual_stream_relay(ctx);
}
// Single-process path: unpack the context into the locals the loop below uses (names unchanged, so the
// body is byte-for-byte the same; the receivers are now owned but `try_recv()` is identical).
let SessionContext {
@@ -2810,20 +2807,7 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
#[cfg(target_os = "windows")]
drop(_idd_setup_guard);
// Windows single-process DDA path (PUNKTFUNK_NO_WGC=1): the SudoVDA virtual display, isolated as the
// SOLE active output, goes into fullscreen independent-flip (one plane on one display) which Desktop
// Duplication cannot capture → the born-lost ACCESS_LOST storm we measured on the RTX4090+iGPU box
// (hook verified-firing, DPI=2, yet 100% DuplicateOutput1 E_ACCESSDENIED + born-lost). A tiny topmost
// layered overlay disqualifies independent-flip and forces DWM composition, which DDA CAN capture.
// (Apollo never hits this because it runs WITH a physical monitor attached — multi-display is already
// DWM-composited; we isolate to sole-display, so we must force composition ourselves.) Unlike the WGC
// relay path — where WGC owns the normal desktop and the overlay is secure-only — here DDA owns the
// normal desktop too, so it must run unconditionally. Held for the session; Drop tears it down.
// Best-effort; disable with PUNKTFUNK_FORCE_COMPOSED=0.
#[cfg(target_os = "windows")]
let _composed_flip = crate::capture::composed_flip::ForceComposedFlip::start();
// Windows: capture is live (and composition forced) — launch the requested library title into the
// Windows: capture is live — launch the requested library title into the
// interactive user session so it renders onto the captured desktop and grabs foreground. Linux
// nests its launch in gamescope instead (the handshake `PUNKTFUNK_GAMESCOPE_APP` path). Best-effort:
// a launch failure (no recipe for the kind, no interactive user) leaves the user on the desktop.
@@ -3295,480 +3279,6 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
Ok(())
}
/// Windows two-process video stream: the SYSTEM host creates the SudoVDA virtual output (and holds
/// its keepalive = the sole topology/isolation owner), spawns the WGC helper in the user session to
/// capture+encode the NORMAL desktop, and relays the helper's AUs onto the QUIC data plane via the
/// same send thread as the single-process path. A [`DesktopWatcher`](crate::capture::desktop_watch)
/// muxes the source: while the input desktop is Winlogon (UAC / lock / login — which WGC can't
/// capture), the host captures it with its OWN DDA encoder; back on Default it resumes the relay.
/// Every source switch latches a "wait for IDR" so the client's decoder resumes on a keyframe (the
/// two encoders keep independent infinite-GOP state). Reconfigure rebuilds the output + re-spawns the
/// helper at the new mode (and drops the stale-target DDA); keyframe requests forward to the active
/// source.
#[cfg(target_os = "windows")]
fn virtual_stream_relay(ctx: SessionContext) -> Result<()> {
use crate::capture::dxgi::WinCaptureTarget;
use crate::capture::wgc_relay::HelperRelay;
use crate::capture::Capturer; // trait methods (set_active/next_frame) on the concrete DuplCapturer
// Unpack the context (names unchanged so the body is identical). The relay doesn't yet send the
// source's 0xCE HDR metadata — the helper's in-band SEI carries it (a Windows follow-up) — so `conn`
// is held unused.
let SessionContext {
session,
mode,
seconds,
stop,
reconfig,
keyframe,
compositor,
bitrate_kbps,
bit_depth,
// The two-process WGC relay encodes 4:2:0 in v1 — the handshake's `single_process` gate already
// forced `chroma` to Yuv420 for this topology, so the helper + secure-desktop DDA stay 4:2:0.
chroma: _,
probe_rx,
probe_result_tx,
fec_target,
conn: _conn,
stats,
client_label,
launch,
} = ctx;
tracing::info!(
?mode,
bitrate_kbps,
bit_depth,
"punktfunk/1 two-process stream (SYSTEM host + user-session WGC helper)"
);
let mut vd = crate::vdisplay::open(compositor)?;
// Create the SudoVDA output + spawn a helper capturing it by GDI name. Returns the keepalive
// (held for the output's life — the sole isolation owner), the running relay, the capture target
// (so the host can also open DDA on it for the secure desktop), and the achieved refresh.
type Built = (Box<dyn Send>, HelperRelay, WinCaptureTarget, u32);
let build = |vd: &mut Box<dyn crate::vdisplay::VirtualDisplay>,
mode: punktfunk_core::Mode|
-> Result<Built> {
let vout = vd.create(mode).context("create virtual output")?;
let effective_hz = vout
.preferred_mode
.map(|(_, _, hz)| hz)
.filter(|&hz| hz > 0)
.unwrap_or(mode.refresh_hz);
let target = vout.win_capture.clone().ok_or_else(|| {
anyhow!("SudoVDA target not yet an active display (needs a WDDM GPU to activate it)")
})?;
// HDR is driven by the SudoVDA monitor's ACTUAL advanced-color state, not the handshake bit
// depth: the whole pipeline follows the monitor (WGC captures FP16 when HDR is on; NVENC forces
// Main10 + BT.2020 PQ from the 10-bit capture format regardless of the negotiated depth; the
// client auto-detects PQ from the HEVC VUI). So:
// - a negotiated 10-bit session PROACTIVELY enables HDR on the monitor (below), but
// - we must NEVER force HDR *off* here — that would wipe out a user's deliberate Windows HDR
// toggle on the virtual display on every build (the "HDR doesn't persist" bug). Leaving the
// monitor's state alone lets a user-enabled HDR session flow through end-to-end.
// The secure-desktop HDR drop (for the DDA leg) keys off the monitor's real state in the mux loop.
#[cfg(target_os = "windows")]
if bit_depth >= 10 {
// SAFETY: `set_advanced_color` is marked `unsafe` only because it drives the Win32 CCD API
// internally; it takes `target_id` by value (Copy `u32` — this session's live SudoVDA
// monitor's CCD target id) and sizes + owns every buffer it hands the OS on its own stack.
// We pass no pointers, so nothing must outlive the call and there is no aliasing; an
// unknown/absent target id simply returns false.
unsafe {
if crate::win_display::set_advanced_color(target.target_id, true) {
// Let the colorspace change settle before WGC creates its capture item / detects HDR.
std::thread::sleep(std::time::Duration::from_millis(250));
}
}
}
let relay = HelperRelay::spawn(
&target,
(mode.width, mode.height, effective_hz),
bitrate_kbps,
bit_depth,
)
.context("spawn WGC helper")?;
Ok((vout.keepalive, relay, target, effective_hz))
};
let (mut _keepalive, mut relay, mut target, mut effective_hz) = build(&mut vd, mode)?;
let mut cur_mode = mode;
// Capture is live (the WGC helper is relaying) — launch the requested library title into the
// interactive user session so it renders onto the captured desktop and grabs foreground.
// Best-effort: a failure (no recipe for the kind, no interactive user) leaves the user on the desktop.
if let Some(id) = launch.as_deref() {
if let Err(e) = crate::library::launch_title(id) {
tracing::warn!(launch_id = id, error = %e, "could not launch requested library title");
}
}
// O3.1: optionally observe the IDD-push ring alongside WGC (WGC = the presentation trigger) to
// confirm the 0257 driver pushes frames into a HOST-created ring. Diagnostic only; gated.
if std::env::var_os("PUNKTFUNK_IDD_PUSH_OBSERVE").is_some() {
crate::capture::idd_push::spawn_observer(
target.clone(),
Some((cur_mode.width, cur_mode.height, effective_hz)),
);
}
// The host's own DDA capturer+encoder for the SECURE (Winlogon) desktop, which WGC — and thus the
// helper — cannot capture. Opened lazily on the first secure transition (so a session that never
// hits a UAC/lock screen never pays for a second NVENC session), then kept for fast re-switch.
struct DdaPipe {
cap: Box<dyn crate::capture::Capturer>,
enc: Box<dyn crate::encode::Encoder>,
frame: crate::capture::CapturedFrame,
}
// Note: takes the dimensions as args rather than capturing `cur_mode` — `cur_mode` is reassigned
// on reconfig, and a closure holding a shared borrow of it for the whole fn would forbid that.
let open_dda =
|target: &WinCaptureTarget, w: u32, h: u32, hz: u32, hdr: bool| -> Result<DdaPipe> {
// The host already holds the real keepalive (sole isolation owner), so DDA gets a no-op one.
// `hdr` requests an FP16 DuplicateOutput1 so the secure desktop is captured in HDR (→ BT.2020
// PQ Main10) instead of black — legacy DuplicateOutput can't capture an HDR/FP16 desktop.
let mut cap = crate::capture::dxgi::DuplCapturer::open(
target.clone(),
Some((w, h, hz)),
Box::new(()),
// The relay's host encoder is GPU (NVENC/AMF/QSV unless software) — pass `gpu` in (Goal-1
// stage 5) so the DDA capturer doesn't re-derive it.
crate::capture::gpu_encode(),
hdr,
false, // the two-process relay path is 4:2:0 in v1
)
.context("open DDA for secure desktop")?;
cap.set_active(true);
let frame = cap.next_frame().context("DDA first frame")?;
let enc = crate::encode::open_video(
crate::encode::Codec::H265,
frame.format,
frame.width,
frame.height,
hz,
bitrate_kbps as u64 * 1000,
frame.is_cuda(),
bit_depth,
// Secure-desktop DDA on the two-process relay path: 4:2:0 in v1 (matches the helper).
crate::encode::ChromaFormat::Yuv420,
)
.context("open video encoder for DDA")?;
Ok(DdaPipe {
cap: Box::new(cap),
enc,
frame,
})
};
let perf = crate::config::config().perf;
let burst_cap = std::env::var("PUNKTFUNK_PACE_BURST_KB")
.ok()
.and_then(|s| s.parse::<usize>().ok())
.unwrap_or(128)
* 1024;
// Same encode|send split as the single-process path: this thread relays AUs, a dedicated send
// thread owns the Session and does FEC+seal+paced-send. The relay encodes in the helper process,
// so this path's FrameMsgs carry no cap/submit/encode split (those stages stay 0 in the sample);
// the send thread still emits fps/goodput/pacing/loss from `session.stats()`.
let send_stats = SendStats {
rec: stats,
width: mode.width,
height: mode.height,
fps: effective_hz,
codec: "hevc",
client: client_label,
bitrate_kbps,
};
let (frame_tx, frame_rx) = std::sync::mpsc::sync_channel::<FrameMsg>(3);
let send_thread = std::thread::Builder::new()
.name("punktfunk-send".into())
.spawn({
let stop = stop.clone();
move || {
send_loop(
session,
frame_rx,
probe_rx,
probe_result_tx,
stop,
perf,
burst_cap,
fec_target,
send_stats,
)
}
})
.context("spawn send thread")?;
// Test hook: PUNKTFUNK_SECURE_TEST_PERIOD_MS=N drives a square-wave secure/normal toggle every N ms
// instead of the real watcher — exercises the mid-session helper↔DDA mux without a live UAC/lock.
let secure_test_ms: Option<u128> = std::env::var("PUNKTFUNK_SECURE_TEST_PERIOD_MS")
.ok()
.and_then(|s| s.parse().ok())
.filter(|&n| n > 0);
// Switching to the host DDA on the secure (Winlogon) desktop is OPT-IN: DDA can't reliably capture
// the secure desktop's HDR independent-flip (it storms ACCESS_LOST → black), whereas the WGC helper
// STAYS LIVE through a lock/UAC. So by default the mux keeps WGC the whole time (no DesktopWatcher
// switch, no overlay). Enable the experimental DDA-on-secure path with PUNKTFUNK_SECURE_DDA=1.
let dda_secure = crate::config::config().secure_dda || secure_test_ms.is_some();
// The authoritative Default↔Winlogon signal (requires SYSTEM to read the Winlogon desktop name);
// only needed when the DDA-on-secure path is enabled.
let watcher = dda_secure.then(crate::capture::desktop_watch::DesktopWatcher::start);
// Force-composed-flip overlay (only with DDA-on-secure): keeps the secure desktop out of fullscreen
// independent-flip so DDA can duplicate it. Off by default to avoid touching the normal desktop.
let _composed_flip = dda_secure
.then(crate::capture::composed_flip::ForceComposedFlip::start)
.flatten();
let start = std::time::Instant::now();
let mut interval = std::time::Duration::from_secs_f64(1.0 / effective_hz.max(1) as f64);
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(seconds as u64);
let mut sent: u64 = 0;
// Mux state: which source is live, the lazily-opened DDA pipe, a DDA pacing clock, and a
// "wait for the next IDR before forwarding" latch set on every source switch (the client's
// decoder must resume on a keyframe — the two encoders keep independent infinite-GOP state).
let mut dda: Option<DdaPipe> = None;
let mut on_secure = false;
let mut next = std::time::Instant::now();
let mut await_idr = false;
// Step 6 relaunch watchdog: how many times in a row the helper has died without producing a frame.
// A console disconnect/reconnect or a helper crash kills it; we respawn (the new helper picks up
// the now-active session via WTSGetActiveConsoleSessionId). Reset on the first relayed frame; only
// give up (end the stream) after a run of failures spanning a few seconds.
let mut helper_fails = 0u32;
const MAX_HELPER_FAILS: u32 = 20;
// Build a FrameMsg + hand it to the send thread; returns false if the send thread is gone (caller
// breaks the loop). Kept as a macro (not a closure) so each use borrows `frame_tx`/`sent`/`interval`
// at its own site without a long-lived capture, and `break 'outer` stays a literal at the call site
// (a `break 'outer` inside the macro body risks label-hygiene resolution failures).
macro_rules! forward {
($data:expr, $capture_ns:expr, $keyframe:expr) => {{
let flags = if $keyframe {
(FLAG_PIC | FLAG_SOF) as u32
} else {
FLAG_PIC as u32
};
let capture_ns = $capture_ns;
let encode_us = (now_ns().saturating_sub(capture_ns) / 1000) as u32;
let msg = FrameMsg {
data: $data,
capture_ns,
flags,
deadline: std::time::Instant::now() + interval,
encode_us,
cap_us: 0,
submit_us: 0,
wait_us: 0,
repeat: false,
was_measured: false,
};
let ok = frame_tx.send(msg).is_ok();
if ok {
sent += 1;
}
ok
}};
}
'outer: while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
// Mode switch: rebuild the output + re-spawn the helper at the new mode (drop the old relay +
// keepalive only after the new pair is up, so a failed rebuild keeps the current stream). The
// DDA pipe (on the old target) is dropped — it reopens on the next secure transition.
let mut want = None;
while let Ok(m) = reconfig.try_recv() {
want = Some(m);
}
if let Some(new_mode) = want {
tracing::info!(?new_mode, "two-process: rebuilding for mode switch");
match build(&mut vd, new_mode) {
Ok((ka, rl, tg, hz)) => {
relay = rl; // drops the old relay (kills old helper) ...
_keepalive = ka; // ... then releases the old output
target = tg;
effective_hz = hz;
cur_mode = new_mode;
dda = None; // old-target DDA is stale; reopen on next secure
interval = std::time::Duration::from_secs_f64(1.0 / hz.max(1) as f64);
}
Err(e) => {
tracing::error!(error = %format!("{e:#}"), ?new_mode,
"two-process mode-switch rebuild failed — staying on the current mode");
}
}
}
// Coalesce client decode-recovery keyframe requests and forward to the active source.
let mut want_kf = false;
while keyframe.try_recv().is_ok() {
want_kf = true;
}
// Source mux: capture the secure (Winlogon) desktop via the host's DDA, the normal desktop via
// the helper relay. On a switch, latch await_idr + force the now-active source to emit an IDR
// so the client resumes cleanly.
let secure = dda_secure
&& match secure_test_ms {
Some(p) => (start.elapsed().as_millis() / p) % 2 == 1,
None => watcher.as_ref().is_some_and(|w| w.is_secure()),
};
if secure != on_secure {
on_secure = secure;
await_idr = true;
tracing::info!(
to = if secure {
"secure(DDA)"
} else {
"normal(WGC relay)"
},
"two-process: source switch"
);
if secure {
// Capture the secure (Winlogon) desktop in its NATIVE colorspace. Don't try to drop the
// SudoVDA out of HDR for the DDA leg — display-config changes are denied on the secure
// desktop (the drop just churned + still went black). Instead, if the monitor is in HDR,
// open DDA in HDR (FP16 DuplicateOutput1 → BT.2020 PQ Main10); the normal-desktop DDA
// overlay/flip issues that drove us to WGC don't apply to the composed Winlogon UI.
// SAFETY: `advanced_color_enabled` is `unsafe` only because it queries the Win32 CCD
// API; it takes `target_id` by value (the live SudoVDA monitor's CCD target id) and
// allocates + owns every buffer it passes the OS internally. No caller pointer is
// involved, so nothing must outlive the call and there is no aliasing; a missing
// target id just yields false.
let hdr = unsafe { crate::win_display::advanced_color_enabled(target.target_id) };
dda = None; // reopen to capture the secure desktop
match open_dda(&target, cur_mode.width, cur_mode.height, effective_hz, hdr) {
Ok(mut p) => {
tracing::info!(hdr, "two-process: opened DDA for the secure desktop");
p.enc.request_keyframe();
dda = Some(p);
}
Err(e) => {
tracing::error!(error = %format!("{e:#}"),
"two-process: DDA open failed — secure desktop will freeze on last frame");
}
}
next = std::time::Instant::now();
} else {
// Returning to the normal desktop: RESUME from the still-alive WGC helper. Do NOT
// recreate the SudoVDA monitor or respawn the helper — build()'s vd.create() is an
// IOCTL_REMOVE+ADD of the monitor (the audible disconnect/connect chime + the
// teardown/recreate kernel stress that broke DDA, now applied to the mux). The monitor +
// helper persist for the WHOLE session; only the host-DDA leg opens (secure) and closes
// (normal). Apply the DDA learning here: reuse, don't tear down.
dda = None; // free the secure DDA encoder; the relay (helper) is the source again
while relay.try_recv().is_ok() {} // drop secure-dwell backlog
relay.request_keyframe(); // client decoder resumes on the helper's next IDR
// Nothing to restore: we no longer toggle the SudoVDA's HDR state for the DDA leg, so the
// monitor's colorspace is unchanged and the still-alive WGC helper just resumes.
next = std::time::Instant::now();
}
}
if want_kf {
if secure {
if let Some(d) = dda.as_mut() {
d.enc.request_keyframe();
}
} else {
relay.request_keyframe();
}
await_idr = true;
}
if secure {
// DDA capture+encode for the secure desktop, paced to the frame interval.
let Some(d) = dda.as_mut() else {
std::thread::sleep(interval);
continue;
};
if let Some(f) = d.cap.try_latest().context("DDA capture")? {
d.frame = f;
}
let capture_ns = now_ns();
d.enc.submit(&d.frame).context("DDA encoder submit")?;
next += interval;
while let Some(au) = d.enc.poll().context("DDA encoder poll")? {
if await_idr && !au.keyframe {
continue;
}
await_idr = false;
if !forward!(au.data, capture_ns, au.keyframe) {
break 'outer; // send thread gone
}
}
match next.checked_duration_since(std::time::Instant::now()) {
Some(dur) => std::thread::sleep(dur),
None => next = std::time::Instant::now(),
}
} else {
// Relay the helper's AUs for the normal desktop. Timeout → keep servicing the loop;
// Disconnected → the helper exited (step 6 adds the relaunch watchdog).
let au = match relay.recv_timeout(std::time::Duration::from_millis(500)) {
Ok(au) => au,
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
if stop.load(Ordering::SeqCst) {
break;
}
tracing::warn!("two-process: no AU from helper within 500ms");
continue;
}
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => {
// The helper exited (crash, or a console disconnect killed its session). REBUILD
// the whole output + helper (not just respawn on the old target): an abruptly-killed
// helper leaves the SudoVDA's DXGI output briefly unresolvable ("no DXGI output for
// target N yet"), and a console reconnect needs a fresh output in the new session —
// `build` recreates both. Back off so a hard-failing rebuild (e.g. no active session
// yet) doesn't spin; give up only after a sustained run of failures.
helper_fails += 1;
if helper_fails > MAX_HELPER_FAILS {
tracing::error!(
fails = helper_fails,
"two-process: WGC helper keeps dying — ending stream"
);
break;
}
std::thread::sleep(std::time::Duration::from_millis(500));
match build(&mut vd, cur_mode) {
Ok((ka, rl, tg, hz)) => {
tracing::warn!(
fails = helper_fails,
"two-process: WGC helper exited — rebuilt output + helper"
);
relay = rl;
_keepalive = ka;
target = tg;
effective_hz = hz;
dda = None; // old-target DDA is stale
interval = std::time::Duration::from_secs_f64(1.0 / hz.max(1) as f64);
await_idr = true; // resume on the new helper's opening IDR
}
Err(e) => {
tracing::warn!(error = %format!("{e:#}"), fails = helper_fails,
"two-process: helper rebuild failed — will retry");
}
}
continue;
}
};
if await_idr && !au.keyframe {
continue; // skip stale deltas until the post-switch IDR
}
await_idr = false;
helper_fails = 0; // a frame flowed → the helper is healthy again
// The helper's pts_ns is on this machine's monotonic clock (same `now_ns()` source).
if !forward!(au.data, au.pts_ns, au.keyframe) {
break 'outer; // send thread gone
}
}
}
drop(frame_tx);
let _ = send_thread.join();
drop(watcher);
tracing::info!(sent, "punktfunk/1 two-process stream complete");
Ok(())
}
/// One mode's capture/encode pipeline: (capturer, encoder, first frame, frame interval).
/// Dropping the capturer tears down the PipeWire stream and the virtual output with it.
type Pipeline = (
+9 -42
View File
@@ -26,12 +26,9 @@ pub enum CaptureBackend {
/// Linux: the xdg ScreenCast portal → PipeWire (the only Linux capture path).
Portal,
/// Windows: IDD direct-push — frames pulled straight from the pf-vdisplay driver's shared ring
/// (in-process, Session 0; no Desktop Duplication, no WGC helper).
/// (in-process, Session 0; captures the secure desktop too). The sole Windows capture path —
/// DXGI Desktop Duplication (DDA) and the WGC two-process relay were removed.
IddPush,
/// Windows: DXGI Desktop Duplication (`PUNKTFUNK_CAPTURE=dda|dxgi` or `PUNKTFUNK_NO_WGC`).
Dda,
/// Windows: Windows.Graphics.Capture (the composed-desktop default), with a DDA watchdog fallback.
Wgc,
}
impl CaptureBackend {
@@ -42,20 +39,10 @@ impl CaptureBackend {
CaptureBackend::Portal
}
/// Windows precedence (identical to the pre-stage-3 `capture_virtual_output` branch order):
/// IDD-push wins; else an explicit `dda`/`dxgi` request or `PUNKTFUNK_NO_WGC` selects DDA; else WGC.
/// Windows: IDD direct-push is the sole capture path (DDA + the WGC two-process relay were removed).
#[cfg(target_os = "windows")]
pub fn resolve() -> Self {
let cfg = crate::config::config();
if cfg.idd_push {
CaptureBackend::IddPush
} else if matches!(cfg.capture_backend.as_str(), "dda" | "dxgi")
|| crate::capture::wgc_disabled()
{
CaptureBackend::Dda
} else {
CaptureBackend::Wgc
}
}
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
@@ -67,11 +54,9 @@ impl CaptureBackend {
/// How a session is structured across processes.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SessionTopology {
/// One process captures + encodes (Linux; Windows non-SYSTEM / IDD-push / `NO_WGC`).
/// One process captures + encodes. The only topology: Linux (portal) and Windows (in-process
/// IDD-push in Session 0). The SYSTEM-host + user-session WGC relay was removed with DDA/WGC.
SingleProcess,
/// SYSTEM host + a user-session WGC helper relay (the Windows normal-desktop path under SYSTEM,
/// where in-process WGC can't activate). See `virtual_stream_relay`.
TwoProcessRelay,
}
/// The resolved encode backend (recorded for logging / stages 45; the per-session encoder open still
@@ -103,8 +88,8 @@ pub struct SessionPlan {
pub encoder: EncoderBackend,
/// Handshake-negotiated encode bit depth (8, or 10 = HEVC Main10).
pub bit_depth: u8,
/// The IDD-push HDR hint (`bit_depth >= 10`) — the want-HDR flag the capturer was passed before.
/// Non-IDD-push Windows backends ignore it and auto-detect HDR from the monitor; Linux is 8-bit.
/// The IDD-push HDR hint (`bit_depth >= 10`) — the want-HDR flag handed to the capturer so it
/// proactively enables advanced color on the virtual display. Linux is 8-bit (HDR blocked upstream).
pub hdr: bool,
/// Handshake-negotiated chroma subsampling (4:2:0, or full-chroma 4:4:4 when the client + host +
/// GPU all support it). Resolved before the Welcome; `Yuv420` on every backend that declined it.
@@ -151,26 +136,8 @@ impl SessionPlan {
}
}
/// Process topology. On Windows this is the former `punktfunk1::should_use_helper` logic verbatim; on
/// every other platform the session is always single-process.
#[cfg(target_os = "windows")]
pub(crate) fn resolve_topology() -> SessionTopology {
let cfg = crate::config::config();
// `NO_HELPER`/`NO_WGC` force single-process; IDD-push captures in-process in Session 0 (no helper);
// otherwise the helper runs when forced or when we're SYSTEM (in-process WGC can't activate there).
let helper = if cfg.no_helper || crate::capture::wgc_disabled() || cfg.idd_push {
false
} else {
cfg.force_helper || crate::capture::wgc_relay::running_as_system()
};
if helper {
SessionTopology::TwoProcessRelay
} else {
SessionTopology::SingleProcess
}
}
#[cfg(not(target_os = "windows"))]
/// Process topology. Single-process is the only topology now: Linux (portal) and Windows (in-process
/// IDD-push in Session 0). The Windows SYSTEM-host + user-session WGC relay was removed with DDA/WGC.
pub(crate) fn resolve_topology() -> SessionTopology {
SessionTopology::SingleProcess
}
@@ -5,9 +5,8 @@
//! activation, and each store's auth/entitlement context resolve — the process must run in the
//! interactive session under the **logged-in user's** token, not SYSTEM and not session 0.
//!
//! This is the same `WTSGetActiveConsoleSessionId → WTSQueryUserToken → DuplicateTokenEx →
//! CreateProcessAsUserW(winsta0\\default)` primitive the WGC helper relay uses
//! ([`crate::capture::wgc_relay`]), factored out for the library launch path
//! This is the standard `WTSGetActiveConsoleSessionId → WTSQueryUserToken → DuplicateTokenEx →
//! CreateProcessAsUserW(winsta0\\default)` primitive, used for the library launch path
//! ([`crate::library::launch_title`]).
//!
//! IMPORTANT — use the **user** token (`WTSQueryUserToken`), NOT a session-retargeted SYSTEM token
@@ -36,7 +35,7 @@ use windows::Win32::System::Threading::{
///
/// Fire-and-forget: the launched game/launcher outlives this call, so the host does not track the
/// child — its handles are closed before returning (the process keeps running). The environment is
/// the user's block merged with the host's `PUNKTFUNK_*`/`RUST_LOG` (same merge the WGC helper uses),
/// the user's block merged with the host's `PUNKTFUNK_*`/`RUST_LOG` (see [`merged_env_block`]),
/// so `host.env` settings propagate.
///
/// Requires the host to run as SYSTEM (`WTSQueryUserToken` needs `SE_TCB`). Fails when no interactive
@@ -75,7 +74,7 @@ unsafe fn spawn_inner(cmdline: &str, workdir: Option<&Path>) -> Result<u32> {
// with the host's PUNKTFUNK_*/RUST_LOG vars — same shared helper the WGC helper + service spawns use.
let mut env_block: *mut core::ffi::c_void = std::ptr::null_mut();
let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false);
let merged_env = crate::capture::wgc_relay::merged_env_block(env_block as *const u16);
let merged_env = merged_env_block(env_block as *const u16);
if !env_block.is_null() {
let _ = DestroyEnvironmentBlock(env_block);
}
@@ -124,3 +123,48 @@ unsafe fn spawn_inner(cmdline: &str, workdir: Option<&Path>) -> Result<u32> {
let _ = CloseHandle(pi.hThread);
Ok(pid)
}
/// Build the environment block for a process launched into the interactive session: the target
/// session's block (`user_block`, from `CreateEnvironmentBlock`) with this process's `PUNKTFUNK_*`
/// vars overlaid, so the child runs with the SAME settings this process has
/// (`PUNKTFUNK_ENCODER=nvenc`, `PUNKTFUNK_ZEROCOPY`, …) instead of the target shell's. Returns a
/// UTF-16, double-null-terminated block suitable for `CREATE_UNICODE_ENVIRONMENT`. Shared by the
/// interactive library launch (here) and the Windows service launching the host into the active
/// session ([`crate::service`]).
///
/// # Safety
/// `user_block` must be either null or a valid pointer to a UTF-16, double-null-terminated
/// environment block (the `CreateEnvironmentBlock` output), readable for its whole length.
pub(crate) unsafe fn merged_env_block(user_block: *const u16) -> Vec<u16> {
// Parse the user block ("VAR=VALUE\0" … "\0") into entries.
let mut entries: Vec<String> = Vec::new();
if !user_block.is_null() {
let mut p = user_block;
loop {
let mut len = 0isize;
while *p.offset(len) != 0 {
len += 1;
}
if len == 0 {
break; // the trailing empty string = end of block
}
let slice = std::slice::from_raw_parts(p, len as usize);
entries.push(String::from_utf16_lossy(slice));
p = p.offset(len + 1);
}
}
// Overlay "our" settings — PUNKTFUNK_* and RUST_LOG — dropping whatever the target block had.
let is_ours = |k: &str| k.starts_with("PUNKTFUNK_") || k == "RUST_LOG";
entries.retain(|e| !is_ours(e.split('=').next().unwrap_or("")));
for (k, v) in std::env::vars().filter(|(k, _)| is_ours(k)) {
entries.push(format!("{k}={v}"));
}
// Serialize back to a UTF-16 double-null-terminated block.
let mut block: Vec<u16> = Vec::new();
for e in entries {
block.extend(e.encode_utf16());
block.push(0);
}
block.push(0);
block
}
+9 -8
View File
@@ -3,12 +3,12 @@
//! for the ad-hoc PsExec / VBS / scheduled-task launch chain used during bring-up.
//!
//! Why a supervisor and not just "run the host as a service": the host must run **as SYSTEM in the
//! interactive session** (session 1+). Desktop Duplication of the secure (Winlogon/UAC/lock) desktop
//! and `SendInput` both need SYSTEM; capture and injection both need the *interactive* session, which
//! interactive session** (session 1+). Capturing the secure (Winlogon/UAC/lock) desktop and
//! `SendInput` both need SYSTEM; capture and injection both need the *interactive* session, which
//! a plain session-0 service is not in. So this service (itself in session 0) never captures — it
//! duplicates its own LocalSystem token, retargets it to the active console session, and
//! `CreateProcessAsUserW`s the host there. This is the Sunshine/Apollo model. The host in turn spawns
//! the WGC helper into the *user* session (see `capture::wgc_relay`) — two nested launches.
//! `CreateProcessAsUserW`s the host there. This is the Sunshine/Apollo model. The host captures the
//! virtual display in-process via IDD direct-push (no helper process).
//!
//! Subcommands (Windows only):
//! ```text
@@ -230,8 +230,9 @@ fn run_service() -> Result<()> {
let _ = SESSION_EVENT.set(session_owned);
// The control handler captures nothing — it reaches the events through the statics, so it stays
// `Fn + Send + 'static`. Session lock/unlock are handled inside the host (DesktopWatcher), so we
// only flag console connect/disconnect/logon — the events that change the active session.
// `Fn + Send + 'static`. Lock/unlock is handled by the in-process IDD-push capture (the driver
// composes the secure desktop into the ring), so we only flag console connect/disconnect/logon —
// the events that change the active session.
let handler = move |control| -> ServiceControlHandlerResult {
match control {
ServiceControl::Stop | ServiceControl::Preshutdown | ServiceControl::Shutdown => {
@@ -517,10 +518,10 @@ unsafe fn spawn_host(
.context("SetTokenInformation(TokenSessionId)")?;
// 2) The session's environment block, merged with this process's PUNKTFUNK_*/RUST_LOG (so the
// host runs with host.env's settings, not a bare block). Same merge the WGC helper uses.
// host runs with host.env's settings, not a bare block). Same merge the interactive launch uses.
let mut env_block: *mut c_void = std::ptr::null_mut();
let _ = CreateEnvironmentBlock(&mut env_block, Some(primary), false);
let merged = crate::capture::wgc_relay::merged_env_block(env_block as *const u16);
let merged = crate::interactive::merged_env_block(env_block as *const u16);
if !env_block.is_null() {
let _ = DestroyEnvironmentBlock(env_block);
}
@@ -1,346 +0,0 @@
//! USER-session WGC helper (Windows) — part of the two-process secure-desktop design
//! (design/archive/windows-secure-desktop.md).
//!
//! WGC won't activate under the SYSTEM account, but the host must run as SYSTEM for the secure
//! desktop. So the SYSTEM host spawns THIS helper in the interactive user session
//! (`CreateProcessAsUserW`) to do the WGC capture + NVENC encode that needs the user token, and the
//! helper ships the encoded Annex-B access units back over its **stdout** pipe (which the host
//! inherits + reads). The host relays them on the live QUIC session while the normal desktop is up,
//! and switches to its own DDA encoder on the secure desktop. The helper captures the SAME SudoVDA
//! output **by GDI name only** — it never creates a virtual output / touches display topology (a
//! second topology owner would re-trigger the ACCESS_LOST born-lost storm).
//!
//! Wire framing on stdout, per AU: `[u32 len LE][u64 pts_ns LE][u8 keyframe][len bytes data]`.
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use crate::capture::{dxgi::WinCaptureTarget, wgc::WgcCapturer, Capturer};
use crate::encode::{self, Codec};
use anyhow::{Context, Result};
use std::io::{Read, Write};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
pub struct HelperOptions {
pub target_id: u32,
pub gdi_name: String,
pub width: u32,
pub height: u32,
pub fps: u32,
pub bitrate_kbps: u32,
/// Negotiated encode bit depth (8, or 10 = HEVC Main10). HDR auto-upgrades to 10 from the
/// captured frame's `Rgb10a2` format regardless.
pub bit_depth: u8,
}
/// AU framing magic + version, so the host can resync / detect a helper crash on its stdout stream.
const AU_MAGIC: u32 = 0x5046_4155; // "PFAU"
/// Control byte the host writes on our stdin to force the next frame to be an IDR. Must match
/// `wgc_relay::CTL_KEYFRAME`.
const CTL_KEYFRAME: u8 = 0x01;
pub fn run(opts: HelperOptions) -> Result<()> {
tracing::info!(
target_id = opts.target_id,
gdi = %opts.gdi_name,
mode = format!("{}x{}@{}", opts.width, opts.height, opts.fps),
"WGC helper starting (user session)"
);
// This thread does WGC capture + video-processor convert + NVENC submit — the GPU-submitting hot
// path. Elevate its OS priority so a CPU-heavy game can't deschedule it and delay submission (which
// would leave our HIGH GPU priority with nothing queued to prioritise). Apollo's capture thread is
// likewise CRITICAL.
crate::punktfunk1::boost_thread_priority(true);
// Capture the EXISTING SudoVDA output by GDI name / target id — do NOT create one (the host owns
// the virtual output + its isolate/restore; a second topology owner breaks DDA recovery).
let target = WinCaptureTarget {
adapter_luid: 0,
gdi_name: opts.gdi_name.clone(),
target_id: opts.target_id,
};
let mut cap =
WgcCapturer::open(target, Some((opts.width, opts.height, opts.fps))).context("WGC open")?;
cap.set_active(true);
// O3 present-trigger experiment: spawn a thread that PRESENTS a D3D swapchain to the virtual
// display (a present SOURCE), testing whether that — unlike WGC's READ — makes the OS assign the
// driver's IddCx swap-chain (so the driver's run_core runs + can push). Gated; diagnostic.
if std::env::var_os("PUNKTFUNK_PRESENT_TRIGGER").is_some() {
let (w, h) = (opts.width, opts.height);
std::thread::Builder::new()
.name("pf-present-trigger".into())
.spawn(move || {
tracing::info!("present-trigger: starting D3D present loop on the virtual display");
// SAFETY: `present_trigger` is unsafe only for its Win32/D3D11 FFI; it has no caller
// preconditions (it creates and exclusively owns its own window, device, and swapchain on
// this dedicated thread), so the call is sound.
if let Err(e) = unsafe { present_trigger(w, h) } {
tracing::warn!("present-trigger error: {e:#}");
}
})
.ok();
}
// First frame establishes the real dimensions + whether the desktop is HDR (the encoder derives
// Main10/HDR from the frame's PixelFormat::Rgb10a2). Then open NVENC on the capture device.
let first = cap.next_frame().context("first WGC frame")?;
let (w, h) = (first.width, first.height);
let mut enc = encode::open_video(
Codec::H265,
first.format,
w,
h,
opts.fps,
opts.bitrate_kbps as u64 * 1000,
false, // not cuda
opts.bit_depth, // 8, or 10 = Main10 (HDR auto-upgrades from the Rgb10a2 frame regardless)
// The two-process WGC relay helper encodes 4:2:0 in v1 (4:4:4 over the relay is a follow-up);
// the host gates 4:4:4 to the single-process topology.
encode::ChromaFormat::Yuv420,
)
.context("open NVENC")?;
// Control channel: the host writes a single byte on our stdin to force an IDR (client decode
// recovery), mirroring `enc.request_keyframe()` in the single-process path. A reader thread sets
// a flag the encode loop checks; stdin EOF (host gone) just stops the thread.
let kf = Arc::new(AtomicBool::new(false));
{
let kf = kf.clone();
std::thread::Builder::new()
.name("wgc-helper-ctl".into())
.spawn(move || {
let mut stdin = std::io::stdin();
let mut byte = [0u8; 1];
while let Ok(n) = stdin.read(&mut byte) {
if n == 0 {
break; // host closed our stdin
}
if byte[0] == CTL_KEYFRAME {
kf.store(true, Ordering::Relaxed);
}
}
})
.ok();
}
// Binary stdout — lock it once + write framed AUs. A short write / broken pipe means the host
// (parent) went away → exit cleanly so the host's relaunch watchdog can respawn us.
let stdout = std::io::stdout();
let mut out = stdout.lock();
// FIXED-CADENCE encode loop (mirrors the single-process `punktfunk1::virtual_stream` loop). The
// host runs as SYSTEM and relays our AUs; to deliver a STEADY `fps` to the client (the "fixed 240"
// goal) we must NOT gate on WGC's content-driven FrameArrived — `WgcCapturer::next_frame` blocks up
// to its ~8 ms static-repeat timeout when the desktop is quiet, capping a barely-changing desktop
// ~125 fps regardless of the GPU. Instead we pace to `1/fps` and take the FRESHEST frame with the
// non-blocking `try_latest`, repeating the last one when nothing newer arrived. Depth-1: NVENC's
// `poll` (lock_bitstream) blocks until the just-submitted frame is encoded, so exactly one frame is
// in flight per iteration. A deeper pipeline was measured to only stack latency under a
// GPU-saturating game (the encodes serialize on the contended GPU anyway) — the in-game lever is
// the GPU scheduling priority the SYSTEM host stamps on us, not pipeline depth.
let interval = std::time::Duration::from_secs_f64(1.0 / opts.fps.max(1) as f64);
let perf = crate::config::config().perf;
let mut frames = 0u64;
let mut repeats = 0u64; // frames where no newer capture had arrived (duplicate re-encode)
let mut cap_ns = 0u64; // time in try_latest (capture + video-processor convert)
let mut encode_ns = 0u64; // time blocked in lock_bitstream
let mut write_ns = 0u64; // time writing the AU to the stdout pipe (relay backpressure)
let mut window = std::time::Instant::now();
// `frame` is held across iterations and repeated when `try_latest` has nothing newer, so a static
// desktop still clocks `fps`. The capturer's held-set / output ring keep its texture alive across
// the repeat; reassigning `frame` on a fresh capture drops the prior one (already drained by poll).
let mut frame = first;
let mut next = std::time::Instant::now();
loop {
if kf.swap(false, Ordering::Relaxed) {
enc.request_keyframe();
}
// Freshest captured frame, or repeat the last (no new composition: static desktop / between a
// game's presents). Non-blocking, so the cadence is OURS, not WGC's event rate.
let t0 = std::time::Instant::now();
match cap.try_latest().context("WGC try_latest")? {
Some(f) => frame = f,
None => repeats += 1,
}
if perf {
cap_ns += t0.elapsed().as_nanos() as u64;
}
enc.submit(&frame).context("encoder submit")?;
// Drain the just-submitted frame. NVENC's poll blocks in lock_bitstream until it's encoded, so
// this returns exactly one AU (then None) — depth-1, no accumulation.
loop {
let p0 = std::time::Instant::now();
let polled = enc.poll().context("encoder poll")?;
if perf {
encode_ns += p0.elapsed().as_nanos() as u64;
}
let Some(au) = polled else { break };
let w0 = std::time::Instant::now();
let wrote = write_au(&mut out, &au);
if perf {
write_ns += w0.elapsed().as_nanos() as u64;
}
if wrote.is_err() {
tracing::info!("WGC helper: stdout closed (host gone) — exiting");
return Ok(());
}
}
// Pace to this frame's due time. If we're already past it (encode couldn't keep up under a
// GPU-saturating game), skip the sleep and re-baseline so we don't spiral into catch-up.
next += interval;
match next.checked_duration_since(std::time::Instant::now()) {
Some(d) => std::thread::sleep(d),
None => next = std::time::Instant::now(),
}
if perf {
frames += 1;
let since = window.elapsed();
if since.as_secs() >= 2 {
let secs = since.as_secs_f64();
let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6);
tracing::info!(
fps = format!("{:.1}", frames as f64 / secs),
repeats,
cap_ms = per(cap_ns),
encode_ms = per(encode_ns),
write_ms = per(write_ns),
"WGC helper perf (fixed-cadence depth-1; encode_ms=lock_bitstream; repeats=duplicated frames)"
);
frames = 0;
repeats = 0;
cap_ns = 0;
encode_ns = 0;
write_ns = 0;
window = std::time::Instant::now();
}
}
}
}
fn write_au(out: &mut impl Write, au: &encode::EncodedFrame) -> std::io::Result<()> {
out.write_all(&AU_MAGIC.to_le_bytes())?;
out.write_all(&(au.data.len() as u32).to_le_bytes())?;
out.write_all(&au.pts_ns.to_le_bytes())?;
out.write_all(&[au.keyframe as u8])?;
out.write_all(&au.data)?;
out.flush()
}
/// O3 present-trigger experiment (see the gated call in `run`). Creates a small swapchain-backed
/// window on the virtual display (the CCD-isolated primary) and presents continuously — an active
/// present SOURCE on the display — to test whether that makes the OS assign the driver's IddCx
/// swap-chain (which WGC's read does not). Runs forever on its own thread.
///
/// # Safety
/// Win32/D3D11 FFI; called once on a dedicated helper thread.
unsafe fn present_trigger(disp_w: u32, disp_h: u32) -> Result<()> {
use windows::core::{w, Interface};
use windows::Win32::Foundation::{HMODULE, HWND, LPARAM, LRESULT, WPARAM};
use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE;
use windows::Win32::Graphics::Direct3D11::{
D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView,
ID3D11Texture2D, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_SDK_VERSION,
};
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
use windows::Win32::Graphics::Dxgi::{
IDXGIAdapter, IDXGIDevice, IDXGIFactory2, DXGI_PRESENT, DXGI_SWAP_CHAIN_DESC1,
DXGI_SWAP_EFFECT_FLIP_DISCARD, DXGI_USAGE_RENDER_TARGET_OUTPUT,
};
use windows::Win32::System::LibraryLoader::GetModuleHandleW;
use windows::Win32::UI::WindowsAndMessaging::{
CreateWindowExW, DefWindowProcW, DispatchMessageW, PeekMessageW, RegisterClassW,
ShowWindow, MSG, PM_REMOVE, SW_SHOWNOACTIVATE, WNDCLASSW, WS_EX_NOACTIVATE, WS_EX_TOPMOST,
WS_POPUP, WS_VISIBLE,
};
unsafe extern "system" fn wndproc(h: HWND, m: u32, wp: WPARAM, lp: LPARAM) -> LRESULT {
DefWindowProcW(h, m, wp, lp)
}
let hinst: HMODULE = GetModuleHandleW(None)?;
let cls = w!("pfPresentTrigger");
let wc = WNDCLASSW {
lpfnWndProc: Some(wndproc),
hInstance: hinst.into(),
lpszClassName: cls,
..Default::default()
};
RegisterClassW(&wc);
// Small window at the top-left of the (primary = virtual) display so it barely obscures the
// captured desktop; topmost + no-activate so it doesn't steal focus.
let win_w = disp_w.min(96) as i32;
let win_h = disp_h.min(96) as i32;
let hwnd: HWND = CreateWindowExW(
WS_EX_TOPMOST | WS_EX_NOACTIVATE,
cls,
w!("pf-present"),
WS_POPUP | WS_VISIBLE,
0,
0,
win_w,
win_h,
None,
None,
Some(hinst.into()),
None,
)?;
let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE);
let mut device: Option<ID3D11Device> = None;
let mut context: Option<ID3D11DeviceContext> = None;
D3D11CreateDevice(
None,
D3D_DRIVER_TYPE_HARDWARE,
HMODULE::default(),
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
None,
D3D11_SDK_VERSION,
Some(&mut device),
None,
Some(&mut context),
)?;
let device = device.context("present-trigger d3d11 device")?;
let context = context.context("present-trigger d3d11 context")?;
let dxgi_dev: IDXGIDevice = device.cast()?;
let adapter: IDXGIAdapter = dxgi_dev.GetAdapter()?;
let factory: IDXGIFactory2 = adapter.GetParent()?;
let scd = DXGI_SWAP_CHAIN_DESC1 {
Width: win_w as u32,
Height: win_h as u32,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
BufferUsage: DXGI_USAGE_RENDER_TARGET_OUTPUT,
BufferCount: 2,
SwapEffect: DXGI_SWAP_EFFECT_FLIP_DISCARD,
..Default::default()
};
let swapchain = factory.CreateSwapChainForHwnd(&device, hwnd, &scd, None, None)?;
tracing::info!("present-trigger: swapchain created on the virtual display; presenting");
let mut frame = 0u32;
loop {
let mut msg = MSG::default();
while PeekMessageW(&mut msg, None, 0, 0, PM_REMOVE).as_bool() {
let _ = DispatchMessageW(&msg);
}
let back: ID3D11Texture2D = swapchain.GetBuffer(0)?;
let mut rtv: Option<ID3D11RenderTargetView> = None;
device.CreateRenderTargetView(&back, None, Some(&mut rtv))?;
let rtv = rtv.context("present-trigger rtv")?;
let c = (frame % 120) as f32 / 120.0;
context.ClearRenderTargetView(&rtv, &[c, 0.1, 0.2, 1.0]);
let _ = swapchain.Present(1, DXGI_PRESENT(0));
frame = frame.wrapping_add(1);
}
}