feat(windows): pf-vdisplay IDD-push — HDR + pipelined zero-copy capture

HDR (display-driven, matching the WGC path): - CTA-861.3 HDR EDID (BT.2020 primaries + HDR Static Metadata block) so Windows offers "Use HDR" on the virtual display. The host FOLLOWS the display's live advanced-color state, recreating the shared ring at the matching format (FP16 in HDR / BGRA in SDR) on a toggle — no freeze. - Always emit Main10/BT.2020-PQ Rgb10a2 while the display is HDR; the client auto-detects PQ from the HEVC VUI (clients under-report VIDEO_CAP_10BIT). Generic HDR10 mastering SEI on every IDR. - Generation-tagged `latest` (gen<<40|seq<<8|slot) + driver `is_stale` re-attach kill the toggle-time garbage frame and any stale-ring read. Perf: - Pipeline the encode loop (Capturer::pipeline_depth; IDD-push = 2): submit N+1 before polling N so the convert/copy on the 3D engine overlaps the NVENC encode of N on the ASIC. PUNKTFUNK_IDD_DEPTH overrides (1 = synchronous). - Rotating host output ring (OUT_RING) so the in-flight encode and the next convert never touch the same texture. - HDR converts directly from the keyed-mutex slot's SRV into the output ring (drops the redundant slot->fp16 scratch copy); SDR copies the BGRA slot in. The slot mutex is held only across the convert/copy, not the encode. RING_LEN 3->6 for publish headroom. - Capture-health diagnostic: new_fps vs repeat_fps under PUNKTFUNK_PERF (a low new_fps at a high send rate means the source isn't compositing, not an encode stall). Validated live on the RTX box: 5120x1440@240 HDR streams; driver composes ~180 new fps, encode 240 fps @ ~4.3 ms p50. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 00:35:52 +02:00
parent c5dab484df
commit e2c9bfd3d9
26 changed files with 2962 additions and 313 deletions
@@ -142,6 +142,16 @@ pub trait Capturer: Send {
    fn hdr_meta(&self) -> Option<punktfunk_core::quic::HdrMeta> {
        None
    }
+
+    /// How many frames the encode loop may keep in flight (submitted but not yet polled) before it
+    /// blocks. `1` (the default) is the synchronous loop: capture → submit → poll-blocks, so the
+    /// per-frame wall time is `capture+convert + encode`. A capturer that hands a fresh output texture
+    /// per frame (so the encode of N reads a different texture than the convert of N+1 writes) can return
+    /// `>1` to PIPELINE: the loop submits N+1 before polling N, overlapping the convert/copy on the 3D
+    /// engine with the NVENC-ASIC encode of the prior frame, dropping per-frame wall toward `max(...)`.
+    fn pipeline_depth(&self) -> usize {
+        1
+    }
 }

 /// A deterministic moving test pattern (BGRx). Lets the spike exercise the encode → file →
@@ -302,7 +312,11 @@ pub fn open_portal_monitor() -> Result<Box<dyn Capturer>> {
 /// [`crate::vdisplay::VirtualDisplay`] backend. The captured size is the size the output was
 /// created at — native, no scaling.
 #[cfg(target_os = "linux")]
-pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result<Box<dyn Capturer>> {
+pub fn capture_virtual_output(
+    vout: crate::vdisplay::VirtualOutput,
+    _want_hdr: bool,
+) -> Result<Box<dyn Capturer>> {
+    // The Linux host stays 8-bit (HDR is blocked upstream), so `want_hdr` is unused here.
    linux::PortalCapturer::from_virtual_output(vout).map(|c| Box::new(c) as Box<dyn Capturer>)
 }

@@ -317,7 +331,10 @@ pub(crate) fn wgc_disabled() -> bool {
 }

 #[cfg(target_os = "windows")]
-pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result<Box<dyn Capturer>> {
+pub fn capture_virtual_output(
+    vout: crate::vdisplay::VirtualOutput,
+    want_hdr: bool,
+) -> Result<Box<dyn Capturer>> {
    let target = vout.win_capture.clone().ok_or_else(|| {
        anyhow::anyhow!(
            "SudoVDA target not yet an active display (needs a WDDM GPU to activate it)"
@@ -325,6 +342,18 @@ pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result<Bo
    })?;
    let pref = vout.preferred_mode;
    let keep = vout.keepalive;
+    // P2 direct frame push (kill DDA): consume frames straight from the pf-vdisplay driver's shared
+    // ring — no Desktop Duplication, no win32u reparenting hook. Opt-in while it's A/B'd against DDA;
+    // `idd_push` takes the keepalive (owns the virtual display) so there's no fall-through.
+    if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() {
+        // Recreate the monitor + ring per session (fix-teardown): a FRESH monitor reliably gets a
+        // working IddCx swap-chain, whereas a REUSED monitor's swap-chain dies after ~2 sessions and
+        // the host can't revive it. The driver's recreate crash (target id resolved to 0) is fixed by
+        // stamping target_id onto the monitor context. The ring is always FP16 (the driver composes
+        // the IDD in FP16); `want_hdr` selects the per-frame conversion (FP16 → Rgb10a2 vs Bgra).
+        return idd_push::IddPushCapturer::open(target, pref, want_hdr, keep)
+            .map(|c| Box::new(c) as Box<dyn Capturer>);
+    }
    // WGC (Windows.Graphics.Capture) is the default: it captures the COMPOSED desktop including the
    // overlay/independent-flip planes DXGI Desktop Duplication misses (the frozen-HDR-animation bug),
    // and has no ACCESS_LOST-on-overlay churn. DDA stays available via PUNKTFUNK_CAPTURE=dda and is
@@ -376,7 +405,10 @@ pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result<Bo
 }

 #[cfg(not(any(target_os = "linux", target_os = "windows")))]
-pub fn capture_virtual_output(_vout: crate::vdisplay::VirtualOutput) -> Result<Box<dyn Capturer>> {
+pub fn capture_virtual_output(
+    _vout: crate::vdisplay::VirtualOutput,
+    _want_hdr: bool,
+) -> Result<Box<dyn Capturer>> {
    anyhow::bail!("virtual-output capture requires Linux or Windows")
 }

@@ -386,6 +418,8 @@ pub mod composed_flip;
 pub mod desktop_watch;
 #[cfg(target_os = "windows")]
 pub mod dxgi;
+#[cfg(target_os = "windows")]
+pub mod idd_push;
 #[cfg(target_os = "linux")]
 mod linux;
 #[cfg(target_os = "windows")]
@@ -202,6 +202,87 @@ pub(crate) unsafe fn make_device(
    Ok((device, context))
 }

+/// Resolve the configured GPU scheduling-priority class from `PUNKTFUNK_GPU_PRIORITY_CLASS`
+/// (`off|normal|high|realtime`, default high). `None` = leave it at the OS default (the `off` opt-out).
+/// D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, REALTIME 5.
+fn configured_gpu_priority_class() -> Option<i32> {
+    match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS")
+        .ok()
+        .as_deref()
+    {
+        Some("off") => None,
+        Some("normal") => Some(2),
+        Some("realtime") => Some(5),
+        _ => Some(4), // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC)
+    }
+}
+
+/// Enable SE_INC_BASE_PRIORITY on the CURRENT process token (best-effort) — the kernel gates the
+/// HIGH/REALTIME GPU scheduling-priority bump on it. Held by SYSTEM/Administrators; a UAC-FILTERED
+/// token (what `CreateProcessAsUserW` hands the WGC helper) does NOT have it, which is why the helper
+/// can't elevate itself and the SYSTEM host stamps the class onto it cross-process instead (see
+/// [`set_child_gpu_priority_class`]).
+unsafe fn enable_inc_base_priority() {
+    use windows::core::PCWSTR;
+    use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
+    use windows::Win32::Security::{
+        AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES,
+        SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, TOKEN_PRIVILEGES,
+        TOKEN_QUERY,
+    };
+    use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
+    let mut token = HANDLE::default();
+    if OpenProcessToken(
+        GetCurrentProcess(),
+        TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
+        &mut token,
+    )
+    .is_ok()
+    {
+        let mut luid = LUID::default();
+        if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() {
+            let tp = TOKEN_PRIVILEGES {
+                PrivilegeCount: 1,
+                Privileges: [LUID_AND_ATTRIBUTES {
+                    Luid: luid,
+                    Attributes: SE_PRIVILEGE_ENABLED,
+                }],
+            };
+            if AdjustTokenPrivileges(
+                token,
+                false,
+                Some(&tp as *const TOKEN_PRIVILEGES),
+                0,
+                None,
+                None,
+            )
+            .is_err()
+            {
+                tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority");
+            }
+        }
+        let _ = CloseHandle(token);
+    }
+}
+
+/// Call `gdi32!D3DKMTSetProcessSchedulingPriorityClass(process, prio)` (no stable windows-rs binding —
+/// loaded by name). Returns the NTSTATUS (0 = success) or `None` if the export can't be resolved. The
+/// CALLING process must hold SE_INC_BASE_PRIORITY ([`enable_inc_base_priority`]) for HIGH/REALTIME; the
+/// kernel checks the caller's privilege whether the target is self or a child we created.
+unsafe fn d3dkmt_set_scheduling_priority_class(
+    process: windows::Win32::Foundation::HANDLE,
+    prio: i32,
+) -> Option<i32> {
+    use windows::core::s;
+    use windows::Win32::Foundation::HANDLE;
+    use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
+    let gdi32 = LoadLibraryA(s!("gdi32.dll")).ok()?;
+    let p = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass"))?;
+    type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32;
+    let f: SetPrio = std::mem::transmute(p);
+    Some(f(process, prio))
+}
+
 /// Apollo-style GPU scheduling-priority hardening (Sunshine `display_base.cpp:599-709`). On a
 /// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but
 /// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling
@@ -209,89 +290,64 @@ pub(crate) unsafe fn make_device(
 /// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT
 /// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this).
 /// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime`
-/// (default high).
+/// (default high). NOTE: in the SYSTEM-host + user-session-helper deployment this self-set NO-OPs in
+/// the helper (filtered token), so the host also sets it on the helper via [`set_child_gpu_priority_class`].
 fn elevate_process_gpu_priority() {
    use std::sync::Once;
    static ONCE: Once = Once::new();
    ONCE.call_once(|| unsafe {
-        use windows::core::{s, PCWSTR};
-        use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
-        use windows::Win32::Security::{
-            AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES,
-            SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES,
-            TOKEN_PRIVILEGES, TOKEN_QUERY,
+        use windows::Win32::System::Threading::GetCurrentProcess;
+        let Some(prio) = configured_gpu_priority_class() else {
+            tracing::info!("GPU process scheduling priority class left at default (off)");
+            return;
        };
-        use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
-        use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
-
-        // D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4,
-        // REALTIME 5.
-        let prio: i32 = match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS").ok().as_deref() {
-            Some("off") => {
-                tracing::info!("GPU process scheduling priority class left at default (off)");
-                return;
-            }
-            Some("normal") => 2,
-            Some("realtime") => 5,
-            _ => 4, // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC)
-        };
-
-        // 1. Enable SE_INC_BASE_PRIORITY so the kernel permits the GPU priority bump.
-        let mut token = HANDLE::default();
-        if OpenProcessToken(
-            GetCurrentProcess(),
-            TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
-            &mut token,
-        )
-        .is_ok()
-        {
-            let mut luid = LUID::default();
-            if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() {
-                let tp = TOKEN_PRIVILEGES {
-                    PrivilegeCount: 1,
-                    Privileges: [LUID_AND_ATTRIBUTES {
-                        Luid: luid,
-                        Attributes: SE_PRIVILEGE_ENABLED,
-                    }],
-                };
-                if AdjustTokenPrivileges(
-                    token,
-                    false,
-                    Some(&tp as *const TOKEN_PRIVILEGES),
-                    0,
-                    None,
-                    None,
-                )
-                .is_err()
-                {
-                    tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority");
-                }
-            }
-            let _ = CloseHandle(token);
-        }
-
-        // 2. D3DKMTSetProcessSchedulingPriorityClass via gdi32 (no stable windows-rs binding).
-        if let Ok(gdi32) = LoadLibraryA(s!("gdi32.dll")) {
-            if let Some(p) = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass")) {
-                type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32;
-                let f: SetPrio = std::mem::transmute(p);
-                let st = f(GetCurrentProcess(), prio);
-                if st == 0 {
-                    tracing::info!(
-                        priority_class = prio,
-                        "GPU process scheduling priority class set (2=normal 4=high 5=realtime)"
-                    );
-                } else {
-                    tracing::warn!(
-                        status = format!("0x{st:08X}"),
-                        "D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)"
-                    );
-                }
-            }
+        enable_inc_base_priority();
+        match d3dkmt_set_scheduling_priority_class(GetCurrentProcess(), prio) {
+            Some(0) => tracing::info!(
+                priority_class = prio,
+                "GPU process scheduling priority class set (2=normal 4=high 5=realtime)"
+            ),
+            Some(st) => tracing::warn!(
+                status = format!("0x{st:08X}"),
+                "D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)"
+            ),
+            None => tracing::warn!("D3DKMTSetProcessSchedulingPriorityClass export not found"),
        }
    });
 }

+/// Set the GPU scheduling-priority class of ANOTHER process we created — the WGC capture+encode helper
+/// in the interactive user session. The helper is spawned with the user's UAC-FILTERED token, which
+/// lacks SE_INC_BASE_PRIORITY, so its own [`elevate_process_gpu_priority`] silently no-ops and NVENC
+/// gets starved under a GPU-saturating game (the "240→40 fps in-game collapse"). The SYSTEM host DOES
+/// hold the privilege, so it stamps the class onto the child's process handle right after spawn — the
+/// process-level class applies to GPU contexts the child creates afterwards. Best-effort; logged.
+/// `PUNKTFUNK_GPU_PRIORITY_CLASS=off` disables it (same knob as the self path).
+///
+/// # Safety
+/// `process` must be a valid handle to a process we own with at least PROCESS_SET_INFORMATION access
+/// (the just-created helper, `PROCESS_INFORMATION::hProcess`).
+pub(crate) unsafe fn set_child_gpu_priority_class(process: windows::Win32::Foundation::HANDLE) {
+    let Some(prio) = configured_gpu_priority_class() else {
+        return;
+    };
+    enable_inc_base_priority(); // the SYSTEM host holds SE_INC_BASE_PRIORITY; the helper does not
+    match d3dkmt_set_scheduling_priority_class(process, prio) {
+        Some(0) => tracing::info!(
+            priority_class = prio,
+            "WGC helper GPU scheduling priority class set cross-process from the SYSTEM host \
+             (2=normal 4=high 5=realtime)"
+        ),
+        Some(st) => tracing::warn!(
+            status = format!("0x{st:08X}"),
+            "cross-process D3DKMTSetProcessSchedulingPriorityClass on the WGC helper failed"
+        ),
+        None => tracing::warn!(
+            "D3DKMTSetProcessSchedulingPriorityClass export not found — WGC helper has no GPU priority"
+        ),
+    }
+}
+
 /// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST
 /// recovery to rebuild the whole capture on the current (possibly secure) input desktop.
 unsafe fn reopen_duplication(
@@ -0,0 +1,922 @@
+//! P2 direct frame push (kill DDA) — HOST side. The pf-vdisplay driver runs in a restricted WUDFHost
+//! token that canNOT create named kernel objects, so — exactly like the gamepad UMDF drivers
+//! (`inject/dualsense_windows.rs`) — the HOST (privileged) CREATES the shared header + frame-ready
+//! event + ring of keyed-mutex textures (`Global\` names, permissive `D:(A;;GA;;;WD)` SDDL) on the
+//! discrete render GPU, and the driver only OPENS them and copies frames in. We then consume the ring
+//! straight into the zero-copy NVENC path — no DXGI Desktop Duplication, no `win32u` hook. Gated by
+//! `PUNKTFUNK_IDD_PUSH`. Driver counterpart: `packaging/windows/vdisplay-driver/pf-vdisplay/src/
+//! frame_transport.rs` — [`SharedHeader`], [`MAGIC`], [`RING_LEN`], the status codes and the `Global\`
+//! name scheme are DUPLICATED byte-identically there.
+
+use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget};
+use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
+use anyhow::{bail, Context, Result};
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+use std::sync::Mutex;
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+use windows::core::{w, Interface, HSTRING};
+use windows::Win32::Foundation::{CloseHandle, HANDLE, INVALID_HANDLE_VALUE, LUID};
+use windows::Win32::Graphics::Direct3D11::{
+    ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView,
+    ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE,
+    D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX, D3D11_RESOURCE_MISC_SHARED_NTHANDLE,
+    D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
+};
+use windows::Win32::Graphics::Dxgi::Common::{
+    DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
+    DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
+};
+use windows::Win32::Graphics::Dxgi::{
+    CreateDXGIFactory1, IDXGIAdapter1, IDXGIFactory4, IDXGIKeyedMutex, IDXGIResource1,
+};
+use windows::Win32::Security::Authorization::{
+    ConvertStringSecurityDescriptorToSecurityDescriptorW, SDDL_REVISION_1,
+};
+use windows::Win32::Security::{PSECURITY_DESCRIPTOR, SECURITY_ATTRIBUTES};
+use windows::Win32::System::Memory::{
+    CreateFileMappingW, MapViewOfFile, UnmapViewOfFile, FILE_MAP_ALL_ACCESS,
+    MEMORY_MAPPED_VIEW_ADDRESS, PAGE_READWRITE,
+};
+use windows::Win32::System::Threading::{CreateEventW, WaitForSingleObject};
+
+// --- kept byte-identical with the driver (frame_transport.rs) ---
+pub const MAGIC: u32 = 0x4456_4650;
+pub const VERSION: u32 = 1;
+/// Ring slots — MUST equal the driver's `RING_LEN` (frame_transport.rs). 6 (was 3) gives ample headroom
+/// so the driver's 0 ms-timeout publish always finds a free slot while the host briefly holds one across
+/// the convert/copy into its output ring and the depth-2 pipelined encode runs on the rest.
+pub const RING_LEN: u32 = 6;
+const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1;
+
+// driver_status codes (the driver writes these; we read+log them).
+const DRV_STATUS_OPENED: u32 = 1;
+const DRV_STATUS_TEX_FAIL: u32 = 2;
+const DRV_STATUS_NO_DEVICE1: u32 = 3;
+
+/// Host-owned output-ring depth: distinct NVENC-input textures rotated per frame so the in-flight
+/// encode of frame N and the convert/copy of frame N+1 never touch the same texture. 3 covers a
+/// pipeline depth of 2 with one slot of margin.
+const OUT_RING: usize = 3;
+
+#[repr(C)]
+struct SharedHeader {
+    magic: u32,
+    version: u32,
+    generation: u32,
+    ring_len: u32,
+    width: u32,
+    height: u32,
+    dxgi_format: u32,
+    _pad: u32,
+    latest: u64,
+    qpc_pts: u64,
+    driver_render_luid_low: u32,
+    driver_render_luid_high: i32,
+    driver_status: u32,
+    driver_status_detail: u32,
+}
+
+/// Bring-up debug block (fixed name) — the host creates it; the driver writes diagnostics into it
+/// independent of the per-target header. Byte-identical with the driver's `DebugBlock`.
+#[repr(C)]
+struct DebugBlock {
+    magic: u32,
+    run_core_entries: u32,
+    resolved_target_id: u32,
+    header_open_attempts: u32,
+    last_open_error: u32,
+    header_opened: u32,
+    render_luid_low: u32,
+    render_luid_high: i32,
+    frames_acquired: u32,
+    _pad: u32,
+}
+const DBG_NAME: &str = "Global\\pfvd-dbg";
+const DBG_MAGIC: u32 = 0x4742_4450;
+
+fn hdr_name(target_id: u32) -> String {
+    format!("Global\\pfvd-hdr-{target_id}")
+}
+fn evt_name(target_id: u32) -> String {
+    format!("Global\\pfvd-evt-{target_id}")
+}
+fn tex_name(target_id: u32, generation: u32, slot: u32) -> String {
+    format!("Global\\pfvd-tex-{target_id}-{generation}-{slot}")
+}
+// ----------------------------------------------------------------
+
+/// Monotonic per-process generation: each capturer instance stamps its ring-texture names with a
+/// fresh value so a retried/overlapping `open()` never collides with a previous attempt's not-yet-
+/// released shared-handle names (`DXGI_ERROR_NAME_ALREADY_EXISTS`). The driver reads it from the header.
+static IDD_GENERATION: AtomicU32 = AtomicU32::new(1);
+
+fn now_ns() -> u64 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_nanos() as u64)
+        .unwrap_or(0)
+}
+
+struct HostSlot {
+    tex: ID3D11Texture2D,
+    mutex: IDXGIKeyedMutex,
+    shared: HANDLE,
+    /// SRV on the slot texture so the HDR path samples the FP16 slot DIRECTLY (no slot→scratch copy);
+    /// the convert pass writes the output ring while holding the slot's keyed mutex. Unused for SDR
+    /// (which CopyResource's the BGRA slot straight to the output).
+    srv: ID3D11ShaderResourceView,
+}
+
+impl Drop for HostSlot {
+    fn drop(&mut self) {
+        unsafe {
+            let _ = CloseHandle(self.shared);
+        }
+    }
+}
+
+/// Creates + owns the shared ring; yields the driver's frames as [`FramePayload::D3d11`].
+pub struct IddPushCapturer {
+    device: ID3D11Device,
+    context: ID3D11DeviceContext,
+    target_id: u32,
+    map: HANDLE,
+    header: *mut SharedHeader,
+    event: HANDLE,
+    dbg_map: HANDLE,
+    dbg_block: *mut DebugBlock,
+    width: u32,
+    height: u32,
+    slots: Vec<HostSlot>,
+    /// The ring/texture generation, bumped every time the ring is recreated at a new format (the
+    /// display's HDR mode flipped). Stamped into the texture names + the header so the driver re-attaches.
+    generation: u32,
+    /// The CLIENT's advertised 10-bit capability (= negotiated `bit_depth >= 10`). Only used at `open`
+    /// to PROACTIVELY enable advanced color (so a 10-bit client gets HDR without a manual toggle); it
+    /// does NOT gate the per-frame conversion — that follows the display, like the WGC path (clients
+    /// under-report 10-bit yet all decode Main10 + auto-detect PQ from the VUI).
+    client_10bit: bool,
+    /// The DISPLAY's CURRENT HDR state (from `advanced_color_enabled`) — the user can flip "Use HDR" in
+    /// Windows mid-session. Drives the ring format (HDR → FP16 surfaces, SDR → BGRA) and the conversion.
+    /// Polled in the capture loop; a change recreates the ring (see [`Self::recreate_ring`]).
+    display_hdr: bool,
+    /// Throttle for the `advanced_color_enabled` poll (a CCD `QueryDisplayConfig`, ~ms — too costly per
+    /// frame at 240 Hz).
+    last_acm_poll: Instant,
+    /// Host-owned ROTATING output ring NVENC encodes (texture + RTV per slot). Rotating it per frame is
+    /// the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
+    /// ASIC, frame N+1's convert/copy writes a DIFFERENT texture on the 3D engine — the two overlap. The
+    /// HDR convert and the SDR copy both write into the current slot. Format = `out_format()` (Rgb10a2 in
+    /// HDR, Bgra in SDR); rebuilt on a display-mode flip. Built lazily.
+    out_ring: Vec<(ID3D11Texture2D, ID3D11RenderTargetView)>,
+    out_idx: usize,
+    /// FP16 scRGB → `Rgb10a2` BT.2020 PQ converter, used while the display is HDR. Built lazily.
+    hdr_conv: Option<HdrConverter>,
+    last_seq: u64,
+    last_present: Option<(ID3D11Texture2D, PixelFormat)>,
+    status_logged: bool,
+    /// The monitor generation this capturer was opened for. When the active monitor gen changes (a
+    /// reconnect preempted + recreated the monitor), `next_frame` bails immediately so this session
+    /// releases its NVENC encoder instead of lingering on the dead ring's 20s deadline.
+    my_gen: u64,
+    _keepalive: Box<dyn Send>,
+}
+// COM objects used only from the owning (encode) thread.
+unsafe impl Send for IddPushCapturer {}
+
+/// The persistent IDD-push capturer, kept alive for the host lifetime and SHARED across client
+/// sessions. The driver's per-session monitor TEARDOWN→RECREATE path is unstable (on session 2 the
+/// target-id resolves to 0, `IddCxSwapChainSetDevice` fails `0x80070057`, then an access violation),
+/// while the FIRST-session path is solid. So we create the monitor + ring + swap-chain ONCE and hand
+/// every later session a thin handle delegating to this one. The persistent capturer holds a monitor
+/// lease for the host lifetime, so `VirtualDisplay::create` always JOINs the same live monitor (same
+/// target id) and the reuse match always hits — no recreate, no driver crash. Prototype scope:
+/// single-client, single-mode (a different mode would need a recreate, the unstable path).
+static IDD_PERSIST: Mutex<Option<IddPushCapturer>> = Mutex::new(None);
+
+/// Open the IDD-push capturer, reusing the persistent one across sessions (see [`IDD_PERSIST`]).
+pub fn open_or_reuse(
+    target: WinCaptureTarget,
+    preferred: Option<(u32, u32, u32)>,
+    client_10bit: bool,
+    keepalive: Box<dyn Send>,
+) -> Result<Box<dyn Capturer>> {
+    let (w, h, _) =
+        preferred.context("IDD push needs the negotiated mode (WxH) to size the ring")?;
+    let mut slot = IDD_PERSIST.lock().unwrap();
+    let reuse = matches!(slot.as_ref(), Some(c) if c.target_id == target.target_id && c.width == w && c.height == h);
+    match slot.as_mut() {
+        Some(c) if reuse => {
+            // Reuse: the persistent capturer already owns the monitor + ring + driver attach. Drop the
+            // new per-session monitor lease (the persistent capturer's lease keeps the monitor live).
+            // The ring tracks the display, not the client; only the client's 10-bit cap can differ.
+            drop(keepalive);
+            c.set_client_10bit(client_10bit);
+            tracing::info!(
+                target_id = target.target_id,
+                client_10bit,
+                "IDD push: reusing the persistent capturer (no monitor/ring recreate)"
+            );
+        }
+        Some(c) => bail!(
+            "IDD-push persistent capturer is {}x{} target {}, this session wants {}x{} target {} — a \
+             mode/target change needs a recreate (the driver's recreate path is unstable); not \
+             supported in the persistent prototype",
+            c.width,
+            c.height,
+            c.target_id,
+            w,
+            h,
+            target.target_id
+        ),
+        None => {
+            tracing::info!(
+                target_id = target.target_id,
+                client_10bit,
+                "IDD push: creating the persistent capturer (first session)"
+            );
+            *slot = Some(IddPushCapturer::open(target, preferred, client_10bit, keepalive)?);
+        }
+    }
+    Ok(Box::new(IddReuseHandle))
+}
+
+/// Thin per-session handle: every method delegates to the single persistent [`IddPushCapturer`].
+/// Dropping it (session end) does NOT tear down the ring/monitor — that's the whole point.
+struct IddReuseHandle;
+impl Capturer for IddReuseHandle {
+    fn next_frame(&mut self) -> Result<CapturedFrame> {
+        IDD_PERSIST
+            .lock()
+            .unwrap()
+            .as_mut()
+            .context("IDD-push persistent capturer missing")?
+            .next_frame()
+    }
+    fn try_latest(&mut self) -> Result<Option<CapturedFrame>> {
+        IDD_PERSIST
+            .lock()
+            .unwrap()
+            .as_mut()
+            .context("IDD-push persistent capturer missing")?
+            .try_latest()
+    }
+    fn set_active(&self, active: bool) {
+        if let Some(c) = IDD_PERSIST.lock().unwrap().as_ref() {
+            c.set_active(active);
+        }
+    }
+    fn hdr_meta(&self) -> Option<punktfunk_core::quic::HdrMeta> {
+        IDD_PERSIST
+            .lock()
+            .unwrap()
+            .as_ref()
+            .and_then(|c| c.hdr_meta())
+    }
+}
+
+/// Build a permissive (Everyone:GenericAll) `SECURITY_ATTRIBUTES` so the restricted WUDFHost driver
+/// can OPEN the host-created objects — the same `D:(A;;GA;;;WD)` SDDL the gamepad shared section uses.
+/// The returned `psd` backing must outlive `sa`; both are dropped when the process exits.
+unsafe fn permissive_sa() -> Result<(SECURITY_ATTRIBUTES, PSECURITY_DESCRIPTOR)> {
+    let mut psd = PSECURITY_DESCRIPTOR::default();
+    ConvertStringSecurityDescriptorToSecurityDescriptorW(
+        w!("D:(A;;GA;;;WD)"),
+        SDDL_REVISION_1,
+        &mut psd,
+        None,
+    )
+    .context("build SDDL for IDD-push shared objects")?;
+    let sa = SECURITY_ATTRIBUTES {
+        nLength: std::mem::size_of::<SECURITY_ATTRIBUTES>() as u32,
+        lpSecurityDescriptor: psd.0,
+        bInheritHandle: false.into(),
+    };
+    Ok((sa, psd))
+}
+
+impl IddPushCapturer {
+    /// Create the `RING_LEN` shared keyed-mutex textures for one ring generation, at `format` (matched
+    /// to the display's composition format — FP16 in HDR, BGRA in SDR). Each is shared by the name
+    /// `pfvd-tex-<target>-<generation>-<k>` so the driver opens it; a fresh generation gives fresh names
+    /// (so a recreate never collides with the old ring's not-yet-released handles).
+    unsafe fn create_ring_slots(
+        device: &ID3D11Device,
+        target_id: u32,
+        generation: u32,
+        w: u32,
+        h: u32,
+        format: DXGI_FORMAT,
+    ) -> Result<Vec<HostSlot>> {
+        let (sa, _psd) = permissive_sa()?;
+        let mut slots = Vec::new();
+        for k in 0..RING_LEN {
+            let desc = D3D11_TEXTURE2D_DESC {
+                Width: w,
+                Height: h,
+                MipLevels: 1,
+                ArraySize: 1,
+                // Match the OS-composed swap-chain surfaces so the driver's CopyResource into the slot +
+                // its format-guard both succeed.
+                Format: format,
+                SampleDesc: DXGI_SAMPLE_DESC {
+                    Count: 1,
+                    Quality: 0,
+                },
+                Usage: D3D11_USAGE_DEFAULT,
+                BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
+                CPUAccessFlags: 0,
+                MiscFlags: (D3D11_RESOURCE_MISC_SHARED_NTHANDLE.0
+                    | D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX.0) as u32,
+            };
+            let mut tex: Option<ID3D11Texture2D> = None;
+            device
+                .CreateTexture2D(&desc, None, Some(&mut tex))
+                .context("CreateTexture2D(IDD-push ring slot)")?;
+            let tex = tex.context("null ring texture")?;
+            let res1: IDXGIResource1 = tex.cast()?;
+            let shared = res1
+                .CreateSharedHandle(
+                    Some(&sa as *const SECURITY_ATTRIBUTES),
+                    DXGI_SHARED_RESOURCE_RW,
+                    &HSTRING::from(tex_name(target_id, generation, k)),
+                )
+                .context("CreateSharedHandle(IDD-push ring slot)")?;
+            let mutex: IDXGIKeyedMutex = tex.cast()?;
+            let mut srv: Option<ID3D11ShaderResourceView> = None;
+            device
+                .CreateShaderResourceView(&tex, None, Some(&mut srv))
+                .context("CreateShaderResourceView(IDD-push ring slot)")?;
+            let srv = srv.context("null slot srv")?;
+            slots.push(HostSlot {
+                tex,
+                mutex,
+                shared,
+                srv,
+            });
+        }
+        Ok(slots)
+    }
+
+    pub fn open(
+        target: WinCaptureTarget,
+        preferred: Option<(u32, u32, u32)>,
+        client_10bit: bool,
+        keepalive: Box<dyn Send>,
+    ) -> Result<Self> {
+        let (w, h, _hz) = preferred
+            .context("IDD push needs the negotiated mode (WxH) to size the shared ring")?;
+        // The driver composes the virtual display in FP16 (R16G16B16A16_FLOAT scRGB) when the display is
+        // in advanced-color (HDR) mode, and 8-bit BGRA otherwise (per swap_chain_processor.rs + the
+        // COMMIT_MODES2 colorspace/rgb_bpc log). The user can flip "Use HDR" in Windows at any time, so
+        // the ring format must TRACK the display's ACTUAL mode (the driver's format-guard drops a
+        // mismatch). We poll the live state here and on every recreate. For a 10-bit-capable client we
+        // PROACTIVELY enable advanced color so HDR streams without the user toggling anything; an
+        // SDR-only client leaves the display alone (and still gets a tone-mapped picture, never a freeze,
+        // if the user does enable HDR).
+        unsafe {
+            if client_10bit && crate::vdisplay::sudovda::set_advanced_color(target.target_id, true)
+            {
+                // Let the colorspace change settle before the driver composes + we size the ring.
+                std::thread::sleep(Duration::from_millis(250));
+            }
+            let display_hdr = crate::vdisplay::sudovda::advanced_color_enabled(target.target_id);
+            let ring_fmt = if display_hdr {
+                DXGI_FORMAT_R16G16B16A16_FLOAT
+            } else {
+                DXGI_FORMAT_B8G8R8A8_UNORM
+            };
+            // Create our device on the discrete render GPU (where NVENC runs); the driver must render
+            // the swap-chain on the SAME adapter for the shared textures to open (it reports its actual
+            // render LUID into the header so we can detect a mismatch).
+            let luid = resolve_render_adapter_luid_or(target.adapter_luid);
+            let factory: IDXGIFactory4 = CreateDXGIFactory1().context("CreateDXGIFactory1")?;
+            let adapter: IDXGIAdapter1 = factory
+                .EnumAdapterByLuid(luid)
+                .context("EnumAdapterByLuid(render adapter) for IDD push")?;
+            let (device, context) = make_device(&adapter).context("make_device for IDD push")?;
+
+            let (sa, _psd) = permissive_sa()?;
+            let bytes = std::mem::size_of::<SharedHeader>().max(64);
+
+            // Header.
+            let map = CreateFileMappingW(
+                INVALID_HANDLE_VALUE,
+                Some(&sa),
+                PAGE_READWRITE,
+                0,
+                bytes as u32,
+                &HSTRING::from(hdr_name(target.target_id)),
+            )
+            .context("CreateFileMapping(IDD-push header)")?;
+            let view = MapViewOfFile(map, FILE_MAP_ALL_ACCESS, 0, 0, bytes);
+            if view.Value.is_null() {
+                let _ = CloseHandle(map);
+                bail!("MapViewOfFile failed for IDD-push header");
+            }
+            let generation = IDD_GENERATION.fetch_add(1, Ordering::Relaxed);
+            let header = view.Value.cast::<SharedHeader>();
+            std::ptr::write_bytes(header.cast::<u8>(), 0, bytes);
+            (*header).version = VERSION;
+            (*header).generation = generation;
+            (*header).ring_len = RING_LEN;
+            (*header).width = w;
+            (*header).height = h;
+            // Ring format = the display's composition format (FP16 in HDR, BGRA in SDR). The driver
+            // reads this into its `ring_format` and drops any surface that doesn't match.
+            (*header).dxgi_format = ring_fmt.0 as u32;
+
+            // Frame-ready event (auto-reset).
+            let event = CreateEventW(
+                Some(&sa),
+                false,
+                false,
+                &HSTRING::from(evt_name(target.target_id)),
+            )
+            .context("CreateEvent(IDD-push)")?;
+
+            // Ring of shared keyed-mutex textures, format matched to the display's current mode.
+            let slots =
+                Self::create_ring_slots(&device, target.target_id, generation, w, h, ring_fmt)?;
+
+            // Bring-up debug block (fixed name) — the driver writes diagnostics here. Best-effort.
+            let dbg_bytes = std::mem::size_of::<DebugBlock>();
+            let (dbg_map, dbg_block) = match CreateFileMappingW(
+                INVALID_HANDLE_VALUE,
+                Some(&sa),
+                PAGE_READWRITE,
+                0,
+                dbg_bytes as u32,
+                &HSTRING::from(DBG_NAME),
+            ) {
+                Ok(dm) => {
+                    let dv = MapViewOfFile(dm, FILE_MAP_ALL_ACCESS, 0, 0, dbg_bytes);
+                    if dv.Value.is_null() {
+                        let _ = CloseHandle(dm);
+                        (HANDLE::default(), std::ptr::null_mut())
+                    } else {
+                        let p = dv.Value.cast::<DebugBlock>();
+                        std::ptr::write_bytes(p.cast::<u8>(), 0, dbg_bytes);
+                        (*p).magic = DBG_MAGIC;
+                        (dm, p)
+                    }
+                }
+                Err(_) => (HANDLE::default(), std::ptr::null_mut()),
+            };
+
+            // Publish: magic LAST (Release) — signals the driver the ring is ready to open.
+            std::sync::atomic::fence(Ordering::Release);
+            (*(std::ptr::addr_of!((*header).magic) as *const AtomicU32))
+                .store(MAGIC, Ordering::Release);
+
+            tracing::info!(
+                target_id = target.target_id,
+                render_luid = format!("{:08x}:{:08x}", luid.HighPart, luid.LowPart),
+                mode = format!("{w}x{h}"),
+                display_hdr,
+                client_10bit,
+                ring_fp16 = display_hdr,
+                "IDD push(host): created shared ring; waiting for the driver to attach + publish"
+            );
+            Ok(Self {
+                device,
+                context,
+                target_id: target.target_id,
+                map,
+                header,
+                event,
+                dbg_map,
+                dbg_block,
+                width: w,
+                height: h,
+                slots,
+                generation,
+                client_10bit,
+                display_hdr,
+                last_acm_poll: Instant::now(),
+                out_ring: Vec::new(),
+                out_idx: 0,
+                hdr_conv: None,
+                last_seq: 0,
+                last_present: None,
+                status_logged: false,
+                my_gen: crate::vdisplay::sudovda::CURRENT_MON_GEN.load(Ordering::Relaxed),
+                _keepalive: keepalive,
+            })
+        }
+    }
+
+    #[inline]
+    fn latest(&self) -> u64 {
+        unsafe {
+            (*(std::ptr::addr_of!((*self.header).latest) as *const AtomicU64))
+                .load(Ordering::Acquire)
+        }
+    }
+
+    /// Log the driver's status once it first reports (the only driver-visibility channel we have).
+    fn log_driver_status_once(&mut self) {
+        if self.status_logged {
+            return;
+        }
+        let (status, detail, lo, hi) = unsafe {
+            (
+                (*self.header).driver_status,
+                (*self.header).driver_status_detail,
+                (*self.header).driver_render_luid_low,
+                (*self.header).driver_render_luid_high,
+            )
+        };
+        if status == 0 {
+            return;
+        }
+        self.status_logged = true;
+        let render_luid = format!("{hi:08x}:{lo:08x}");
+        match status {
+            DRV_STATUS_OPENED => tracing::info!(
+                render_luid,
+                "IDD push: driver attached to the shared ring"
+            ),
+            DRV_STATUS_TEX_FAIL => tracing::error!(
+                render_luid,
+                detail = format!("0x{detail:08x}"),
+                "IDD push: driver could NOT open our textures — render-adapter mismatch (it renders on \
+                 a different GPU than where we created the ring)"
+            ),
+            DRV_STATUS_NO_DEVICE1 => {
+                tracing::error!("IDD push: driver has no ID3D11Device1 to open shared resources")
+            }
+            other => tracing::warn!(other, render_luid, "IDD push: driver reported an unknown status"),
+        }
+    }
+
+    /// Log the driver's bring-up diagnostics (the fixed-name debug block) — independent of the
+    /// per-target header, so it tells us whether the swap-chain processor ran, what target_id it
+    /// resolved, whether the header opened (+ error), and whether frames flowed.
+    fn log_debug_block(&self) {
+        if self.dbg_block.is_null() {
+            tracing::warn!("IDD push DEBUG: no debug block");
+            return;
+        }
+        let d = unsafe { &*self.dbg_block };
+        tracing::error!(
+            run_core_entries = d.run_core_entries,
+            resolved_target_id = d.resolved_target_id,
+            header_open_attempts = d.header_open_attempts,
+            last_open_error = format!("0x{:08x}", d.last_open_error),
+            header_opened = d.header_opened,
+            driver_render_luid = format!("{:08x}:{:08x}", d.render_luid_high, d.render_luid_low),
+            frames_acquired = d.frames_acquired,
+            "IDD push DEBUG: driver-reported diagnostics (run_core_entries=0 ⇒ swap-chain processor \
+             never ran; resolved_target_id≠ours ⇒ name mismatch; last_open_error 0x80070002 ⇒ header \
+             not found; frames_acquired=0 ⇒ idle display)"
+        );
+    }
+
+    /// The output texture format + the [`PixelFormat`] it presents as, driven SOLELY by the DISPLAY's
+    /// HDR state (like the WGC path): HDR → `Rgb10a2` BT.2020 PQ → NVENC Main10, and the client
+    /// auto-detects PQ from the HEVC VUI; SDR → 8-bit `Bgra`. We do NOT gate HDR on the client's
+    /// advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit only when
+    /// its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
+    fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) {
+        if self.display_hdr {
+            (DXGI_FORMAT_R10G10B10A2_UNORM, PixelFormat::Rgb10a2)
+        } else {
+            (DXGI_FORMAT_B8G8R8A8_UNORM, PixelFormat::Bgra)
+        }
+    }
+
+    /// The ring (shared-texture) format, matched to the display's composition format: FP16 when the
+    /// display is HDR, BGRA when SDR.
+    fn ring_format(&self) -> DXGI_FORMAT {
+        if self.display_hdr {
+            DXGI_FORMAT_R16G16B16A16_FLOAT
+        } else {
+            DXGI_FORMAT_B8G8R8A8_UNORM
+        }
+    }
+
+    /// Update the client's 10-bit capability (the reuse path). Only affects whether a fresh `open`
+    /// proactively enables advanced color; the per-frame conversion follows the display, not the client.
+    fn set_client_10bit(&mut self, client_10bit: bool) {
+        self.client_10bit = client_10bit;
+    }
+
+    /// Recreate the ring at the format for `new_display_hdr` (the user flipped "Use HDR"). Bumps the
+    /// generation so the driver re-attaches ([`is_stale`]) to the new-format textures; clears the
+    /// header's `latest` so we don't consume a stale slot from the old ring; drops the conversion
+    /// textures so they rebuild at the new format.
+    fn recreate_ring(&mut self, new_display_hdr: bool) -> Result<()> {
+        self.display_hdr = new_display_hdr;
+        let fmt = self.ring_format();
+        let new_gen = IDD_GENERATION.fetch_add(1, Ordering::Relaxed);
+        let new_slots = unsafe {
+            Self::create_ring_slots(
+                &self.device,
+                self.target_id,
+                new_gen,
+                self.width,
+                self.height,
+                fmt,
+            )?
+        };
+        unsafe {
+            // Clear `latest` to the 0 sentinel (generation 0, which try_consume rejects). The real guard
+            // against consuming an unwritten new-ring slot is the generation tag in `latest`: a stale
+            // old-ring publish racing this recreate carries the OLD generation and is rejected. We wait
+            // for the driver's first NEW-generation publish.
+            (*(std::ptr::addr_of!((*self.header).latest) as *const AtomicU64))
+                .store(0, Ordering::Relaxed);
+            (*self.header).dxgi_format = fmt.0 as u32;
+            // Publish the new generation LAST (Release): when the driver observes it (Acquire) the new
+            // textures already exist and the format is already updated.
+            std::sync::atomic::fence(Ordering::Release);
+            (*(std::ptr::addr_of!((*self.header).generation) as *const AtomicU32))
+                .store(new_gen, Ordering::Release);
+        }
+        self.slots = new_slots; // drops the old slots → closes their shared handles + SRVs
+        self.generation = new_gen;
+        self.last_seq = 0;
+        self.out_ring.clear(); // the output format changed → rebuild lazily at the new format
+        self.out_idx = 0;
+        self.last_present = None;
+        Ok(())
+    }
+
+    /// Throttled poll of the display's live HDR state; recreate the ring if the user flipped "Use HDR".
+    /// Called from the capture loop (incl. while frozen on a format mismatch) so a toggle recovers within
+    /// a poll interval.
+    fn poll_display_hdr(&mut self) {
+        if self.last_acm_poll.elapsed() < Duration::from_millis(250) {
+            return;
+        }
+        self.last_acm_poll = Instant::now();
+        let now_hdr = unsafe { crate::vdisplay::sudovda::advanced_color_enabled(self.target_id) };
+        if now_hdr == self.display_hdr {
+            return;
+        }
+        tracing::info!(
+            target_id = self.target_id,
+            display_hdr = now_hdr,
+            client_10bit = self.client_10bit,
+            "IDD push: display HDR mode flipped — recreating the ring at the new format"
+        );
+        if let Err(e) = self.recreate_ring(now_hdr) {
+            tracing::warn!(error = %format!("{e:#}"), "IDD push: ring recreate failed");
+        }
+    }
+
+    /// Build the host-owned output ring (`OUT_RING` textures at [`Self::out_format`] + RTVs) if not yet
+    /// built. Rotated per frame so the in-flight encode of N and the convert/copy of N+1 touch different
+    /// textures. Rebuilt (cleared) when the display-mode flip changes the output format.
+    fn ensure_out_ring(&mut self) -> Result<()> {
+        if !self.out_ring.is_empty() {
+            return Ok(());
+        }
+        let (format, _) = self.out_format();
+        let desc = D3D11_TEXTURE2D_DESC {
+            Width: self.width,
+            Height: self.height,
+            MipLevels: 1,
+            ArraySize: 1,
+            Format: format,
+            SampleDesc: DXGI_SAMPLE_DESC {
+                Count: 1,
+                Quality: 0,
+            },
+            Usage: D3D11_USAGE_DEFAULT,
+            BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
+            CPUAccessFlags: 0,
+            MiscFlags: 0,
+        };
+        for _ in 0..OUT_RING {
+            let mut t: Option<ID3D11Texture2D> = None;
+            let mut rtv: Option<ID3D11RenderTargetView> = None;
+            unsafe {
+                self.device
+                    .CreateTexture2D(&desc, None, Some(&mut t))
+                    .context("CreateTexture2D(IDD out ring)")?;
+                let t = t.context("null out-ring texture")?;
+                self.device
+                    .CreateRenderTargetView(&t, None, Some(&mut rtv))
+                    .context("CreateRenderTargetView(IDD out ring)")?;
+                self.out_ring.push((t, rtv.context("null out-ring rtv")?));
+            }
+        }
+        Ok(())
+    }
+
+    /// Build the HDR converter if not already built (HDR-display path only — an SDR display is a copy).
+    fn ensure_converter(&mut self) -> Result<()> {
+        if self.hdr_conv.is_none() {
+            self.hdr_conv = Some(unsafe { HdrConverter::new(&self.device)? });
+        }
+        Ok(())
+    }
+
+    fn try_consume(&mut self) -> Result<Option<CapturedFrame>> {
+        self.log_driver_status_once();
+        // Follow the display: a "Use HDR" flip recreates the ring at the matching format.
+        self.poll_display_hdr();
+        let latest = self.latest();
+        // `latest` = (generation << 40) | (seq << 8) | slot. Reject any publish whose generation isn't
+        // our CURRENT ring (a stale old-ring publish racing a recreate, or the 0 sentinel we reset to) so
+        // we never consume an unwritten new-ring slot — eliminating the toggle-time garbage frame.
+        if (latest >> 40) as u32 != self.generation {
+            return Ok(None);
+        }
+        let seq = (latest >> 8) & 0xFFFF_FFFF;
+        let slot = (latest & 0xff) as usize;
+        if seq == self.last_seq || slot >= self.slots.len() {
+            return Ok(None);
+        }
+        self.ensure_out_ring()?;
+        // Build the HDR converter BEFORE acquiring the slot so nothing between Acquire and Release can
+        // `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot).
+        if self.display_hdr {
+            self.ensure_converter()?;
+        }
+        let i = self.out_idx;
+        let (out, out_rtv) = {
+            let (t, rtv) = &self.out_ring[i];
+            (t.clone(), rtv.clone())
+        };
+        let (_, pf) = self.out_format();
+
+        // Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the
+        // ~3 ms encode — NVENC reads the host out-ring slot, not the keyed-mutex slot), so the driver gets
+        // the slot back immediately and the encode of the PREVIOUS frame overlaps this convert.
+        let s = &self.slots[slot];
+        if unsafe { s.mutex.AcquireSync(0, 8) }.is_err() {
+            return Ok(None);
+        }
+        unsafe {
+            if self.display_hdr {
+                // Sample the FP16 slot's SRV directly (no scratch copy) → BT.2020 PQ Rgb10a2.
+                if let Some(conv) = self.hdr_conv.as_ref() {
+                    conv.convert(&self.context, &s.srv, &out_rtv, self.width, self.height);
+                }
+            } else {
+                // SDR: the slot is already 8-bit BGRA — one copy into the out-ring (hidden by pipelining).
+                self.context.CopyResource(&out, &s.tex);
+            }
+            let _ = s.mutex.ReleaseSync(0);
+        }
+        self.out_idx = (i + 1) % self.out_ring.len();
+        self.last_seq = seq;
+        self.last_present = Some((out.clone(), pf));
+        Ok(Some(CapturedFrame {
+            width: self.width,
+            height: self.height,
+            pts_ns: now_ns(),
+            format: pf,
+            payload: FramePayload::D3d11(D3d11Frame {
+                texture: out,
+                device: self.device.clone(),
+            }),
+        }))
+    }
+
+    fn repeat_last(&self) -> Option<CapturedFrame> {
+        self.last_present.as_ref().map(|(tex, pf)| CapturedFrame {
+            width: self.width,
+            height: self.height,
+            pts_ns: now_ns(),
+            format: *pf,
+            payload: FramePayload::D3d11(D3d11Frame {
+                texture: tex.clone(),
+                device: self.device.clone(),
+            }),
+        })
+    }
+}
+
+/// Diagnostic observer (O3.1): create the IDD-push ring + debug block as the SYSTEM host (LocalSystem
+/// — proper privileges, the gamepad pattern) ALONGSIDE the normal WGC path, which provides the
+/// presentation trigger. Logs whether the driver's `run_core` ran and pushed frames into a
+/// host-created ring — resolving the `run_core=0` ambiguity (a user-created ring may be unwritable by
+/// the driver). Gated by `PUNKTFUNK_IDD_PUSH_OBSERVE`; spawns a short-lived sampling thread.
+pub fn spawn_observer(target: WinCaptureTarget, preferred: Option<(u32, u32, u32)>) {
+    std::thread::spawn(move || {
+        let tid = target.target_id;
+        tracing::info!(
+            target_id = tid,
+            "IDD push OBSERVER: creating host ring (LocalSystem) + debug block alongside WGC"
+        );
+        match IddPushCapturer::open(target, preferred, false, Box::new(())) {
+            Ok(mut cap) => {
+                let mut frames = 0u32;
+                for _ in 0..40 {
+                    match cap.try_consume() {
+                        Ok(Some(_)) => frames += 1,
+                        Ok(None) => {}
+                        Err(e) => tracing::warn!("IDD push OBSERVER: consume error: {e:#}"),
+                    }
+                    std::thread::sleep(Duration::from_millis(750));
+                }
+                tracing::info!(
+                    target_id = tid,
+                    frames_from_ring = frames,
+                    "IDD push OBSERVER: sampling done"
+                );
+                cap.log_debug_block();
+            }
+            Err(e) => tracing::warn!(
+                target_id = tid,
+                "IDD push OBSERVER: ring open failed: {e:#}"
+            ),
+        }
+    });
+}
+
+/// The discrete render GPU LUID (where NVENC runs), falling back to the monitor's `OsAdapterLuid`.
+fn resolve_render_adapter_luid_or(fallback_packed: i64) -> LUID {
+    if let Some(l) = unsafe { crate::vdisplay::sudovda::resolve_render_adapter_luid() } {
+        return l;
+    }
+    LUID {
+        LowPart: (fallback_packed & 0xffff_ffff) as u32,
+        HighPart: (fallback_packed >> 32) as i32,
+    }
+}
+
+impl Capturer for IddPushCapturer {
+    fn next_frame(&mut self) -> Result<CapturedFrame> {
+        let deadline = Instant::now() + Duration::from_secs(20);
+        loop {
+            let _ = unsafe { WaitForSingleObject(self.event, 16) };
+            if let Some(f) = self.try_consume()? {
+                return Ok(f);
+            }
+            if let Some(f) = self.repeat_last() {
+                return Ok(f);
+            }
+            if Instant::now() > deadline {
+                self.log_debug_block();
+                let (st, detail, lo, hi) = unsafe {
+                    (
+                        (*self.header).driver_status,
+                        (*self.header).driver_status_detail,
+                        (*self.header).driver_render_luid_low,
+                        (*self.header).driver_render_luid_high,
+                    )
+                };
+                bail!(
+                    "no IDD-push frame within 20s (target {}) — driver_status={st} detail=0x{detail:08x} \
+                     driver_render_luid={hi:08x}:{lo:08x}. 0=driver never attached (swap-chain not \
+                     assigned / driver not active), 1=attached but no frames (idle desktop?), 2=driver \
+                     couldn't open our textures (render-adapter mismatch).",
+                    self.target_id
+                );
+            }
+        }
+    }
+
+    fn try_latest(&mut self) -> Result<Option<CapturedFrame>> {
+        self.try_consume()
+    }
+
+    fn hdr_meta(&self) -> Option<punktfunk_core::quic::HdrMeta> {
+        // While the display is HDR we emit BT.2020 PQ (Rgb10a2) → the encoder forces HEVC Main10 + the
+        // PQ VUI; pair that with a mastering-display SEI so any decoder tone-maps from a real grade. The
+        // driver doesn't (yet) forward the OS's IDDCX_HDR10_METADATA, so use the generic HDR10 baseline
+        // (the same metadata the native HDR path sends on the 0xCE datagram).
+        self.display_hdr.then(crate::hdr::generic_hdr10)
+    }
+
+    fn pipeline_depth(&self) -> usize {
+        // 2 = one frame deferred: submit N+1 (capture + convert/copy into a fresh out-ring texture) while
+        // NVENC encodes N on the ASIC. We hand a rotating `OUT_RING` of output textures, so this is safe.
+        // `PUNKTFUNK_IDD_DEPTH` overrides (1 disables pipelining; clamp to ≤ OUT_RING so a frame in flight
+        // always has its own texture).
+        std::env::var("PUNKTFUNK_IDD_DEPTH")
+            .ok()
+            .and_then(|s| s.parse::<usize>().ok())
+            .unwrap_or(2)
+            .clamp(1, OUT_RING)
+    }
+}
+
+impl Drop for IddPushCapturer {
+    fn drop(&mut self) {
+        self.slots.clear();
+        unsafe {
+            if !self.dbg_block.is_null() {
+                let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS {
+                    Value: self.dbg_block.cast(),
+                });
+            }
+            if !self.dbg_map.is_invalid() {
+                let _ = CloseHandle(self.dbg_map);
+            }
+            if !self.header.is_null() {
+                let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS {
+                    Value: self.header.cast(),
+                });
+            }
+            let _ = CloseHandle(self.event);
+            let _ = CloseHandle(self.map);
+        }
+        // _keepalive drops after, REMOVEing the virtual display.
+    }
+}
@@ -278,6 +278,13 @@ unsafe fn spawn_inner(cmdline: &str, w: u32, h: u32, hz: u32) -> Result<HelperRe
    }
    tracing::info!(pid = pi.dwProcessId, mode = %format!("{w}x{h}@{hz}"), "WGC helper spawned");

+    // The helper does the WGC capture + NVENC encode, but it runs under the user's UAC-FILTERED token
+    // (no SE_INC_BASE_PRIORITY), so it can't raise its OWN GPU scheduling-priority class — under a
+    // GPU-saturating game NVENC then gets starved (the "240→40 fps in-game collapse"). The SYSTEM host
+    // holds the privilege, so stamp the HIGH GPU priority class onto the child here, right after spawn
+    // (the process-level class applies to the GPU contexts the helper creates afterwards).
+    crate::capture::dxgi::set_child_gpu_priority_class(pi.hProcess);
+
    // stderr → host tracing, line by line.
    let err_handle = HandleReader(err_r);
    std::thread::Builder::new()
@@ -127,8 +127,15 @@ fn run(
                refresh_hz: cfg.fps,
            })
            .context("create virtual output at client resolution")?;
+        // `want_hdr=false`: the IDD-push backend (opt-in PUNKTFUNK_IDD_PUSH) has no monitor-HDR
+        // auto-detection — it converts its always-FP16 ring per this flag — and GameStream HDR is not
+        // negotiated into StreamConfig here, so an IDD-push GameStream session streams SDR even on an
+        // HDR desktop. (The default WGC backend DOES auto-detect HDR from the output colorspace, but
+        // IDD-push bypasses WGC.) Acceptable for the experimental IDD-push A/B path; HDR over IDD-push
+        // is wired only for punktfunk/1 (want_hdr = negotiated bit_depth >= 10). TODO: derive want_hdr
+        // from a GameStream HDR flag once StreamConfig carries one.
        let mut capturer =
-            capture::capture_virtual_output(vout).context("capture virtual output")?;
+            capture::capture_virtual_output(vout, false).context("capture virtual output")?;
        capturer.set_active(true);
        return stream_body(&mut *capturer, &sock, cfg, running, force_idr, rfi_range);
    }
@@ -2149,6 +2149,22 @@ fn session_watcher_loop(tx: std::sync::mpsc::Sender<SessionSwitch>, stop: Arc<At
 /// keepalive, the virtual output) while the data-plane `session` continues untouched —
 /// the rebuilt encoder opens with an IDR + in-band parameter sets. `probe_rx`/`probe_result_tx`
 /// carry speed-test bursts (see [`service_probes`]).
+/// The stop flag of the current in-process IDD-push session, so a NEW connection can PREEMPT it.
+/// A fresh connection means the prior client is gone (a reconnect) and a reused IddCx monitor's
+/// swap-chain is dead — so we stop the prior session (it releases its monitor cleanly while frames
+/// still flow), then build a fresh one, instead of joining a dying session or tearing its monitor out
+/// from under it (which churns the driver's ADD/REMOVE path and wedges it under rapid reconnects).
+#[cfg(target_os = "windows")]
+static IDD_SESSION_STOP: std::sync::Mutex<Option<Arc<AtomicBool>>> = std::sync::Mutex::new(None);
+
+/// Serializes IDD-push session SETUP (preempt + monitor create + first frame). Held across setup,
+/// released before the encode loop — so a reconnect FLOOD can never run concurrent monitor
+/// create/teardown (the churn that fails the ADD IOCTL and wedges the driver). Each session finishes
+/// setup before the next acquires this and preempts it, by which point the preempted session is in its
+/// encode loop and releases its monitor promptly.
+#[cfg(target_os = "windows")]
+static IDD_SETUP_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
 #[allow(clippy::too_many_arguments)]
 fn virtual_stream(
    session: Session,
@@ -2197,9 +2213,30 @@ fn virtual_stream(
        bit_depth,
        "punktfunk/1 virtual display"
    );
+    // IDD-push reconnect preempt: a fresh connection means the prior client is gone. Hold IDD_SETUP_LOCK
+    // across the preempt + pipeline build so a reconnect FLOOD can't run concurrent monitor
+    // create/teardown. Then STOP the prior session (it ends cleanly while its monitor still composites
+    // frames) and WAIT for it to release its monitor, before building a FRESH one — instead of the
+    // driver-churning teardown of a monitor under a still-live session. Register THIS session's stop so
+    // the next reconnect preempts it.
+    #[cfg(target_os = "windows")]
+    let idd_setup_guard = std::env::var_os("PUNKTFUNK_IDD_PUSH")
+        .is_some()
+        .then(|| IDD_SETUP_LOCK.lock().unwrap());
+    #[cfg(target_os = "windows")]
+    if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() {
+        let prev = IDD_SESSION_STOP.lock().unwrap().replace(stop.clone());
+        if let Some(prev_stop) = prev {
+            prev_stop.store(true, Ordering::SeqCst);
+            crate::vdisplay::sudovda::wait_for_monitor_released(std::time::Duration::from_secs(3));
+        }
+    }
    let mut vd = crate::vdisplay::open(compositor)?;
    let (mut capturer, mut enc, mut frame, mut interval) =
        build_pipeline_with_retry(&mut vd, mode, bitrate_kbps, bit_depth)?;
+    // Setup done — release the IDD-push setup lock so the next reconnect can begin (and preempt us).
+    #[cfg(target_os = "windows")]
+    drop(idd_setup_guard);

    // Windows single-process DDA path (PUNKTFUNK_NO_WGC=1): the SudoVDA virtual display, isolated as the
    // SOLE active output, goes into fullscreen independent-flip (one plane on one display) which Desktop
@@ -2276,6 +2313,17 @@ fn virtual_stream(
    let mut capture_rebuilds: u32 = 0;
    // Last HDR mastering metadata we forwarded — re-sent as 0xCE on change/keyframe (see below).
    let mut last_hdr_meta: Option<punktfunk_core::quic::HdrMeta> = None;
+    // Frames submitted to NVENC but not yet polled (capture_ns, pacing deadline). With a capturer that
+    // hands a fresh output texture per frame, the loop submits N+1 before polling N (pipeline depth > 1),
+    // overlapping the convert/copy of N+1 on the 3D engine with the encode of N on the NVENC ASIC.
+    let mut inflight: std::collections::VecDeque<(u64, std::time::Instant)> =
+        std::collections::VecDeque::new();
+    // Diagnostic: distinguish NEW captured frames (the source produced a fresh frame) from REPEATS (the
+    // loop re-encoded the last frame because `try_latest` had nothing). A low new-frame rate at a high
+    // send rate ⇒ the capture source isn't producing frames (e.g. an IDD virtual display DWM isn't
+    // compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`.
+    let (mut diag_new, mut diag_repeat) = (0u64, 0u64);
+    let mut diag_at = std::time::Instant::now();
    while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
        // Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in
        // place — a different compositor at the SAME client mode — keeping the Session + send thread
@@ -2384,9 +2432,10 @@ fn virtual_stream(
        match capturer.try_latest() {
            Ok(Some(f)) => {
                frame = f;
+                diag_new += 1;
                capture_rebuilds = 0; // a delivered frame clears the consecutive-loss counter
            }
-            Ok(None) => {} // no new frame (static desktop / mid-rebuild) — repeat the last frame
+            Ok(None) => diag_repeat += 1, // no new frame (static desktop / mid-rebuild) — repeat the last
            // The capture source died (PipeWire/compositor thread ended, virtual output gone). Rather
            // than tear the whole session down — the client has no reconnect path and would have to
            // cold-restart the handshake — rebuild the pipeline IN PLACE at the current mode, exactly
@@ -2411,6 +2460,18 @@ fn virtual_stream(
                next = std::time::Instant::now();
            }
        }
+        if perf && diag_at.elapsed() >= std::time::Duration::from_secs(2) {
+            let secs = diag_at.elapsed().as_secs_f64();
+            tracing::info!(
+                new_fps = format!("{:.0}", diag_new as f64 / secs),
+                repeat_fps = format!("{:.0}", diag_repeat as f64 / secs),
+                "capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \
+                 the source isn't producing frames, not an encode stall)"
+            );
+            diag_new = 0;
+            diag_repeat = 0;
+            diag_at = std::time::Instant::now();
+        }
        // The source's static HDR mastering metadata (Windows GetDesc1; None on Linux/SDR) is the
        // single source of truth: hand it to the encoder (in-band SEI on keyframes) and, when it
        // changes, to the client (0xCE). Re-sent on each keyframe below so a dropped best-effort
@@ -2421,13 +2482,26 @@ fn virtual_stream(
        if resend_meta {
            last_hdr_meta = hdr_meta;
        }
+        // How deep to pipeline (1 = synchronous submit→poll, the original behaviour). The IDD-push
+        // capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1.
+        let depth = capturer.pipeline_depth().max(1);
        let capture_ns = now_ns();
        enc.submit(&frame).context("encoder submit")?;
-        // The deadline for this frame's packets (the next frame's due time); the send thread paces
-        // up to here so a high-bitrate frame spreads over the interval instead of bursting.
+        // This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame
+        // up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled.
        next += interval;
+        inflight.push_back((capture_ns, next));
+        // Drain the OLDEST in-flight frames, keeping at most depth-1 deferred. At depth 1 this polls
+        // immediately after every submit (synchronous); at depth 2 it polls N right after submitting N+1,
+        // so the encode of N overlaps the convert/copy of N+1. NVENC's `pending` is FIFO, so poll() returns
+        // the oldest submitted frame's AU — matching `inflight.pop_front()`.
        let mut send_gone = false;
-        while let Some(au) = enc.poll().context("encoder poll")? {
+        while inflight.len() >= depth {
+            let au = match enc.poll().context("encoder poll")? {
+                Some(au) => au,
+                None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks)
+            };
+            let (cap_ns, deadline) = inflight.pop_front().expect("inflight non-empty");
            let flags = if au.keyframe {
                (FLAG_PIC | FLAG_SOF) as u32
            } else {
@@ -2442,12 +2516,12 @@ fn virtual_stream(
                    resend_meta = false;
                }
            }
-            let encode_us = (now_ns().saturating_sub(capture_ns) / 1000) as u32;
+            let encode_us = (now_ns().saturating_sub(cap_ns) / 1000) as u32;
            let msg = FrameMsg {
                data: au.data,
-                capture_ns,
+                capture_ns: cap_ns,
                flags,
-                deadline: next,
+                deadline,
                encode_us,
            };
            // Hand to the send thread; this blocks (backpressure) if it's behind. An Err means it
@@ -2466,6 +2540,28 @@ fn virtual_stream(
            None => next = std::time::Instant::now(),
        }
    }
+    // Drain the in-flight tail (the depth-1 frames submitted but not yet polled) so the last frames still
+    // reach the client instead of being dropped on the way out.
+    while let Some((cap_ns, deadline)) = inflight.pop_front() {
+        let Ok(Some(au)) = enc.poll() else { break };
+        let flags = if au.keyframe {
+            (FLAG_PIC | FLAG_SOF) as u32
+        } else {
+            FLAG_PIC as u32
+        };
+        let encode_us = (now_ns().saturating_sub(cap_ns) / 1000) as u32;
+        let msg = FrameMsg {
+            data: au.data,
+            capture_ns: cap_ns,
+            flags,
+            deadline,
+            encode_us,
+        };
+        if frame_tx.send(msg).is_err() {
+            break;
+        }
+        sent += 1;
+    }
    // Signal the send thread to drain + exit (drop the channel), then join it.
    drop(frame_tx);
    let _ = send_thread.join();
@@ -2484,6 +2580,14 @@ fn should_use_helper() -> bool {
    if std::env::var_os("PUNKTFUNK_NO_HELPER").is_some() || crate::capture::wgc_disabled() {
        return false;
    }
+    // IDD direct-push captures IN-PROCESS in Session 0: the pf-vdisplay driver delivers frames to the
+    // SYSTEM host's session via shared memory and NVENC is headless, so no user-session WGC helper is
+    // needed for VIDEO (and a Session-1 helper couldn't open the Session-0 shared textures anyway).
+    // NOTE: input injection (SendInput) from Session 0 can't reach the user's Session-1 desktop yet —
+    // a known follow-up; this path validates the video transport. See docs/windows-virtual-display-rust-port.md.
+    if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() {
+        return false;
+    }
    std::env::var_os("PUNKTFUNK_FORCE_HELPER").is_some()
        || crate::capture::wgc_relay::running_as_system()
 }
@@ -2576,6 +2680,15 @@ fn virtual_stream_relay(
    let (mut _keepalive, mut relay, mut target, mut effective_hz) = build(&mut vd, mode)?;
    let mut cur_mode = mode;

+    // O3.1: optionally observe the IDD-push ring alongside WGC (WGC = the presentation trigger) to
+    // confirm the 0257 driver pushes frames into a HOST-created ring. Diagnostic only; gated.
+    if std::env::var_os("PUNKTFUNK_IDD_PUSH_OBSERVE").is_some() {
+        crate::capture::idd_push::spawn_observer(
+            target.clone(),
+            Some((cur_mode.width, cur_mode.height, effective_hz)),
+        );
+    }
+
    // The host's own DDA capturer+encoder for the SECURE (Winlogon) desktop, which WGC — and thus the
    // helper — cannot capture. Opened lazily on the first secure transition (so a session that never
    // hits a UAC/lock screen never pays for a second NVENC session), then kept for fast re-switch.
@@ -3014,8 +3127,12 @@ fn build_pipeline(
            "compositor did not honor the requested refresh — encoding at the achieved rate"
        );
    }
-    let mut capturer =
-        crate::capture::capture_virtual_output(vout).context("capture virtual output")?;
+    // HDR vs SDR for the IDD-push conversion: a negotiated 10-bit session (client advertised
+    // VIDEO_CAP_10BIT + host opted in via PUNKTFUNK_10BIT) is our HDR path → BT.2020 PQ Rgb10a2;
+    // otherwise the FP16 IDD frames are converted to 8-bit SDR. (Ignored by non-IDD-push backends,
+    // which auto-detect HDR from the monitor state.)
+    let mut capturer = crate::capture::capture_virtual_output(vout, bit_depth >= 10)
+        .context("capture virtual output")?;
    capturer.set_active(true);
    let frame = capturer.next_frame().context("first frame")?;
    // `bit_depth` is the handshake-negotiated value (8, or 10 = HEVC Main10 when the client
@@ -76,7 +76,7 @@ pub fn run(opts: Options) -> Result<()> {
                    refresh_hz: opts.fps,
                })
                .context("create virtual output")?;
-            capture::capture_virtual_output(vout).context("capture virtual output")?
+            capture::capture_virtual_output(vout, false).context("capture virtual output")?
        }
    };

@@ -9,8 +9,25 @@

 use std::ffi::c_void;
 use std::mem::size_of;
-use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::{Arc, Mutex, Once};
+
+/// Monotonic monitor generation. Each [`create_monitor`] stamps the next value onto the [`Monitor`]
+/// and its [`MonitorLease`]s, so a lease whose monitor was already torn down + recreated (the IDD-push
+/// reconnect-preempt path) is ignored on drop instead of decrementing the NEW monitor's refcount.
+static MON_GEN: AtomicU64 = AtomicU64::new(1);
+
+/// The gen of the CURRENTLY-active monitor. A session capturer captures this at open and re-checks it
+/// each frame; when it changes (a reconnect preempted + recreated the monitor), the old session bails
+/// IMMEDIATELY instead of lingering on the dead ring's 20s frame deadline — which would otherwise hold
+/// its NVENC encoder open and exhaust the GPU's encode-session limit under rapid reconnects.
+pub(crate) static CURRENT_MON_GEN: AtomicU64 = AtomicU64::new(0);
+
+/// IDD-push mode: a new client connection preempts + recreates the monitor (single-client reconnect),
+/// because a REUSED IddCx monitor's swap-chain is dead. Off → monitors are shared across sessions.
+fn idd_push_mode() -> bool {
+    std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some()
+}
 use std::thread::{self, JoinHandle};
 use std::time::{Duration, Instant};

@@ -27,7 +44,8 @@ use windows::Win32::Devices::Display::{
    DISPLAYCONFIG_DEVICE_INFO_GET_SOURCE_NAME, DISPLAYCONFIG_DEVICE_INFO_SET_ADVANCED_COLOR_STATE,
    DISPLAYCONFIG_GET_ADVANCED_COLOR_INFO, DISPLAYCONFIG_MODE_INFO, DISPLAYCONFIG_PATH_INFO,
    DISPLAYCONFIG_SET_ADVANCED_COLOR_STATE, DISPLAYCONFIG_SOURCE_DEVICE_NAME,
-    QDC_ONLY_ACTIVE_PATHS, SDC_ALLOW_CHANGES, SDC_APPLY, SDC_USE_SUPPLIED_DISPLAY_CONFIG,
+    QDC_ONLY_ACTIVE_PATHS, SDC_ALLOW_CHANGES, SDC_APPLY, SDC_FORCE_MODE_ENUMERATION,
+    SDC_SAVE_TO_DATABASE, SDC_USE_SUPPLIED_DISPLAY_CONFIG,
 };
 use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
 use windows::Win32::Graphics::Gdi::{
@@ -119,7 +137,9 @@ unsafe fn set_render_adapter(h: HANDLE, luid: LUID) -> Result<()> {
 /// Desktop Duplication (e.g. the RTX 4090). Default: the discrete adapter with the most
 /// `DedicatedVideoMemory`, skipping WARP / Basic-Render and the SudoVDA software adapter (≈0 VRAM).
 /// `PUNKTFUNK_RENDER_ADAPTER=<substring>` forces a match by Description (Apollo's `adapter_name`).
-unsafe fn resolve_render_adapter_luid() -> Option<LUID> {
+/// `pub(crate)` so the IDD direct-push capturer can create its shared textures on the same discrete
+/// GPU it pins here (and where NVENC runs).
+pub(crate) unsafe fn resolve_render_adapter_luid() -> Option<LUID> {
    use windows::Win32::Graphics::Dxgi::{CreateDXGIFactory1, IDXGIFactory1};
    let want = std::env::var("PUNKTFUNK_RENDER_ADAPTER")
        .ok()
@@ -497,13 +517,32 @@ unsafe fn isolate_displays_ccd(keep_target_id: u32) -> Option<SavedConfig> {
        }
    }
    if others == 0 {
-        tracing::info!("display isolate (CCD): SudoVDA target {keep_target_id} already the only active display");
+        // The virtual path shows active in the CCD database (from set_active_mode's legacy
+        // ChangeDisplaySettingsExW), but a legacy mode-set does NOT drive the IddCx adapter's
+        // EVT_IDD_CX_ADAPTER_COMMIT_MODES — and without COMMIT_MODES the OS never calls
+        // ASSIGN_SWAPCHAIN, so the driver never receives composed frames. Force an explicit CCD
+        // SetDisplayConfig commit of the (sole) virtual path so the IddCx path actually activates.
+        // SDC_FORCE_MODE_ENUMERATION makes the OS re-enumerate + re-commit even though the CCD DB
+        // already lists the path active.
+        let rc = SetDisplayConfig(
+            Some(paths.as_slice()),
+            Some(modes.as_slice()),
+            SDC_APPLY
+                | SDC_USE_SUPPLIED_DISPLAY_CONFIG
+                | SDC_ALLOW_CHANGES
+                | SDC_SAVE_TO_DATABASE
+                | SDC_FORCE_MODE_ENUMERATION,
+        );
+        tracing::info!("display isolate (CCD): forced CCD re-commit of sole virtual path {keep_target_id} rc={rc:#x} (drives IddCx COMMIT_MODES → ASSIGN_SWAPCHAIN)");
        return Some(saved);
    }
    let rc = SetDisplayConfig(
        Some(paths.as_slice()),
        Some(modes.as_slice()),
-        SDC_APPLY | SDC_USE_SUPPLIED_DISPLAY_CONFIG | SDC_ALLOW_CHANGES,
+        SDC_APPLY
+            | SDC_USE_SUPPLIED_DISPLAY_CONFIG
+            | SDC_ALLOW_CHANGES
+            | SDC_FORCE_MODE_ENUMERATION,
    );
    if rc == 0 {
        tracing::info!("display isolate (CCD): deactivated {others} other display(s) — SudoVDA target {keep_target_id} is now the sole desktop");
@@ -587,6 +626,8 @@ struct Monitor {
    stop: Arc<AtomicBool>,
    pinger: Option<JoinHandle<()>>,
    ccd_saved: Option<SavedConfig>,
+    /// Generation stamp ([`MON_GEN`]); a [`MonitorLease`] only releases if its gen still matches.
+    gen: u64,
 }

 enum MgrState {
@@ -670,6 +711,14 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result<M
        // PUNKTFUNK_RENDER_ADAPTER=<name substring> only on a box that genuinely needs steering.
        let pinned = if std::env::var("PUNKTFUNK_RENDER_ADAPTER").is_ok() {
            unsafe { resolve_render_adapter_luid() }
+        } else if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() {
+            // P2 direct frame push: the host opens the driver's shared textures AND runs NVENC on the
+            // RENDER adapter, so on a hybrid box (4090 + iGPU) it MUST be the discrete encoder GPU —
+            // an iGPU-rendered surface is untouchable by NVENC. pf-vdisplay HONORS SET_RENDER_ADAPTER
+            // (SudoVDA ignored it), so pin the discrete GPU. The driver also reports the resulting
+            // render LUID in the shared header, so the host binds correctly even if this is overridden.
+            tracing::info!("IDD push: pinning the discrete render GPU (SET_RENDER_ADAPTER)");
+            unsafe { resolve_render_adapter_luid() }
        } else {
            tracing::info!(
                "SudoVDA SET_RENDER_ADAPTER skipped (Apollo-parity: no render pin — avoids cross-GPU \
@@ -735,7 +784,9 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result<M
                    // (the old `let _ =` swallowed it, which masked exactly this during the bad-state churn).
                    Err(e) => {
                        if !warned {
-                            tracing::warn!("SudoVDA keepalive PING failed (control handle lost?): {e:#}");
+                            tracing::warn!(
+                                "SudoVDA keepalive PING failed (control handle lost?): {e:#}"
+                            );
                            warned = true;
                        }
                    }
@@ -796,6 +847,7 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result<M
            stop,
            pinger: Some(pinger),
            ccd_saved,
+            gen: MON_GEN.fetch_add(1, Ordering::Relaxed),
        })
    }
 }
@@ -894,6 +946,39 @@ fn mgr_acquire(mode: Mode) -> Result<VirtualOutput> {
    let device = mgr_ensure_device(&mut g)?;
    let watchdog_s = g.watchdog_s;

+    // IDD-push: a new connection while a monitor is live = a single-client RECONNECT (the prior client
+    // is gone — IDD-push is one display, no concurrency). A REUSED IddCx monitor's swap-chain is DEAD,
+    // so joining it would hand the new client a black screen until the old session times out. PREEMPT:
+    // tear the old monitor down (its Drop restores topology + IOCTL_REMOVEs) and fall through to create
+    // a FRESH one. The old session's lease is gen-stamped, so its later drop is ignored (mgr_release
+    // no-op) and can't tear down the new monitor.
+    if idd_push_mode()
+        && matches!(
+            g.state,
+            MgrState::Active { .. } | MgrState::Lingering { .. }
+        )
+    {
+        if let MgrState::Active { mon, .. } | MgrState::Lingering { mon, .. } =
+            std::mem::replace(&mut g.state, MgrState::Idle)
+        {
+            tracing::info!(
+                old_target = mon.target_id,
+                "IDD-push reconnect — preempting the prior session, recreating a fresh monitor"
+            );
+            // teardown() — NOT drop() — sends IOCTL_REMOVE (and restores topology). `Monitor` has NO
+            // `Drop` impl, so a bare `drop(mon)` orphaned the IddCx monitor in the driver: it was never
+            // departed, so it kept a live D3D device + a stuck swap-chain processor thread, and these
+            // accumulated every reconnect (the driver-side churn leak: +1 device, ~36 nvwgf2umx threads,
+            // ~50 MB VRAM per session, until it choked). teardown frees it via the driver's do_remove.
+            unsafe { mon.teardown(device) };
+            // Let the OS finish the ASYNC IddCx monitor departure before the next ADD. A back-to-back
+            // REMOVE→ADD races the teardown and the ADD IOCTL is rejected (`DeviceIoControl failed`)
+            // under reconnect churn. Held under the MGR lock, but IDD-push setup is already serialized
+            // (IDD_SETUP_LOCK), so this only paces the recreate — exactly what a reconnect flood needs.
+            thread::sleep(Duration::from_millis(400));
+        }
+    }
+
    // A live monitor already exists — join it (refcount++). This covers a concurrent session AND the
    // build-then-drop overlap of a mid-stream Reconfigure / secure-return (the new lease is taken while
    // the old is still held). If the requested mode differs, reconfigure the shared monitor to it so a
@@ -912,11 +997,13 @@ fn mgr_acquire(mode: Mode) -> Result<VirtualOutput> {
        );
        let pm = Some((mon.mode.width, mon.mode.height, mon.mode.refresh_hz));
        let target = mon.target();
+        let gen = mon.gen;
+        CURRENT_MON_GEN.store(gen, Ordering::Relaxed);
        return Ok(VirtualOutput {
            node_id: 0,
            preferred_mode: pm,
            win_capture: target,
-            keepalive: Box::new(MonitorLease),
+            keepalive: Box::new(MonitorLease { gen }),
        });
    }

@@ -937,12 +1024,14 @@ fn mgr_acquire(mode: Mode) -> Result<VirtualOutput> {
    };
    let pm = Some((mon.mode.width, mon.mode.height, mon.mode.refresh_hz));
    let target = mon.target();
+    let gen = mon.gen;
+    CURRENT_MON_GEN.store(gen, Ordering::Relaxed);
    g.state = MgrState::Active { mon, refs: 1 };
    Ok(VirtualOutput {
        node_id: 0,
        preferred_mode: pm,
        win_capture: target,
-        keepalive: Box::new(MonitorLease),
+        keepalive: Box::new(MonitorLease { gen }),
    })
 }

@@ -966,8 +1055,18 @@ unsafe fn mgr_reconfigure(mon: &mut Monitor, mode: Mode) {
 }

 /// Release a session's hold: refcount-- ; when the last session leaves, LINGER before teardown.
-fn mgr_release() {
+/// `gen` is the lease's monitor generation: a STALE lease (its monitor was already torn down +
+/// recreated under it — the IDD-push reconnect-preempt path) does nothing, so it can't decrement the
+/// CURRENT (fresh) monitor's refcount and tear it down.
+fn mgr_release(gen: u64) {
    let mut g = MGR.lock().unwrap();
+    let stale = match &g.state {
+        MgrState::Active { mon, .. } | MgrState::Lingering { mon, .. } => mon.gen != gen,
+        MgrState::Idle => true,
+    };
+    if stale {
+        return;
+    }
    g.state = match std::mem::replace(&mut g.state, MgrState::Idle) {
        MgrState::Active { mon, refs } if refs > 1 => MgrState::Active {
            mon,
@@ -988,6 +1087,28 @@ fn mgr_release() {
    };
 }

+/// Wait (up to `timeout`) for the active monitor to be RELEASED — i.e. the MGR is no longer `Active`
+/// (the prior session dropped its lease → `Lingering`/`Idle`). Used by the IDD-push reconnect preempt:
+/// after signalling the old session to stop, we wait here so it tears its monitor down CLEANLY (while
+/// frames still flow) before we acquire a fresh one — instead of dropping the monitor out from under a
+/// still-live session, which churns the driver's ADD/REMOVE path and wedges it under rapid reconnects.
+pub(crate) fn wait_for_monitor_released(timeout: Duration) {
+    let deadline = Instant::now() + timeout;
+    loop {
+        if !matches!(MGR.lock().unwrap().state, MgrState::Active { .. }) {
+            return;
+        }
+        if Instant::now() >= deadline {
+            tracing::warn!(
+                "IDD-push preempt: prior session didn't release the monitor within {timeout:?} — \
+                 proceeding (mgr_acquire will preempt it)"
+            );
+            return;
+        }
+        thread::sleep(Duration::from_millis(25));
+    }
+}
+
 /// Background timer (started once): tear down a monitor that has lingered past its deadline (→ Idle),
 /// so a physical-screen user gets their screen back after they stop streaming.
 fn ensure_linger_timer() {
@@ -1012,11 +1133,15 @@ fn ensure_linger_timer() {
    });
 }

-/// A session's lease on the shared monitor. Drop releases the refcount (→ linger when it hits 0).
-struct MonitorLease;
+/// A session's lease on the shared monitor. Drop releases the refcount (→ linger when it hits 0),
+/// UNLESS the monitor was already torn down + recreated under it (gen mismatch — the IDD-push
+/// reconnect-preempt path), in which case the drop is a no-op so it can't tear down the new monitor.
+struct MonitorLease {
+    gen: u64,
+}
 impl Drop for MonitorLease {
    fn drop(&mut self) {
-        mgr_release();
+        mgr_release(self.gen);
    }
 }

@@ -63,6 +63,22 @@ pub fn run(opts: HelperOptions) -> Result<()> {
        WgcCapturer::open(target, Some((opts.width, opts.height, opts.fps))).context("WGC open")?;
    cap.set_active(true);

+    // O3 present-trigger experiment: spawn a thread that PRESENTS a D3D swapchain to the virtual
+    // display (a present SOURCE), testing whether that — unlike WGC's READ — makes the OS assign the
+    // driver's IddCx swap-chain (so the driver's run_core runs + can push). Gated; diagnostic.
+    if std::env::var_os("PUNKTFUNK_PRESENT_TRIGGER").is_some() {
+        let (w, h) = (opts.width, opts.height);
+        std::thread::Builder::new()
+            .name("pf-present-trigger".into())
+            .spawn(move || {
+                tracing::info!("present-trigger: starting D3D present loop on the virtual display");
+                if let Err(e) = unsafe { present_trigger(w, h) } {
+                    tracing::warn!("present-trigger error: {e:#}");
+                }
+            })
+            .ok();
+    }
+
    // First frame establishes the real dimensions + whether the desktop is HDR (the encoder derives
    // Main10/HDR from the frame's PixelFormat::Rgb10a2). Then open NVENC on the capture device.
    let first = cap.next_frame().context("first WGC frame")?;
@@ -107,47 +123,55 @@ pub fn run(opts: HelperOptions) -> Result<()> {
    let stdout = std::io::stdout();
    let mut out = stdout.lock();

-    // Encode pipeline depth. The loop keeps DEPTH frames in flight so per-frame GPU-scheduling waits
-    // can overlap. NOTE: depth > 1 was measured to REGRESS under a GPU-saturating game — the encodes
-    // serialize on the contended GPU anyway, so a deeper queue just stacks latency (≈ depth × frame
-    // time) without raising throughput. Default 1 (the validated-best); `PUNKTFUNK_ENCODE_DEPTH` (1..=6)
-    // can raise it if a future workload is genuinely encode-throughput-bound rather than scheduling-bound.
-    let depth: usize = std::env::var("PUNKTFUNK_ENCODE_DEPTH")
-        .ok()
-        .and_then(|s| s.trim().parse::<usize>().ok())
-        .filter(|&d| (1..=6).contains(&d))
-        .unwrap_or(1);
-    tracing::info!(depth, "WGC helper: encode pipeline depth");
+    // FIXED-CADENCE encode loop (mirrors the single-process `punktfunk1::virtual_stream` loop). The
+    // host runs as SYSTEM and relays our AUs; to deliver a STEADY `fps` to the client (the "fixed 240"
+    // goal) we must NOT gate on WGC's content-driven FrameArrived — `WgcCapturer::next_frame` blocks up
+    // to its ~8 ms static-repeat timeout when the desktop is quiet, capping a barely-changing desktop
+    // ~125 fps regardless of the GPU. Instead we pace to `1/fps` and take the FRESHEST frame with the
+    // non-blocking `try_latest`, repeating the last one when nothing newer arrived. Depth-1: NVENC's
+    // `poll` (lock_bitstream) blocks until the just-submitted frame is encoded, so exactly one frame is
+    // in flight per iteration. A deeper pipeline was measured to only stack latency under a
+    // GPU-saturating game (the encodes serialize on the contended GPU anyway) — the in-game lever is
+    // the GPU scheduling priority the SYSTEM host stamps on us, not pipeline depth.
+    let interval = std::time::Duration::from_secs_f64(1.0 / opts.fps.max(1) as f64);

    let perf = std::env::var_os("PUNKTFUNK_PERF").is_some();
    let mut frames = 0u64;
-    let mut cap_wait_ns = 0u64;
-    let mut encode_ns = 0u64; // time blocked in lock_bitstream (the oldest in-flight encode)
-    let mut write_ns = 0u64; // time blocked writing the AU to the stdout pipe (relay backpressure)
+    let mut repeats = 0u64; // frames where no newer capture had arrived (duplicate re-encode)
+    let mut cap_ns = 0u64; // time in try_latest (capture + video-processor convert)
+    let mut encode_ns = 0u64; // time blocked in lock_bitstream
+    let mut write_ns = 0u64; // time writing the AU to the stdout pipe (relay backpressure)
    let mut window = std::time::Instant::now();

-    // Prime: submit `depth` frames before the first poll so NVENC has that many encodes in flight.
-    // We don't hold the `CapturedFrame`s past `submit`: NVENC keeps its own registered texture clone
-    // and the capturer's ring/held-set own the canonical refs (sized for `depth`), so the in-flight
-    // inputs stay valid after our clones drop.
-    enc.submit(&first).context("first encoder submit")?;
-    drop(first);
-    for _ in 1..depth {
-        let f = cap.next_frame().context("WGC prime frame")?;
-        enc.submit(&f).context("prime encoder submit")?;
-    }
+    // `frame` is held across iterations and repeated when `try_latest` has nothing newer, so a static
+    // desktop still clocks `fps`. The capturer's held-set / output ring keep its texture alive across
+    // the repeat; reassigning `frame` on a fresh capture drops the prior one (already drained by poll).
+    let mut frame = first;
+    let mut next = std::time::Instant::now();
    loop {
        if kf.swap(false, Ordering::Relaxed) {
            enc.request_keyframe();
        }
-        // Pop + forward the OLDEST in-flight frame (FIFO). With `depth` outstanding it has had
-        // depth-1 frames' worth of GPU slots to finish, so this rarely blocks under load.
-        let p0 = std::time::Instant::now();
-        let polled = enc.poll().context("encoder poll")?;
-        if perf {
-            encode_ns += p0.elapsed().as_nanos() as u64;
+        // Freshest captured frame, or repeat the last (no new composition: static desktop / between a
+        // game's presents). Non-blocking, so the cadence is OURS, not WGC's event rate.
+        let t0 = std::time::Instant::now();
+        match cap.try_latest().context("WGC try_latest")? {
+            Some(f) => frame = f,
+            None => repeats += 1,
        }
-        if let Some(au) = polled {
+        if perf {
+            cap_ns += t0.elapsed().as_nanos() as u64;
+        }
+        enc.submit(&frame).context("encoder submit")?;
+        // Drain the just-submitted frame. NVENC's poll blocks in lock_bitstream until it's encoded, so
+        // this returns exactly one AU (then None) — depth-1, no accumulation.
+        loop {
+            let p0 = std::time::Instant::now();
+            let polled = enc.poll().context("encoder poll")?;
+            if perf {
+                encode_ns += p0.elapsed().as_nanos() as u64;
+            }
+            let Some(au) = polled else { break };
            let w0 = std::time::Instant::now();
            let wrote = write_au(&mut out, &au);
            if perf {
@@ -158,13 +182,13 @@ pub fn run(opts: HelperOptions) -> Result<()> {
                return Ok(());
            }
        }
-        // Refill: capture + submit to keep `depth` frames in flight.
-        let t0 = std::time::Instant::now();
-        let next = cap.next_frame().context("WGC next frame")?;
-        if perf {
-            cap_wait_ns += t0.elapsed().as_nanos() as u64;
+        // Pace to this frame's due time. If we're already past it (encode couldn't keep up under a
+        // GPU-saturating game), skip the sleep and re-baseline so we don't spiral into catch-up.
+        next += interval;
+        match next.checked_duration_since(std::time::Instant::now()) {
+            Some(d) => std::thread::sleep(d),
+            None => next = std::time::Instant::now(),
        }
-        enc.submit(&next).context("encoder submit")?;

        if perf {
            frames += 1;
@@ -174,13 +198,15 @@ pub fn run(opts: HelperOptions) -> Result<()> {
                let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6);
                tracing::info!(
                    fps = format!("{:.1}", frames as f64 / secs),
-                    cap_wait_ms = per(cap_wait_ns),
+                    repeats,
+                    cap_ms = per(cap_ns),
                    encode_ms = per(encode_ns),
                    write_ms = per(write_ns),
-                    "WGC helper perf (depth-pipelined; encode_ms=lock_bitstream on the oldest)"
+                    "WGC helper perf (fixed-cadence depth-1; encode_ms=lock_bitstream; repeats=duplicated frames)"
                );
                frames = 0;
-                cap_wait_ns = 0;
+                repeats = 0;
+                cap_ns = 0;
                encode_ns = 0;
                write_ns = 0;
                window = std::time::Instant::now();
@@ -197,3 +223,115 @@ fn write_au(out: &mut impl Write, au: &encode::EncodedFrame) -> std::io::Result<
    out.write_all(&au.data)?;
    out.flush()
 }
+
+/// O3 present-trigger experiment (see the gated call in `run`). Creates a small swapchain-backed
+/// window on the virtual display (the CCD-isolated primary) and presents continuously — an active
+/// present SOURCE on the display — to test whether that makes the OS assign the driver's IddCx
+/// swap-chain (which WGC's read does not). Runs forever on its own thread.
+///
+/// # Safety
+/// Win32/D3D11 FFI; called once on a dedicated helper thread.
+unsafe fn present_trigger(disp_w: u32, disp_h: u32) -> Result<()> {
+    use windows::core::{w, Interface};
+    use windows::Win32::Foundation::{HMODULE, HWND, LPARAM, LRESULT, WPARAM};
+    use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE;
+    use windows::Win32::Graphics::Direct3D11::{
+        D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView,
+        ID3D11Texture2D, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_SDK_VERSION,
+    };
+    use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
+    use windows::Win32::Graphics::Dxgi::{
+        IDXGIAdapter, IDXGIDevice, IDXGIFactory2, DXGI_PRESENT, DXGI_SWAP_CHAIN_DESC1,
+        DXGI_SWAP_EFFECT_FLIP_DISCARD, DXGI_USAGE_RENDER_TARGET_OUTPUT,
+    };
+    use windows::Win32::System::LibraryLoader::GetModuleHandleW;
+    use windows::Win32::UI::WindowsAndMessaging::{
+        CreateWindowExW, DefWindowProcW, DispatchMessageW, PeekMessageW, RegisterClassW,
+        ShowWindow, MSG, PM_REMOVE, SW_SHOWNOACTIVATE, WNDCLASSW, WS_EX_NOACTIVATE, WS_EX_TOPMOST,
+        WS_POPUP, WS_VISIBLE,
+    };
+
+    unsafe extern "system" fn wndproc(h: HWND, m: u32, wp: WPARAM, lp: LPARAM) -> LRESULT {
+        DefWindowProcW(h, m, wp, lp)
+    }
+
+    let hinst: HMODULE = GetModuleHandleW(None)?;
+    let cls = w!("pfPresentTrigger");
+    let wc = WNDCLASSW {
+        lpfnWndProc: Some(wndproc),
+        hInstance: hinst.into(),
+        lpszClassName: cls,
+        ..Default::default()
+    };
+    RegisterClassW(&wc);
+    // Small window at the top-left of the (primary = virtual) display so it barely obscures the
+    // captured desktop; topmost + no-activate so it doesn't steal focus.
+    let win_w = disp_w.min(96) as i32;
+    let win_h = disp_h.min(96) as i32;
+    let hwnd: HWND = CreateWindowExW(
+        WS_EX_TOPMOST | WS_EX_NOACTIVATE,
+        cls,
+        w!("pf-present"),
+        WS_POPUP | WS_VISIBLE,
+        0,
+        0,
+        win_w,
+        win_h,
+        None,
+        None,
+        Some(hinst.into()),
+        None,
+    )?;
+    let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE);
+
+    let mut device: Option<ID3D11Device> = None;
+    let mut context: Option<ID3D11DeviceContext> = None;
+    D3D11CreateDevice(
+        None,
+        D3D_DRIVER_TYPE_HARDWARE,
+        HMODULE::default(),
+        D3D11_CREATE_DEVICE_BGRA_SUPPORT,
+        None,
+        D3D11_SDK_VERSION,
+        Some(&mut device),
+        None,
+        Some(&mut context),
+    )?;
+    let device = device.context("present-trigger d3d11 device")?;
+    let context = context.context("present-trigger d3d11 context")?;
+
+    let dxgi_dev: IDXGIDevice = device.cast()?;
+    let adapter: IDXGIAdapter = dxgi_dev.GetAdapter()?;
+    let factory: IDXGIFactory2 = adapter.GetParent()?;
+    let scd = DXGI_SWAP_CHAIN_DESC1 {
+        Width: win_w as u32,
+        Height: win_h as u32,
+        Format: DXGI_FORMAT_B8G8R8A8_UNORM,
+        SampleDesc: DXGI_SAMPLE_DESC {
+            Count: 1,
+            Quality: 0,
+        },
+        BufferUsage: DXGI_USAGE_RENDER_TARGET_OUTPUT,
+        BufferCount: 2,
+        SwapEffect: DXGI_SWAP_EFFECT_FLIP_DISCARD,
+        ..Default::default()
+    };
+    let swapchain = factory.CreateSwapChainForHwnd(&device, hwnd, &scd, None, None)?;
+    tracing::info!("present-trigger: swapchain created on the virtual display; presenting");
+
+    let mut frame = 0u32;
+    loop {
+        let mut msg = MSG::default();
+        while PeekMessageW(&mut msg, None, 0, 0, PM_REMOVE).as_bool() {
+            let _ = DispatchMessageW(&msg);
+        }
+        let back: ID3D11Texture2D = swapchain.GetBuffer(0)?;
+        let mut rtv: Option<ID3D11RenderTargetView> = None;
+        device.CreateRenderTargetView(&back, None, Some(&mut rtv))?;
+        let rtv = rtv.context("present-trigger rtv")?;
+        let c = (frame % 120) as f32 / 120.0;
+        context.ClearRenderTargetView(&rtv, &[c, 0.1, 0.2, 1.0]);
+        let _ = swapchain.Present(1, DXGI_PRESENT(0));
+        frame = frame.wrapping_add(1);
+    }
+}