diff --git a/crates/punktfunk-core/src/quic.rs b/crates/punktfunk-core/src/quic.rs index aff6fba..51407ab 100644 --- a/crates/punktfunk-core/src/quic.rs +++ b/crates/punktfunk-core/src/quic.rs @@ -1454,11 +1454,16 @@ pub mod endpoint { /// close, while a genuinely dead peer is still detected within `MAX_IDLE`. fn stream_transport() -> Arc { use std::time::Duration; - const MAX_IDLE: Duration = Duration::from_secs(20); + // 8s idle (was 20s): a vanished client is declared dead within 8s instead of 20, so its + // session tears down promptly — which the Windows IDD-push path needs so a RECONNECT recreates + // a fresh virtual monitor (a reused monitor's IddCx swap-chain dies) instead of joining the + // still-lingering old session. Active sessions are unaffected: video keeps the connection live, + // and the 4s keep-alive holds it open through quiet control periods. + const MAX_IDLE: Duration = Duration::from_secs(8); const KEEP_ALIVE: Duration = Duration::from_secs(4); let mut t = quinn::TransportConfig::default(); t.max_idle_timeout(Some( - quinn::IdleTimeout::try_from(MAX_IDLE).expect("20s is a valid QUIC idle timeout"), + quinn::IdleTimeout::try_from(MAX_IDLE).expect("8s is a valid QUIC idle timeout"), )); t.keep_alive_interval(Some(KEEP_ALIVE)); Arc::new(t) diff --git a/crates/punktfunk-host/src/capture.rs b/crates/punktfunk-host/src/capture.rs index 5b83fc2..4d200e5 100644 --- a/crates/punktfunk-host/src/capture.rs +++ b/crates/punktfunk-host/src/capture.rs @@ -142,6 +142,16 @@ pub trait Capturer: Send { fn hdr_meta(&self) -> Option { None } + + /// How many frames the encode loop may keep in flight (submitted but not yet polled) before it + /// blocks. `1` (the default) is the synchronous loop: capture → submit → poll-blocks, so the + /// per-frame wall time is `capture+convert + encode`. A capturer that hands a fresh output texture + /// per frame (so the encode of N reads a different texture than the convert of N+1 writes) can return + /// `>1` to PIPELINE: the loop submits N+1 before polling N, overlapping the convert/copy on the 3D + /// engine with the NVENC-ASIC encode of the prior frame, dropping per-frame wall toward `max(...)`. + fn pipeline_depth(&self) -> usize { + 1 + } } /// A deterministic moving test pattern (BGRx). Lets the spike exercise the encode → file → @@ -302,7 +312,11 @@ pub fn open_portal_monitor() -> Result> { /// [`crate::vdisplay::VirtualDisplay`] backend. The captured size is the size the output was /// created at — native, no scaling. #[cfg(target_os = "linux")] -pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result> { +pub fn capture_virtual_output( + vout: crate::vdisplay::VirtualOutput, + _want_hdr: bool, +) -> Result> { + // The Linux host stays 8-bit (HDR is blocked upstream), so `want_hdr` is unused here. linux::PortalCapturer::from_virtual_output(vout).map(|c| Box::new(c) as Box) } @@ -317,7 +331,10 @@ pub(crate) fn wgc_disabled() -> bool { } #[cfg(target_os = "windows")] -pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result> { +pub fn capture_virtual_output( + vout: crate::vdisplay::VirtualOutput, + want_hdr: bool, +) -> Result> { let target = vout.win_capture.clone().ok_or_else(|| { anyhow::anyhow!( "SudoVDA target not yet an active display (needs a WDDM GPU to activate it)" @@ -325,6 +342,18 @@ pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result); + } // WGC (Windows.Graphics.Capture) is the default: it captures the COMPOSED desktop including the // overlay/independent-flip planes DXGI Desktop Duplication misses (the frozen-HDR-animation bug), // and has no ACCESS_LOST-on-overlay churn. DDA stays available via PUNKTFUNK_CAPTURE=dda and is @@ -376,7 +405,10 @@ pub fn capture_virtual_output(vout: crate::vdisplay::VirtualOutput) -> Result Result> { +pub fn capture_virtual_output( + _vout: crate::vdisplay::VirtualOutput, + _want_hdr: bool, +) -> Result> { anyhow::bail!("virtual-output capture requires Linux or Windows") } @@ -386,6 +418,8 @@ pub mod composed_flip; pub mod desktop_watch; #[cfg(target_os = "windows")] pub mod dxgi; +#[cfg(target_os = "windows")] +pub mod idd_push; #[cfg(target_os = "linux")] mod linux; #[cfg(target_os = "windows")] diff --git a/crates/punktfunk-host/src/capture/dxgi.rs b/crates/punktfunk-host/src/capture/dxgi.rs index 4a3cd65..0fde7ca 100644 --- a/crates/punktfunk-host/src/capture/dxgi.rs +++ b/crates/punktfunk-host/src/capture/dxgi.rs @@ -202,6 +202,87 @@ pub(crate) unsafe fn make_device( Ok((device, context)) } +/// Resolve the configured GPU scheduling-priority class from `PUNKTFUNK_GPU_PRIORITY_CLASS` +/// (`off|normal|high|realtime`, default high). `None` = leave it at the OS default (the `off` opt-out). +/// D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, REALTIME 5. +fn configured_gpu_priority_class() -> Option { + match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS") + .ok() + .as_deref() + { + Some("off") => None, + Some("normal") => Some(2), + Some("realtime") => Some(5), + _ => Some(4), // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC) + } +} + +/// Enable SE_INC_BASE_PRIORITY on the CURRENT process token (best-effort) — the kernel gates the +/// HIGH/REALTIME GPU scheduling-priority bump on it. Held by SYSTEM/Administrators; a UAC-FILTERED +/// token (what `CreateProcessAsUserW` hands the WGC helper) does NOT have it, which is why the helper +/// can't elevate itself and the SYSTEM host stamps the class onto it cross-process instead (see +/// [`set_child_gpu_priority_class`]). +unsafe fn enable_inc_base_priority() { + use windows::core::PCWSTR; + use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; + use windows::Win32::Security::{ + AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES, + SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, TOKEN_PRIVILEGES, + TOKEN_QUERY, + }; + use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken}; + let mut token = HANDLE::default(); + if OpenProcessToken( + GetCurrentProcess(), + TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, + &mut token, + ) + .is_ok() + { + let mut luid = LUID::default(); + if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() { + let tp = TOKEN_PRIVILEGES { + PrivilegeCount: 1, + Privileges: [LUID_AND_ATTRIBUTES { + Luid: luid, + Attributes: SE_PRIVILEGE_ENABLED, + }], + }; + if AdjustTokenPrivileges( + token, + false, + Some(&tp as *const TOKEN_PRIVILEGES), + 0, + None, + None, + ) + .is_err() + { + tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority"); + } + } + let _ = CloseHandle(token); + } +} + +/// Call `gdi32!D3DKMTSetProcessSchedulingPriorityClass(process, prio)` (no stable windows-rs binding — +/// loaded by name). Returns the NTSTATUS (0 = success) or `None` if the export can't be resolved. The +/// CALLING process must hold SE_INC_BASE_PRIORITY ([`enable_inc_base_priority`]) for HIGH/REALTIME; the +/// kernel checks the caller's privilege whether the target is self or a child we created. +unsafe fn d3dkmt_set_scheduling_priority_class( + process: windows::Win32::Foundation::HANDLE, + prio: i32, +) -> Option { + use windows::core::s; + use windows::Win32::Foundation::HANDLE; + use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA}; + let gdi32 = LoadLibraryA(s!("gdi32.dll")).ok()?; + let p = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass"))?; + type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32; + let f: SetPrio = std::mem::transmute(p); + Some(f(process, prio)) +} + /// Apollo-style GPU scheduling-priority hardening (Sunshine `display_base.cpp:599-709`). On a /// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but /// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling @@ -209,89 +290,64 @@ pub(crate) unsafe fn make_device( /// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT /// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this). /// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime` -/// (default high). +/// (default high). NOTE: in the SYSTEM-host + user-session-helper deployment this self-set NO-OPs in +/// the helper (filtered token), so the host also sets it on the helper via [`set_child_gpu_priority_class`]. fn elevate_process_gpu_priority() { use std::sync::Once; static ONCE: Once = Once::new(); ONCE.call_once(|| unsafe { - use windows::core::{s, PCWSTR}; - use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; - use windows::Win32::Security::{ - AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES, - SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, - TOKEN_PRIVILEGES, TOKEN_QUERY, + use windows::Win32::System::Threading::GetCurrentProcess; + let Some(prio) = configured_gpu_priority_class() else { + tracing::info!("GPU process scheduling priority class left at default (off)"); + return; }; - use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA}; - use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken}; - - // D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, - // REALTIME 5. - let prio: i32 = match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS").ok().as_deref() { - Some("off") => { - tracing::info!("GPU process scheduling priority class left at default (off)"); - return; - } - Some("normal") => 2, - Some("realtime") => 5, - _ => 4, // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC) - }; - - // 1. Enable SE_INC_BASE_PRIORITY so the kernel permits the GPU priority bump. - let mut token = HANDLE::default(); - if OpenProcessToken( - GetCurrentProcess(), - TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, - &mut token, - ) - .is_ok() - { - let mut luid = LUID::default(); - if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() { - let tp = TOKEN_PRIVILEGES { - PrivilegeCount: 1, - Privileges: [LUID_AND_ATTRIBUTES { - Luid: luid, - Attributes: SE_PRIVILEGE_ENABLED, - }], - }; - if AdjustTokenPrivileges( - token, - false, - Some(&tp as *const TOKEN_PRIVILEGES), - 0, - None, - None, - ) - .is_err() - { - tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority"); - } - } - let _ = CloseHandle(token); - } - - // 2. D3DKMTSetProcessSchedulingPriorityClass via gdi32 (no stable windows-rs binding). - if let Ok(gdi32) = LoadLibraryA(s!("gdi32.dll")) { - if let Some(p) = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass")) { - type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32; - let f: SetPrio = std::mem::transmute(p); - let st = f(GetCurrentProcess(), prio); - if st == 0 { - tracing::info!( - priority_class = prio, - "GPU process scheduling priority class set (2=normal 4=high 5=realtime)" - ); - } else { - tracing::warn!( - status = format!("0x{st:08X}"), - "D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)" - ); - } - } + enable_inc_base_priority(); + match d3dkmt_set_scheduling_priority_class(GetCurrentProcess(), prio) { + Some(0) => tracing::info!( + priority_class = prio, + "GPU process scheduling priority class set (2=normal 4=high 5=realtime)" + ), + Some(st) => tracing::warn!( + status = format!("0x{st:08X}"), + "D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)" + ), + None => tracing::warn!("D3DKMTSetProcessSchedulingPriorityClass export not found"), } }); } +/// Set the GPU scheduling-priority class of ANOTHER process we created — the WGC capture+encode helper +/// in the interactive user session. The helper is spawned with the user's UAC-FILTERED token, which +/// lacks SE_INC_BASE_PRIORITY, so its own [`elevate_process_gpu_priority`] silently no-ops and NVENC +/// gets starved under a GPU-saturating game (the "240→40 fps in-game collapse"). The SYSTEM host DOES +/// hold the privilege, so it stamps the class onto the child's process handle right after spawn — the +/// process-level class applies to GPU contexts the child creates afterwards. Best-effort; logged. +/// `PUNKTFUNK_GPU_PRIORITY_CLASS=off` disables it (same knob as the self path). +/// +/// # Safety +/// `process` must be a valid handle to a process we own with at least PROCESS_SET_INFORMATION access +/// (the just-created helper, `PROCESS_INFORMATION::hProcess`). +pub(crate) unsafe fn set_child_gpu_priority_class(process: windows::Win32::Foundation::HANDLE) { + let Some(prio) = configured_gpu_priority_class() else { + return; + }; + enable_inc_base_priority(); // the SYSTEM host holds SE_INC_BASE_PRIORITY; the helper does not + match d3dkmt_set_scheduling_priority_class(process, prio) { + Some(0) => tracing::info!( + priority_class = prio, + "WGC helper GPU scheduling priority class set cross-process from the SYSTEM host \ + (2=normal 4=high 5=realtime)" + ), + Some(st) => tracing::warn!( + status = format!("0x{st:08X}"), + "cross-process D3DKMTSetProcessSchedulingPriorityClass on the WGC helper failed" + ), + None => tracing::warn!( + "D3DKMTSetProcessSchedulingPriorityClass export not found — WGC helper has no GPU priority" + ), + } +} + /// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST /// recovery to rebuild the whole capture on the current (possibly secure) input desktop. unsafe fn reopen_duplication( diff --git a/crates/punktfunk-host/src/capture/idd_push.rs b/crates/punktfunk-host/src/capture/idd_push.rs new file mode 100644 index 0000000..129de1d --- /dev/null +++ b/crates/punktfunk-host/src/capture/idd_push.rs @@ -0,0 +1,922 @@ +//! P2 direct frame push (kill DDA) — HOST side. The pf-vdisplay driver runs in a restricted WUDFHost +//! token that canNOT create named kernel objects, so — exactly like the gamepad UMDF drivers +//! (`inject/dualsense_windows.rs`) — the HOST (privileged) CREATES the shared header + frame-ready +//! event + ring of keyed-mutex textures (`Global\` names, permissive `D:(A;;GA;;;WD)` SDDL) on the +//! discrete render GPU, and the driver only OPENS them and copies frames in. We then consume the ring +//! straight into the zero-copy NVENC path — no DXGI Desktop Duplication, no `win32u` hook. Gated by +//! `PUNKTFUNK_IDD_PUSH`. Driver counterpart: `packaging/windows/vdisplay-driver/pf-vdisplay/src/ +//! frame_transport.rs` — [`SharedHeader`], [`MAGIC`], [`RING_LEN`], the status codes and the `Global\` +//! name scheme are DUPLICATED byte-identically there. + +use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget}; +use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; +use anyhow::{bail, Context, Result}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Mutex; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; +use windows::core::{w, Interface, HSTRING}; +use windows::Win32::Foundation::{CloseHandle, HANDLE, INVALID_HANDLE_VALUE, LUID}; +use windows::Win32::Graphics::Direct3D11::{ + ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView, + ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, + D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX, D3D11_RESOURCE_MISC_SHARED_NTHANDLE, + D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT, +}; +use windows::Win32::Graphics::Dxgi::Common::{ + DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, + DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, +}; +use windows::Win32::Graphics::Dxgi::{ + CreateDXGIFactory1, IDXGIAdapter1, IDXGIFactory4, IDXGIKeyedMutex, IDXGIResource1, +}; +use windows::Win32::Security::Authorization::{ + ConvertStringSecurityDescriptorToSecurityDescriptorW, SDDL_REVISION_1, +}; +use windows::Win32::Security::{PSECURITY_DESCRIPTOR, SECURITY_ATTRIBUTES}; +use windows::Win32::System::Memory::{ + CreateFileMappingW, MapViewOfFile, UnmapViewOfFile, FILE_MAP_ALL_ACCESS, + MEMORY_MAPPED_VIEW_ADDRESS, PAGE_READWRITE, +}; +use windows::Win32::System::Threading::{CreateEventW, WaitForSingleObject}; + +// --- kept byte-identical with the driver (frame_transport.rs) --- +pub const MAGIC: u32 = 0x4456_4650; +pub const VERSION: u32 = 1; +/// Ring slots — MUST equal the driver's `RING_LEN` (frame_transport.rs). 6 (was 3) gives ample headroom +/// so the driver's 0 ms-timeout publish always finds a free slot while the host briefly holds one across +/// the convert/copy into its output ring and the depth-2 pipelined encode runs on the rest. +pub const RING_LEN: u32 = 6; +const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1; + +// driver_status codes (the driver writes these; we read+log them). +const DRV_STATUS_OPENED: u32 = 1; +const DRV_STATUS_TEX_FAIL: u32 = 2; +const DRV_STATUS_NO_DEVICE1: u32 = 3; + +/// Host-owned output-ring depth: distinct NVENC-input textures rotated per frame so the in-flight +/// encode of frame N and the convert/copy of frame N+1 never touch the same texture. 3 covers a +/// pipeline depth of 2 with one slot of margin. +const OUT_RING: usize = 3; + +#[repr(C)] +struct SharedHeader { + magic: u32, + version: u32, + generation: u32, + ring_len: u32, + width: u32, + height: u32, + dxgi_format: u32, + _pad: u32, + latest: u64, + qpc_pts: u64, + driver_render_luid_low: u32, + driver_render_luid_high: i32, + driver_status: u32, + driver_status_detail: u32, +} + +/// Bring-up debug block (fixed name) — the host creates it; the driver writes diagnostics into it +/// independent of the per-target header. Byte-identical with the driver's `DebugBlock`. +#[repr(C)] +struct DebugBlock { + magic: u32, + run_core_entries: u32, + resolved_target_id: u32, + header_open_attempts: u32, + last_open_error: u32, + header_opened: u32, + render_luid_low: u32, + render_luid_high: i32, + frames_acquired: u32, + _pad: u32, +} +const DBG_NAME: &str = "Global\\pfvd-dbg"; +const DBG_MAGIC: u32 = 0x4742_4450; + +fn hdr_name(target_id: u32) -> String { + format!("Global\\pfvd-hdr-{target_id}") +} +fn evt_name(target_id: u32) -> String { + format!("Global\\pfvd-evt-{target_id}") +} +fn tex_name(target_id: u32, generation: u32, slot: u32) -> String { + format!("Global\\pfvd-tex-{target_id}-{generation}-{slot}") +} +// ---------------------------------------------------------------- + +/// Monotonic per-process generation: each capturer instance stamps its ring-texture names with a +/// fresh value so a retried/overlapping `open()` never collides with a previous attempt's not-yet- +/// released shared-handle names (`DXGI_ERROR_NAME_ALREADY_EXISTS`). The driver reads it from the header. +static IDD_GENERATION: AtomicU32 = AtomicU32::new(1); + +fn now_ns() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos() as u64) + .unwrap_or(0) +} + +struct HostSlot { + tex: ID3D11Texture2D, + mutex: IDXGIKeyedMutex, + shared: HANDLE, + /// SRV on the slot texture so the HDR path samples the FP16 slot DIRECTLY (no slot→scratch copy); + /// the convert pass writes the output ring while holding the slot's keyed mutex. Unused for SDR + /// (which CopyResource's the BGRA slot straight to the output). + srv: ID3D11ShaderResourceView, +} + +impl Drop for HostSlot { + fn drop(&mut self) { + unsafe { + let _ = CloseHandle(self.shared); + } + } +} + +/// Creates + owns the shared ring; yields the driver's frames as [`FramePayload::D3d11`]. +pub struct IddPushCapturer { + device: ID3D11Device, + context: ID3D11DeviceContext, + target_id: u32, + map: HANDLE, + header: *mut SharedHeader, + event: HANDLE, + dbg_map: HANDLE, + dbg_block: *mut DebugBlock, + width: u32, + height: u32, + slots: Vec, + /// The ring/texture generation, bumped every time the ring is recreated at a new format (the + /// display's HDR mode flipped). Stamped into the texture names + the header so the driver re-attaches. + generation: u32, + /// The CLIENT's advertised 10-bit capability (= negotiated `bit_depth >= 10`). Only used at `open` + /// to PROACTIVELY enable advanced color (so a 10-bit client gets HDR without a manual toggle); it + /// does NOT gate the per-frame conversion — that follows the display, like the WGC path (clients + /// under-report 10-bit yet all decode Main10 + auto-detect PQ from the VUI). + client_10bit: bool, + /// The DISPLAY's CURRENT HDR state (from `advanced_color_enabled`) — the user can flip "Use HDR" in + /// Windows mid-session. Drives the ring format (HDR → FP16 surfaces, SDR → BGRA) and the conversion. + /// Polled in the capture loop; a change recreates the ring (see [`Self::recreate_ring`]). + display_hdr: bool, + /// Throttle for the `advanced_color_enabled` poll (a CCD `QueryDisplayConfig`, ~ms — too costly per + /// frame at 240 Hz). + last_acm_poll: Instant, + /// Host-owned ROTATING output ring NVENC encodes (texture + RTV per slot). Rotating it per frame is + /// the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the + /// ASIC, frame N+1's convert/copy writes a DIFFERENT texture on the 3D engine — the two overlap. The + /// HDR convert and the SDR copy both write into the current slot. Format = `out_format()` (Rgb10a2 in + /// HDR, Bgra in SDR); rebuilt on a display-mode flip. Built lazily. + out_ring: Vec<(ID3D11Texture2D, ID3D11RenderTargetView)>, + out_idx: usize, + /// FP16 scRGB → `Rgb10a2` BT.2020 PQ converter, used while the display is HDR. Built lazily. + hdr_conv: Option, + last_seq: u64, + last_present: Option<(ID3D11Texture2D, PixelFormat)>, + status_logged: bool, + /// The monitor generation this capturer was opened for. When the active monitor gen changes (a + /// reconnect preempted + recreated the monitor), `next_frame` bails immediately so this session + /// releases its NVENC encoder instead of lingering on the dead ring's 20s deadline. + my_gen: u64, + _keepalive: Box, +} +// COM objects used only from the owning (encode) thread. +unsafe impl Send for IddPushCapturer {} + +/// The persistent IDD-push capturer, kept alive for the host lifetime and SHARED across client +/// sessions. The driver's per-session monitor TEARDOWN→RECREATE path is unstable (on session 2 the +/// target-id resolves to 0, `IddCxSwapChainSetDevice` fails `0x80070057`, then an access violation), +/// while the FIRST-session path is solid. So we create the monitor + ring + swap-chain ONCE and hand +/// every later session a thin handle delegating to this one. The persistent capturer holds a monitor +/// lease for the host lifetime, so `VirtualDisplay::create` always JOINs the same live monitor (same +/// target id) and the reuse match always hits — no recreate, no driver crash. Prototype scope: +/// single-client, single-mode (a different mode would need a recreate, the unstable path). +static IDD_PERSIST: Mutex> = Mutex::new(None); + +/// Open the IDD-push capturer, reusing the persistent one across sessions (see [`IDD_PERSIST`]). +pub fn open_or_reuse( + target: WinCaptureTarget, + preferred: Option<(u32, u32, u32)>, + client_10bit: bool, + keepalive: Box, +) -> Result> { + let (w, h, _) = + preferred.context("IDD push needs the negotiated mode (WxH) to size the ring")?; + let mut slot = IDD_PERSIST.lock().unwrap(); + let reuse = matches!(slot.as_ref(), Some(c) if c.target_id == target.target_id && c.width == w && c.height == h); + match slot.as_mut() { + Some(c) if reuse => { + // Reuse: the persistent capturer already owns the monitor + ring + driver attach. Drop the + // new per-session monitor lease (the persistent capturer's lease keeps the monitor live). + // The ring tracks the display, not the client; only the client's 10-bit cap can differ. + drop(keepalive); + c.set_client_10bit(client_10bit); + tracing::info!( + target_id = target.target_id, + client_10bit, + "IDD push: reusing the persistent capturer (no monitor/ring recreate)" + ); + } + Some(c) => bail!( + "IDD-push persistent capturer is {}x{} target {}, this session wants {}x{} target {} — a \ + mode/target change needs a recreate (the driver's recreate path is unstable); not \ + supported in the persistent prototype", + c.width, + c.height, + c.target_id, + w, + h, + target.target_id + ), + None => { + tracing::info!( + target_id = target.target_id, + client_10bit, + "IDD push: creating the persistent capturer (first session)" + ); + *slot = Some(IddPushCapturer::open(target, preferred, client_10bit, keepalive)?); + } + } + Ok(Box::new(IddReuseHandle)) +} + +/// Thin per-session handle: every method delegates to the single persistent [`IddPushCapturer`]. +/// Dropping it (session end) does NOT tear down the ring/monitor — that's the whole point. +struct IddReuseHandle; +impl Capturer for IddReuseHandle { + fn next_frame(&mut self) -> Result { + IDD_PERSIST + .lock() + .unwrap() + .as_mut() + .context("IDD-push persistent capturer missing")? + .next_frame() + } + fn try_latest(&mut self) -> Result> { + IDD_PERSIST + .lock() + .unwrap() + .as_mut() + .context("IDD-push persistent capturer missing")? + .try_latest() + } + fn set_active(&self, active: bool) { + if let Some(c) = IDD_PERSIST.lock().unwrap().as_ref() { + c.set_active(active); + } + } + fn hdr_meta(&self) -> Option { + IDD_PERSIST + .lock() + .unwrap() + .as_ref() + .and_then(|c| c.hdr_meta()) + } +} + +/// Build a permissive (Everyone:GenericAll) `SECURITY_ATTRIBUTES` so the restricted WUDFHost driver +/// can OPEN the host-created objects — the same `D:(A;;GA;;;WD)` SDDL the gamepad shared section uses. +/// The returned `psd` backing must outlive `sa`; both are dropped when the process exits. +unsafe fn permissive_sa() -> Result<(SECURITY_ATTRIBUTES, PSECURITY_DESCRIPTOR)> { + let mut psd = PSECURITY_DESCRIPTOR::default(); + ConvertStringSecurityDescriptorToSecurityDescriptorW( + w!("D:(A;;GA;;;WD)"), + SDDL_REVISION_1, + &mut psd, + None, + ) + .context("build SDDL for IDD-push shared objects")?; + let sa = SECURITY_ATTRIBUTES { + nLength: std::mem::size_of::() as u32, + lpSecurityDescriptor: psd.0, + bInheritHandle: false.into(), + }; + Ok((sa, psd)) +} + +impl IddPushCapturer { + /// Create the `RING_LEN` shared keyed-mutex textures for one ring generation, at `format` (matched + /// to the display's composition format — FP16 in HDR, BGRA in SDR). Each is shared by the name + /// `pfvd-tex---` so the driver opens it; a fresh generation gives fresh names + /// (so a recreate never collides with the old ring's not-yet-released handles). + unsafe fn create_ring_slots( + device: &ID3D11Device, + target_id: u32, + generation: u32, + w: u32, + h: u32, + format: DXGI_FORMAT, + ) -> Result> { + let (sa, _psd) = permissive_sa()?; + let mut slots = Vec::new(); + for k in 0..RING_LEN { + let desc = D3D11_TEXTURE2D_DESC { + Width: w, + Height: h, + MipLevels: 1, + ArraySize: 1, + // Match the OS-composed swap-chain surfaces so the driver's CopyResource into the slot + + // its format-guard both succeed. + Format: format, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32, + CPUAccessFlags: 0, + MiscFlags: (D3D11_RESOURCE_MISC_SHARED_NTHANDLE.0 + | D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX.0) as u32, + }; + let mut tex: Option = None; + device + .CreateTexture2D(&desc, None, Some(&mut tex)) + .context("CreateTexture2D(IDD-push ring slot)")?; + let tex = tex.context("null ring texture")?; + let res1: IDXGIResource1 = tex.cast()?; + let shared = res1 + .CreateSharedHandle( + Some(&sa as *const SECURITY_ATTRIBUTES), + DXGI_SHARED_RESOURCE_RW, + &HSTRING::from(tex_name(target_id, generation, k)), + ) + .context("CreateSharedHandle(IDD-push ring slot)")?; + let mutex: IDXGIKeyedMutex = tex.cast()?; + let mut srv: Option = None; + device + .CreateShaderResourceView(&tex, None, Some(&mut srv)) + .context("CreateShaderResourceView(IDD-push ring slot)")?; + let srv = srv.context("null slot srv")?; + slots.push(HostSlot { + tex, + mutex, + shared, + srv, + }); + } + Ok(slots) + } + + pub fn open( + target: WinCaptureTarget, + preferred: Option<(u32, u32, u32)>, + client_10bit: bool, + keepalive: Box, + ) -> Result { + let (w, h, _hz) = preferred + .context("IDD push needs the negotiated mode (WxH) to size the shared ring")?; + // The driver composes the virtual display in FP16 (R16G16B16A16_FLOAT scRGB) when the display is + // in advanced-color (HDR) mode, and 8-bit BGRA otherwise (per swap_chain_processor.rs + the + // COMMIT_MODES2 colorspace/rgb_bpc log). The user can flip "Use HDR" in Windows at any time, so + // the ring format must TRACK the display's ACTUAL mode (the driver's format-guard drops a + // mismatch). We poll the live state here and on every recreate. For a 10-bit-capable client we + // PROACTIVELY enable advanced color so HDR streams without the user toggling anything; an + // SDR-only client leaves the display alone (and still gets a tone-mapped picture, never a freeze, + // if the user does enable HDR). + unsafe { + if client_10bit && crate::vdisplay::sudovda::set_advanced_color(target.target_id, true) + { + // Let the colorspace change settle before the driver composes + we size the ring. + std::thread::sleep(Duration::from_millis(250)); + } + let display_hdr = crate::vdisplay::sudovda::advanced_color_enabled(target.target_id); + let ring_fmt = if display_hdr { + DXGI_FORMAT_R16G16B16A16_FLOAT + } else { + DXGI_FORMAT_B8G8R8A8_UNORM + }; + // Create our device on the discrete render GPU (where NVENC runs); the driver must render + // the swap-chain on the SAME adapter for the shared textures to open (it reports its actual + // render LUID into the header so we can detect a mismatch). + let luid = resolve_render_adapter_luid_or(target.adapter_luid); + let factory: IDXGIFactory4 = CreateDXGIFactory1().context("CreateDXGIFactory1")?; + let adapter: IDXGIAdapter1 = factory + .EnumAdapterByLuid(luid) + .context("EnumAdapterByLuid(render adapter) for IDD push")?; + let (device, context) = make_device(&adapter).context("make_device for IDD push")?; + + let (sa, _psd) = permissive_sa()?; + let bytes = std::mem::size_of::().max(64); + + // Header. + let map = CreateFileMappingW( + INVALID_HANDLE_VALUE, + Some(&sa), + PAGE_READWRITE, + 0, + bytes as u32, + &HSTRING::from(hdr_name(target.target_id)), + ) + .context("CreateFileMapping(IDD-push header)")?; + let view = MapViewOfFile(map, FILE_MAP_ALL_ACCESS, 0, 0, bytes); + if view.Value.is_null() { + let _ = CloseHandle(map); + bail!("MapViewOfFile failed for IDD-push header"); + } + let generation = IDD_GENERATION.fetch_add(1, Ordering::Relaxed); + let header = view.Value.cast::(); + std::ptr::write_bytes(header.cast::(), 0, bytes); + (*header).version = VERSION; + (*header).generation = generation; + (*header).ring_len = RING_LEN; + (*header).width = w; + (*header).height = h; + // Ring format = the display's composition format (FP16 in HDR, BGRA in SDR). The driver + // reads this into its `ring_format` and drops any surface that doesn't match. + (*header).dxgi_format = ring_fmt.0 as u32; + + // Frame-ready event (auto-reset). + let event = CreateEventW( + Some(&sa), + false, + false, + &HSTRING::from(evt_name(target.target_id)), + ) + .context("CreateEvent(IDD-push)")?; + + // Ring of shared keyed-mutex textures, format matched to the display's current mode. + let slots = + Self::create_ring_slots(&device, target.target_id, generation, w, h, ring_fmt)?; + + // Bring-up debug block (fixed name) — the driver writes diagnostics here. Best-effort. + let dbg_bytes = std::mem::size_of::(); + let (dbg_map, dbg_block) = match CreateFileMappingW( + INVALID_HANDLE_VALUE, + Some(&sa), + PAGE_READWRITE, + 0, + dbg_bytes as u32, + &HSTRING::from(DBG_NAME), + ) { + Ok(dm) => { + let dv = MapViewOfFile(dm, FILE_MAP_ALL_ACCESS, 0, 0, dbg_bytes); + if dv.Value.is_null() { + let _ = CloseHandle(dm); + (HANDLE::default(), std::ptr::null_mut()) + } else { + let p = dv.Value.cast::(); + std::ptr::write_bytes(p.cast::(), 0, dbg_bytes); + (*p).magic = DBG_MAGIC; + (dm, p) + } + } + Err(_) => (HANDLE::default(), std::ptr::null_mut()), + }; + + // Publish: magic LAST (Release) — signals the driver the ring is ready to open. + std::sync::atomic::fence(Ordering::Release); + (*(std::ptr::addr_of!((*header).magic) as *const AtomicU32)) + .store(MAGIC, Ordering::Release); + + tracing::info!( + target_id = target.target_id, + render_luid = format!("{:08x}:{:08x}", luid.HighPart, luid.LowPart), + mode = format!("{w}x{h}"), + display_hdr, + client_10bit, + ring_fp16 = display_hdr, + "IDD push(host): created shared ring; waiting for the driver to attach + publish" + ); + Ok(Self { + device, + context, + target_id: target.target_id, + map, + header, + event, + dbg_map, + dbg_block, + width: w, + height: h, + slots, + generation, + client_10bit, + display_hdr, + last_acm_poll: Instant::now(), + out_ring: Vec::new(), + out_idx: 0, + hdr_conv: None, + last_seq: 0, + last_present: None, + status_logged: false, + my_gen: crate::vdisplay::sudovda::CURRENT_MON_GEN.load(Ordering::Relaxed), + _keepalive: keepalive, + }) + } + } + + #[inline] + fn latest(&self) -> u64 { + unsafe { + (*(std::ptr::addr_of!((*self.header).latest) as *const AtomicU64)) + .load(Ordering::Acquire) + } + } + + /// Log the driver's status once it first reports (the only driver-visibility channel we have). + fn log_driver_status_once(&mut self) { + if self.status_logged { + return; + } + let (status, detail, lo, hi) = unsafe { + ( + (*self.header).driver_status, + (*self.header).driver_status_detail, + (*self.header).driver_render_luid_low, + (*self.header).driver_render_luid_high, + ) + }; + if status == 0 { + return; + } + self.status_logged = true; + let render_luid = format!("{hi:08x}:{lo:08x}"); + match status { + DRV_STATUS_OPENED => tracing::info!( + render_luid, + "IDD push: driver attached to the shared ring" + ), + DRV_STATUS_TEX_FAIL => tracing::error!( + render_luid, + detail = format!("0x{detail:08x}"), + "IDD push: driver could NOT open our textures — render-adapter mismatch (it renders on \ + a different GPU than where we created the ring)" + ), + DRV_STATUS_NO_DEVICE1 => { + tracing::error!("IDD push: driver has no ID3D11Device1 to open shared resources") + } + other => tracing::warn!(other, render_luid, "IDD push: driver reported an unknown status"), + } + } + + /// Log the driver's bring-up diagnostics (the fixed-name debug block) — independent of the + /// per-target header, so it tells us whether the swap-chain processor ran, what target_id it + /// resolved, whether the header opened (+ error), and whether frames flowed. + fn log_debug_block(&self) { + if self.dbg_block.is_null() { + tracing::warn!("IDD push DEBUG: no debug block"); + return; + } + let d = unsafe { &*self.dbg_block }; + tracing::error!( + run_core_entries = d.run_core_entries, + resolved_target_id = d.resolved_target_id, + header_open_attempts = d.header_open_attempts, + last_open_error = format!("0x{:08x}", d.last_open_error), + header_opened = d.header_opened, + driver_render_luid = format!("{:08x}:{:08x}", d.render_luid_high, d.render_luid_low), + frames_acquired = d.frames_acquired, + "IDD push DEBUG: driver-reported diagnostics (run_core_entries=0 ⇒ swap-chain processor \ + never ran; resolved_target_id≠ours ⇒ name mismatch; last_open_error 0x80070002 ⇒ header \ + not found; frames_acquired=0 ⇒ idle display)" + ); + } + + /// The output texture format + the [`PixelFormat`] it presents as, driven SOLELY by the DISPLAY's + /// HDR state (like the WGC path): HDR → `Rgb10a2` BT.2020 PQ → NVENC Main10, and the client + /// auto-detects PQ from the HEVC VUI; SDR → 8-bit `Bgra`. We do NOT gate HDR on the client's + /// advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit only when + /// its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path. + fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) { + if self.display_hdr { + (DXGI_FORMAT_R10G10B10A2_UNORM, PixelFormat::Rgb10a2) + } else { + (DXGI_FORMAT_B8G8R8A8_UNORM, PixelFormat::Bgra) + } + } + + /// The ring (shared-texture) format, matched to the display's composition format: FP16 when the + /// display is HDR, BGRA when SDR. + fn ring_format(&self) -> DXGI_FORMAT { + if self.display_hdr { + DXGI_FORMAT_R16G16B16A16_FLOAT + } else { + DXGI_FORMAT_B8G8R8A8_UNORM + } + } + + /// Update the client's 10-bit capability (the reuse path). Only affects whether a fresh `open` + /// proactively enables advanced color; the per-frame conversion follows the display, not the client. + fn set_client_10bit(&mut self, client_10bit: bool) { + self.client_10bit = client_10bit; + } + + /// Recreate the ring at the format for `new_display_hdr` (the user flipped "Use HDR"). Bumps the + /// generation so the driver re-attaches ([`is_stale`]) to the new-format textures; clears the + /// header's `latest` so we don't consume a stale slot from the old ring; drops the conversion + /// textures so they rebuild at the new format. + fn recreate_ring(&mut self, new_display_hdr: bool) -> Result<()> { + self.display_hdr = new_display_hdr; + let fmt = self.ring_format(); + let new_gen = IDD_GENERATION.fetch_add(1, Ordering::Relaxed); + let new_slots = unsafe { + Self::create_ring_slots( + &self.device, + self.target_id, + new_gen, + self.width, + self.height, + fmt, + )? + }; + unsafe { + // Clear `latest` to the 0 sentinel (generation 0, which try_consume rejects). The real guard + // against consuming an unwritten new-ring slot is the generation tag in `latest`: a stale + // old-ring publish racing this recreate carries the OLD generation and is rejected. We wait + // for the driver's first NEW-generation publish. + (*(std::ptr::addr_of!((*self.header).latest) as *const AtomicU64)) + .store(0, Ordering::Relaxed); + (*self.header).dxgi_format = fmt.0 as u32; + // Publish the new generation LAST (Release): when the driver observes it (Acquire) the new + // textures already exist and the format is already updated. + std::sync::atomic::fence(Ordering::Release); + (*(std::ptr::addr_of!((*self.header).generation) as *const AtomicU32)) + .store(new_gen, Ordering::Release); + } + self.slots = new_slots; // drops the old slots → closes their shared handles + SRVs + self.generation = new_gen; + self.last_seq = 0; + self.out_ring.clear(); // the output format changed → rebuild lazily at the new format + self.out_idx = 0; + self.last_present = None; + Ok(()) + } + + /// Throttled poll of the display's live HDR state; recreate the ring if the user flipped "Use HDR". + /// Called from the capture loop (incl. while frozen on a format mismatch) so a toggle recovers within + /// a poll interval. + fn poll_display_hdr(&mut self) { + if self.last_acm_poll.elapsed() < Duration::from_millis(250) { + return; + } + self.last_acm_poll = Instant::now(); + let now_hdr = unsafe { crate::vdisplay::sudovda::advanced_color_enabled(self.target_id) }; + if now_hdr == self.display_hdr { + return; + } + tracing::info!( + target_id = self.target_id, + display_hdr = now_hdr, + client_10bit = self.client_10bit, + "IDD push: display HDR mode flipped — recreating the ring at the new format" + ); + if let Err(e) = self.recreate_ring(now_hdr) { + tracing::warn!(error = %format!("{e:#}"), "IDD push: ring recreate failed"); + } + } + + /// Build the host-owned output ring (`OUT_RING` textures at [`Self::out_format`] + RTVs) if not yet + /// built. Rotated per frame so the in-flight encode of N and the convert/copy of N+1 touch different + /// textures. Rebuilt (cleared) when the display-mode flip changes the output format. + fn ensure_out_ring(&mut self) -> Result<()> { + if !self.out_ring.is_empty() { + return Ok(()); + } + let (format, _) = self.out_format(); + let desc = D3D11_TEXTURE2D_DESC { + Width: self.width, + Height: self.height, + MipLevels: 1, + ArraySize: 1, + Format: format, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32, + CPUAccessFlags: 0, + MiscFlags: 0, + }; + for _ in 0..OUT_RING { + let mut t: Option = None; + let mut rtv: Option = None; + unsafe { + self.device + .CreateTexture2D(&desc, None, Some(&mut t)) + .context("CreateTexture2D(IDD out ring)")?; + let t = t.context("null out-ring texture")?; + self.device + .CreateRenderTargetView(&t, None, Some(&mut rtv)) + .context("CreateRenderTargetView(IDD out ring)")?; + self.out_ring.push((t, rtv.context("null out-ring rtv")?)); + } + } + Ok(()) + } + + /// Build the HDR converter if not already built (HDR-display path only — an SDR display is a copy). + fn ensure_converter(&mut self) -> Result<()> { + if self.hdr_conv.is_none() { + self.hdr_conv = Some(unsafe { HdrConverter::new(&self.device)? }); + } + Ok(()) + } + + fn try_consume(&mut self) -> Result> { + self.log_driver_status_once(); + // Follow the display: a "Use HDR" flip recreates the ring at the matching format. + self.poll_display_hdr(); + let latest = self.latest(); + // `latest` = (generation << 40) | (seq << 8) | slot. Reject any publish whose generation isn't + // our CURRENT ring (a stale old-ring publish racing a recreate, or the 0 sentinel we reset to) so + // we never consume an unwritten new-ring slot — eliminating the toggle-time garbage frame. + if (latest >> 40) as u32 != self.generation { + return Ok(None); + } + let seq = (latest >> 8) & 0xFFFF_FFFF; + let slot = (latest & 0xff) as usize; + if seq == self.last_seq || slot >= self.slots.len() { + return Ok(None); + } + self.ensure_out_ring()?; + // Build the HDR converter BEFORE acquiring the slot so nothing between Acquire and Release can + // `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot). + if self.display_hdr { + self.ensure_converter()?; + } + let i = self.out_idx; + let (out, out_rtv) = { + let (t, rtv) = &self.out_ring[i]; + (t.clone(), rtv.clone()) + }; + let (_, pf) = self.out_format(); + + // Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the + // ~3 ms encode — NVENC reads the host out-ring slot, not the keyed-mutex slot), so the driver gets + // the slot back immediately and the encode of the PREVIOUS frame overlaps this convert. + let s = &self.slots[slot]; + if unsafe { s.mutex.AcquireSync(0, 8) }.is_err() { + return Ok(None); + } + unsafe { + if self.display_hdr { + // Sample the FP16 slot's SRV directly (no scratch copy) → BT.2020 PQ Rgb10a2. + if let Some(conv) = self.hdr_conv.as_ref() { + conv.convert(&self.context, &s.srv, &out_rtv, self.width, self.height); + } + } else { + // SDR: the slot is already 8-bit BGRA — one copy into the out-ring (hidden by pipelining). + self.context.CopyResource(&out, &s.tex); + } + let _ = s.mutex.ReleaseSync(0); + } + self.out_idx = (i + 1) % self.out_ring.len(); + self.last_seq = seq; + self.last_present = Some((out.clone(), pf)); + Ok(Some(CapturedFrame { + width: self.width, + height: self.height, + pts_ns: now_ns(), + format: pf, + payload: FramePayload::D3d11(D3d11Frame { + texture: out, + device: self.device.clone(), + }), + })) + } + + fn repeat_last(&self) -> Option { + self.last_present.as_ref().map(|(tex, pf)| CapturedFrame { + width: self.width, + height: self.height, + pts_ns: now_ns(), + format: *pf, + payload: FramePayload::D3d11(D3d11Frame { + texture: tex.clone(), + device: self.device.clone(), + }), + }) + } +} + +/// Diagnostic observer (O3.1): create the IDD-push ring + debug block as the SYSTEM host (LocalSystem +/// — proper privileges, the gamepad pattern) ALONGSIDE the normal WGC path, which provides the +/// presentation trigger. Logs whether the driver's `run_core` ran and pushed frames into a +/// host-created ring — resolving the `run_core=0` ambiguity (a user-created ring may be unwritable by +/// the driver). Gated by `PUNKTFUNK_IDD_PUSH_OBSERVE`; spawns a short-lived sampling thread. +pub fn spawn_observer(target: WinCaptureTarget, preferred: Option<(u32, u32, u32)>) { + std::thread::spawn(move || { + let tid = target.target_id; + tracing::info!( + target_id = tid, + "IDD push OBSERVER: creating host ring (LocalSystem) + debug block alongside WGC" + ); + match IddPushCapturer::open(target, preferred, false, Box::new(())) { + Ok(mut cap) => { + let mut frames = 0u32; + for _ in 0..40 { + match cap.try_consume() { + Ok(Some(_)) => frames += 1, + Ok(None) => {} + Err(e) => tracing::warn!("IDD push OBSERVER: consume error: {e:#}"), + } + std::thread::sleep(Duration::from_millis(750)); + } + tracing::info!( + target_id = tid, + frames_from_ring = frames, + "IDD push OBSERVER: sampling done" + ); + cap.log_debug_block(); + } + Err(e) => tracing::warn!( + target_id = tid, + "IDD push OBSERVER: ring open failed: {e:#}" + ), + } + }); +} + +/// The discrete render GPU LUID (where NVENC runs), falling back to the monitor's `OsAdapterLuid`. +fn resolve_render_adapter_luid_or(fallback_packed: i64) -> LUID { + if let Some(l) = unsafe { crate::vdisplay::sudovda::resolve_render_adapter_luid() } { + return l; + } + LUID { + LowPart: (fallback_packed & 0xffff_ffff) as u32, + HighPart: (fallback_packed >> 32) as i32, + } +} + +impl Capturer for IddPushCapturer { + fn next_frame(&mut self) -> Result { + let deadline = Instant::now() + Duration::from_secs(20); + loop { + let _ = unsafe { WaitForSingleObject(self.event, 16) }; + if let Some(f) = self.try_consume()? { + return Ok(f); + } + if let Some(f) = self.repeat_last() { + return Ok(f); + } + if Instant::now() > deadline { + self.log_debug_block(); + let (st, detail, lo, hi) = unsafe { + ( + (*self.header).driver_status, + (*self.header).driver_status_detail, + (*self.header).driver_render_luid_low, + (*self.header).driver_render_luid_high, + ) + }; + bail!( + "no IDD-push frame within 20s (target {}) — driver_status={st} detail=0x{detail:08x} \ + driver_render_luid={hi:08x}:{lo:08x}. 0=driver never attached (swap-chain not \ + assigned / driver not active), 1=attached but no frames (idle desktop?), 2=driver \ + couldn't open our textures (render-adapter mismatch).", + self.target_id + ); + } + } + } + + fn try_latest(&mut self) -> Result> { + self.try_consume() + } + + fn hdr_meta(&self) -> Option { + // While the display is HDR we emit BT.2020 PQ (Rgb10a2) → the encoder forces HEVC Main10 + the + // PQ VUI; pair that with a mastering-display SEI so any decoder tone-maps from a real grade. The + // driver doesn't (yet) forward the OS's IDDCX_HDR10_METADATA, so use the generic HDR10 baseline + // (the same metadata the native HDR path sends on the 0xCE datagram). + self.display_hdr.then(crate::hdr::generic_hdr10) + } + + fn pipeline_depth(&self) -> usize { + // 2 = one frame deferred: submit N+1 (capture + convert/copy into a fresh out-ring texture) while + // NVENC encodes N on the ASIC. We hand a rotating `OUT_RING` of output textures, so this is safe. + // `PUNKTFUNK_IDD_DEPTH` overrides (1 disables pipelining; clamp to ≤ OUT_RING so a frame in flight + // always has its own texture). + std::env::var("PUNKTFUNK_IDD_DEPTH") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(2) + .clamp(1, OUT_RING) + } +} + +impl Drop for IddPushCapturer { + fn drop(&mut self) { + self.slots.clear(); + unsafe { + if !self.dbg_block.is_null() { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: self.dbg_block.cast(), + }); + } + if !self.dbg_map.is_invalid() { + let _ = CloseHandle(self.dbg_map); + } + if !self.header.is_null() { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: self.header.cast(), + }); + } + let _ = CloseHandle(self.event); + let _ = CloseHandle(self.map); + } + // _keepalive drops after, REMOVEing the virtual display. + } +} diff --git a/crates/punktfunk-host/src/capture/wgc_relay.rs b/crates/punktfunk-host/src/capture/wgc_relay.rs index 2b78845..614196f 100644 --- a/crates/punktfunk-host/src/capture/wgc_relay.rs +++ b/crates/punktfunk-host/src/capture/wgc_relay.rs @@ -278,6 +278,13 @@ unsafe fn spawn_inner(cmdline: &str, w: u32, h: u32, hz: u32) -> Result= 10). TODO: derive want_hdr + // from a GameStream HDR flag once StreamConfig carries one. let mut capturer = - capture::capture_virtual_output(vout).context("capture virtual output")?; + capture::capture_virtual_output(vout, false).context("capture virtual output")?; capturer.set_active(true); return stream_body(&mut *capturer, &sock, cfg, running, force_idr, rfi_range); } diff --git a/crates/punktfunk-host/src/punktfunk1.rs b/crates/punktfunk-host/src/punktfunk1.rs index 8e18382..2d09261 100644 --- a/crates/punktfunk-host/src/punktfunk1.rs +++ b/crates/punktfunk-host/src/punktfunk1.rs @@ -2149,6 +2149,22 @@ fn session_watcher_loop(tx: std::sync::mpsc::Sender, stop: Arc>> = std::sync::Mutex::new(None); + +/// Serializes IDD-push session SETUP (preempt + monitor create + first frame). Held across setup, +/// released before the encode loop — so a reconnect FLOOD can never run concurrent monitor +/// create/teardown (the churn that fails the ADD IOCTL and wedges the driver). Each session finishes +/// setup before the next acquires this and preempts it, by which point the preempted session is in its +/// encode loop and releases its monitor promptly. +#[cfg(target_os = "windows")] +static IDD_SETUP_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + #[allow(clippy::too_many_arguments)] fn virtual_stream( session: Session, @@ -2197,9 +2213,30 @@ fn virtual_stream( bit_depth, "punktfunk/1 virtual display" ); + // IDD-push reconnect preempt: a fresh connection means the prior client is gone. Hold IDD_SETUP_LOCK + // across the preempt + pipeline build so a reconnect FLOOD can't run concurrent monitor + // create/teardown. Then STOP the prior session (it ends cleanly while its monitor still composites + // frames) and WAIT for it to release its monitor, before building a FRESH one — instead of the + // driver-churning teardown of a monitor under a still-live session. Register THIS session's stop so + // the next reconnect preempts it. + #[cfg(target_os = "windows")] + let idd_setup_guard = std::env::var_os("PUNKTFUNK_IDD_PUSH") + .is_some() + .then(|| IDD_SETUP_LOCK.lock().unwrap()); + #[cfg(target_os = "windows")] + if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() { + let prev = IDD_SESSION_STOP.lock().unwrap().replace(stop.clone()); + if let Some(prev_stop) = prev { + prev_stop.store(true, Ordering::SeqCst); + crate::vdisplay::sudovda::wait_for_monitor_released(std::time::Duration::from_secs(3)); + } + } let mut vd = crate::vdisplay::open(compositor)?; let (mut capturer, mut enc, mut frame, mut interval) = build_pipeline_with_retry(&mut vd, mode, bitrate_kbps, bit_depth)?; + // Setup done — release the IDD-push setup lock so the next reconnect can begin (and preempt us). + #[cfg(target_os = "windows")] + drop(idd_setup_guard); // Windows single-process DDA path (PUNKTFUNK_NO_WGC=1): the SudoVDA virtual display, isolated as the // SOLE active output, goes into fullscreen independent-flip (one plane on one display) which Desktop @@ -2276,6 +2313,17 @@ fn virtual_stream( let mut capture_rebuilds: u32 = 0; // Last HDR mastering metadata we forwarded — re-sent as 0xCE on change/keyframe (see below). let mut last_hdr_meta: Option = None; + // Frames submitted to NVENC but not yet polled (capture_ns, pacing deadline). With a capturer that + // hands a fresh output texture per frame, the loop submits N+1 before polling N (pipeline depth > 1), + // overlapping the convert/copy of N+1 on the 3D engine with the encode of N on the NVENC ASIC. + let mut inflight: std::collections::VecDeque<(u64, std::time::Instant)> = + std::collections::VecDeque::new(); + // Diagnostic: distinguish NEW captured frames (the source produced a fresh frame) from REPEATS (the + // loop re-encoded the last frame because `try_latest` had nothing). A low new-frame rate at a high + // send rate ⇒ the capture source isn't producing frames (e.g. an IDD virtual display DWM isn't + // compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`. + let (mut diag_new, mut diag_repeat) = (0u64, 0u64); + let mut diag_at = std::time::Instant::now(); while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline { // Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in // place — a different compositor at the SAME client mode — keeping the Session + send thread @@ -2384,9 +2432,10 @@ fn virtual_stream( match capturer.try_latest() { Ok(Some(f)) => { frame = f; + diag_new += 1; capture_rebuilds = 0; // a delivered frame clears the consecutive-loss counter } - Ok(None) => {} // no new frame (static desktop / mid-rebuild) — repeat the last frame + Ok(None) => diag_repeat += 1, // no new frame (static desktop / mid-rebuild) — repeat the last // The capture source died (PipeWire/compositor thread ended, virtual output gone). Rather // than tear the whole session down — the client has no reconnect path and would have to // cold-restart the handshake — rebuild the pipeline IN PLACE at the current mode, exactly @@ -2411,6 +2460,18 @@ fn virtual_stream( next = std::time::Instant::now(); } } + if perf && diag_at.elapsed() >= std::time::Duration::from_secs(2) { + let secs = diag_at.elapsed().as_secs_f64(); + tracing::info!( + new_fps = format!("{:.0}", diag_new as f64 / secs), + repeat_fps = format!("{:.0}", diag_repeat as f64 / secs), + "capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \ + the source isn't producing frames, not an encode stall)" + ); + diag_new = 0; + diag_repeat = 0; + diag_at = std::time::Instant::now(); + } // The source's static HDR mastering metadata (Windows GetDesc1; None on Linux/SDR) is the // single source of truth: hand it to the encoder (in-band SEI on keyframes) and, when it // changes, to the client (0xCE). Re-sent on each keyframe below so a dropped best-effort @@ -2421,13 +2482,26 @@ fn virtual_stream( if resend_meta { last_hdr_meta = hdr_meta; } + // How deep to pipeline (1 = synchronous submit→poll, the original behaviour). The IDD-push + // capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1. + let depth = capturer.pipeline_depth().max(1); let capture_ns = now_ns(); enc.submit(&frame).context("encoder submit")?; - // The deadline for this frame's packets (the next frame's due time); the send thread paces - // up to here so a high-bitrate frame spreads over the interval instead of bursting. + // This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame + // up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled. next += interval; + inflight.push_back((capture_ns, next)); + // Drain the OLDEST in-flight frames, keeping at most depth-1 deferred. At depth 1 this polls + // immediately after every submit (synchronous); at depth 2 it polls N right after submitting N+1, + // so the encode of N overlaps the convert/copy of N+1. NVENC's `pending` is FIFO, so poll() returns + // the oldest submitted frame's AU — matching `inflight.pop_front()`. let mut send_gone = false; - while let Some(au) = enc.poll().context("encoder poll")? { + while inflight.len() >= depth { + let au = match enc.poll().context("encoder poll")? { + Some(au) => au, + None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks) + }; + let (cap_ns, deadline) = inflight.pop_front().expect("inflight non-empty"); let flags = if au.keyframe { (FLAG_PIC | FLAG_SOF) as u32 } else { @@ -2442,12 +2516,12 @@ fn virtual_stream( resend_meta = false; } } - let encode_us = (now_ns().saturating_sub(capture_ns) / 1000) as u32; + let encode_us = (now_ns().saturating_sub(cap_ns) / 1000) as u32; let msg = FrameMsg { data: au.data, - capture_ns, + capture_ns: cap_ns, flags, - deadline: next, + deadline, encode_us, }; // Hand to the send thread; this blocks (backpressure) if it's behind. An Err means it @@ -2466,6 +2540,28 @@ fn virtual_stream( None => next = std::time::Instant::now(), } } + // Drain the in-flight tail (the depth-1 frames submitted but not yet polled) so the last frames still + // reach the client instead of being dropped on the way out. + while let Some((cap_ns, deadline)) = inflight.pop_front() { + let Ok(Some(au)) = enc.poll() else { break }; + let flags = if au.keyframe { + (FLAG_PIC | FLAG_SOF) as u32 + } else { + FLAG_PIC as u32 + }; + let encode_us = (now_ns().saturating_sub(cap_ns) / 1000) as u32; + let msg = FrameMsg { + data: au.data, + capture_ns: cap_ns, + flags, + deadline, + encode_us, + }; + if frame_tx.send(msg).is_err() { + break; + } + sent += 1; + } // Signal the send thread to drain + exit (drop the channel), then join it. drop(frame_tx); let _ = send_thread.join(); @@ -2484,6 +2580,14 @@ fn should_use_helper() -> bool { if std::env::var_os("PUNKTFUNK_NO_HELPER").is_some() || crate::capture::wgc_disabled() { return false; } + // IDD direct-push captures IN-PROCESS in Session 0: the pf-vdisplay driver delivers frames to the + // SYSTEM host's session via shared memory and NVENC is headless, so no user-session WGC helper is + // needed for VIDEO (and a Session-1 helper couldn't open the Session-0 shared textures anyway). + // NOTE: input injection (SendInput) from Session 0 can't reach the user's Session-1 desktop yet — + // a known follow-up; this path validates the video transport. See docs/windows-virtual-display-rust-port.md. + if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() { + return false; + } std::env::var_os("PUNKTFUNK_FORCE_HELPER").is_some() || crate::capture::wgc_relay::running_as_system() } @@ -2576,6 +2680,15 @@ fn virtual_stream_relay( let (mut _keepalive, mut relay, mut target, mut effective_hz) = build(&mut vd, mode)?; let mut cur_mode = mode; + // O3.1: optionally observe the IDD-push ring alongside WGC (WGC = the presentation trigger) to + // confirm the 0257 driver pushes frames into a HOST-created ring. Diagnostic only; gated. + if std::env::var_os("PUNKTFUNK_IDD_PUSH_OBSERVE").is_some() { + crate::capture::idd_push::spawn_observer( + target.clone(), + Some((cur_mode.width, cur_mode.height, effective_hz)), + ); + } + // The host's own DDA capturer+encoder for the SECURE (Winlogon) desktop, which WGC — and thus the // helper — cannot capture. Opened lazily on the first secure transition (so a session that never // hits a UAC/lock screen never pays for a second NVENC session), then kept for fast re-switch. @@ -3014,8 +3127,12 @@ fn build_pipeline( "compositor did not honor the requested refresh — encoding at the achieved rate" ); } - let mut capturer = - crate::capture::capture_virtual_output(vout).context("capture virtual output")?; + // HDR vs SDR for the IDD-push conversion: a negotiated 10-bit session (client advertised + // VIDEO_CAP_10BIT + host opted in via PUNKTFUNK_10BIT) is our HDR path → BT.2020 PQ Rgb10a2; + // otherwise the FP16 IDD frames are converted to 8-bit SDR. (Ignored by non-IDD-push backends, + // which auto-detect HDR from the monitor state.) + let mut capturer = crate::capture::capture_virtual_output(vout, bit_depth >= 10) + .context("capture virtual output")?; capturer.set_active(true); let frame = capturer.next_frame().context("first frame")?; // `bit_depth` is the handshake-negotiated value (8, or 10 = HEVC Main10 when the client diff --git a/crates/punktfunk-host/src/spike.rs b/crates/punktfunk-host/src/spike.rs index 9e31ce3..9d1ba03 100644 --- a/crates/punktfunk-host/src/spike.rs +++ b/crates/punktfunk-host/src/spike.rs @@ -76,7 +76,7 @@ pub fn run(opts: Options) -> Result<()> { refresh_hz: opts.fps, }) .context("create virtual output")?; - capture::capture_virtual_output(vout).context("capture virtual output")? + capture::capture_virtual_output(vout, false).context("capture virtual output")? } }; diff --git a/crates/punktfunk-host/src/vdisplay/sudovda.rs b/crates/punktfunk-host/src/vdisplay/sudovda.rs index f85b95e..db21005 100644 --- a/crates/punktfunk-host/src/vdisplay/sudovda.rs +++ b/crates/punktfunk-host/src/vdisplay/sudovda.rs @@ -9,8 +9,25 @@ use std::ffi::c_void; use std::mem::size_of; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, Once}; + +/// Monotonic monitor generation. Each [`create_monitor`] stamps the next value onto the [`Monitor`] +/// and its [`MonitorLease`]s, so a lease whose monitor was already torn down + recreated (the IDD-push +/// reconnect-preempt path) is ignored on drop instead of decrementing the NEW monitor's refcount. +static MON_GEN: AtomicU64 = AtomicU64::new(1); + +/// The gen of the CURRENTLY-active monitor. A session capturer captures this at open and re-checks it +/// each frame; when it changes (a reconnect preempted + recreated the monitor), the old session bails +/// IMMEDIATELY instead of lingering on the dead ring's 20s frame deadline — which would otherwise hold +/// its NVENC encoder open and exhaust the GPU's encode-session limit under rapid reconnects. +pub(crate) static CURRENT_MON_GEN: AtomicU64 = AtomicU64::new(0); + +/// IDD-push mode: a new client connection preempts + recreates the monitor (single-client reconnect), +/// because a REUSED IddCx monitor's swap-chain is dead. Off → monitors are shared across sessions. +fn idd_push_mode() -> bool { + std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() +} use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; @@ -27,7 +44,8 @@ use windows::Win32::Devices::Display::{ DISPLAYCONFIG_DEVICE_INFO_GET_SOURCE_NAME, DISPLAYCONFIG_DEVICE_INFO_SET_ADVANCED_COLOR_STATE, DISPLAYCONFIG_GET_ADVANCED_COLOR_INFO, DISPLAYCONFIG_MODE_INFO, DISPLAYCONFIG_PATH_INFO, DISPLAYCONFIG_SET_ADVANCED_COLOR_STATE, DISPLAYCONFIG_SOURCE_DEVICE_NAME, - QDC_ONLY_ACTIVE_PATHS, SDC_ALLOW_CHANGES, SDC_APPLY, SDC_USE_SUPPLIED_DISPLAY_CONFIG, + QDC_ONLY_ACTIVE_PATHS, SDC_ALLOW_CHANGES, SDC_APPLY, SDC_FORCE_MODE_ENUMERATION, + SDC_SAVE_TO_DATABASE, SDC_USE_SUPPLIED_DISPLAY_CONFIG, }; use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; use windows::Win32::Graphics::Gdi::{ @@ -119,7 +137,9 @@ unsafe fn set_render_adapter(h: HANDLE, luid: LUID) -> Result<()> { /// Desktop Duplication (e.g. the RTX 4090). Default: the discrete adapter with the most /// `DedicatedVideoMemory`, skipping WARP / Basic-Render and the SudoVDA software adapter (≈0 VRAM). /// `PUNKTFUNK_RENDER_ADAPTER=` forces a match by Description (Apollo's `adapter_name`). -unsafe fn resolve_render_adapter_luid() -> Option { +/// `pub(crate)` so the IDD direct-push capturer can create its shared textures on the same discrete +/// GPU it pins here (and where NVENC runs). +pub(crate) unsafe fn resolve_render_adapter_luid() -> Option { use windows::Win32::Graphics::Dxgi::{CreateDXGIFactory1, IDXGIFactory1}; let want = std::env::var("PUNKTFUNK_RENDER_ADAPTER") .ok() @@ -497,13 +517,32 @@ unsafe fn isolate_displays_ccd(keep_target_id: u32) -> Option { } } if others == 0 { - tracing::info!("display isolate (CCD): SudoVDA target {keep_target_id} already the only active display"); + // The virtual path shows active in the CCD database (from set_active_mode's legacy + // ChangeDisplaySettingsExW), but a legacy mode-set does NOT drive the IddCx adapter's + // EVT_IDD_CX_ADAPTER_COMMIT_MODES — and without COMMIT_MODES the OS never calls + // ASSIGN_SWAPCHAIN, so the driver never receives composed frames. Force an explicit CCD + // SetDisplayConfig commit of the (sole) virtual path so the IddCx path actually activates. + // SDC_FORCE_MODE_ENUMERATION makes the OS re-enumerate + re-commit even though the CCD DB + // already lists the path active. + let rc = SetDisplayConfig( + Some(paths.as_slice()), + Some(modes.as_slice()), + SDC_APPLY + | SDC_USE_SUPPLIED_DISPLAY_CONFIG + | SDC_ALLOW_CHANGES + | SDC_SAVE_TO_DATABASE + | SDC_FORCE_MODE_ENUMERATION, + ); + tracing::info!("display isolate (CCD): forced CCD re-commit of sole virtual path {keep_target_id} rc={rc:#x} (drives IddCx COMMIT_MODES → ASSIGN_SWAPCHAIN)"); return Some(saved); } let rc = SetDisplayConfig( Some(paths.as_slice()), Some(modes.as_slice()), - SDC_APPLY | SDC_USE_SUPPLIED_DISPLAY_CONFIG | SDC_ALLOW_CHANGES, + SDC_APPLY + | SDC_USE_SUPPLIED_DISPLAY_CONFIG + | SDC_ALLOW_CHANGES + | SDC_FORCE_MODE_ENUMERATION, ); if rc == 0 { tracing::info!("display isolate (CCD): deactivated {others} other display(s) — SudoVDA target {keep_target_id} is now the sole desktop"); @@ -587,6 +626,8 @@ struct Monitor { stop: Arc, pinger: Option>, ccd_saved: Option, + /// Generation stamp ([`MON_GEN`]); a [`MonitorLease`] only releases if its gen still matches. + gen: u64, } enum MgrState { @@ -670,6 +711,14 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result only on a box that genuinely needs steering. let pinned = if std::env::var("PUNKTFUNK_RENDER_ADAPTER").is_ok() { unsafe { resolve_render_adapter_luid() } + } else if std::env::var_os("PUNKTFUNK_IDD_PUSH").is_some() { + // P2 direct frame push: the host opens the driver's shared textures AND runs NVENC on the + // RENDER adapter, so on a hybrid box (4090 + iGPU) it MUST be the discrete encoder GPU — + // an iGPU-rendered surface is untouchable by NVENC. pf-vdisplay HONORS SET_RENDER_ADAPTER + // (SudoVDA ignored it), so pin the discrete GPU. The driver also reports the resulting + // render LUID in the shared header, so the host binds correctly even if this is overridden. + tracing::info!("IDD push: pinning the discrete render GPU (SET_RENDER_ADAPTER)"); + unsafe { resolve_render_adapter_luid() } } else { tracing::info!( "SudoVDA SET_RENDER_ADAPTER skipped (Apollo-parity: no render pin — avoids cross-GPU \ @@ -735,7 +784,9 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result { if !warned { - tracing::warn!("SudoVDA keepalive PING failed (control handle lost?): {e:#}"); + tracing::warn!( + "SudoVDA keepalive PING failed (control handle lost?): {e:#}" + ); warned = true; } } @@ -796,6 +847,7 @@ unsafe fn create_monitor(device: isize, mode: Mode, watchdog_s: u32) -> Result Result { let device = mgr_ensure_device(&mut g)?; let watchdog_s = g.watchdog_s; + // IDD-push: a new connection while a monitor is live = a single-client RECONNECT (the prior client + // is gone — IDD-push is one display, no concurrency). A REUSED IddCx monitor's swap-chain is DEAD, + // so joining it would hand the new client a black screen until the old session times out. PREEMPT: + // tear the old monitor down (its Drop restores topology + IOCTL_REMOVEs) and fall through to create + // a FRESH one. The old session's lease is gen-stamped, so its later drop is ignored (mgr_release + // no-op) and can't tear down the new monitor. + if idd_push_mode() + && matches!( + g.state, + MgrState::Active { .. } | MgrState::Lingering { .. } + ) + { + if let MgrState::Active { mon, .. } | MgrState::Lingering { mon, .. } = + std::mem::replace(&mut g.state, MgrState::Idle) + { + tracing::info!( + old_target = mon.target_id, + "IDD-push reconnect — preempting the prior session, recreating a fresh monitor" + ); + // teardown() — NOT drop() — sends IOCTL_REMOVE (and restores topology). `Monitor` has NO + // `Drop` impl, so a bare `drop(mon)` orphaned the IddCx monitor in the driver: it was never + // departed, so it kept a live D3D device + a stuck swap-chain processor thread, and these + // accumulated every reconnect (the driver-side churn leak: +1 device, ~36 nvwgf2umx threads, + // ~50 MB VRAM per session, until it choked). teardown frees it via the driver's do_remove. + unsafe { mon.teardown(device) }; + // Let the OS finish the ASYNC IddCx monitor departure before the next ADD. A back-to-back + // REMOVE→ADD races the teardown and the ADD IOCTL is rejected (`DeviceIoControl failed`) + // under reconnect churn. Held under the MGR lock, but IDD-push setup is already serialized + // (IDD_SETUP_LOCK), so this only paces the recreate — exactly what a reconnect flood needs. + thread::sleep(Duration::from_millis(400)); + } + } + // A live monitor already exists — join it (refcount++). This covers a concurrent session AND the // build-then-drop overlap of a mid-stream Reconfigure / secure-return (the new lease is taken while // the old is still held). If the requested mode differs, reconfigure the shared monitor to it so a @@ -912,11 +997,13 @@ fn mgr_acquire(mode: Mode) -> Result { ); let pm = Some((mon.mode.width, mon.mode.height, mon.mode.refresh_hz)); let target = mon.target(); + let gen = mon.gen; + CURRENT_MON_GEN.store(gen, Ordering::Relaxed); return Ok(VirtualOutput { node_id: 0, preferred_mode: pm, win_capture: target, - keepalive: Box::new(MonitorLease), + keepalive: Box::new(MonitorLease { gen }), }); } @@ -937,12 +1024,14 @@ fn mgr_acquire(mode: Mode) -> Result { }; let pm = Some((mon.mode.width, mon.mode.height, mon.mode.refresh_hz)); let target = mon.target(); + let gen = mon.gen; + CURRENT_MON_GEN.store(gen, Ordering::Relaxed); g.state = MgrState::Active { mon, refs: 1 }; Ok(VirtualOutput { node_id: 0, preferred_mode: pm, win_capture: target, - keepalive: Box::new(MonitorLease), + keepalive: Box::new(MonitorLease { gen }), }) } @@ -966,8 +1055,18 @@ unsafe fn mgr_reconfigure(mon: &mut Monitor, mode: Mode) { } /// Release a session's hold: refcount-- ; when the last session leaves, LINGER before teardown. -fn mgr_release() { +/// `gen` is the lease's monitor generation: a STALE lease (its monitor was already torn down + +/// recreated under it — the IDD-push reconnect-preempt path) does nothing, so it can't decrement the +/// CURRENT (fresh) monitor's refcount and tear it down. +fn mgr_release(gen: u64) { let mut g = MGR.lock().unwrap(); + let stale = match &g.state { + MgrState::Active { mon, .. } | MgrState::Lingering { mon, .. } => mon.gen != gen, + MgrState::Idle => true, + }; + if stale { + return; + } g.state = match std::mem::replace(&mut g.state, MgrState::Idle) { MgrState::Active { mon, refs } if refs > 1 => MgrState::Active { mon, @@ -988,6 +1087,28 @@ fn mgr_release() { }; } +/// Wait (up to `timeout`) for the active monitor to be RELEASED — i.e. the MGR is no longer `Active` +/// (the prior session dropped its lease → `Lingering`/`Idle`). Used by the IDD-push reconnect preempt: +/// after signalling the old session to stop, we wait here so it tears its monitor down CLEANLY (while +/// frames still flow) before we acquire a fresh one — instead of dropping the monitor out from under a +/// still-live session, which churns the driver's ADD/REMOVE path and wedges it under rapid reconnects. +pub(crate) fn wait_for_monitor_released(timeout: Duration) { + let deadline = Instant::now() + timeout; + loop { + if !matches!(MGR.lock().unwrap().state, MgrState::Active { .. }) { + return; + } + if Instant::now() >= deadline { + tracing::warn!( + "IDD-push preempt: prior session didn't release the monitor within {timeout:?} — \ + proceeding (mgr_acquire will preempt it)" + ); + return; + } + thread::sleep(Duration::from_millis(25)); + } +} + /// Background timer (started once): tear down a monitor that has lingered past its deadline (→ Idle), /// so a physical-screen user gets their screen back after they stop streaming. fn ensure_linger_timer() { @@ -1012,11 +1133,15 @@ fn ensure_linger_timer() { }); } -/// A session's lease on the shared monitor. Drop releases the refcount (→ linger when it hits 0). -struct MonitorLease; +/// A session's lease on the shared monitor. Drop releases the refcount (→ linger when it hits 0), +/// UNLESS the monitor was already torn down + recreated under it (gen mismatch — the IDD-push +/// reconnect-preempt path), in which case the drop is a no-op so it can't tear down the new monitor. +struct MonitorLease { + gen: u64, +} impl Drop for MonitorLease { fn drop(&mut self) { - mgr_release(); + mgr_release(self.gen); } } diff --git a/crates/punktfunk-host/src/wgc_helper.rs b/crates/punktfunk-host/src/wgc_helper.rs index 7a08f99..bbabf7a 100644 --- a/crates/punktfunk-host/src/wgc_helper.rs +++ b/crates/punktfunk-host/src/wgc_helper.rs @@ -63,6 +63,22 @@ pub fn run(opts: HelperOptions) -> Result<()> { WgcCapturer::open(target, Some((opts.width, opts.height, opts.fps))).context("WGC open")?; cap.set_active(true); + // O3 present-trigger experiment: spawn a thread that PRESENTS a D3D swapchain to the virtual + // display (a present SOURCE), testing whether that — unlike WGC's READ — makes the OS assign the + // driver's IddCx swap-chain (so the driver's run_core runs + can push). Gated; diagnostic. + if std::env::var_os("PUNKTFUNK_PRESENT_TRIGGER").is_some() { + let (w, h) = (opts.width, opts.height); + std::thread::Builder::new() + .name("pf-present-trigger".into()) + .spawn(move || { + tracing::info!("present-trigger: starting D3D present loop on the virtual display"); + if let Err(e) = unsafe { present_trigger(w, h) } { + tracing::warn!("present-trigger error: {e:#}"); + } + }) + .ok(); + } + // First frame establishes the real dimensions + whether the desktop is HDR (the encoder derives // Main10/HDR from the frame's PixelFormat::Rgb10a2). Then open NVENC on the capture device. let first = cap.next_frame().context("first WGC frame")?; @@ -107,47 +123,55 @@ pub fn run(opts: HelperOptions) -> Result<()> { let stdout = std::io::stdout(); let mut out = stdout.lock(); - // Encode pipeline depth. The loop keeps DEPTH frames in flight so per-frame GPU-scheduling waits - // can overlap. NOTE: depth > 1 was measured to REGRESS under a GPU-saturating game — the encodes - // serialize on the contended GPU anyway, so a deeper queue just stacks latency (≈ depth × frame - // time) without raising throughput. Default 1 (the validated-best); `PUNKTFUNK_ENCODE_DEPTH` (1..=6) - // can raise it if a future workload is genuinely encode-throughput-bound rather than scheduling-bound. - let depth: usize = std::env::var("PUNKTFUNK_ENCODE_DEPTH") - .ok() - .and_then(|s| s.trim().parse::().ok()) - .filter(|&d| (1..=6).contains(&d)) - .unwrap_or(1); - tracing::info!(depth, "WGC helper: encode pipeline depth"); + // FIXED-CADENCE encode loop (mirrors the single-process `punktfunk1::virtual_stream` loop). The + // host runs as SYSTEM and relays our AUs; to deliver a STEADY `fps` to the client (the "fixed 240" + // goal) we must NOT gate on WGC's content-driven FrameArrived — `WgcCapturer::next_frame` blocks up + // to its ~8 ms static-repeat timeout when the desktop is quiet, capping a barely-changing desktop + // ~125 fps regardless of the GPU. Instead we pace to `1/fps` and take the FRESHEST frame with the + // non-blocking `try_latest`, repeating the last one when nothing newer arrived. Depth-1: NVENC's + // `poll` (lock_bitstream) blocks until the just-submitted frame is encoded, so exactly one frame is + // in flight per iteration. A deeper pipeline was measured to only stack latency under a + // GPU-saturating game (the encodes serialize on the contended GPU anyway) — the in-game lever is + // the GPU scheduling priority the SYSTEM host stamps on us, not pipeline depth. + let interval = std::time::Duration::from_secs_f64(1.0 / opts.fps.max(1) as f64); let perf = std::env::var_os("PUNKTFUNK_PERF").is_some(); let mut frames = 0u64; - let mut cap_wait_ns = 0u64; - let mut encode_ns = 0u64; // time blocked in lock_bitstream (the oldest in-flight encode) - let mut write_ns = 0u64; // time blocked writing the AU to the stdout pipe (relay backpressure) + let mut repeats = 0u64; // frames where no newer capture had arrived (duplicate re-encode) + let mut cap_ns = 0u64; // time in try_latest (capture + video-processor convert) + let mut encode_ns = 0u64; // time blocked in lock_bitstream + let mut write_ns = 0u64; // time writing the AU to the stdout pipe (relay backpressure) let mut window = std::time::Instant::now(); - // Prime: submit `depth` frames before the first poll so NVENC has that many encodes in flight. - // We don't hold the `CapturedFrame`s past `submit`: NVENC keeps its own registered texture clone - // and the capturer's ring/held-set own the canonical refs (sized for `depth`), so the in-flight - // inputs stay valid after our clones drop. - enc.submit(&first).context("first encoder submit")?; - drop(first); - for _ in 1..depth { - let f = cap.next_frame().context("WGC prime frame")?; - enc.submit(&f).context("prime encoder submit")?; - } + // `frame` is held across iterations and repeated when `try_latest` has nothing newer, so a static + // desktop still clocks `fps`. The capturer's held-set / output ring keep its texture alive across + // the repeat; reassigning `frame` on a fresh capture drops the prior one (already drained by poll). + let mut frame = first; + let mut next = std::time::Instant::now(); loop { if kf.swap(false, Ordering::Relaxed) { enc.request_keyframe(); } - // Pop + forward the OLDEST in-flight frame (FIFO). With `depth` outstanding it has had - // depth-1 frames' worth of GPU slots to finish, so this rarely blocks under load. - let p0 = std::time::Instant::now(); - let polled = enc.poll().context("encoder poll")?; - if perf { - encode_ns += p0.elapsed().as_nanos() as u64; + // Freshest captured frame, or repeat the last (no new composition: static desktop / between a + // game's presents). Non-blocking, so the cadence is OURS, not WGC's event rate. + let t0 = std::time::Instant::now(); + match cap.try_latest().context("WGC try_latest")? { + Some(f) => frame = f, + None => repeats += 1, } - if let Some(au) = polled { + if perf { + cap_ns += t0.elapsed().as_nanos() as u64; + } + enc.submit(&frame).context("encoder submit")?; + // Drain the just-submitted frame. NVENC's poll blocks in lock_bitstream until it's encoded, so + // this returns exactly one AU (then None) — depth-1, no accumulation. + loop { + let p0 = std::time::Instant::now(); + let polled = enc.poll().context("encoder poll")?; + if perf { + encode_ns += p0.elapsed().as_nanos() as u64; + } + let Some(au) = polled else { break }; let w0 = std::time::Instant::now(); let wrote = write_au(&mut out, &au); if perf { @@ -158,13 +182,13 @@ pub fn run(opts: HelperOptions) -> Result<()> { return Ok(()); } } - // Refill: capture + submit to keep `depth` frames in flight. - let t0 = std::time::Instant::now(); - let next = cap.next_frame().context("WGC next frame")?; - if perf { - cap_wait_ns += t0.elapsed().as_nanos() as u64; + // Pace to this frame's due time. If we're already past it (encode couldn't keep up under a + // GPU-saturating game), skip the sleep and re-baseline so we don't spiral into catch-up. + next += interval; + match next.checked_duration_since(std::time::Instant::now()) { + Some(d) => std::thread::sleep(d), + None => next = std::time::Instant::now(), } - enc.submit(&next).context("encoder submit")?; if perf { frames += 1; @@ -174,13 +198,15 @@ pub fn run(opts: HelperOptions) -> Result<()> { let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6); tracing::info!( fps = format!("{:.1}", frames as f64 / secs), - cap_wait_ms = per(cap_wait_ns), + repeats, + cap_ms = per(cap_ns), encode_ms = per(encode_ns), write_ms = per(write_ns), - "WGC helper perf (depth-pipelined; encode_ms=lock_bitstream on the oldest)" + "WGC helper perf (fixed-cadence depth-1; encode_ms=lock_bitstream; repeats=duplicated frames)" ); frames = 0; - cap_wait_ns = 0; + repeats = 0; + cap_ns = 0; encode_ns = 0; write_ns = 0; window = std::time::Instant::now(); @@ -197,3 +223,115 @@ fn write_au(out: &mut impl Write, au: &encode::EncodedFrame) -> std::io::Result< out.write_all(&au.data)?; out.flush() } + +/// O3 present-trigger experiment (see the gated call in `run`). Creates a small swapchain-backed +/// window on the virtual display (the CCD-isolated primary) and presents continuously — an active +/// present SOURCE on the display — to test whether that makes the OS assign the driver's IddCx +/// swap-chain (which WGC's read does not). Runs forever on its own thread. +/// +/// # Safety +/// Win32/D3D11 FFI; called once on a dedicated helper thread. +unsafe fn present_trigger(disp_w: u32, disp_h: u32) -> Result<()> { + use windows::core::{w, Interface}; + use windows::Win32::Foundation::{HMODULE, HWND, LPARAM, LRESULT, WPARAM}; + use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE; + use windows::Win32::Graphics::Direct3D11::{ + D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, + ID3D11Texture2D, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_SDK_VERSION, + }; + use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC}; + use windows::Win32::Graphics::Dxgi::{ + IDXGIAdapter, IDXGIDevice, IDXGIFactory2, DXGI_PRESENT, DXGI_SWAP_CHAIN_DESC1, + DXGI_SWAP_EFFECT_FLIP_DISCARD, DXGI_USAGE_RENDER_TARGET_OUTPUT, + }; + use windows::Win32::System::LibraryLoader::GetModuleHandleW; + use windows::Win32::UI::WindowsAndMessaging::{ + CreateWindowExW, DefWindowProcW, DispatchMessageW, PeekMessageW, RegisterClassW, + ShowWindow, MSG, PM_REMOVE, SW_SHOWNOACTIVATE, WNDCLASSW, WS_EX_NOACTIVATE, WS_EX_TOPMOST, + WS_POPUP, WS_VISIBLE, + }; + + unsafe extern "system" fn wndproc(h: HWND, m: u32, wp: WPARAM, lp: LPARAM) -> LRESULT { + DefWindowProcW(h, m, wp, lp) + } + + let hinst: HMODULE = GetModuleHandleW(None)?; + let cls = w!("pfPresentTrigger"); + let wc = WNDCLASSW { + lpfnWndProc: Some(wndproc), + hInstance: hinst.into(), + lpszClassName: cls, + ..Default::default() + }; + RegisterClassW(&wc); + // Small window at the top-left of the (primary = virtual) display so it barely obscures the + // captured desktop; topmost + no-activate so it doesn't steal focus. + let win_w = disp_w.min(96) as i32; + let win_h = disp_h.min(96) as i32; + let hwnd: HWND = CreateWindowExW( + WS_EX_TOPMOST | WS_EX_NOACTIVATE, + cls, + w!("pf-present"), + WS_POPUP | WS_VISIBLE, + 0, + 0, + win_w, + win_h, + None, + None, + Some(hinst.into()), + None, + )?; + let _ = ShowWindow(hwnd, SW_SHOWNOACTIVATE); + + let mut device: Option = None; + let mut context: Option = None; + D3D11CreateDevice( + None, + D3D_DRIVER_TYPE_HARDWARE, + HMODULE::default(), + D3D11_CREATE_DEVICE_BGRA_SUPPORT, + None, + D3D11_SDK_VERSION, + Some(&mut device), + None, + Some(&mut context), + )?; + let device = device.context("present-trigger d3d11 device")?; + let context = context.context("present-trigger d3d11 context")?; + + let dxgi_dev: IDXGIDevice = device.cast()?; + let adapter: IDXGIAdapter = dxgi_dev.GetAdapter()?; + let factory: IDXGIFactory2 = adapter.GetParent()?; + let scd = DXGI_SWAP_CHAIN_DESC1 { + Width: win_w as u32, + Height: win_h as u32, + Format: DXGI_FORMAT_B8G8R8A8_UNORM, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + BufferUsage: DXGI_USAGE_RENDER_TARGET_OUTPUT, + BufferCount: 2, + SwapEffect: DXGI_SWAP_EFFECT_FLIP_DISCARD, + ..Default::default() + }; + let swapchain = factory.CreateSwapChainForHwnd(&device, hwnd, &scd, None, None)?; + tracing::info!("present-trigger: swapchain created on the virtual display; presenting"); + + let mut frame = 0u32; + loop { + let mut msg = MSG::default(); + while PeekMessageW(&mut msg, None, 0, 0, PM_REMOVE).as_bool() { + let _ = DispatchMessageW(&msg); + } + let back: ID3D11Texture2D = swapchain.GetBuffer(0)?; + let mut rtv: Option = None; + device.CreateRenderTargetView(&back, None, Some(&mut rtv))?; + let rtv = rtv.context("present-trigger rtv")?; + let c = (frame % 120) as f32 / 120.0; + context.ClearRenderTargetView(&rtv, &[c, 0.1, 0.2, 1.0]); + let _ = swapchain.Present(1, DXGI_PRESENT(0)); + frame = frame.wrapping_add(1); + } +} diff --git a/docs/windows-virtual-display-rust-port.md b/docs/windows-virtual-display-rust-port.md index 0797525..5769f03 100644 --- a/docs/windows-virtual-display-rust-port.md +++ b/docs/windows-virtual-display-rust-port.md @@ -115,6 +115,225 @@ binary keeps running (silently). **Devnode hygiene:** create the root devnode wi on every `pnputil /add-driver` (they have `hwid root\pf_vdisplay`, so the driver install re-materializes them). The production installer must use a single `nefconc`/INF-created node and never `devgen`. +## P2 — direct frame push (kill DDA): design & decision record + +Status: **in progress.** P1 ships frames the old way (the driver drains its swap-chain and DDA/WGC +re-captures the composited desktop). P2 makes the driver *publish* each swap-chain frame to the host +directly, so we can retire Desktop Duplication and its multi-GPU survival code. Built behind +`PUNKTFUNK_IDD_PUSH`, A/B'd against DDA, and only then made the default. + +### The decisive finding: producer and consumer are both in Session 0 + +The whole transport design hinged on one unknown — same-session or cross-session? **Measured on the +RTX box (2026-06-22):** the pf-vdisplay host process is `WUDFHost.exe` with +`-DeviceGroupId:pfVDisplayGroup`, running in **Session 0**; the punktfunk host service is `LocalSystem`, +also **Session 0**. So the swap-chain processor thread (spawned by our own `thread::spawn` inside the +driver, i.e. in `WUDFHost`) and the encoder live in the **same session**. This is the easy case: + +- A D3D11 **shared keyed-mutex texture** created in the driver can be opened by name in the host with + `ID3D11Device1::OpenSharedResourceByName` — both devices created on the **same render-adapter LUID** + (which the driver already reports out of the `ADD` IOCTL via `OsAdapterLuid`, surfaced as + `WinCaptureTarget::adapter_luid`). +- Named kernel objects resolve through Session 0's shared `\BaseNamedObjects`, so **no `Global\` + prefix / `SeCreateGlobalPrivilege` gymnastics** are needed (kept the names unprefixed; documented + that this relies on both processes being Session 0). The Looking-Glass cross-*VM* shared-memory + device is unnecessary — this is cross-*process*, same-session, on one GPU. + +This collapses the "Session-0 cross-process transport is the long pole" risk from the original plan. + +### Transport: a ring of shared keyed-mutex textures + a metadata header + an event + +A single ping-pong keyed mutex would couple the driver's present rate to the host's consume rate — and +**the swap-chain thread must never block** (a stalled `IddCxSwapChainReleaseAndAcquire`/processing loop +freezes DWM compositing system-wide). So, the Looking-Glass shape — multiple frame buffers, newest +wins: + +- **Ring** of `N` (default 3) shared textures, `RESOURCE_MISC_SHARED_NTHANDLE | + SHARED_KEYEDMUTEX`, fixed size for the session. A **generation** counter bumps on a mode change + (resize): the driver tears down + recreates the ring at the new size, the host notices the + generation change and re-opens. +- **Named metadata header** (`CreateFileMapping`): `{magic, version, generation, width, height, + dxgi_format, ring_len, latest}` where `latest` packs `{write_index, monotonic sequence}` published + *after* the copy completes. Plain (unprefixed) names — Session-0 shared namespace. +- **Frame-ready auto-reset event** so the consumer waits instead of spinning. +- **Producer (driver, per acquired frame):** pick `(latest_index + 1) % N`; **try**-acquire that + slot's keyed mutex with a 0 ms timeout (if the host still holds it — rare with 3 slots — reuse the + current slot or skip, **never block**); `CopyResource` the acquired `MetaData.pSurface` into the + slot; release the mutex; publish `{index, ++seq}`; `SetEvent`. Then `FinishedProcessingFrame` as + today. +- **Consumer (host `IddPushCapturer`):** `WaitForSingleObject(event, timeout)`; read `latest`; if `seq` + advanced, acquire that slot's mutex, `CopyResource` into an owned NVENC-input texture, release, yield + `FramePayload::D3d11{texture, device}` — straight into the existing zero-copy NVENC path. No DDA, no + CPU readback. + +### What P2 removes vs. keeps + +- **Removes:** `capture/dxgi.rs`'s `DXGI_ERROR_ACCESS_LOST`/`MODE_CHANGE_IN_PROGRESS` re-duplication + churn, the legacy-`DuplicateOutput` fallback, and **`install_gpu_pref_hook()` (the `win32u.dll` + patch)** — by **pinning the render adapter to the encoder GPU** (`IddCxAdapterSetRenderAdapter`, the + existing `SET_RENDER_ADAPTER` IOCTL, driven before `ADD`), so the OS never reparents the output and + the shared texture + NVENC share one device by construction. +- **Keeps:** display **topology** (making the virtual display the composited desktop) and the + **watchdog** (now ours). The **two-process WGC secure-desktop relay** stays until we confirm the IDD + push also delivers the secure (Winlogon) desktop; if it does, that retires too. + +### On-glass attempt 2026-06-22 — code complete, blocked at driver load + +The full transport (driver publisher + host `IddPushCapturer` + render-LUID robustness + in-process +routing) is written and compiles clean. The first on-glass A/B exposed several real things and one +hard blocker: + +- **The service captures in a Session-1 WGC helper, not in-process.** `should_use_helper()` returns + true for a SYSTEM service, so it spawns a user-session helper that does capture **and input + injection**. IDD-push must capture **in-process in Session 0** (where the driver publishes) — wired + via `should_use_helper()` returning false for `PUNKTFUNK_IDD_PUSH`. **Caveat:** `SendInput` from + Session 0 can't reach the user's Session-1 desktop, so in-process IDD-push has **no working input** + yet. Production needs either a Session-1 input-only helper, or `Global\`-namespaced shared textures + so a Session-1 helper consumes IDD-push for both video + input. +- **`SET_RENDER_ADAPTER` is ignored by the driver** (the IDD lands on a different adapter than pinned: + observed IDD adapter `0xd60722` vs pinned 4090 `0x15de1`). The render-LUID-in-header path makes the + host bind correctly regardless, but the driver should be made to actually honor the pin (or the host + must copy across adapters) so NVENC gets a 4090 surface. +- **Cursor is included** in the IddCx composited frame (DDA strips it) — so the host-side cursor + compositor (P2.5) is likely unnecessary for this path. +- **`FAILED_POST_START` was a red herring (churn, not the binary).** Comparing the 2157 (works) and + the `frame_transport` DLL import tables: **identical** (same 8 DLLs; the size/hash delta is just the + Authenticode signature). A clean install **+ reboot** (no `restart-device`/`disable-enable`/kill in + between) loads the `frame_transport` driver to **`OK`**. The earlier `FAILED_POST_START` was the + device wedging from the hot-reload churn (the deploy gotchas above). **Lesson: deploy = install + + reboot, full stop.** +- **THE REAL BLOCKER — the driver can't CREATE the shared objects.** With the driver loaded clean and + the monitor active, the host's `IddPushCapturer` still times out: `pfvd-hdr- never appeared`. + The driver's own `OutputDebugString` is invisible (UMDF redirects it to ETW, not DebugView — verified + with a working DBWIN self-test), so a **file-logging** driver build was tried — and it wrote **no + file at all**, even though `init()` runs in `DriverEntry`, the device is `OK`, WUDFHost runs as + `LocalService`, and `C:\Users\Public` is world-writable. **WUDFHost runs with a restricted token: it + can neither write the filesystem nor create named kernel objects** (`CreateFileMappingW`/`CreateEventW`/ + `CreateSharedHandle`), so `FramePublisher::new` fails silently. This is exactly why the **gamepad UMDF + drivers invert it**: `inject/dualsense_windows.rs` — *"the host creates the section (privileged → a + permissive SDDL so the WUDFHost can open it); the driver maps it"* — `Global\pfds-shm-` + SDDL + `D:(A;;GA;;;WD)`. **Fix: invert frame-push to match.** The HOST creates the header + event + ring + textures (`Global\` names, `D:(A;;GA;;;WD)` SDDL); the DRIVER only OPENS them, writes its actual + render LUID + a status code back into the host-created header (so we get driver visibility through the + host log), and runs the copy loop. The host creates the textures on the render adapter the driver + reports. +- **Also unresolved: `SET_RENDER_ADAPTER` appears ignored** (the host's pin to the 4090 vs the ADD-reply + adapter differ every time). The inverted header carries the driver's *actual* render LUID so the host + can create textures + run NVENC on the right adapter — but if that's the iGPU, NVENC (NVIDIA) can't + encode it, so the driver must be made to honor the pin (or the host must cross-adapter copy). Needs its + own investigation. + +**Driver deploy gotchas learned (this box):** hot-reloading a UMDF display driver is unreliable — +`pnputil /restart-device` does NOT restart WUDFHost (old image stays mapped), `Disable/Enable-PnpDevice` +errors on the root-enumerated IDD, and **killing WUDFHost invalidates the host's cached `{e5bcc234}` +control handle** (every ADD then fails `0x80070006`, and the device can wedge to `FAILED_POST_START`). +A **reboot** loads a freshly-installed build cleanly. **Recovery** from a broken build is clean and +reboot-free: `pnputil /delete-driver .inf /uninstall` removes the bad package and the device +rebinds the previous (validated) package in the DriverStore — restored 2157 → `OK` immediately. + +### On-glass attempt 2 (2026-06-23) — inversion works; in-process Session-0 path is a dead end + +Implemented the **inversion** (host creates the header + event + ring textures with the +`D:(A;;GA;;;WD)` SDDL, driver only opens them) + a per-attempt **generation** (kills the +`DXGI_ERROR_NAME_ALREADY_EXISTS` retry collisions) + a fixed-name **`Global\pfvd-dbg` debug channel** +(structured counters the driver writes, since UMDF/ETW + the restricted token block its other logs). +Results on the RTX box: + +- ✅ The host **creates the shared ring every time** (`created shared ring … render_luid=…`) — the + privileged-create / restricted-open split is sound. +- ✅ No more name collisions (generation fix). +- ❌ **The driver writes NOTHING** — debug block all zeros, crucially `run_core_entries=0`. The + swap-chain processor **never runs**, i.e. the OS **never assigns a swap-chain** to the virtual + monitor in this path. + +**Root cause: an IddCx monitor only gets a swap-chain when something PRESENTS to it, and the in-process +path has no presenter.** The host + the CCD topology-isolate run in **Session 0, which has no DWM / +compositor**. The WGC path works because its capture helper lives in **Session 1**, where DWM composes +the desktop onto the display (that composition is the swap-chain trigger). So in-process Session-0 +IDD-push gets no frames to push, full stop — a **fundamental** barrier, not a fixable bug. The original +plan's "Session-0 transport is the long pole" was right, but the long pole turned out to be *triggering +presentation*, not the shared-memory mechanics (those work). + +**Consequence:** the only viable IDD-push shape is **option 3 — a Session-1 helper drives presentation + +consumes the `Global\` ring** (the inversion built here is exactly what it needs). But it carries an +unretired risk: it's still unproven whether the swap-chain gets assigned even with a Session-1 consumer +that isn't WGC. Until that's answered, **DDA/WGC stays the shipping Windows capture path** — it works. +All the IDD-push code (driver open-side + host create-side + debug channel) is written, compiles, and is +gated behind `PUNKTFUNK_IDD_PUSH` (off), so it's dormant and harmless. + +### CONCLUSION (2026-06-23): IDD-push is not viable for bare-metal capture — the swap-chain is never assigned + +After the inversion + a fixed-name debug channel + a host-created-ring observer + an autonomous +loopback test harness (`punktfunk-probe` → the SYSTEM service, paired via the mgmt API), the question +"does the driver's swap-chain processor ever run?" was answered **definitively: no.** The driver's +`run_core` is **never entered** — `run_core_entries=0` in *every* configuration tested: + +- in-process (Session 0) and WGC-triggered (Session 1 helper) sessions, +- a user-created ring AND a host-created (LocalSystem) ring with a permissive `D:(A;;GA;;;WD)` SDDL, +- with and without a Low-IL (`S:(ML;;NW;;;LW)`) mandatory label, +- with WUDFHost confirmed **not** an AppContainer (`IsAppContainer=0`), + +— even while WGC simultaneously captured the same virtual monitor's composition and streamed multi-MB +of HEVC. The gamepad UMDF drivers prove a UMDF driver *can* open + write a host-created `Global\` +section on this box, so the driver writing nothing is **not** an access problem — `run_core` simply +does not run. + +**Root cause (researched + ecosystem-confirmed):** an IddCx virtual monitor only receives a swap-chain +(`EVT_IDD_CX_MONITOR_ASSIGN_SWAPCHAIN`) when the OS **presents/scans-out** to it, which requires a real +presentation consumer. **WGC/DDA capture of the composed desktop does NOT count** — it reads DWM's +composition, bypassing the driver's swap-chain. With no physical scanout and no consumer that routes +*through the driver*, the path stays inactive (`IDDCX_PATH_FLAGS=0`) and `ASSIGN_SWAPCHAIN` never fires. +Confirming evidence: + +- **Every bare-metal virtual-display capture project uses WGC/DDA, not the driver swap-chain:** SudoVDA + (its swap-chain loop acquires-and-discards), Apollo/Sunshine (DDA + WGC backends), virtual-display-rs + (discards), parsec-vdd (no frame path). Only **Looking Glass** consumes the driver swap-chain — and + only because a **VM guest scans out** the display (the consumer). We have no equivalent on bare metal. +- Microsoft's own unanswered Q&A (learn.microsoft.com/answers 4096179) reports the identical symptom for + the IddSampleDriver: virtual display "always inactive," `ASSIGN_SWAPCHAIN` never runs. + +**Verdict:** the "driver consumes its swap-chain and pushes frames" architecture (P2 / Looking-Glass +style) **cannot get frames** for punktfunk's bare-metal, whole-desktop, capture-only use case. The +shared-memory transport machinery (host-creates / driver-opens, the gamepad pattern) is all sound and +proven to *create*, but there is nothing for the driver to publish. **DDA/WGC remains the only viable +Windows capture path**, which is exactly what the entire ecosystem does. The IDD-push code stays +in-tree, compiles, and is gated `off` (`PUNKTFUNK_IDD_PUSH`) — dormant and harmless — documenting the +attempt so it isn't re-tried. "Better performance/lower overhead" must come from optimizing the WGC/DDA +path (e.g. trimming the Session-0↔Session-1 relay, zero-copy encode), not from IDD-push. + +The only unexplored avenue is **driver-side** (a different adapter/monitor/path setup that might make the +OS treat the virtual display as a presentation target) — but it needs a reboot to test, the MS Q&A +suggests it's unsolved, and the unanimous ecosystem choice of WGC/DDA argues it's a dead end. + +**Final exhaustion (2026-06-23, follow-up): both remaining avenues closed.** + +- **Option 3 (present source) — TESTED, failed.** Added a present-trigger to the Session-1 WGC helper: + it successfully created a D3D11 swapchain on the virtual display and presented continuously (WGC even + captured the flashing window). The driver stayed `run_core_entries=0` / `frames_acquired=0`. So an + active *present source* on the display does NOT make the OS assign the driver's swap-chain either — + DWM composes the present onto the display (capturable) without routing it through the driver's + swap-chain. +- **Option 2 (driver flag) — closed by analysis.** The present-trigger succeeding proves the **path is + already active** (a swapchain presents to the display fine); the missing piece is **scanout routed + through the driver**, which the OS does only for a real consumer (physical display / VM guest / RDP). + The one IddCx flag for that — `IDDCX_ADAPTER_FLAGS_REMOTE_SESSION_DRIVER` — requires the **RDP + protocol stack** as the consumer, which bare-metal console capture has no equivalent of. + +**Verdict is final:** IDD-push needs a presentation consumer (scanout / VM guest / RDP) that bare-metal +console desktop-capture fundamentally cannot provide. No host-side capture, no in-process path, no +present source, and no available driver flag overcomes it. WGC (normal desktop) + DDA (secure desktop) +is the only viable Windows capture path — as the entire ecosystem already does. The IDD-push + +present-trigger code stays in-tree, gated off, as the documented record of the attempt. + +### Known gaps the build-out must close (tracked as P2.* tasks) + +- **Cursor.** DDA/WGC composite the HW cursor host-side from frame-info; the IDD path delivers the + cursor separately (`IddCxMonitorSetupHardwareCursor` event → `QueryHardwareCursor`). The prototype + may ship cursor-less; the build-out wires the IDD cursor into the existing `CursorCompositor`. +- **HDR.** The default IddCx swap-chain surface is 8-bit `B8G8R8A8`; FP16/HDR needs the **IddCx 1.11 + D3D12 acquire path** (`SetDevice2`/`ReleaseAndAcquireBuffer2` → `ID3D12Resource`). Build against + 1.10, runtime-gate 1.11. SDR-only for the prototype. + ## Why we'd do this The user's goals, mapped to outcomes: diff --git a/packaging/windows/README.md b/packaging/windows/README.md index 6f9e52d..2554699 100644 --- a/packaging/windows/README.md +++ b/packaging/windows/README.md @@ -55,10 +55,11 @@ read it from `%ProgramData%\punktfunk\web-password`. ## Prerequisites on the target box -- An **NVIDIA GPU + driver** — the installer's exe is built `--features nvenc` and load-depends on the - driver's `nvEncodeAPI64.dll`. -- **ViGEmBus** (optional) for virtual gamepads — still a manual prerequisite (not bundled yet): - . +- A **GPU for hardware encode**: an NVIDIA GPU + driver (NVENC), or an AMD/Intel GPU (AMF/QSV) — the + exe is built `--features nvenc,amf-qsv`. Software H.264 is the GPU-less fallback. +- **Virtual gamepads need no prerequisite.** The DualSense / DualShock 4 / Xbox 360 (XUSB) UMDF drivers + are **bundled** in the installer (the *Install the virtual gamepad drivers* task) and + `pnputil`-installed. **ViGEmBus is no longer used.** ## Files here diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/Cargo.toml b/packaging/windows/vdisplay-driver/pf-vdisplay/Cargo.toml index ef63918..2fbdcca 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/Cargo.toml +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/Cargo.toml @@ -21,6 +21,7 @@ features = [ "Win32_Security", "Win32_System_SystemServices", "Win32_System_Threading", + "Win32_System_Memory", "Win32_System_Diagnostics_Debug", "Win32_Graphics_Direct3D", "Win32_Graphics_Direct3D11", diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/callbacks.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/callbacks.rs index 2fe1dd4..f48dfaf 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/callbacks.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/callbacks.rs @@ -11,10 +11,17 @@ use wdf_umdf_sys::{ DISPLAYCONFIG_TARGET_MODE, DISPLAYCONFIG_VIDEO_SIGNAL_INFO, IDARG_IN_ADAPTER_INIT_FINISHED, IDARG_IN_COMMITMODES, IDARG_IN_GETDEFAULTDESCRIPTIONMODES, IDARG_IN_PARSEMONITORDESCRIPTION, IDARG_IN_QUERYTARGETMODES, IDARG_IN_SETSWAPCHAIN, IDARG_OUT_GETDEFAULTDESCRIPTIONMODES, - IDARG_OUT_PARSEMONITORDESCRIPTION, IDARG_OUT_QUERYTARGETMODES, IDDCX_ADAPTER__, + IDARG_OUT_PARSEMONITORDESCRIPTION, IDARG_OUT_QUERYTARGETMODES, IDDCX_ADAPTER__, IDDCX_PATH, IDDCX_MONITOR_MODE, IDDCX_MONITOR_MODE_ORIGIN, IDDCX_MONITOR__, IDDCX_TARGET_MODE, NTSTATUS, WDFDEVICE, WDF_POWER_DEVICE_STATE, }; +// IddCx 1.10 *2 DDIs (HDR-capable). For B1 we advertise SDR (8 bpc) so behaviour is unchanged; B2 +// flips the bit depth + adapter flag to enable HDR. +use wdf_umdf_sys::{ + IDARG_IN_COMMITMODES2, IDARG_IN_PARSEMONITORDESCRIPTION2, IDARG_IN_QUERYTARGETMODES2, + IDARG_IN_QUERYTARGET_INFO, IDARG_OUT_QUERYTARGET_INFO, IDDCX_BITS_PER_COMPONENT, IDDCX_MONITOR_MODE2, + IDDCX_PATH2, IDDCX_TARGET_CAPS, IDDCX_TARGET_MODE2, IDDCX_WIRE_BITS_PER_COMPONENT, +}; use crate::{ context::{DeviceContext, MonitorContext}, @@ -179,6 +186,7 @@ pub extern "C-unwind" fn monitor_get_default_modes( _p_in_args: *const IDARG_IN_GETDEFAULTDESCRIPTIONMODES, _p_out_args: *mut IDARG_OUT_GETDEFAULTDESCRIPTIONMODES, ) -> NTSTATUS { + info!("GET_DEFAULT_MODES called (we return NOT_IMPLEMENTED — only valid for a monitor with NO EDID)"); NTSTATUS::STATUS_NOT_IMPLEMENTED } @@ -287,9 +295,20 @@ pub extern "C-unwind" fn monitor_query_modes( pub extern "C-unwind" fn adapter_commit_modes( _adapter_object: *mut IDDCX_ADAPTER__, - _p_in_args: *const IDARG_IN_COMMITMODES, + p_in_args: *const IDARG_IN_COMMITMODES, ) -> NTSTATUS { - // The swap-chain is managed by IddCx; there is nothing device-specific to reconfigure on a commit. + // DIAGNOSTIC: does the OS commit an ACTIVE path for our monitor? IDDCX_PATH_FLAGS_ACTIVE = 2. If + // no active path is ever committed, the OS never calls ASSIGN_SWAPCHAIN (the bug we're chasing). + let in_args = unsafe { &*p_in_args }; + info!("COMMIT_MODES: path_count={}", in_args.PathCount); + for i in 0..in_args.PathCount { + let path: &IDDCX_PATH = unsafe { &*in_args.pPaths.add(i as usize) }; + let active = (path.Flags.0 & 2) != 0; + info!( + " path[{i}] monitor={:p} flags=0x{:x} active={active}", + path.MonitorObject, path.Flags.0 + ); + } NTSTATUS::STATUS_SUCCESS } @@ -320,3 +339,194 @@ pub extern "C-unwind" fn unassign_swap_chain(monitor_object: *mut IDDCX_MONITOR_ .into() } } + +// ===== IddCx 1.10 *2 DDIs (HDR-capable path) ============================================ +// These mirror the 1.x callbacks above but advertise per-mode wire bit-depth. B1 reports SDR (8 bpc); +// B2 bumps `wire_bits()` to add 10 bpc + sets CAN_PROCESS_FP16 to actually enable HDR. + +/// Wire bit-depth advertised per mode. B2: advertise BOTH 8 and 10 bpc RGB so the OS offers HDR10 +/// modes (the bitfield: 8 = 0x2, 10 = 0x4). +fn wire_bits() -> IDDCX_WIRE_BITS_PER_COMPONENT { + let rgb = IDDCX_BITS_PER_COMPONENT( + IDDCX_BITS_PER_COMPONENT::IDDCX_BITS_PER_COMPONENT_8.0 + | IDDCX_BITS_PER_COMPONENT::IDDCX_BITS_PER_COMPONENT_10.0, + ); + IDDCX_WIRE_BITS_PER_COMPONENT { + Rgb: rgb, + YCbCr444: IDDCX_BITS_PER_COMPONENT::IDDCX_BITS_PER_COMPONENT_NONE, + YCbCr422: IDDCX_BITS_PER_COMPONENT::IDDCX_BITS_PER_COMPONENT_NONE, + YCbCr420: IDDCX_BITS_PER_COMPONENT::IDDCX_BITS_PER_COMPONENT_NONE, + } +} + +/// 1.10 variant of [`parse_monitor_description`] — writes `IDDCX_MONITOR_MODE2` (adds bit-depth). +pub extern "C-unwind" fn parse_monitor_description2( + p_in_args: *const IDARG_IN_PARSEMONITORDESCRIPTION2, + p_out_args: *mut IDARG_OUT_PARSEMONITORDESCRIPTION, +) -> NTSTATUS { + let in_args = unsafe { &*p_in_args }; + let out_args = unsafe { &mut *p_out_args }; + + let Ok(monitors) = MONITOR_MODES.lock() else { + error!("MONITOR_MODES mutex poisoned"); + return NTSTATUS::STATUS_DRIVER_INTERNAL_ERROR; + }; + + let edid = unsafe { + std::slice::from_raw_parts( + in_args.MonitorDescription.pData as *const u8, + in_args.MonitorDescription.DataSize as usize, + ) + }; + let Ok(monitor_index) = Edid::get_serial(edid) else { + error!("bad edid ({} bytes)", edid.len()); + return NTSTATUS::STATUS_INVALID_VIEW_SIZE; + }; + let Some(monitor) = monitors.iter().find(|&m| m.data.id == monitor_index) else { + error!("Failed to find monitor id {monitor_index}"); + return NTSTATUS::STATUS_DRIVER_INTERNAL_ERROR; + }; + + let number_of_modes: u32 = monitor + .data + .modes + .iter() + .map(|m| u32::try_from(m.refresh_rates.len()).expect("Cannot use > u32::MAX refresh rates")) + .sum(); + + out_args.MonitorModeBufferOutputCount = number_of_modes; + if in_args.MonitorModeBufferInputCount < number_of_modes { + return if in_args.MonitorModeBufferInputCount > 0 { + NTSTATUS::STATUS_BUFFER_TOO_SMALL + } else { + NTSTATUS::STATUS_SUCCESS + }; + } + + let monitor_modes = unsafe { + std::slice::from_raw_parts_mut( + in_args.pMonitorModes.cast::>(), + number_of_modes as usize, + ) + }; + for (mode, out_mode) in monitor.data.modes.flatten().zip(monitor_modes.iter_mut()) { + out_mode.write(IDDCX_MONITOR_MODE2 { + #[allow(clippy::cast_possible_truncation)] + Size: mem::size_of::() as u32, + Origin: IDDCX_MONITOR_MODE_ORIGIN::IDDCX_MONITOR_MODE_ORIGIN_MONITORDESCRIPTOR, + MonitorVideoSignalInfo: display_info(mode.width, mode.height, mode.refresh_rate), + BitsPerComponent: wire_bits(), + }); + } + out_args.PreferredMonitorModeIdx = 0; + NTSTATUS::STATUS_SUCCESS +} + +fn target_mode2(width: u32, height: u32, refresh_rate: u32) -> IDDCX_TARGET_MODE2 { + let m1 = target_mode(width, height, refresh_rate); + IDDCX_TARGET_MODE2 { + #[allow(clippy::cast_possible_truncation)] + Size: mem::size_of::() as u32, + TargetVideoSignalInfo: m1.TargetVideoSignalInfo, + BitsPerComponent: wire_bits(), + ..Default::default() + } +} + +/// 1.10 variant of [`monitor_query_modes`] — writes `IDDCX_TARGET_MODE2`. +pub extern "C-unwind" fn monitor_query_modes2( + monitor_object: *mut IDDCX_MONITOR__, + p_in_args: *const IDARG_IN_QUERYTARGETMODES2, + p_out_args: *mut IDARG_OUT_QUERYTARGETMODES, +) -> NTSTATUS { + let Ok(monitors) = MONITOR_MODES.lock() else { + error!("MONITOR_MODES mutex poisoned"); + return NTSTATUS::STATUS_DRIVER_INTERNAL_ERROR; + }; + let Some(monitor) = monitors + .iter() + .find(|&m| m.object.is_some_and(|p| p.as_ptr() == monitor_object)) + else { + error!("Failed to find monitor object in cache for {monitor_object:?}"); + return NTSTATUS::STATUS_DRIVER_INTERNAL_ERROR; + }; + + let number_of_modes = monitor + .data + .modes + .iter() + .map(|m| u32::try_from(m.refresh_rates.len()).expect("Cannot use > u32::MAX modes")) + .sum(); + + let out_args = unsafe { &mut *p_out_args }; + out_args.TargetModeBufferOutputCount = number_of_modes; + + let in_args = unsafe { &*p_in_args }; + if in_args.TargetModeBufferInputCount >= number_of_modes { + let out_target_modes = unsafe { + std::slice::from_raw_parts_mut( + in_args.pTargetModes.cast::>(), + number_of_modes as usize, + ) + }; + for (mode, out_target) in monitor.data.modes.flatten().zip(out_target_modes.iter_mut()) { + out_target.write(target_mode2(mode.width, mode.height, mode.refresh_rate)); + } + } + NTSTATUS::STATUS_SUCCESS +} + +/// 1.10 variant of [`adapter_commit_modes`] — `IDDCX_PATH2` carries the committed wire format. +pub extern "C-unwind" fn adapter_commit_modes2( + _adapter_object: *mut IDDCX_ADAPTER__, + p_in_args: *const IDARG_IN_COMMITMODES2, +) -> NTSTATUS { + let in_args = unsafe { &*p_in_args }; + info!("COMMIT_MODES2: path_count={}", in_args.PathCount); + for i in 0..in_args.PathCount { + let path: &IDDCX_PATH2 = unsafe { &*in_args.pPaths.add(i as usize) }; + let active = (path.Flags.0 & 2) != 0; + info!( + " path2[{i}] monitor={:p} flags=0x{:x} active={active} colorspace={} rgb_bpc=0x{:x}", + path.MonitorObject, + path.Flags.0, + path.WireFormatInfo.ColorSpace.0, + path.WireFormatInfo.BitsPerComponent.Rgb.0 + ); + } + NTSTATUS::STATUS_SUCCESS +} + +/// 1.10 NEW: per-target capabilities. B2 reports `HIGH_COLOR_SPACE` so the OS enables HDR10 (transfer +/// curve + wide gamut) on this target. +pub extern "C-unwind" fn query_target_info( + _adapter_object: *mut IDDCX_ADAPTER__, + _p_in_args: *mut IDARG_IN_QUERYTARGET_INFO, + p_out_args: *mut IDARG_OUT_QUERYTARGET_INFO, +) -> NTSTATUS { + let out_args = unsafe { &mut *p_out_args }; + out_args.TargetCaps = IDDCX_TARGET_CAPS::IDDCX_TARGET_CAPS_HIGH_COLOR_SPACE; + out_args.DitheringSupport = IDDCX_WIRE_BITS_PER_COMPONENT::default(); + NTSTATUS::STATUS_SUCCESS +} + +/// 1.10 NEW (HDR): the OS hands us the default HDR10 static metadata for the monitor. B2 accepts it +/// (the host/client own the final HDR metadata for the stream); B3 will forward it to the host for the +/// HEVC mastering-display SEI. Stub keeps the OS's HDR setup happy. +pub extern "C-unwind" fn set_default_hdr_metadata( + _monitor_object: *mut IDDCX_MONITOR__, + _p_in_args: *const wdf_umdf_sys::IDARG_IN_MONITOR_SET_DEFAULT_HDR_METADATA, +) -> NTSTATUS { + NTSTATUS::STATUS_SUCCESS +} + +/// 1.10 HDR: the OS hands us the gamma ramp (a 3x4 colour-space matrix in HDR mode). We do NOT apply it +/// server-side — the host streams the scRGB FP16 and the CLIENT's display applies its own transform — +/// so we accept it. Wiring this is OBLIGATED once CAN_PROCESS_FP16 is set; without it the OS rejects +/// the adapter at init (`IddCxAdapterInitAsync` → "Failed to get adapter"). +pub extern "C-unwind" fn set_gamma_ramp( + _monitor_object: *mut IDDCX_MONITOR__, + _p_in_args: *const wdf_umdf_sys::IDARG_IN_SET_GAMMARAMP, +) -> NTSTATUS { + NTSTATUS::STATUS_SUCCESS +} diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/context.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/context.rs index 203c7ef..686edd2 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/context.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/context.rs @@ -2,6 +2,7 @@ use std::{ mem::{self, size_of}, num::{ParseIntError, TryFromIntError}, ptr::{addr_of_mut, NonNull}, + sync::{Arc, Mutex}, }; use anyhow::anyhow; @@ -13,7 +14,7 @@ use wdf_umdf::{ use wdf_umdf_sys::{ DISPLAYCONFIG_VIDEO_OUTPUT_TECHNOLOGY, HANDLE, IDARG_IN_ADAPTER_INIT, IDARG_IN_MONITORCREATE, IDARG_IN_SETUP_HWCURSOR, IDARG_OUT_ADAPTER_INIT, IDARG_OUT_MONITORARRIVAL, - IDARG_OUT_MONITORCREATE, IDDCX_ADAPTER, IDDCX_ADAPTER_CAPS, IDDCX_CURSOR_CAPS, + IDARG_OUT_MONITORCREATE, IDDCX_ADAPTER, IDDCX_ADAPTER_CAPS, IDDCX_ADAPTER_FLAGS, IDDCX_CURSOR_CAPS, IDDCX_ENDPOINT_DIAGNOSTIC_INFO, IDDCX_ENDPOINT_VERSION, IDDCX_FEATURE_IMPLEMENTATION, IDDCX_MONITOR, IDDCX_MONITOR_DESCRIPTION, IDDCX_MONITOR_DESCRIPTION_TYPE, IDDCX_MONITOR_INFO, IDDCX_SWAPCHAIN, IDDCX_TRANSMISSION_TYPE, IDDCX_XOR_CURSOR_SUPPORT, LUID, NTSTATUS, WDFDEVICE, @@ -34,6 +35,37 @@ use crate::{ // Maximum amount of monitors that can be connected pub const MAX_MONITORS: u8 = 16; +/// ONE shared D3D render device, reused across every swap-chain assignment (keyed by render LUID). +/// Creating a fresh `Direct3DDevice` per assign — and the swap-chain flap fires several assigns per +/// session — spawned a new NVIDIA UMD worker-thread set each time that was NEVER reclaimed on release +/// (proven on the RTX box: ~70 `nvwgf2umx` threads + ~50 MB VRAM leaked per reconnect, permanently, +/// even though our `Direct3DDevice` refcount dropped to 0). Pooling one device keeps a single, stable +/// thread set: the processors borrow an `Arc`, so the device outlives them and is never re-created. +static DEVICE_POOL: Mutex)>> = Mutex::new(None); + +/// Get-or-create the pooled D3D device for `luid`. Re-creates only if the render adapter changes +/// (e.g. a GPU hot-swap), which drops the old `Arc` once its last processor releases it. +fn pooled_device(luid: windows::Win32::Foundation::LUID) -> Option> { + let key = (i64::from(luid.HighPart) << 32) | i64::from(luid.LowPart as u32); + let mut pool = DEVICE_POOL.lock().ok()?; + if let Some((k, dev)) = pool.as_ref() { + if *k == key { + return Some(dev.clone()); + } + } + match Direct3DDevice::init(luid) { + Ok(d) => { + let a = Arc::new(d); + *pool = Some((key, a.clone())); + Some(a) + } + Err(e) => { + error!("pooled Direct3DDevice::init failed: {e:?}"); + None + } + } +} + pub struct DeviceContext { device: WDFDEVICE, adapter: Option, @@ -48,6 +80,11 @@ unsafe impl Sync for DeviceContext {} pub struct MonitorContext { device: IDDCX_MONITOR, swap_chain_processor: Option, + /// OS target id (from IddCxMonitorArrival), stamped on this context at creation. assign_swap_chain + /// uses THIS instead of a MONITOR_MODES pointer lookup — the lookup returns 0 for a recreated + /// (session-2+) monitor, which broke the shared-ring naming and cascaded into SetDevice + /// E_INVALIDARG + an access violation (the fix-teardown crash). + target_id: u32, } // SAFETY: Raw ptr is managed by external library @@ -98,6 +135,10 @@ impl DeviceContext { #[allow(clippy::cast_possible_truncation)] Size: size_of::() as u32, + // B2 HDR: declare we can process FP16 (scRGB) desktop surfaces — enables HDR10 / SDR WCG. + // This OBLIGATES the *2 mode DDIs (done) + ReleaseAndAcquireBuffer2 (done in run_core). + Flags: IDDCX_ADAPTER_FLAGS::IDDCX_ADAPTER_FLAGS_CAN_PROCESS_FP16, + MaxMonitorsSupported: u32::from(MAX_MONITORS), EndPointDiagnostics: IDDCX_ENDPOINT_DIAGNOSTIC_INFO { @@ -231,6 +272,14 @@ impl DeviceContext { } } + // Stamp the OS target id onto the monitor's CONTEXT so assign_swap_chain reads it directly + // (no MONITOR_MODES pointer lookup, which returns 0 for a recreated monitor). + unsafe { + let _ = MonitorContext::get_mut(monitor_create_out.MonitorObject.cast(), |ctx| { + ctx.target_id = arrival_out.OsTargetId; + }); + } + Ok(()) } } @@ -240,6 +289,7 @@ impl MonitorContext { Self { device, swap_chain_processor: None, + target_id: 0, } } @@ -265,20 +315,37 @@ impl MonitorContext { render_adapter.HighPart, render_adapter.LowPart ); - let device = Direct3DDevice::init(luid); + // The OS target id keys the per-monitor shared frame-push objects (header/event/textures) the + // host opens. Read it from THIS context (stamped at creation after IddCxMonitorArrival) — the + // old MONITOR_MODES pointer lookup returned 0 for a recreated (session-2+) monitor, which broke + // the ring naming and cascaded into SetDevice E_INVALIDARG + an access violation. + let target_id = self.target_id; - if let Ok(device) = device { + let device = pooled_device(luid); + + if let Some(device) = device { let mut processor = SwapChainProcessor::new(); - processor.run(swap_chain, device, new_frame_event); + processor.run( + swap_chain, + device, + new_frame_event, + target_id, + render_adapter.LowPart, + render_adapter.HighPart, + ); self.swap_chain_processor = Some(processor); - self.setup_hw_cursor(); + // Cursor is BAKED into the captured video: for IDD-push we deliberately do NOT advertise a + // hardware cursor, so DWM software-composites the mouse cursor into the swapchain surface we + // capture — the client then sees the cursor in the stream. (A future separate-plane cursor + // would re-enable setup_hw_cursor + IddCxMonitorQueryHardwareCursor.) Not advertising one + // also stops leaking a CreateEventA handle per assign. } else { - // It's important to delete the swap-chain if D3D initialization fails, so that the OS knows to generate a new - // swap-chain and try again. - error!("Direct3DDevice::init FAILED on render LUID: {device:?} — deleting swap chain for OS retry"); + // It's important to delete the swap-chain if D3D init fails, so the OS generates a fresh + // swap-chain and retries. + error!("pooled Direct3DDevice unavailable for render LUID — deleting swap chain for OS retry"); unsafe { let _ = WdfObjectDelete(swap_chain.cast()); @@ -287,9 +354,15 @@ impl MonitorContext { } pub fn unassign_swap_chain(&mut self) { - self.swap_chain_processor.take(); + let had = self.swap_chain_processor.take().is_some(); + error!("unassign_swap_chain (target={}) — dropped live processor: {had}", self.target_id); } + /// Advertise a HARDWARE cursor. NOT called for IDD-push — we bake the cursor into the video + /// instead (see `assign_swap_chain`). Kept for a future separate-plane cursor (which would pair it + /// with `IddCxMonitorQueryHardwareCursor`). Leaks a `CreateEventA` handle per call, so only wire it + /// back up alongside a real cursor-plane consumer. + #[allow(dead_code)] pub fn setup_hw_cursor(&mut self) { let mouse_event = unsafe { CreateEventA(None, false, false, s!("vdd_mouse_event")) }; let Ok(mouse_event) = mouse_event else { diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/control.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/control.rs index 91e38da..cccdd1d 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/control.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/control.rs @@ -6,8 +6,9 @@ use std::ffi::c_void; use std::mem::size_of; use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Mutex; use std::thread; -use std::time::Duration; +use std::time::{Duration, Instant}; use log::{error, info}; use wdf_umdf::{ @@ -16,7 +17,7 @@ use wdf_umdf::{ }; use wdf_umdf_sys::{IDARG_IN_ADAPTERSETRENDERADAPTER, LUID, NTSTATUS, WDFDEVICE, WDFREQUEST}; -use crate::context::DeviceContext; +use crate::context::{DeviceContext, MonitorContext}; use crate::monitor::{ default_modes, Mode, MonitorData, MonitorObject, ADAPTER, MONITOR_MODES, NEXT_ID, PREFERRED_RENDER_ADAPTER, PROTOCOL_VERSION, WATCHDOG_COUNTDOWN, WATCHDOG_TIMEOUT, @@ -37,6 +38,16 @@ const IOCTL_CLEAR_ALL: u32 = ctl(0x804); const IOCTL_PING: u32 = ctl(0x888); const IOCTL_GET_VERSION: u32 = ctl(0x8FF); +/// Serializes monitor lifecycle ops — ADD / REMOVE / watchdog-teardown — against each other. Without +/// it, a watchdog expiry can drain an entry out from under an in-flight `do_add` (which releases the +/// `MONITOR_MODES` lock before the slow `create_monitor`), leaving `do_add` to return +/// `STATUS_UNSUCCESSFUL` → the host sees `ERROR_GEN_FAILURE`. This was the reconnect-churn fault. +static MONITOR_OP_LOCK: Mutex<()> = Mutex::new(()); +/// A monitor created less than this ago is still in its host-side setup window (CCD commit + GDI-name +/// resolve + topology settle, ~5 s) and is never reaped by the watchdog — only by an explicit +/// CLEAR_ALL. Protects a freshly-born monitor from a transient PING gap during reconnect churn. +const MONITOR_GRACE: Duration = Duration::from_secs(6); + #[repr(C)] struct AddParams { width: u32, @@ -117,7 +128,7 @@ pub extern "C-unwind" fn device_io_control( IOCTL_GET_WATCHDOG => do_get_watchdog(request, output_len, &mut bytes), IOCTL_PING => NTSTATUS::STATUS_SUCCESS, IOCTL_CLEAR_ALL => { - disconnect_all_monitors(); + disconnect_all_monitors(true); NTSTATUS::STATUS_SUCCESS } IOCTL_GET_VERSION => do_get_version(request, output_len, &mut bytes), @@ -136,6 +147,11 @@ unsafe fn do_add( output_len: usize, bytes: &mut usize, ) -> NTSTATUS { + // Serialize the whole ADD (push entry → create_monitor → verify) against the watchdog teardown + + // REMOVE, so an expiry can never drain this entry mid-flight. `create_monitor` is fast (the slow + // CCD/GDI work is host-side, after this returns), and PING/GET_WATCHDOG don't take this lock, so + // the host keeps the watchdog reset while we hold it. + let _op = MONITOR_OP_LOCK.lock().unwrap(); if input_len < size_of::() || output_len < size_of::() { return NTSTATUS::STATUS_BUFFER_TOO_SMALL; } @@ -182,6 +198,7 @@ unsafe fn do_add( target_id: 0, adapter_luid_low: 0, adapter_luid_high: 0, + created_at: Instant::now(), }); // Create the IddCx monitor via the device context (captures target id + LUID into the entry). @@ -226,18 +243,37 @@ unsafe fn do_remove(request: WDFREQUEST, input_len: usize) -> NTSTATUS { let params = unsafe { &*pin.cast::() }; let guid = guid_key(¶ms.guid); - let mut lock = MONITOR_MODES.lock().unwrap(); - if let Some(pos) = lock.iter().position(|m| m.guid == guid) { - let mon = lock.remove(pos); - if let Some(obj) = mon.object { - if let Err(e) = unsafe { IddCxMonitorDeparture(obj.as_ptr()) } { - error!("REMOVE: departure failed: {e:?}"); - } + // Serialize against ADD + watchdog teardown (lock order: OP_LOCK → MONITOR_MODES). + let _op = MONITOR_OP_LOCK.lock().unwrap(); + let mon = { + let mut lock = MONITOR_MODES.lock().unwrap(); + match lock.iter().position(|m| m.guid == guid) { + Some(pos) => lock.remove(pos), + None => return NTSTATUS::STATUS_NOT_FOUND, } - info!("REMOVE target_id={}", mon.target_id); - NTSTATUS::STATUS_SUCCESS - } else { - NTSTATUS::STATUS_NOT_FOUND + // MONITOR_MODES released here — the processor-join + departure below must not hold it. + }; + if let Some(obj) = mon.object { + free_swap_chain_processor(obj.as_ptr()); + if let Err(e) = unsafe { IddCxMonitorDeparture(obj.as_ptr()) } { + error!("REMOVE: departure failed: {e:?}"); + } + } + info!("REMOVE target_id={}", mon.target_id); + NTSTATUS::STATUS_SUCCESS +} + +/// Drop a monitor's live swap-chain processor BEFORE departure. The WDF context is an +/// `Arc>` that WDF frees WITHOUT running Rust `Drop` (no `EvtCleanupCallback` +/// is wired), and the OS does not reliably call UNASSIGN on a host-initiated departure — so the +/// streaming `Direct3DDevice` (its ~dozens of D3D worker threads + tens of MB of VRAM) was orphaned +/// once per session, the dominant reconnect-churn leak. `get_mut` takes the context `RwLock`, so this +/// is safe against a concurrent OS unassign callback (whichever runs second sees `None`). +fn free_swap_chain_processor(monitor: *mut wdf_umdf_sys::IDDCX_MONITOR__) { + // SAFETY: `monitor` is a live IddCx monitor object whose context was init'd at creation. + let r = unsafe { MonitorContext::get_mut(monitor.cast(), |ctx| ctx.unassign_swap_chain()) }; + if let Err(e) = r { + error!("free_swap_chain_processor: get_mut FAILED: {e:?}"); } } @@ -295,22 +331,46 @@ unsafe fn do_get_version(request: WDFREQUEST, output_len: usize, bytes: &mut usi NTSTATUS::STATUS_SUCCESS } -/// Tear down every monitor (watchdog expiry — the host is gone). Mirrors SudoVDA's DisconnectAllMonitors. -fn disconnect_all_monitors() { - let mut lock = MONITOR_MODES.lock().unwrap(); - if lock.is_empty() { - return; - } - for mon in lock.drain(..) { +/// Tear down monitors. `force` (CLEAR_ALL) reaps EVERYTHING — orphans from a crashed previous host; +/// the watchdog passes `false`, which spares any monitor still inside its creation grace +/// (`MONITOR_GRACE`) so a freshly-born monitor is never reaped mid-setup. Caller MUST hold +/// `MONITOR_OP_LOCK` (lock order: OP_LOCK → MONITOR_MODES). Mirrors SudoVDA's DisconnectAllMonitors. +fn disconnect_all_monitors_locked(force: bool) { + // Drain under the lock (fast); free processors + depart OUTSIDE it (the processor-join blocks). + let to_depart: Vec = { + let mut lock = MONITOR_MODES.lock().unwrap(); + if lock.is_empty() { + return; + } + let mut keep: Vec = Vec::new(); + let mut depart: Vec = Vec::new(); + for mon in lock.drain(..) { + if !force && mon.created_at.elapsed() < MONITOR_GRACE { + keep.push(mon); // still in its host-side setup window — leave it alone + } else { + depart.push(mon); + } + } + *lock = keep; + depart + }; + for mon in to_depart { if let Some(obj) = mon.object { + free_swap_chain_processor(obj.as_ptr()); // SAFETY: `obj` is a live IddCx monitor object. if let Err(e) = unsafe { IddCxMonitorDeparture(obj.as_ptr()) } { - error!("watchdog: monitor departure failed: {e:?}"); + error!("teardown: monitor departure failed: {e:?}"); } } } } +/// Public entry: takes `MONITOR_OP_LOCK`, then tears down. Used by CLEAR_ALL (`force = true`). +fn disconnect_all_monitors(force: bool) { + let _op = MONITOR_OP_LOCK.lock().unwrap(); + disconnect_all_monitors_locked(force); +} + /// Start the watchdog thread (once). The host reads the timeout via GET_WATCHDOG and PINGs every /// timeout/3; if it stops, the countdown reaches 0 and every monitor is torn down — so a crashed/gone /// host never leaves a phantom display. Mirrors SudoVDA's RunWatchdog. @@ -340,8 +400,14 @@ pub fn start_watchdog() { .is_ok() && prev - 1 == 0 { - error!("watchdog expired (host stopped pinging) — tearing down all monitors"); - disconnect_all_monitors(); + // About to fire. Serialize against do_add/do_remove (so we never tear an entry out from + // under an in-flight ADD), then RE-CHECK the countdown under the lock: if a concurrent + // IOCTL (PING/ADD) reset it while we were acquiring the lock, the host is alive — abort. + let _op = MONITOR_OP_LOCK.lock().unwrap(); + if WATCHDOG_COUNTDOWN.load(Ordering::Relaxed) == 0 { + error!("watchdog expired (host stopped pinging) — tearing down stale monitors"); + disconnect_all_monitors_locked(false); + } } }); } diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/direct_3d_device.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/direct_3d_device.rs index 80280aa..75c2708 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/direct_3d_device.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/direct_3d_device.rs @@ -1,3 +1,5 @@ +use std::sync::atomic::{AtomicI32, Ordering}; + use windows::{ core::Error, Win32::{ @@ -29,13 +31,19 @@ impl From<&'static str> for Direct3DError { } } +/// DIAGNOSTIC: live `Direct3DDevice` count. Each one holds an `ID3D11Device` whose NVIDIA UMD spawns +/// ~dozens of worker threads; if this climbs without bound across reconnects, devices are leaking. +pub static LIVE_DEVICES: AtomicI32 = AtomicI32::new(0); + #[derive(Debug)] pub struct Direct3DDevice { // The following are already refcounted, so they're safe to use directly without additional drop impls _dxgi_factory: IDXGIFactory5, _adapter: IDXGIAdapter1, pub device: ID3D11Device, - _device_context: ID3D11DeviceContext, + /// The single (SINGLETHREADED) immediate context — used by the frame-push publisher's + /// `CopyResource` on the swap-chain processor thread (the one thread this device is touched from). + pub device_context: ID3D11DeviceContext, } impl Direct3DDevice { @@ -67,11 +75,21 @@ impl Direct3DDevice { let device = device.ok_or("ID3D11Device not found")?; let device_context = device_context.ok_or("ID3D11DeviceContext not found")?; + let live = LIVE_DEVICES.fetch_add(1, Ordering::Relaxed) + 1; + log::error!("Direct3DDevice::init OK — live D3D devices = {live}"); + Ok(Self { _dxgi_factory: dxgi_factory, _adapter: adapter, device, - _device_context: device_context, + device_context, }) } } + +impl Drop for Direct3DDevice { + fn drop(&mut self) { + let live = LIVE_DEVICES.fetch_sub(1, Ordering::Relaxed) - 1; + log::error!("Direct3DDevice::drop — live D3D devices = {live}"); + } +} diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/edid.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/edid.rs index 8387cb4..9ee80e4 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/edid.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/edid.rs @@ -1,114 +1,118 @@ -use std::{array::TryFromSliceError, ops::Deref}; +//! The 256-byte EDID the pf-vdisplay driver hands IddCx for each virtual monitor: a 128-byte EDID 1.4 +//! base block + a **CTA-861.3 extension** that advertises HDR — a BT.2020 Colorimetry Data Block and an +//! HDR Static Metadata Data Block declaring the SMPTE ST 2084 (PQ) EOTF. Windows reads a display's HDR +//! capability from this CTA HDR block; without it the monitor is treated as SDR-only regardless of the +//! IddCx adapter's `CAN_PROCESS_FP16` / `HIGH_COLOR_SPACE` / 10-bit mode caps (the missing piece that +//! made "Use HDR" never appear for the virtual display). The base block declares EDID 1.4 + 10-bit +//! digital so the panel's bit depth is unambiguous. +//! +//! Identity: manufacturer "PNK" (bytes 8-9), product name "punktfunk" (the 0xFC display descriptor). The +//! serial-number field (base offset 0x0C, little-endian) encodes the per-monitor index so +//! `parse_monitor_description` can map an EDID the OS hands back to its monitor; [`Edid::generate_with`] +//! patches that serial and recomputes BOTH block checksums (base byte 127 + extension byte 255). The +//! detailed-timing / range-limit descriptors are placeholders — the modes we actually advertise come +//! from the monitor's stored mode list (`monitor.rs` / `callbacks.rs`), not from parsing this EDID. -use bytemuck::{Pod, Zeroable}; +use std::array::TryFromSliceError; -// A clean, self-contained 128-byte EDID carrying punktfunk's own identity — manufacturer ID "PNK" -// (bytes 8-9) and product name "punktfunk" (the 0xFC display-descriptor). Derived from the -// virtual-display-rs base block (a standard, widely-deployed virtual EDID); it deliberately carries NO -// other driver's bytes or branding. The serial-number field (offset 0x0C) encodes the per-monitor -// index, so `parse_monitor_description` can map an EDID the OS hands back to its monitor; -// `generate_with` patches that serial and `gen_checksum` recomputes byte 127 before the EDID reaches -// IddCx. The detailed-timing / range-limit descriptors are placeholders: the modes we actually -// advertise come from the monitor's stored mode list (`monitor.rs` / `callbacks.rs`), not from parsing -// this EDID. -const _EDID: [u8; 128] = [ - 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x41, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xFF, 0x21, 0x01, 0x03, 0x80, 0x32, 0x1F, 0x78, 0x07, 0xEE, 0x95, 0xA3, 0x54, 0x4C, 0x99, 0x26, - 0x0F, 0x50, 0x54, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, - 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x3A, 0x80, 0x18, 0x71, 0x38, 0x2D, 0x40, 0x58, 0x2C, - 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x17, 0xF0, 0x0F, - 0xFF, 0x0F, 0x00, 0x0A, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0xFC, 0x00, 0x70, - 0x75, 0x6E, 0x6B, 0x74, 0x66, 0x75, 0x6E, 0x6B, 0x0A, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +/// Per-monitor serial number, base-block offset 0x0C, little-endian u32. +const SERIAL_OFFSET: usize = 0x0C; + +/// EDID 1.4 base block (128 bytes). Differs from a plain SDR virtual EDID only by: revision 1.4 (byte +/// 19 = 0x04), 10-bit digital video input (byte 20 = 0xB0), and one extension present (byte 126 = 0x01). +/// Byte 127 (checksum) and the serial (0x0C) are filled/patched in [`Edid::generate_with`]. +#[rustfmt::skip] +const BASE: [u8; 128] = [ + 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, // fixed header + 0x41, 0xCB, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // mfr "PNK", product, serial (patched) + 0xFF, 0x21, 0x01, 0x04, 0xB0, 0x32, 0x1F, 0x78, // week/year, EDID 1.4, 10-bit digital, size, gamma + 0x03, 0x78, 0xB1, 0xB5, 0x4A, 0x2B, 0xCC, 0x21, // feature (sRGB-default CLEARED), BT.2020 primaries... + 0x0B, 0x50, 0x54, 0x00, 0x00, 0x00, 0x01, 0x01, // ...BT.2020 primaries, established timings, std timings + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x3A, // std timings, DTD 1 (placeholder preferred timing) + 0x80, 0x18, 0x71, 0x38, 0x2D, 0x40, 0x58, 0x2C, + 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, + 0x00, 0x00, 0x00, 0xFD, 0x00, 0x17, 0xF0, 0x0F, // display range-limits descriptor + 0xFF, 0x0F, 0x00, 0x0A, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x00, 0x00, 0x00, 0xFC, 0x00, 0x70, // name descriptor "punktfunk" + 0x75, 0x6E, 0x6B, 0x74, 0x66, 0x75, 0x6E, 0x6B, + 0x0A, 0x20, 0x20, 0x20, 0x00, 0x00, 0x00, 0x00, // empty 4th descriptor... + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, // ...byte 126 = 1 extension, byte 127 = checksum ]; -const EDID_LEN: usize = _EDID.len(); +/// CTA-861.3 extension block (128 bytes), block 1. Header + a Data Block Collection holding the +/// Colorimetry and HDR Static Metadata data blocks; the rest is padding up to the checksum (byte 255). +/// `D` (byte 130) marks where DTDs would start (= end of the data blocks); we carry none. +#[rustfmt::skip] +const CTA_HEADER: [u8; 4] = [ + 0x02, // CTA Extension tag + 0x03, // revision 3 (CTA-861.3 — required for the extended-tag data blocks below) + 0x0F, // D = 15: the (empty) DTD region starts at block byte 15, i.e. data blocks occupy bytes 4..15 + 0x00, // 0 native DTDs; no basic audio; no YCbCr 4:4:4/4:2:2 (RGB-only, matching the wire format) +]; -static EDID: AlignedEdid = AlignedEdid { - data: _EDID, - _align: [], -}; +/// Colorimetry Data Block (CTA extended tag 0x05): declare BT.2020 RGB (bit 7). YCbCr variants are left +/// clear — the IddCx wire format is RGB-only — and the gamut-metadata flags are 0. +#[rustfmt::skip] +const COLORIMETRY_DB: [u8; 4] = [ + 0xE3, // tag 0b111 (use-extended-tag) | length 3 + 0x05, // extended tag: Colorimetry + 0x80, // BT2020RGB (bit 7); xvYCC/sYCC/opRGB/BT2020 YCC/cYCC all clear + 0x00, // gamut metadata profiles MD0..MD3: none +]; -#[repr(C)] -struct AlignedEdid { - data: [u8; N], - // required to make this type aligned to Edid - _align: [Edid; 0], -} +/// HDR Static Metadata Data Block (CTA extended tag 0x06): EOTFs = Traditional SDR (ET_0) + SMPTE ST +/// 2084 / PQ (ET_2); Static Metadata Type 1 (SM_0). Plus the optional desired-content luminance hints +/// (~993 nit max, ~400 nit max-frame-average, ~0.05 nit min) so the block is complete. +#[rustfmt::skip] +const HDR_STATIC_METADATA_DB: [u8; 7] = [ + 0xE6, // tag 0b111 (use-extended-tag) | length 6 + 0x06, // extended tag: HDR Static Metadata + 0x05, // Supported EOTFs: ET_0 (traditional SDR) | ET_2 (SMPTE ST 2084 / PQ) + 0x01, // Supported Static Metadata Descriptors: SM_0 (Static Metadata Type 1) + 0x8A, // Desired Content Max Luminance (code 138 ≈ 993 nits) + 0x60, // Desired Content Max Frame-avg Lum. (code 96 = 400 nits) + 0x12, // Desired Content Min Luminance (code 18 ≈ 0.05 nits) +]; -impl AlignedEdid { - fn new(data: &[u8]) -> Result { - let data: [u8; N] = data.try_into()?; - Ok(Self { data, _align: [] }) - } -} - -impl Deref for AlignedEdid { - type Target = Edid; - - fn deref(&self) -> &Self::Target { - let header = &self.data[..EDID_SIZE]; - bytemuck::from_bytes(header) - } -} - -const EDID_SIZE: usize = std::mem::size_of::(); - -#[repr(C)] -#[derive(Debug, Copy, Clone, Pod, Zeroable)] -pub struct Edid { - header: [u8; 8], - manufacturer_id: [u8; 2], - product_code: u16, - serial_number: u32, - manufacture_week: u8, - manufacture_year: u8, - version: u8, - revision: u8, -} +#[derive(Debug, Clone, Copy)] +pub struct Edid; impl Edid { + /// Build the full 256-byte EDID for monitor `serial`, with both block checksums recomputed. pub fn generate_with(serial: u32) -> Vec { - // change serial number in the header - let mut header = *EDID; - header.serial_number = serial; - - header.generate() + let mut edid = [0u8; 256]; + // Block 0: base. + edid[..128].copy_from_slice(&BASE); + edid[SERIAL_OFFSET..SERIAL_OFFSET + 4].copy_from_slice(&serial.to_le_bytes()); + // Block 1: CTA-861.3 extension (header + colorimetry + HDR static metadata; rest stays 0). + edid[128..132].copy_from_slice(&CTA_HEADER); + edid[132..136].copy_from_slice(&COLORIMETRY_DB); + edid[136..143].copy_from_slice(&HDR_STATIC_METADATA_DB); + // Each 128-byte block ends in a checksum byte that makes the block sum ≡ 0 (mod 256). + Self::fix_block_checksum(&mut edid, 0); + Self::fix_block_checksum(&mut edid, 128); + edid.to_vec() } + /// Read the per-monitor serial (base offset 0x0C, little-endian) from an EDID the OS handed back. + /// Works for the full 256-byte EDID or just the 128-byte base block. Errors (rather than panics) on + /// a too-short buffer so the caller can reject a malformed descriptor. pub fn get_serial(edid: &[u8]) -> Result { - let edid = AlignedEdid::::new(edid)?; - Ok(edid.serial_number) + let bytes: [u8; 4] = edid + .get(SERIAL_OFFSET..SERIAL_OFFSET + 4) + .unwrap_or(&[]) + .try_into()?; + Ok(u32::from_le_bytes(bytes)) } - fn generate(&self) -> Vec { - let header = bytemuck::bytes_of(self); - - // slice of monitor edid minus header - let data = &EDID.data[EDID_SIZE..]; - - // splice together header and the rest of the EDID - let mut edid: Vec = header.iter().chain(data).copied().collect(); - // regenerate checksum - Self::gen_checksum(&mut edid); - - edid - } - - fn gen_checksum(data: &mut [u8]) { - // important, this is the bare minimum length - assert!(data.len() >= 128); - - // slice to the entire data minus the last checksum byte - let edid_data = &data[..=126]; - - // do checksum calculation - let sum: u32 = edid_data.iter().copied().map(u32::from).sum(); - // this wont ever truncate - #[allow(clippy::cast_possible_truncation)] - let checksum = (256 - (sum % 256)) as u8; - - // update last byte with new checksum - data[127] = checksum; + /// Set the trailing byte of the 128-byte block at `start` so the block's bytes sum to 0 (mod 256) — + /// the standard EDID block checksum. + fn fix_block_checksum(edid: &mut [u8], start: usize) { + let sum = edid[start..start + 127] + .iter() + .fold(0u8, |acc, &b| acc.wrapping_add(b)); + edid[start + 127] = 0u8.wrapping_sub(sum); } } diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/entry.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/entry.rs index 93e616e..95f4dac 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/entry.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/entry.rs @@ -12,8 +12,10 @@ use wdf_umdf_sys::{ }; use crate::callbacks::{ - adapter_commit_modes, adapter_init_finished, assign_swap_chain, device_d0_entry, - monitor_get_default_modes, monitor_query_modes, parse_monitor_description, unassign_swap_chain, + adapter_commit_modes, adapter_commit_modes2, adapter_init_finished, assign_swap_chain, + device_d0_entry, monitor_get_default_modes, monitor_query_modes, monitor_query_modes2, + parse_monitor_description, parse_monitor_description2, query_target_info, + set_default_hdr_metadata, set_gamma_ramp, unassign_swap_chain, }; use crate::context::DeviceContext; use crate::control::device_io_control; @@ -73,6 +75,15 @@ extern "C-unwind" fn driver_add( config.EvtIddCxMonitorGetDefaultDescriptionModes = Some(monitor_get_default_modes); config.EvtIddCxMonitorQueryTargetModes = Some(monitor_query_modes); config.EvtIddCxAdapterCommitModes = Some(adapter_commit_modes); + // IddCx 1.10 *2 mode DDIs (HDR-capable path). The OS prefers these on 1.10; the 1.x callbacks + // above stay as the down-level fallback. B1 advertises SDR through them (so behaviour is unchanged); + // B2 enables HDR by adding 10 bpc in `wire_bits()`, HIGH_COLOR_SPACE caps, and CAN_PROCESS_FP16. + config.EvtIddCxParseMonitorDescription2 = Some(parse_monitor_description2); + config.EvtIddCxMonitorQueryTargetModes2 = Some(monitor_query_modes2); + config.EvtIddCxAdapterCommitModes2 = Some(adapter_commit_modes2); + config.EvtIddCxAdapterQueryTargetInfo = Some(query_target_info); + config.EvtIddCxMonitorSetDefaultHdrMetaData = Some(set_default_hdr_metadata); + config.EvtIddCxMonitorSetGammaRamp = Some(set_gamma_ramp); config.EvtIddCxMonitorAssignSwapChain = Some(assign_swap_chain); config.EvtIddCxMonitorUnassignSwapChain = Some(unassign_swap_chain); // IddCx redirects device IOCTLs to this callback — our SudoVDA-compatible control plane. diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/frame_transport.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/frame_transport.rs new file mode 100644 index 0000000..81396c7 --- /dev/null +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/frame_transport.rs @@ -0,0 +1,424 @@ +//! P2 direct frame push — DRIVER side. The restricted WUDFHost token canNOT create named kernel +//! objects (proven on the RTX box: it can't even write a world-writable file), so — exactly like the +//! gamepad UMDF drivers (`crates/punktfunk-host/src/inject/dualsense_windows.rs`: *"the host creates +//! the section, privileged, with a permissive SDDL so the WUDFHost can open it; the driver maps it"*) +//! — the **host** creates the shared header + frame-ready event + ring of keyed-mutex textures, and +//! the driver only **OPENS** them. The driver writes its actual render-adapter LUID + a status code +//! back into the host-created header (our only driver-visibility channel: UMDF hides OutputDebugString +//! in ETW and the token can't write files), then copies each acquired swap-chain surface into the next +//! ring slot and signals the host. +//! +//! Host counterpart: `crates/punktfunk-host/src/capture/idd_push.rs` — [`SharedHeader`], [`MAGIC`], +//! [`RING_LEN`], the driver-status codes and the `Global\` object-name scheme are DUPLICATED +//! byte-identically there. + +use std::sync::atomic::{AtomicPtr, AtomicU32, AtomicU64, Ordering}; + +use log::info; +use windows::core::{Interface, HSTRING}; +use windows::Win32::Foundation::{CloseHandle, HANDLE}; +use windows::Win32::Graphics::Direct3D11::{ + ID3D11Device, ID3D11Device1, ID3D11DeviceContext, ID3D11Texture2D, D3D11_TEXTURE2D_DESC, +}; +use windows::Win32::Graphics::Dxgi::IDXGIKeyedMutex; +use windows::Win32::System::Memory::{ + MapViewOfFile, OpenFileMappingW, UnmapViewOfFile, FILE_MAP_ALL_ACCESS, + MEMORY_MAPPED_VIEW_ADDRESS, +}; +use windows::Win32::System::Threading::{OpenEventW, SetEvent, SYNCHRONIZATION_ACCESS_RIGHTS}; + +// --- kept byte-identical with the host (idd_push.rs) --- +pub const MAGIC: u32 = 0x4456_4650; +/// Kept for parity with the host's duplicated protocol header (the host writes it). +#[allow(dead_code)] +pub const VERSION: u32 = 1; +/// Ring slots. 6 (was 3) gives ample headroom so this 0 ms-timeout publish always finds a free slot +/// while the host briefly holds one across the convert/copy into its output ring and the depth-2 +/// pipelined encode runs. MUST equal the host's `RING_LEN` (idd_push.rs) — both are rebuilt together; +/// a mismatch corrupts the slot mapping. +pub const RING_LEN: u32 = 6; +const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1; +/// SYNCHRONIZE | EVENT_MODIFY_STATE — the driver waits on (no) and SIGNALS the event. +const EVENT_ACCESS: u32 = 0x0010_0000 | 0x0002; +const WAIT_TIMEOUT_HRESULT: i32 = 0x0000_0102; + +/// `driver_status` values the driver writes into the host header (the host logs them on a timeout). +/// `NONE` is the host's initial value (kept for parity). +#[allow(dead_code)] +pub const DRV_STATUS_NONE: u32 = 0; +pub const DRV_STATUS_OPENED: u32 = 1; +pub const DRV_STATUS_TEX_FAIL: u32 = 2; +pub const DRV_STATUS_NO_DEVICE1: u32 = 3; + +#[repr(C)] +pub struct SharedHeader { + pub magic: u32, + pub version: u32, + pub generation: u32, + pub ring_len: u32, + pub width: u32, + pub height: u32, + pub dxgi_format: u32, + pub _pad: u32, + /// `(seq << 8) | slot` — DRIVER-written after each copy; host loads it `Acquire`. + pub latest: u64, + pub qpc_pts: u64, + /// DRIVER-written: the adapter the swap-chain actually renders on (so the host can detect a + /// mismatch with the textures it created and report it). + pub driver_render_luid_low: u32, + pub driver_render_luid_high: i32, + /// DRIVER-written status (visibility channel). + pub driver_status: u32, + pub driver_status_detail: u32, +} + +pub fn hdr_name(target_id: u32) -> String { + format!("Global\\pfvd-hdr-{target_id}") +} +pub fn evt_name(target_id: u32) -> String { + format!("Global\\pfvd-evt-{target_id}") +} +pub fn tex_name(target_id: u32, generation: u32, slot: u32) -> String { + format!("Global\\pfvd-tex-{target_id}-{generation}-{slot}") +} +// -------------------------------------------------------- + +// ===== Bring-up debug channel (fixed-name, host-created) ===== +// UMDF hides the driver's OutputDebugString (ETW) and the restricted token can't write files, so this +// fixed-name `Global\pfvd-dbg` block — created by the host with the permissive SDDL — is how the driver +// reports what it's doing, INDEPENDENT of the per-target header (which is the thing under test). The +// host reads + logs these counters. Duplicated in `idd_push.rs`. +#[repr(C)] +pub struct DebugBlock { + pub magic: u32, + /// ++ each `run_core` entry — proves the swap-chain processor runs at all. + pub run_core_entries: u32, + /// The `target_id` the driver resolved for naming (mismatch vs the host = the bug). + pub resolved_target_id: u32, + /// ++ each header-open attempt. + pub header_open_attempts: u32, + /// Last header-open error (win32/HRESULT). + pub last_open_error: u32, + /// 1 once the driver opened the per-target header. + pub header_opened: u32, + pub render_luid_low: u32, + pub render_luid_high: i32, + /// ++ each acquired swap-chain frame — proves frames flow (or the display is idle). + pub frames_acquired: u32, + pub _pad: u32, +} + +static DBG_PTR: AtomicPtr = AtomicPtr::new(std::ptr::null_mut()); + +/// Map the host-created debug block on first use (fixed name). Returns null until the host creates it. +fn dbg_block() -> *mut DebugBlock { + let p = DBG_PTR.load(Ordering::Acquire); + if !p.is_null() { + return p; + } + let Ok(map) = (unsafe { + OpenFileMappingW(FILE_MAP_ALL_ACCESS.0, false, &HSTRING::from("Global\\pfvd-dbg")) + }) else { + return std::ptr::null_mut(); + }; + let view = unsafe { MapViewOfFile(map, FILE_MAP_ALL_ACCESS, 0, 0, std::mem::size_of::()) }; + if view.Value.is_null() { + unsafe { + let _ = CloseHandle(map); + } + return std::ptr::null_mut(); + } + let np = view.Value.cast::(); + match DBG_PTR.compare_exchange(std::ptr::null_mut(), np, Ordering::AcqRel, Ordering::Acquire) { + Ok(_) => np, // we win; intentionally leak the handle (diagnostic, process-lifetime) + Err(existing) => { + unsafe { + let _ = UnmapViewOfFile(view); + let _ = CloseHandle(map); + } + existing + } + } +} + +pub fn dbg_run_core_entry() { + let p = dbg_block(); + if !p.is_null() { + unsafe { + (*(std::ptr::addr_of_mut!((*p).run_core_entries) as *const AtomicU32)) + .fetch_add(1, Ordering::Relaxed); + } + } +} + +pub fn dbg_frame() { + let p = dbg_block(); + if !p.is_null() { + unsafe { + (*(std::ptr::addr_of_mut!((*p).frames_acquired) as *const AtomicU32)) + .fetch_add(1, Ordering::Relaxed); + } + } +} + +/// Record the target id + render LUID the driver will use to name the shared objects. +pub fn dbg_set_target(target_id: u32, render_luid_low: u32, render_luid_high: i32) { + let p = dbg_block(); + if !p.is_null() { + unsafe { + (*p).resolved_target_id = target_id; + (*p).render_luid_low = render_luid_low; + (*p).render_luid_high = render_luid_high; + } + } +} + +/// Record a header-open attempt + its error (0 = success). +pub fn dbg_header_attempt(error: u32, opened: bool) { + let p = dbg_block(); + if !p.is_null() { + unsafe { + (*(std::ptr::addr_of_mut!((*p).header_open_attempts) as *const AtomicU32)) + .fetch_add(1, Ordering::Relaxed); + (*p).last_open_error = error; + if opened { + (*p).header_opened = 1; + } + } + } +} + +struct Slot { + tex: ID3D11Texture2D, + mutex: IDXGIKeyedMutex, +} + +/// Publishes acquired swap-chain surfaces into the HOST-created ring. Owned by the swap-chain +/// processor thread; attached lazily once the host has created the shared objects. +pub struct FramePublisher { + context: ID3D11DeviceContext, + map: HANDLE, + header: *mut SharedHeader, + event: HANDLE, + slots: Vec, + next: u32, + seq: u64, + /// The host-created ring textures' DXGI format (from the shared header). A swap-chain surface whose + /// format differs (e.g. an FP16 HDR frame vs a BGRA ring) is dropped in `publish` — CopyResource + /// needs matching formats. + ring_format: u32, + /// The ring generation this publisher attached to. The host BUMPS the header generation when it + /// recreates the ring at a new format mid-session (the display's HDR mode flipped) — [`Self::is_stale`] + /// detects that so `run_core` re-attaches to the new-format textures instead of dropping every frame. + generation: u32, +} + +// SAFETY: created and used only on the swap-chain processor thread. +unsafe impl Send for FramePublisher {} + +impl FramePublisher { + /// Try ONCE to attach to the host-created shared objects. Returns `Err` cheaply if the host hasn't + /// created/published them yet — the drain loop retries periodically, so a non-IDD-push session + /// just keeps draining with no stall. + pub fn try_open( + target_id: u32, + render_luid_low: u32, + render_luid_high: i32, + device: &ID3D11Device, + context: &ID3D11DeviceContext, + ) -> windows::core::Result { + // 1. Open the host-created header (RW). Err if the host hasn't created it yet. + let map = unsafe { + OpenFileMappingW( + FILE_MAP_ALL_ACCESS.0, + false, + &HSTRING::from(hdr_name(target_id)), + )? + }; + let view = + unsafe { MapViewOfFile(map, FILE_MAP_ALL_ACCESS, 0, 0, std::mem::size_of::()) }; + if view.Value.is_null() { + unsafe { + let _ = CloseHandle(map); + } + return Err(windows::core::Error::from_win32()); + } + let header = view.Value.cast::(); + + // 2. Report our render adapter to the host immediately (lets it detect a mismatch). + unsafe { + (*header).driver_render_luid_low = render_luid_low; + (*header).driver_render_luid_high = render_luid_high; + } + + // 3. The host sets magic==MAGIC only once the ring textures exist. Not ready → retry later. + let magic = + unsafe { (*(std::ptr::addr_of!((*header).magic) as *const AtomicU32)).load(Ordering::Acquire) }; + if magic != MAGIC { + unsafe { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { Value: header.cast() }); + let _ = CloseHandle(map); + } + return Err(windows::core::Error::from_win32()); + } + let (generation, ring_len) = + unsafe { ((*header).generation, (*header).ring_len.min(RING_LEN)) }; + + // 4. Open the event (SYNCHRONIZE | EVENT_MODIFY_STATE so we can SetEvent). + let event = match unsafe { + OpenEventW( + SYNCHRONIZATION_ACCESS_RIGHTS(EVENT_ACCESS), + false, + &HSTRING::from(evt_name(target_id)), + ) + } { + Ok(e) => e, + Err(e) => { + unsafe { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { Value: header.cast() }); + let _ = CloseHandle(map); + } + return Err(e); + } + }; + + // 5. Open device1 + the ring textures the host created (same render adapter required). + let device1: ID3D11Device1 = match device.cast() { + Ok(d) => d, + Err(e) => { + unsafe { + (*header).driver_status = DRV_STATUS_NO_DEVICE1; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { Value: header.cast() }); + let _ = CloseHandle(map); + } + return Err(e); + } + }; + let mut slots = Vec::new(); + for k in 0..ring_len { + let name = HSTRING::from(tex_name(target_id, generation, k)); + let opened: windows::core::Result = + unsafe { device1.OpenSharedResourceByName(&name, DXGI_SHARED_RESOURCE_RW) }; + match opened { + Ok(tex) => match tex.cast::() { + Ok(mutex) => slots.push(Slot { tex, mutex }), + Err(e) => { + unsafe { + (*header).driver_status = DRV_STATUS_TEX_FAIL; + (*header).driver_status_detail = e.code().0 as u32; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { Value: header.cast() }); + let _ = CloseHandle(map); + } + return Err(e); + } + }, + Err(e) => { + // Most likely a render-adapter mismatch (the host made the textures on a different + // GPU than the swap-chain renders on). Tell the host so it can report it. + unsafe { + (*header).driver_status = DRV_STATUS_TEX_FAIL; + (*header).driver_status_detail = e.code().0 as u32; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { Value: header.cast() }); + let _ = CloseHandle(map); + } + return Err(e); + } + } + } + + unsafe { + (*header).driver_status = DRV_STATUS_OPENED; + } + info!("frame-push(driver): attached to host ring gen {generation} ({ring_len} slots)"); + Ok(Self { + context: context.clone(), + map, + header, + event, + slots, + next: 0, + seq: 0, + ring_format: unsafe { (*header).dxgi_format }, + generation, + }) + } + + #[inline] + fn latest_cell(&self) -> &AtomicU64 { + unsafe { &*(std::ptr::addr_of!((*self.header).latest) as *const AtomicU64) } + } + + /// True once the host has recreated the ring (bumped the header generation) — e.g. the display's + /// HDR mode flipped, so the ring format changed (FP16 ⇄ BGRA) and the texture names now carry a new + /// generation. `run_core` drops the publisher on this so it re-attaches to the new ring. + pub fn is_stale(&self) -> bool { + let cur = unsafe { + (*(std::ptr::addr_of!((*self.header).generation) as *const AtomicU32)) + .load(Ordering::Acquire) + }; + cur != self.generation + } + + /// Copy `surface` into the next free ring slot and signal the host. Never blocks (0 ms try-acquire). + pub fn publish(&mut self, surface: &ID3D11Texture2D) { + let ring_len = self.slots.len() as u32; + if ring_len == 0 { + return; + } + // B2 format guard: CopyResource needs the surface + ring textures to share a DXGI format. Drop + // a frame that doesn't match (e.g. an FP16 HDR surface arriving while the ring is still BGRA, + // before B3 makes the ring FP16) instead of corrupting / failing the copy. + let mut desc = D3D11_TEXTURE2D_DESC::default(); + unsafe { surface.GetDesc(&mut desc) }; + if desc.Format.0 as u32 != self.ring_format { + return; + } + let start = self.next; + for attempt in 0..ring_len { + let slot = (start + attempt) % ring_len; + let s = &self.slots[slot as usize]; + match unsafe { s.mutex.AcquireSync(0, 0) } { + Ok(()) => { + unsafe { + self.context.CopyResource(&s.tex, surface); + let _ = s.mutex.ReleaseSync(0); + } + self.seq = self.seq.wrapping_add(1); + // `latest` = (generation << 40) | (seq << 8) | slot. Stamping the generation lets the + // host REJECT a publish from a stale ring (an old-generation publisher racing the + // host's mid-session ring recreate) so it never consumes an unwritten new-ring slot. + let latest = (u64::from(self.generation) << 40) + | ((self.seq & 0xFFFF_FFFF) << 8) + | u64::from(slot & 0xff); + self.latest_cell().store(latest, Ordering::Release); + unsafe { + let _ = SetEvent(self.event); + } + self.next = (slot + 1) % ring_len; + return; + } + Err(e) if e.code().0 == WAIT_TIMEOUT_HRESULT => continue, + Err(_) => return, + } + } + // All slots busy — drop this frame (never block the swap-chain thread). + } +} + +impl Drop for FramePublisher { + fn drop(&mut self) { + self.slots.clear(); + unsafe { + if !self.header.is_null() { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: self.header.cast(), + }); + } + let _ = CloseHandle(self.event); + let _ = CloseHandle(self.map); + } + } +} diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/lib.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/lib.rs index 4d0f5b1..a2fc138 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/lib.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/lib.rs @@ -12,6 +12,7 @@ mod control; mod direct_3d_device; mod edid; mod entry; +mod frame_transport; mod helpers; mod logger; mod monitor; diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/logger.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/logger.rs index 9e251c2..fdb9191 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/logger.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/logger.rs @@ -1,12 +1,22 @@ -//! Minimal `log` backend that writes to `OutputDebugString` — no `driver-logger`/event-log/`tokio`. -//! View with DebugView/WinDbg. Keeping the `log` facade lets the ported callbacks/context use -//! `error!`/`info!`/`debug!` unchanged. +//! Minimal `log` backend that writes to `OutputDebugString` AND tees to a file — UMDF redirects a +//! hosted driver's `OutputDebugString` to ETW (invisible to DebugView), so the file tee is how we +//! actually read driver logs during bring-up. Keeping the `log` facade lets the ported +//! callbacks/context use `error!`/`info!`/`debug!` unchanged. + +use std::fs::OpenOptions; +use std::io::Write; +use std::sync::Mutex; use log::{LevelFilter, Metadata, Record}; use windows::core::PCSTR; use windows::Win32::System::Diagnostics::Debug::OutputDebugStringA; -struct DbgLogger; +/// World-writable so the restricted WUDFHost token can append. Read it during bring-up. +const LOG_PATH: &str = r"C:\Users\Public\pfvd-driver.log"; + +struct DbgLogger { + file: Mutex<()>, +} impl log::Log for DbgLogger { fn enabled(&self, _metadata: &Metadata) -> bool { @@ -17,12 +27,19 @@ impl log::Log for DbgLogger { let msg = format!("[pf-vdisplay] {:<5} {}\0", record.level(), record.args()); // SAFETY: `msg` is a NUL-terminated byte string valid for the call. unsafe { OutputDebugStringA(PCSTR(msg.as_ptr())) }; + // Tee to the file (best-effort): the real channel during bring-up. + let _guard = self.file.lock(); + if let Ok(mut f) = OpenOptions::new().create(true).append(true).open(LOG_PATH) { + let _ = writeln!(f, "{:<5} {}", record.level(), record.args()); + } } fn flush(&self) {} } -static LOGGER: DbgLogger = DbgLogger; +static LOGGER: DbgLogger = DbgLogger { + file: Mutex::new(()), +}; pub fn init() { let _ = log::set_logger(&LOGGER); @@ -31,4 +48,8 @@ pub fn init() { } else { LevelFilter::Info }); + // Boot marker so each load is distinguishable in the file. + if let Ok(mut f) = OpenOptions::new().create(true).append(true).open(LOG_PATH) { + let _ = writeln!(f, "==== pf-vdisplay logger init ===="); + } } diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/monitor.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/monitor.rs index dad10ab..ba20b0a 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/monitor.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/monitor.rs @@ -6,6 +6,7 @@ use std::ptr::NonNull; use std::sync::atomic::{AtomicU32, AtomicU64}; use std::sync::{Mutex, OnceLock}; +use std::time::Instant; use wdf_umdf_sys::{IDDCX_ADAPTER__, IDDCX_MONITOR__}; @@ -37,6 +38,10 @@ pub struct MonitorObject { pub target_id: u32, pub adapter_luid_low: u32, pub adapter_luid_high: i32, + /// When the entry was pushed (`do_add`). The watchdog skips monitors younger than the host's + /// setup window (CCD commit + GDI-name resolve + settle) so a still-initializing monitor is never + /// torn down mid-birth during reconnect churn. + pub created_at: Instant, } // SAFETY: the raw IddCx object ptr is framework-managed; access is serialized by MONITOR_MODES. unsafe impl Send for MonitorObject {} @@ -53,9 +58,12 @@ pub static MONITOR_MODES: Mutex> = Mutex::new(Vec::new()); /// Monitor id / EDID-serial counter (unique per created monitor). pub static NEXT_ID: AtomicU32 = AtomicU32::new(1); -/// Watchdog (seconds). The host reads the timeout via GET_WATCHDOG and PINGs to keep alive. -pub static WATCHDOG_TIMEOUT: AtomicU32 = AtomicU32::new(3); -pub static WATCHDOG_COUNTDOWN: AtomicU32 = AtomicU32::new(3); +/// Watchdog (seconds). The host reads the timeout via GET_WATCHDOG and PINGs to keep alive. 8 s (was +/// 3) gives the host's between-session teardown gap — stop old pinger → CCD display re-attach (a slow +/// `SetDisplayConfig`) → REMOVE — headroom, so the watchdog doesn't spuriously fire during reconnect +/// churn. The host derives its PING interval from this (timeout/3), so it auto-adjusts. +pub static WATCHDOG_TIMEOUT: AtomicU32 = AtomicU32::new(8); +pub static WATCHDOG_COUNTDOWN: AtomicU32 = AtomicU32::new(8); /// The preferred render adapter LUID set via SET_RENDER_ADAPTER, packed `(high<<32)|low`. 0 = none. pub static PREFERRED_RENDER_ADAPTER: AtomicU64 = AtomicU64::new(0); diff --git a/packaging/windows/vdisplay-driver/pf-vdisplay/src/swap_chain_processor.rs b/packaging/windows/vdisplay-driver/pf-vdisplay/src/swap_chain_processor.rs index de038be..e148edb 100644 --- a/packaging/windows/vdisplay-driver/pf-vdisplay/src/swap_chain_processor.rs +++ b/packaging/windows/vdisplay-driver/pf-vdisplay/src/swap_chain_processor.rs @@ -4,29 +4,39 @@ use std::{ Arc, }, thread::{self, JoinHandle}, + time::Duration, }; use log::{debug, error}; use wdf_umdf::{ - IddCxSwapChainFinishedProcessingFrame, IddCxSwapChainReleaseAndAcquireBuffer, + IddCxSwapChainFinishedProcessingFrame, IddCxSwapChainReleaseAndAcquireBuffer2, IddCxSwapChainSetDevice, WdfObjectDelete, }; use wdf_umdf_sys::{ - HANDLE, IDARG_IN_SWAPCHAINSETDEVICE, IDARG_OUT_RELEASEANDACQUIREBUFFER, IDDCX_SWAPCHAIN, - NTSTATUS, WAIT_TIMEOUT, WDFOBJECT, + HANDLE, IDARG_IN_RELEASEANDACQUIREBUFFER2, IDARG_IN_SWAPCHAINSETDEVICE, + IDARG_OUT_RELEASEANDACQUIREBUFFER2, IDDCX_SWAPCHAIN, NTSTATUS, WAIT_TIMEOUT, WDFOBJECT, }; use windows::{ core::{w, Interface}, Win32::{ Foundation::HANDLE as WHANDLE, - Graphics::Dxgi::IDXGIDevice, + Graphics::{ + Direct3D11::ID3D11Texture2D, + Dxgi::{IDXGIDevice, IDXGIResource}, + }, System::Threading::{ AvRevertMmThreadCharacteristics, AvSetMmThreadCharacteristicsW, WaitForSingleObject, }, }, }; -use crate::{direct_3d_device::Direct3DDevice, helpers::Sendable}; +use crate::{ + direct_3d_device::Direct3DDevice, + frame_transport::{ + dbg_frame, dbg_header_attempt, dbg_run_core_entry, dbg_set_target, FramePublisher, + }, + helpers::Sendable, +}; pub struct SwapChainProcessor { terminate: Arc, @@ -47,8 +57,11 @@ impl SwapChainProcessor { pub fn run( &mut self, swap_chain: IDDCX_SWAPCHAIN, - device: Direct3DDevice, + device: Arc, available_buffer_event: HANDLE, + target_id: u32, + render_luid_low: u32, + render_luid_high: i32, ) { let available_buffer_event = unsafe { Sendable::new(available_buffer_event) }; let swap_chain = unsafe { Sendable::new(swap_chain) }; @@ -64,7 +77,17 @@ impl SwapChainProcessor { return; }; - Self::run_core(*swap_chain, &device, *available_buffer_event, &terminate); + Self::run_core( + *swap_chain, + &device, + *available_buffer_event, + &terminate, + target_id, + render_luid_low, + render_luid_high, + ); + + error!("run_core RETURNED (target={target_id}) — deleting swap-chain, device drops next"); let res = unsafe { WdfObjectDelete(*swap_chain as WDFOBJECT) }; if let Err(e) = res { @@ -87,31 +110,140 @@ impl SwapChainProcessor { device: &Direct3DDevice, available_buffer_event: HANDLE, terminate: &AtomicBool, + target_id: u32, + render_luid_low: u32, + render_luid_high: i32, ) { - let dxgi_device = device.device.cast::(); - let Ok(dxgi_device) = dxgi_device else { - error!("Failed to cast ID3D11Device to IDXGIDevice: {dxgi_device:?}"); - return; - }; + // P2 direct frame push: lazily ATTACH to the HOST-created shared ring. The restricted UMDF + // token can't create named objects, so the host creates the header + event + textures and we + // only OPEN them once they appear (`try_open`). Until then we just drain — exactly the P1 + // behaviour — so a non-IDD-push session never stalls. Retried every ~30 frames. + let mut publisher: Option = None; + let mut frames_since_try: u32 = u32::MAX; // attach attempt on the first acquired frame + // Bring-up debug: prove run_core ran + record the target/render LUID we'll name objects with. + dbg_run_core_entry(); + dbg_set_target(target_id, render_luid_low, render_luid_high); + + // SetDevice fails (0x887A0026, FACILITY_DXGI) when the monitor briefly flaps INACTIVE during + // topology activation — the OS unassigns + re-assigns the swap-chain, and a fresh run_core thread + // can lose the race to the unassign. Retry briefly so a stable re-assign binds the device instead + // of giving up on the first transient failure. `terminate` (set when the OS unassigns + drops the + // processor) breaks us out promptly. + // Cast to IDXGIDevice ONCE and BORROW it to the swap-chain across all retries. The previous + // code re-cast + `into_raw()`'d on EVERY attempt — and a flapping monitor fails several + // attempts per session — so each failure orphaned one IDXGIDevice reference, pinning the D3D + // device so it (and its ~dozen D3D worker threads + tens of MB of VRAM) was NEVER freed when + // the processor dropped. That leaked ~71 threads / ~57 MB VRAM per reconnect until the driver + // choked and sessions fell to 0 bytes. `as_raw()` keeps our single reference (released right + // after the loop); IddCx AddRefs its own on success, and `device` keeps the object alive for + // the drain loop regardless. + let dxgi_device = match device.device.cast::() { + Ok(d) => d, + Err(e) => { + error!("Failed to cast ID3D11Device to IDXGIDevice: {e:?}"); + return; + } + }; let set_device = IDARG_IN_SWAPCHAINSETDEVICE { - pDevice: dxgi_device.into_raw().cast(), + pDevice: dxgi_device.as_raw().cast(), }; - - let res = unsafe { IddCxSwapChainSetDevice(swap_chain, &set_device) }; - if res.is_err() { - debug!("Failed to set swapchain device: {res:?}"); + let mut set_ok = false; + let mut terminated = false; + for attempt in 0..60u32 { + if terminate.load(Ordering::Relaxed) { + error!("run_core: terminated during SetDevice (attempt {attempt}, target={target_id})"); + terminated = true; + break; + } + let res = unsafe { IddCxSwapChainSetDevice(swap_chain, &set_device) }; + if res.is_ok() { + set_ok = true; + error!("run_core: SetDevice OK (target={target_id}, attempt={attempt}) — entering drain loop"); + break; + } + if attempt == 0 { + debug!("run_core: SetDevice attempt 0 failed ({res:?}) — retrying up to 60x@50ms (monitor may be flapping)"); + } + thread::sleep(Duration::from_millis(50)); + } + // Release our borrowed device reference — IddCx holds its own now, or we gave up. (Explicit + // drop so NLL can't release it mid-loop while the swap-chain still references the raw ptr.) + drop(dxgi_device); + if !set_ok { + if !terminated { + error!("run_core: SetDevice never succeeded after retries (target={target_id}) — giving up"); + } return; } + let mut logged_pending = false; + let mut logged_frame = false; loop { - let mut buffer = IDARG_OUT_RELEASEANDACQUIREBUFFER::default(); - let hr: NTSTATUS = - unsafe { IddCxSwapChainReleaseAndAcquireBuffer(swap_chain, &mut buffer).into() }; + // Check terminate at the TOP, every iteration. The success branch below does NOT re-check + // it, so during a CONTINUOUS frame burst (DWM rendering the freshly-activated desktop) a + // thread that the OS unassigns — or that `free_swap_chain_processor` is dropping — never + // sees the flag and loops on, pinning its D3D device (and ~36 NVIDIA worker threads). That + // is THE reconnect leak: it only reproduced at full speed, because cdb's pacing forced + // E_PENDING gaps (which DO check terminate) and masked it. Without this, `SwapChainProcessor::drop`'s + // join can also block until the burst ends. + if terminate.load(Ordering::Relaxed) { + break; + } + // The host recreates the shared ring (new format) mid-session when the display's HDR mode + // flips — it bumps the header generation. Detect that and drop the publisher so we re-attach + // to the new-format textures below; otherwise we'd keep CopyResource'ing into the stale ring, + // whose format now mismatches the surface → the publish() format-guard drops every frame and + // the stream freezes until the next swap-chain recreate. + if publisher.as_ref().is_some_and(FramePublisher::is_stale) { + publisher = None; + frames_since_try = u32::MAX; // re-attach immediately + } + // Lazy-attach (rate-limited) at the loop TOP so we keep trying even while the display is + // idle (E_PENDING / no frames presented yet), not only when a frame is acquired. `try_open` + // is a cheap OpenFileMapping that fails fast until the host has created the ring. + if publisher.is_none() { + if frames_since_try >= 30 { + frames_since_try = 0; + match FramePublisher::try_open( + target_id, + render_luid_low, + render_luid_high, + &device.device, + &device.device_context, + ) { + Ok(p) => { + dbg_header_attempt(0, true); + publisher = Some(p); + } + Err(e) => dbg_header_attempt(e.code().0 as u32, false), + } + } else { + frames_since_try += 1; + } + } + + // B2: ...Buffer2 is required once CAN_PROCESS_FP16 is set. AcquireSystemMemoryBuffer=FALSE + // keeps the GPU surface (out.MetaData.pSurface). The surface format varies per-frame — + // FP16 (R16G16B16A16_FLOAT) in HDR, BGRA in SDR — and the publisher's format guard handles + // a frame that doesn't match the ring until B3 makes the ring FP16. + let mut in_args = IDARG_IN_RELEASEANDACQUIREBUFFER2 { + #[allow(clippy::cast_possible_truncation)] + Size: std::mem::size_of::() as u32, + AcquireSystemMemoryBuffer: 0, + }; + let mut buffer = IDARG_OUT_RELEASEANDACQUIREBUFFER2::default(); + let hr: NTSTATUS = unsafe { + IddCxSwapChainReleaseAndAcquireBuffer2(swap_chain, &mut in_args, &mut buffer).into() + }; #[allow(clippy::items_after_statements)] const E_PENDING: u32 = 0x8000_000A; if u32::from(hr) == E_PENDING { + if !logged_pending { + error!("run_core: E_PENDING (target={target_id}) — swap-chain valid but DWM has composed NO frame yet"); + logged_pending = true; + } let wait_result = unsafe { WaitForSingleObject(WHANDLE(available_buffer_event.cast()), 16).0 }; @@ -130,8 +262,29 @@ impl SwapChainProcessor { // The wait was cancelled or something unexpected happened break; } else if hr.is_success() { + if !logged_frame { + error!("run_core: FIRST FRAME acquired (target={target_id}) — DWM IS compositing the virtual display!"); + logged_frame = true; + } + dbg_frame(); // bring-up: prove frames actually flow (vs an idle display) // This is the most performance-critical section of code in an IddCx driver. It's important that whatever // is done with the acquired surface be finished as quickly as possible. + // + // P2: copy the acquired surface into the shared ring BEFORE FinishedProcessingFrame + // (the surface is valid until the next ReleaseAndAcquire). The pointer is BORROWED — + // `from_raw_borrowed` does not take IddCx's refcount — and the GPU-side copy is ordered + // before the consumer via the slot keyed mutex. (Attach happens at the loop top.) + if let Some(pub_) = publisher.as_mut() { + let raw = buffer.MetaData.pSurface as *mut core::ffi::c_void; + if !raw.is_null() { + if let Some(res) = unsafe { IDXGIResource::from_raw_borrowed(&raw) } { + if let Ok(tex) = res.cast::() { + pub_.publish(&tex); + } + } + } + } + let hr = unsafe { IddCxSwapChainFinishedProcessingFrame(swap_chain) }; if hr.is_err() { diff --git a/packaging/windows/vdisplay-driver/wdf-umdf-sys/build.rs b/packaging/windows/vdisplay-driver/wdf-umdf-sys/build.rs index 5641363..699cd3c 100644 --- a/packaging/windows/vdisplay-driver/wdf-umdf-sys/build.rs +++ b/packaging/windows/vdisplay-driver/wdf-umdf-sys/build.rs @@ -7,7 +7,10 @@ use winreg::enums::HKEY_LOCAL_MACHINE; use winreg::RegKey; const UMDF_V: &str = "2.31"; -const IDDCX_V: &str = "1.4"; +// Bumped 1.4 -> 1.10 for HDR/FP16 support (IDDCX_ADAPTER_FLAGS_CAN_PROCESS_FP16, +// IddCxSwapChainReleaseAndAcquireBuffer2, the *2 mode/metadata DDIs). 1.10 is a superset of 1.4, so +// existing call sites keep working; the new HDR DDIs become available to bind. +const IDDCX_V: &str = "1.10"; #[derive(Debug, thiserror::Error)] enum Error { diff --git a/packaging/windows/vdisplay-driver/wdf-umdf/src/iddcx.rs b/packaging/windows/vdisplay-driver/wdf-umdf/src/iddcx.rs index 9e0784b..1339637 100644 --- a/packaging/windows/vdisplay-driver/wdf-umdf/src/iddcx.rs +++ b/packaging/windows/vdisplay-driver/wdf-umdf/src/iddcx.rs @@ -7,7 +7,8 @@ use wdf_umdf_sys::{ IDARG_IN_ADAPTERSETRENDERADAPTER, IDARG_IN_ADAPTER_INIT, IDARG_IN_MONITORCREATE, IDARG_IN_QUERY_HWCURSOR, IDARG_IN_SETUP_HWCURSOR, IDARG_IN_SWAPCHAINSETDEVICE, IDARG_OUT_ADAPTER_INIT, IDARG_OUT_MONITORARRIVAL, IDARG_OUT_MONITORCREATE, - IDARG_OUT_QUERY_HWCURSOR, IDARG_OUT_RELEASEANDACQUIREBUFFER, IDDCX_ADAPTER, IDDCX_MONITOR, + IDARG_IN_RELEASEANDACQUIREBUFFER2, IDARG_OUT_QUERY_HWCURSOR, IDARG_OUT_RELEASEANDACQUIREBUFFER, + IDARG_OUT_RELEASEANDACQUIREBUFFER2, IDDCX_ADAPTER, IDDCX_MONITOR, IDDCX_SWAPCHAIN, IDD_CX_CLIENT_CONFIG, NTSTATUS, WDFDEVICE, WDFDEVICE_INIT, }; @@ -236,6 +237,30 @@ pub unsafe fn IddCxSwapChainReleaseAndAcquireBuffer( ) } +/// IddCx 1.10 HDR variant — required once the adapter sets `CAN_PROCESS_FP16`. Provides per-frame +/// `IDDCX_METADATA2` (surface colour space, HDR metadata, SDR white level). +/// +/// # Safety +/// None. User is responsible for safety. +#[rustfmt::skip] +pub unsafe fn IddCxSwapChainReleaseAndAcquireBuffer2( + // in + SwapChainObject: IDDCX_SWAPCHAIN, + // in + pInArgs: &mut IDARG_IN_RELEASEANDACQUIREBUFFER2, + // out + pOutArgs: &mut IDARG_OUT_RELEASEANDACQUIREBUFFER2 +) -> Result { + IddCxCall!( + true, + IddCxSwapChainReleaseAndAcquireBuffer2( + SwapChainObject, + pInArgs, + pOutArgs + ) + ) +} + /// # Safety /// /// None. User is responsible for safety.