diff --git a/crates/punktfunk-host/src/capture/idd_push.rs b/crates/punktfunk-host/src/capture/idd_push.rs index 129de1d..b5cdfec 100644 --- a/crates/punktfunk-host/src/capture/idd_push.rs +++ b/crates/punktfunk-host/src/capture/idd_push.rs @@ -4,13 +4,16 @@ //! event + ring of keyed-mutex textures (`Global\` names, permissive `D:(A;;GA;;;WD)` SDDL) on the //! discrete render GPU, and the driver only OPENS them and copies frames in. We then consume the ring //! straight into the zero-copy NVENC path — no DXGI Desktop Duplication, no `win32u` hook. Gated by -//! `PUNKTFUNK_IDD_PUSH`. Driver counterpart: `packaging/windows/vdisplay-driver/pf-vdisplay/src/ -//! frame_transport.rs` — [`SharedHeader`], [`MAGIC`], [`RING_LEN`], the status codes and the `Global\` -//! name scheme are DUPLICATED byte-identically there. +//! `PUNKTFUNK_IDD_PUSH`. Driver counterpart: `packaging/windows/drivers/pf-vdisplay/src/ +//! frame_transport.rs`. The shared `SharedHeader` layout, `MAGIC`/`VERSION`/`RING_LEN`, the +//! `DRV_STATUS_*` codes, the `Global\` name scheme and the publish token all come from +//! [`pf_vdisplay_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides +//! `use` it, so drift is a compile error rather than a "must match" comment. use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget}; use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use anyhow::{bail, Context, Result}; +use pf_vdisplay_proto::frame; use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Mutex; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; @@ -39,45 +42,26 @@ use windows::Win32::System::Memory::{ }; use windows::Win32::System::Threading::{CreateEventW, WaitForSingleObject}; -// --- kept byte-identical with the driver (frame_transport.rs) --- -pub const MAGIC: u32 = 0x4456_4650; -pub const VERSION: u32 = 1; -/// Ring slots — MUST equal the driver's `RING_LEN` (frame_transport.rs). 6 (was 3) gives ample headroom -/// so the driver's 0 ms-timeout publish always finds a free slot while the host briefly holds one across -/// the convert/copy into its output ring and the depth-2 pipelined encode runs on the rest. -pub const RING_LEN: u32 = 6; -const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1; +// The frame-transport contract — `SharedHeader` layout, `MAGIC`/`VERSION`/`RING_LEN`, the +// `DRV_STATUS_*` codes and the `Global\` name helpers — lives in `pf_vdisplay_proto::frame`; both sides +// `use frame::*`, so a layout/name/code drift is a compile error (the proto has `const` size asserts). +use frame::{ + event_name, header_name, texture_name, SharedHeader, DRV_STATUS_NO_DEVICE1, DRV_STATUS_OPENED, + DRV_STATUS_TEX_FAIL, MAGIC, RING_LEN, VERSION, +}; -// driver_status codes (the driver writes these; we read+log them). -const DRV_STATUS_OPENED: u32 = 1; -const DRV_STATUS_TEX_FAIL: u32 = 2; -const DRV_STATUS_NO_DEVICE1: u32 = 3; +/// `DXGI_SHARED_RESOURCE_READ | _WRITE` for `CreateSharedHandle`/`OpenSharedResourceByName`. Local (not +/// part of the proto contract — it is a DXGI sharing-API arg, mirrored on the driver side). +const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1; /// Host-owned output-ring depth: distinct NVENC-input textures rotated per frame so the in-flight /// encode of frame N and the convert/copy of frame N+1 never touch the same texture. 3 covers a /// pipeline depth of 2 with one slot of margin. const OUT_RING: usize = 3; -#[repr(C)] -struct SharedHeader { - magic: u32, - version: u32, - generation: u32, - ring_len: u32, - width: u32, - height: u32, - dxgi_format: u32, - _pad: u32, - latest: u64, - qpc_pts: u64, - driver_render_luid_low: u32, - driver_render_luid_high: i32, - driver_status: u32, - driver_status_detail: u32, -} - /// Bring-up debug block (fixed name) — the host creates it; the driver writes diagnostics into it -/// independent of the per-target header. Byte-identical with the driver's `DebugBlock`. +/// independent of the per-target header. NOT part of `pf_vdisplay_proto` (a host-side bring-up channel, +/// not the data path); the matching `DebugBlock` lives in the OLD oracle driver's `frame_transport.rs`. #[repr(C)] struct DebugBlock { magic: u32, @@ -94,17 +78,6 @@ struct DebugBlock { const DBG_NAME: &str = "Global\\pfvd-dbg"; const DBG_MAGIC: u32 = 0x4742_4450; -fn hdr_name(target_id: u32) -> String { - format!("Global\\pfvd-hdr-{target_id}") -} -fn evt_name(target_id: u32) -> String { - format!("Global\\pfvd-evt-{target_id}") -} -fn tex_name(target_id: u32, generation: u32, slot: u32) -> String { - format!("Global\\pfvd-tex-{target_id}-{generation}-{slot}") -} -// ---------------------------------------------------------------- - /// Monotonic per-process generation: each capturer instance stamps its ring-texture names with a /// fresh value so a retried/overlapping `open()` never collides with a previous attempt's not-yet- /// released shared-handle names (`DXGI_ERROR_NAME_ALREADY_EXISTS`). The driver reads it from the header. @@ -339,7 +312,7 @@ impl IddPushCapturer { .CreateSharedHandle( Some(&sa as *const SECURITY_ATTRIBUTES), DXGI_SHARED_RESOURCE_RW, - &HSTRING::from(tex_name(target_id, generation, k)), + &HSTRING::from(texture_name(target_id, generation, k)), ) .context("CreateSharedHandle(IDD-push ring slot)")?; let mutex: IDXGIKeyedMutex = tex.cast()?; @@ -406,7 +379,7 @@ impl IddPushCapturer { PAGE_READWRITE, 0, bytes as u32, - &HSTRING::from(hdr_name(target.target_id)), + &HSTRING::from(header_name(target.target_id)), ) .context("CreateFileMapping(IDD-push header)")?; let view = MapViewOfFile(map, FILE_MAP_ALL_ACCESS, 0, 0, bytes); @@ -431,7 +404,7 @@ impl IddPushCapturer { Some(&sa), false, false, - &HSTRING::from(evt_name(target.target_id)), + &HSTRING::from(event_name(target.target_id)), ) .context("CreateEvent(IDD-push)")?; @@ -719,14 +692,16 @@ impl IddPushCapturer { // Follow the display: a "Use HDR" flip recreates the ring at the matching format. self.poll_display_hdr(); let latest = self.latest(); - // `latest` = (generation << 40) | (seq << 8) | slot. Reject any publish whose generation isn't - // our CURRENT ring (a stale old-ring publish racing a recreate, or the 0 sentinel we reset to) so - // we never consume an unwritten new-ring slot — eliminating the toggle-time garbage frame. - if (latest >> 40) as u32 != self.generation { + // `latest` is the proto publish token `(generation << 40) | (seq << 8) | slot`. Reject any publish + // whose generation isn't our CURRENT ring (a stale old-ring publish racing a recreate, or the 0 + // sentinel we reset to) so we never consume an unwritten new-ring slot — eliminating the + // toggle-time garbage frame. + let tok = frame::FrameToken::unpack(latest); + if tok.generation != self.generation { return Ok(None); } - let seq = (latest >> 8) & 0xFFFF_FFFF; - let slot = (latest & 0xff) as usize; + let seq = u64::from(tok.seq); + let slot = tok.slot as usize; if seq == self.last_seq || slot >= self.slots.len() { return Ok(None); } diff --git a/packaging/windows/drivers/pf-vdisplay/Cargo.toml b/packaging/windows/drivers/pf-vdisplay/Cargo.toml index fb75014..39ac53f 100644 --- a/packaging/windows/drivers/pf-vdisplay/Cargo.toml +++ b/packaging/windows/drivers/pf-vdisplay/Cargo.toml @@ -33,6 +33,7 @@ thiserror = "2.0" version = "0.58.0" features = [ "Win32_Foundation", + "Win32_System_Memory", "Win32_System_Threading", "Win32_Graphics_Direct3D", "Win32_Graphics_Direct3D11", diff --git a/packaging/windows/drivers/pf-vdisplay/src/callbacks.rs b/packaging/windows/drivers/pf-vdisplay/src/callbacks.rs index 3825cc6..05e5f19 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/callbacks.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/callbacks.rs @@ -214,7 +214,17 @@ pub unsafe extern "C" fn assign_swap_chain( if let Some(device) = crate::direct_3d_device::pooled_device(luid) { let mut processor = crate::swap_chain_processor::SwapChainProcessor::new(); - processor.run(swap_chain, device, new_frame_event, target_id); + // STEP 6: the publisher reports this render LUID into the host header so the host detects a + // render-adapter mismatch (it created the ring textures on its own GPU). `luid` is the OS-picked + // render adapter built above. + processor.run( + swap_chain, + device, + new_frame_event, + target_id, + luid.LowPart, + luid.HighPart, + ); // Install on the monitor; drop any processor it replaced (a race lost above) OUTSIDE the lock. drop(crate::monitor::set_swap_chain_processor(monitor, processor)); } else { diff --git a/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs b/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs index 4428b56..fe4ce5d 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs @@ -5,8 +5,9 @@ //! D3D/DXGI types are the `windows` crate (refcounted COM, no manual Drop); the swap-chain/LUID hand-off //! to the wdk-sys IddCx world happens via raw pointers in `swap_chain_processor.rs`. //! -//! STEP 5 only DRAINS the swap-chain to keep the monitor a live display — there is no frame publisher, -//! so the device's immediate context is unused here (it returns to use in STEP 6's `CopyResource`). +//! STEP 5 binds this device to the swap-chain to keep the monitor a live display; STEP 6 reuses the +//! device's immediate context in the frame publisher's `CopyResource` (both on the swap-chain processor +//! thread, the one thread this device is touched from). use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::{Arc, Mutex}; @@ -54,8 +55,6 @@ pub struct Direct3DDevice { pub device: ID3D11Device, /// The single (SINGLETHREADED) immediate context — used by STEP 6's frame-push publisher's /// `CopyResource` on the swap-chain processor thread (the one thread this device is touched from). - /// Unused in STEP 5 (drain-only); kept so the device matches the oracle exactly. - #[allow(dead_code)] pub device_context: ID3D11DeviceContext, } diff --git a/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs b/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs new file mode 100644 index 0000000..34e2358 --- /dev/null +++ b/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs @@ -0,0 +1,317 @@ +//! STEP 6 — IDD-push frame publisher (DRIVER side). +//! +//! The restricted WUDFHost token canNOT create named kernel objects (proven on the RTX box: it can't +//! even write a world-writable file), so — exactly like the gamepad UMDF drivers +//! (`crates/punktfunk-host/src/inject/dualsense_windows.rs`: *"the host creates the section, privileged, +//! with a permissive SDDL so the WUDFHost can open it; the driver maps it"*) — the **host** creates the +//! shared header + frame-ready event + ring of keyed-mutex textures, and the driver only **OPENS** them. +//! The driver writes its actual render-adapter LUID + a status code back into the host-created header (our +//! only driver-visibility channel: UMDF hides OutputDebugString in ETW and the token can't write files), +//! then copies each acquired swap-chain surface into the next ring slot and signals the host. +//! +//! Host counterpart: `crates/punktfunk-host/src/capture/idd_push.rs`. The shared `SharedHeader` layout, +//! the [`FrameToken`] packing, the `Global\` object-name scheme, the `MAGIC`/`RING_LEN` and the +//! `DRV_STATUS_*` codes are NOT hand-duplicated here: both sides `use pf_vdisplay_proto::frame::*`, which +//! OWNS the contract (with `const` size asserts so any drift is a compile error). +//! +//! Ported from the proven oracle (`packaging/windows/vdisplay-driver/pf-vdisplay/src/frame_transport.rs`). +//! Differences from the oracle: +//! * the layout/consts/names/token come from `pf_vdisplay_proto::frame` instead of being re-declared; +//! * `dbglog!` replaces `log::info!`; +//! * the optional fixed-name `Global\pfvd-dbg` `DebugBlock` bring-up channel is SKIPPED (not on the data +//! path). FOLLOW-UP: if the host bring-up diagnostics are needed again, port the oracle's `DebugBlock` +//! here too (it is owned by `idd_push.rs`, not the proto). + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; + +use pf_vdisplay_proto::frame::{ + DRV_STATUS_NO_DEVICE1, DRV_STATUS_OPENED, DRV_STATUS_TEX_FAIL, FrameToken, MAGIC, RING_LEN, + SharedHeader, event_name, header_name, texture_name, +}; +use windows::Win32::Foundation::{CloseHandle, HANDLE}; +use windows::Win32::Graphics::Direct3D11::{ + D3D11_TEXTURE2D_DESC, ID3D11Device, ID3D11Device1, ID3D11DeviceContext, ID3D11Texture2D, +}; +use windows::Win32::Graphics::Dxgi::IDXGIKeyedMutex; +use windows::Win32::System::Memory::{ + FILE_MAP_ALL_ACCESS, MEMORY_MAPPED_VIEW_ADDRESS, MapViewOfFile, OpenFileMappingW, + UnmapViewOfFile, +}; +use windows::Win32::System::Threading::{OpenEventW, SYNCHRONIZATION_ACCESS_RIGHTS, SetEvent}; +use windows::core::{HSTRING, Interface}; + +/// `DXGI_SHARED_RESOURCE_READ | _WRITE` — passed to `OpenSharedResourceByName` (matches the host's +/// `CreateSharedHandle` access). Kept local: it is a `OpenSharedResourceByName` arg, not part of the +/// proto contract. (Same value the host uses in `idd_push.rs`.) +const DXGI_SHARED_RESOURCE_RW: u32 = 0x8000_0000 | 0x1; +/// SYNCHRONIZE | EVENT_MODIFY_STATE — the driver does not wait on the event, only SIGNALS it. +const EVENT_ACCESS: u32 = 0x0010_0000 | 0x0002; +/// `WAIT_TIMEOUT` as an HRESULT — `AcquireSync` returns this when the slot is held by the consumer. +const WAIT_TIMEOUT_HRESULT: i32 = 0x0000_0102; + +struct Slot { + tex: ID3D11Texture2D, + mutex: IDXGIKeyedMutex, +} + +/// Publishes acquired swap-chain surfaces into the HOST-created ring. Owned by the swap-chain processor +/// thread; attached lazily once the host has created the shared objects. +pub struct FramePublisher { + context: ID3D11DeviceContext, + map: HANDLE, + header: *mut SharedHeader, + event: HANDLE, + slots: Vec, + next: u32, + seq: u64, + /// The host-created ring textures' DXGI format (from the shared header). A swap-chain surface whose + /// format differs (e.g. an FP16 HDR frame vs a BGRA ring) is dropped in `publish` — `CopyResource` + /// needs matching formats. + ring_format: u32, + /// The ring generation this publisher attached to. The host BUMPS the header generation when it + /// recreates the ring at a new format mid-session (the display's HDR mode flipped) — [`Self::is_stale`] + /// detects that so `run_core` re-attaches to the new-format textures instead of dropping every frame. + generation: u32, +} + +// SAFETY: created and used only on the swap-chain processor thread. +unsafe impl Send for FramePublisher {} + +impl FramePublisher { + /// Try ONCE to attach to the host-created shared objects. Returns `Err` cheaply if the host hasn't + /// created/published them yet — the drain loop retries periodically, so a non-IDD-push session just + /// keeps draining with no stall. All early-return paths clean up the handles/mapping they opened + /// explicitly (raw-handle style, no RAII — matches the rest of this driver). + pub fn try_open( + target_id: u32, + render_luid_low: u32, + render_luid_high: i32, + device: &ID3D11Device, + context: &ID3D11DeviceContext, + ) -> windows::core::Result { + // 1. Open the host-created header (RW). Err if the host hasn't created it yet. + let map = unsafe { + OpenFileMappingW( + FILE_MAP_ALL_ACCESS.0, + false, + &HSTRING::from(header_name(target_id)), + )? + }; + let view = unsafe { + MapViewOfFile( + map, + FILE_MAP_ALL_ACCESS, + 0, + 0, + core::mem::size_of::(), + ) + }; + if view.Value.is_null() { + unsafe { + let _ = CloseHandle(map); + } + return Err(windows::core::Error::from_win32()); + } + let header = view.Value.cast::(); + + // 2. Report our render adapter to the host immediately (lets it detect a mismatch). + unsafe { + (*header).driver_render_luid_low = render_luid_low; + (*header).driver_render_luid_high = render_luid_high; + } + + // 3. The host sets magic==MAGIC only once the ring textures exist. Not ready → retry later. + let magic = unsafe { + (*(core::ptr::addr_of!((*header).magic) as *const AtomicU32)).load(Ordering::Acquire) + }; + if magic != MAGIC { + unsafe { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: header.cast(), + }); + let _ = CloseHandle(map); + } + return Err(windows::core::Error::from_win32()); + } + let (generation, ring_len) = + unsafe { ((*header).generation, (*header).ring_len.min(RING_LEN)) }; + + // 4. Open the event (SYNCHRONIZE | EVENT_MODIFY_STATE so we can SetEvent). + let event = match unsafe { + OpenEventW( + SYNCHRONIZATION_ACCESS_RIGHTS(EVENT_ACCESS), + false, + &HSTRING::from(event_name(target_id)), + ) + } { + Ok(e) => e, + Err(e) => { + unsafe { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: header.cast(), + }); + let _ = CloseHandle(map); + } + return Err(e); + } + }; + + // 5. Open device1 + the ring textures the host created (same render adapter required). + let device1: ID3D11Device1 = match device.cast() { + Ok(d) => d, + Err(e) => { + unsafe { + (*header).driver_status = DRV_STATUS_NO_DEVICE1; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: header.cast(), + }); + let _ = CloseHandle(map); + } + return Err(e); + } + }; + let mut slots = Vec::new(); + for k in 0..ring_len { + let name = HSTRING::from(texture_name(target_id, generation, k)); + let opened: windows::core::Result = + unsafe { device1.OpenSharedResourceByName(&name, DXGI_SHARED_RESOURCE_RW) }; + match opened { + Ok(tex) => match tex.cast::() { + Ok(mutex) => slots.push(Slot { tex, mutex }), + Err(e) => { + unsafe { + (*header).driver_status = DRV_STATUS_TEX_FAIL; + (*header).driver_status_detail = e.code().0 as u32; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: header.cast(), + }); + let _ = CloseHandle(map); + } + return Err(e); + } + }, + Err(e) => { + // Most likely a render-adapter mismatch (the host made the textures on a different + // GPU than the swap-chain renders on). Tell the host so it can report it. + unsafe { + (*header).driver_status = DRV_STATUS_TEX_FAIL; + (*header).driver_status_detail = e.code().0 as u32; + let _ = CloseHandle(event); + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: header.cast(), + }); + let _ = CloseHandle(map); + } + return Err(e); + } + } + } + + unsafe { + (*header).driver_status = DRV_STATUS_OPENED; + } + dbglog!( + "[pf-vd] frame-push(driver): attached to host ring gen {generation} ({ring_len} slots)" + ); + Ok(Self { + context: context.clone(), + map, + header, + event, + slots, + next: 0, + seq: 0, + ring_format: unsafe { (*header).dxgi_format }, + generation, + }) + } + + #[inline] + fn latest_cell(&self) -> &AtomicU64 { + unsafe { &*(core::ptr::addr_of!((*self.header).latest) as *const AtomicU64) } + } + + /// True once the host has recreated the ring (bumped the header generation) — e.g. the display's HDR + /// mode flipped, so the ring format changed (FP16 ⇄ BGRA) and the texture names now carry a new + /// generation. `run_core` drops the publisher on this so it re-attaches to the new ring. + pub fn is_stale(&self) -> bool { + let cur = unsafe { + (*(core::ptr::addr_of!((*self.header).generation) as *const AtomicU32)) + .load(Ordering::Acquire) + }; + cur != self.generation + } + + /// Copy `surface` into the next free ring slot and signal the host. Never blocks (0 ms try-acquire). + pub fn publish(&mut self, surface: &ID3D11Texture2D) { + let ring_len = self.slots.len() as u32; + if ring_len == 0 { + return; + } + // Format guard: `CopyResource` needs the surface + ring textures to share a DXGI format. Drop a + // frame that doesn't match (e.g. an FP16 HDR surface arriving while the ring is still BGRA, before + // the host recreates the ring as FP16) instead of corrupting / failing the copy. + let mut desc = D3D11_TEXTURE2D_DESC::default(); + unsafe { surface.GetDesc(&mut desc) }; + if desc.Format.0 as u32 != self.ring_format { + return; + } + let start = self.next; + for attempt in 0..ring_len { + let slot = (start + attempt) % ring_len; + let s = &self.slots[slot as usize]; + match unsafe { s.mutex.AcquireSync(0, 0) } { + Ok(()) => { + // STRAIGHT-LINE, NO `?` between acquire + release — a `?`-return here would leak the + // keyed-mutex lock and wedge the host on this slot. The ordering below is load-bearing: + // the CopyResource is GPU-ordered before the consumer via the slot keyed mutex, and the + // `latest` store (Release) publishes the slot only AFTER the copy is queued + the mutex + // released. + unsafe { + self.context.CopyResource(&s.tex, surface); + let _ = s.mutex.ReleaseSync(0); + } + self.seq = self.seq.wrapping_add(1); + // `latest` = (generation << 40) | (seq << 8) | slot, packed by the proto's `FrameToken` + // (single source of truth — the host unpacks with the same type). Stamping the generation + // lets the host REJECT a publish from a stale ring (an old-generation publisher racing the + // host's mid-session ring recreate) so it never consumes an unwritten new-ring slot. + let latest = FrameToken { + generation: self.generation, + seq: self.seq as u32, + slot: slot as u8, + } + .pack(); + self.latest_cell().store(latest, Ordering::Release); + unsafe { + let _ = SetEvent(self.event); + } + self.next = (slot + 1) % ring_len; + return; + } + Err(e) if e.code().0 == WAIT_TIMEOUT_HRESULT => continue, + Err(_) => return, + } + } + // All slots busy — drop this frame (never block the swap-chain thread). + } +} + +impl Drop for FramePublisher { + fn drop(&mut self) { + // Slots FIRST (release the shared textures + keyed mutexes), THEN unmap the header, THEN the + // handles. + self.slots.clear(); + unsafe { + if !self.header.is_null() { + let _ = UnmapViewOfFile(MEMORY_MAPPED_VIEW_ADDRESS { + Value: self.header.cast(), + }); + } + let _ = CloseHandle(self.event); + let _ = CloseHandle(self.map); + } + } +} diff --git a/packaging/windows/drivers/pf-vdisplay/src/lib.rs b/packaging/windows/drivers/pf-vdisplay/src/lib.rs index 305f521..73e8c80 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/lib.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/lib.rs @@ -18,6 +18,7 @@ mod control; mod direct_3d_device; mod edid; mod entry; +mod frame_transport; mod monitor; mod swap_chain_processor; diff --git a/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs b/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs index 57cc2fc..50c196f 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs @@ -1,18 +1,20 @@ -//! The swap-chain processor (STEP 5): a worker thread that DRAINS the IddCx swap-chain so the virtual -//! monitor stays a usable display. +//! The swap-chain processor (STEP 5 + STEP 6): a worker thread that DRAINS the IddCx swap-chain (so the +//! virtual monitor stays a usable display) and PUBLISHES each acquired surface into the host-created +//! shared ring (the IDD-push path). //! -//! The OS presents the composited desktop to the driver through a swap-chain; the driver MUST consume -//! it (acquire → finished-processing) or the monitor stalls. STEP 5 binds our render device to the -//! swap-chain (`IddCxSwapChainSetDevice`) and loops acquire/finish, discarding each frame. It does NOT -//! publish frames to the host — that is STEP 6 (the `CopyResource` of `out.MetaData.pSurface` into a -//! shared ring), deliberately omitted here. +//! The OS presents the composited desktop to the driver through a swap-chain; the driver MUST consume it +//! (acquire → finished-processing) or the monitor stalls. STEP 5 binds our render device to the swap-chain +//! (`IddCxSwapChainSetDevice`) and loops acquire/finish. STEP 6 lazily attaches a [`FramePublisher`] to +//! the host's shared ring and, on each acquired frame, `CopyResource`s `out.MetaData.pSurface` into the +//! next ring slot before finishing the frame (a non-IDD-push session simply never attaches and keeps +//! draining). //! //! Ported from the proven oracle (`packaging/windows/vdisplay-driver/pf-vdisplay/src/ //! swap_chain_processor.rs`) onto wdk-sys + wdk-iddcx. The oracle's `wdf_umdf`/`wdf_umdf_sys` are //! replaced by `wdk_sys::iddcx::*` + the `wdk_iddcx` DDI wrappers. Those wrappers return a RAW //! `NTSTATUS` (`i32`) that is HRESULT-shaped for the swap-chain DDIs, so we classify it by hand //! (`hr >= 0` = success; `0x8000_000A` = E_PENDING; `hr < 0 && != E_PENDING` = error) rather than with -//! `nt_success`. The publisher + `render_luid_low/high` params are dropped (STEP 6). +//! `nt_success`. use std::{ mem::size_of, @@ -35,7 +37,10 @@ use wdk_sys::{HANDLE, NTSTATUS, WDFOBJECT, call_unsafe_wdf_function_binding}; use windows::{ Win32::{ Foundation::HANDLE as WHANDLE, - Graphics::Dxgi::IDXGIDevice, + Graphics::{ + Direct3D11::ID3D11Texture2D, + Dxgi::{IDXGIDevice, IDXGIResource}, + }, System::Threading::{ AvRevertMmThreadCharacteristics, AvSetMmThreadCharacteristicsW, WaitForSingleObject, }, @@ -43,7 +48,7 @@ use windows::{ core::{Interface, w}, }; -use crate::direct_3d_device::Direct3DDevice; +use crate::{direct_3d_device::Direct3DDevice, frame_transport::FramePublisher}; /// E_PENDING — `ReleaseAndAcquireBuffer2` returns this (HRESULT-shaped) when the swap-chain is valid but /// DWM has composed no new frame yet; wait on the surface-available event and retry. @@ -89,6 +94,8 @@ impl SwapChainProcessor { device: Arc, available_buffer_event: HANDLE, target_id: u32, + render_luid_low: u32, + render_luid_high: i32, ) { let available_buffer_event = Sendable(available_buffer_event); let swap_chain = Sendable(swap_chain); @@ -117,6 +124,8 @@ impl SwapChainProcessor { available_buffer_event.0, &terminate, target_id, + render_luid_low, + render_luid_high, ); dbglog!( @@ -147,6 +156,8 @@ impl SwapChainProcessor { available_buffer_event: HANDLE, terminate: &AtomicBool, target_id: u32, + render_luid_low: u32, + render_luid_high: i32, ) { // SetDevice fails (0x887A0026, FACILITY_DXGI) when the monitor briefly flaps INACTIVE during // topology activation — the OS unassigns + re-assigns the swap-chain, and a fresh run_core thread @@ -208,6 +219,13 @@ impl SwapChainProcessor { return; } + // STEP 6 IDD-push: lazily ATTACH to the HOST-created shared ring. The restricted UMDF token can't + // create named objects, so the host creates the header + event + textures and we only OPEN them + // once they appear (`try_open`). Until then we just drain — exactly the STEP-5 behaviour — so a + // non-IDD-push session never stalls. Retried every ~30 loop iterations. + let mut publisher: Option = None; + let mut frames_since_try: u32 = u32::MAX; // attach attempt on the first loop iteration + let mut logged_pending = false; let mut logged_frame = false; loop { @@ -221,9 +239,40 @@ impl SwapChainProcessor { break; } + // The host recreates the shared ring (new format) mid-session when the display's HDR mode + // flips — it bumps the header generation. Detect that and drop the publisher so we re-attach to + // the new-format textures below; otherwise we'd keep CopyResource'ing into the stale ring, whose + // format now mismatches the surface → the publish() format-guard drops every frame and the + // stream freezes until the next swap-chain recreate. + if publisher.as_ref().is_some_and(FramePublisher::is_stale) { + publisher = None; + frames_since_try = u32::MAX; // re-attach immediately + } + // Lazy-attach (rate-limited) at the loop TOP so we keep trying even while the display is idle + // (E_PENDING / no frames presented yet), not only when a frame is acquired. `try_open` is a + // cheap OpenFileMapping that fails fast until the host has created the ring. + if publisher.is_none() { + if frames_since_try >= 30 { + frames_since_try = 0; + // `if let Ok` (not a `match` with an empty `Err` arm) keeps clippy's `single_match` + // happy under `-D warnings`; semantics are identical — attach on success, retry on Err. + if let Ok(p) = FramePublisher::try_open( + target_id, + render_luid_low, + render_luid_high, + &device.device, + &device.device_context, + ) { + publisher = Some(p); + } + } else { + frames_since_try += 1; + } + } + // ...Buffer2 is required once CAN_PROCESS_FP16 is set. AcquireSystemMemoryBuffer=FALSE keeps - // the GPU surface (out.MetaData.pSurface). STEP 5 only drains — it does NOT publish the - // surface (STEP 6 will). Built zeroed + field-assigned (driver style) so a bindgen field-set + // the GPU surface (out.MetaData.pSurface) — STEP 6 publishes it into the shared ring in the + // success branch below. Built zeroed + field-assigned (driver style) so a bindgen field-set // difference can't break a positional struct literal. let mut in_args: IDARG_IN_RELEASEANDACQUIREBUFFER2 = unsafe { core::mem::zeroed() }; #[allow(clippy::cast_possible_truncation)] @@ -275,9 +324,23 @@ impl SwapChainProcessor { ); logged_frame = true; } - // STEP 6 publishes `buffer.MetaData.pSurface` into the shared ring HERE (the surface is - // valid until the next ReleaseAndAcquire). STEP 5 only drains, so we immediately finish - // the frame. + // STEP 6: copy the acquired surface into the shared ring BEFORE FinishedProcessingFrame + // (the surface is valid until the next ReleaseAndAcquire). The pointer is BORROWED — + // `from_raw_borrowed` does NOT take IddCx's refcount — and the GPU-side copy is ordered + // before the consumer via the slot keyed mutex. (Attach happens at the loop top.) + if let Some(p) = publisher.as_mut() { + let raw = buffer.MetaData.pSurface as *mut core::ffi::c_void; + if !raw.is_null() { + // SAFETY: `raw` is IddCx's live surface pointer (valid until the next + // ReleaseAndAcquire); `from_raw_borrowed` does not consume the refcount. + if let Some(res) = unsafe { IDXGIResource::from_raw_borrowed(&raw) } { + if let Ok(tex) = res.cast::() { + p.publish(&tex); + } + } + } + } + // SAFETY: driver is loaded; `swap_chain` is valid. let hr = unsafe { wdk_iddcx::IddCxSwapChainFinishedProcessingFrame(swap_chain) }; if !hr_success(hr) { diff --git a/packaging/windows/drivers/wdk-probe/build.rs b/packaging/windows/drivers/wdk-probe/build.rs index b294607..f97e5bd 100644 --- a/packaging/windows/drivers/wdk-probe/build.rs +++ b/packaging/windows/drivers/wdk-probe/build.rs @@ -37,5 +37,7 @@ fn link_iddcx_stub() { } } } - panic!("IddCxStub.lib not found under any Windows Kits Lib\\\\um\\{ARCH}\\iddcx\\\\"); + panic!( + "IddCxStub.lib not found under any Windows Kits Lib\\\\um\\{ARCH}\\iddcx\\\\" + ); } diff --git a/packaging/windows/drivers/wdk-probe/src/iddcx_rt.rs b/packaging/windows/drivers/wdk-probe/src/iddcx_rt.rs index 9381d8c..289a707 100644 --- a/packaging/windows/drivers/wdk-probe/src/iddcx_rt.rs +++ b/packaging/windows/drivers/wdk-probe/src/iddcx_rt.rs @@ -8,9 +8,9 @@ #![allow(non_snake_case)] use wdk_sys::iddcx::{ - IDARG_IN_ADAPTER_INIT, IDARG_OUT_ADAPTER_INIT, IDD_CX_CLIENT_CONFIG, IddDriverGlobals, - IddFunctions, PFN_IDDCXADAPTERINITASYNC, PFN_IDDCXDEVICEINITCONFIG, PFN_IDDCXDEVICEINITIALIZE, - PFN_IDD_CX, PIDD_DRIVER_GLOBALS, _IDDFUNCENUM, + _IDDFUNCENUM, IDARG_IN_ADAPTER_INIT, IDARG_OUT_ADAPTER_INIT, IDD_CX_CLIENT_CONFIG, + IddDriverGlobals, IddFunctions, PFN_IDD_CX, PFN_IDDCXADAPTERINITASYNC, + PFN_IDDCXDEVICEINITCONFIG, PFN_IDDCXDEVICEINITIALIZE, PIDD_DRIVER_GLOBALS, }; use wdk_sys::{NTSTATUS, PWDFDEVICE_INIT, WDFDEVICE}; diff --git a/packaging/windows/drivers/wdk-probe/src/lib.rs b/packaging/windows/drivers/wdk-probe/src/lib.rs index a67ee2d..a1fae99 100644 --- a/packaging/windows/drivers/wdk-probe/src/lib.rs +++ b/packaging/windows/drivers/wdk-probe/src/lib.rs @@ -11,8 +11,9 @@ mod iddcx_surface_assert; use wdk_sys::iddcx::{IDARG_IN_ADAPTER_INIT, IDARG_OUT_ADAPTER_INIT, IDD_CX_CLIENT_CONFIG}; use wdk_sys::{ - call_unsafe_wdf_function_binding, NTSTATUS, PCUNICODE_STRING, PDRIVER_OBJECT, PWDFDEVICE_INIT, - ULONG, WDFDEVICE, WDFDRIVER, WDF_DRIVER_CONFIG, WDF_NO_HANDLE, WDF_NO_OBJECT_ATTRIBUTES, + NTSTATUS, PCUNICODE_STRING, PDRIVER_OBJECT, PWDFDEVICE_INIT, ULONG, WDF_DRIVER_CONFIG, + WDF_NO_HANDLE, WDF_NO_OBJECT_ATTRIBUTES, WDFDEVICE, WDFDRIVER, + call_unsafe_wdf_function_binding, }; const STATUS_SUCCESS: NTSTATUS = 0;