From 0da9d8ec10e369a622582564720e5eed93370f42 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Fri, 3 Jul 2026 16:27:13 +0000 Subject: [PATCH] =?UTF-8?q?fix(windows):=20IDD-push=20audit=20highs=20?= =?UTF-8?q?=E2=80=94=20keyed-mutex=20timeout,=20two=20per-frame=20leaks,?= =?UTF-8?q?=20IDD=5FPUSH=20knob,=20pooled-device=20threading?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Five verified findings from the IDD-push/pf-vdisplay deep audit: - Keyed-mutex acquire (BOTH endpoints): AcquireSync returns WAIT_TIMEOUT (0x102) / WAIT_ABANDONED (0x80) as SUCCESS-severity HRESULTs, which the windows-rs Result wrapper erases — a busy slot read as "acquired", so driver and host could race the same ring texture (torn frames) and the designed busy-skip backpressure was dead code. Both sides now classify the raw vtable HRESULT; WAIT_ABANDONED counts as acquired (ownership transfers — refusing it would wedge the slot forever). - Host SDR hot path leaked one ID3D11VideoProcessorInputView per converted frame: the D3D11_VIDEO_PROCESSOR_STREAM ManuallyDrop field suppressed the release after VideoProcessorBlt. Released by hand now, success or not. - Driver leaked IddCx's per-acquire surface reference (from_raw_borrowed on a TRANSFERRED reference — the MS sample Attach/Reset's it): the swap-chain surface set survived swap-chain destruction, the likely true root cause of the ~50 MB-per-reconnect VRAM loss that device pooling only mitigated. Now adopted via from_raw (publisher or not) and dropped pre-Finished. - PUNKTFUNK_IDD_PUSH removed: capture is unconditionally IDD-push, but the vdisplay manager still gated the lingering-monitor preempt (and render pin) on the knob, whose default was OFF — dev/CLI runs reused a lingering monitor whose IddCx swap-chain is dead (black reconnect). The preempt and the render-GPU pin are now unconditional; host.env comments no longer promise the removed DDA/WGC fallback. - Driver D3D device: dropped D3D11_CREATE_DEVICE_SINGLETHREADED (unsound since DEVICE_POOL shares one device across processors) and the pooled immediate context is now SetMultithreadProtected — two concurrent monitors' workers otherwise race an unlocked context (UB in the UMD). No wire-contract change (pf-driver-proto untouched); the driver fixes take effect on the next pf-vdisplay redeploy. Co-Authored-By: Claude Fable 5 --- .../src/capture/windows/dxgi.rs | 12 ++++-- .../src/capture/windows/idd_push.rs | 26 +++++++++--- crates/punktfunk-host/src/config.rs | 21 ++-------- .../src/vdisplay/windows/manager.rs | 34 ++++----------- crates/punktfunk-host/src/windows/service.rs | 5 +-- docs-site/content/docs/configuration.md | 3 +- .../pf-vdisplay/src/direct_3d_device.rs | 42 +++++++++++++++---- .../pf-vdisplay/src/frame_transport.rs | 26 +++++++++--- .../pf-vdisplay/src/swap_chain_processor.rs | 23 ++++++---- scripts/windows/host.env.example | 6 +-- 10 files changed, 119 insertions(+), 79 deletions(-) diff --git a/crates/punktfunk-host/src/capture/windows/dxgi.rs b/crates/punktfunk-host/src/capture/windows/dxgi.rs index 807b28d..12c0bd7 100644 --- a/crates/punktfunk-host/src/capture/windows/dxgi.rs +++ b/crates/punktfunk-host/src/capture/windows/dxgi.rs @@ -1129,8 +1129,14 @@ impl VideoConverter { pInputSurface: std::mem::ManuallyDrop::new(in_view), ..Default::default() }; - self.vctx - .VideoProcessorBlt(&self.vp, &out_view, 0, &[stream]) - .context("VideoProcessorBlt") + let blt = + self.vctx + .VideoProcessorBlt(&self.vp, &out_view, 0, std::slice::from_ref(&stream)); + // COM in-params never transfer ownership: the Blt only borrowed the input view, and the + // struct's `ManuallyDrop` field suppressed its release — drop it by hand, success or not. + // (Skipping this leaked one view + its UMD allocation PER CONVERTED FRAME — the SDR hot + // path; D3D11 defers the actual destruction until the GPU is done with the blit.) + drop(std::mem::ManuallyDrop::into_inner(stream.pInputSurface)); + blt.context("VideoProcessorBlt") } } diff --git a/crates/punktfunk-host/src/capture/windows/idd_push.rs b/crates/punktfunk-host/src/capture/windows/idd_push.rs index 003dcc8..3e2a97b 100644 --- a/crates/punktfunk-host/src/capture/windows/idd_push.rs +++ b/crates/punktfunk-host/src/capture/windows/idd_push.rs @@ -10,7 +10,7 @@ //! target process's handle table, so the bootstrap's ACL is not load-bearing; the only way to reach the //! frames is to already be one of the two endpoint processes. The driver copies frames in; we consume //! the ring straight into the zero-copy NVENC path — no DXGI Desktop Duplication, no `win32u` hook. -//! Gated by `PUNKTFUNK_IDD_PUSH`. Driver counterpart: `packaging/windows/drivers/pf-vdisplay/src/ +//! The SOLE Windows capture path. Driver counterpart: `packaging/windows/drivers/pf-vdisplay/src/ //! frame_transport.rs`. The shared `SharedHeader` layout, `MAGIC`/`VERSION`/`RING_LEN`, the //! `DRV_STATUS_*` codes, the channel-delivery struct and the publish token all come from //! [`pf_driver_proto`] (which OWNS the contract, with `const` size asserts) — both sides `use` it, so @@ -133,7 +133,7 @@ struct HostSlot { shared: OwnedHandle, /// SRV on the slot texture so the HDR path samples the FP16 slot DIRECTLY (no slot→scratch copy); /// the convert pass writes the output ring while holding the slot's keyed mutex. Unused for SDR - /// (which CopyResource's the BGRA slot straight to the output). + /// (which converts the BGRA slot → NV12 on the video engine, via its own per-frame input view). srv: ID3D11ShaderResourceView, } @@ -147,6 +147,13 @@ struct KeyedMutexGuard<'a> { key: u64, } +/// `WAIT_ABANDONED` as an HRESULT: the driver died while holding the slot's keyed mutex — ownership +/// still transferred to this caller. SUCCESS-severity (positive), like `WAIT_TIMEOUT` (0x102): the +/// windows-rs `Result` wrapper erases both (`.ok()` maps every non-negative HRESULT to `Ok(())`), so +/// acquisition MUST be classified on the raw vtable HRESULT. Mirrors the driver's constants +/// (`frame_transport.rs`). +const WAIT_ABANDONED_HRESULT: i32 = 0x0000_0080; + impl<'a> KeyedMutexGuard<'a> { /// Acquire `mutex` at `key`, waiting up to `timeout_ms`. `None` if the acquire times out / errors /// (the caller skips the frame), so the guard is only ever held when the lock is genuinely held. @@ -156,10 +163,19 @@ impl<'a> KeyedMutexGuard<'a> { timeout_ms: u32, ) -> Option> { // SAFETY: `mutex` is a live `IDXGIKeyedMutex` on this thread's immediate-context device. - if unsafe { mutex.AcquireSync(key, timeout_ms) }.is_err() { - return None; + // Raw vtable call, NOT the `Result` wrapper: `.is_err()` treated WAIT_TIMEOUT (positive = + // `Ok`) as acquired, handing out a guard for a slot the DRIVER still held — converting from + // a texture mid-copy (torn frame) and `ReleaseSync`ing a key this side never took. + let hr = unsafe { + (Interface::vtable(mutex).AcquireSync)(Interface::as_raw(mutex), key, timeout_ms) + }; + match hr.0 { + // Acquired — S_OK, or WAIT_ABANDONED (the driver died holding the slot: the lock is + // OURS now, and refusing the guard would leave the key held forever, wedging the slot). + 0 | WAIT_ABANDONED_HRESULT => Some(KeyedMutexGuard { mutex, key }), + // WAIT_TIMEOUT (slot busy — the caller skips this frame) or a genuine error: never held. + _ => None, } - Some(KeyedMutexGuard { mutex, key }) } } diff --git a/crates/punktfunk-host/src/config.rs b/crates/punktfunk-host/src/config.rs index 91676ce..b0b7e35 100644 --- a/crates/punktfunk-host/src/config.rs +++ b/crates/punktfunk-host/src/config.rs @@ -5,7 +5,7 @@ //! process lifetime**, so a lazily-parsed global is equivalent to "parsed once at startup". //! //! **Goal-1 stages 1–2** (`design/windows-host-rewrite.md` §2.2): stage 1 stood this up; stage 2 migrated the -//! genuinely-constant operator/dispatch knobs onto it (the dispatch-disagreement bug class: `idd_push`, +//! genuinely-constant operator/dispatch knobs onto it (the dispatch-disagreement bug class: //! `encoder_pref`, `render_adapter`, the vdisplay backend select — plus the plan-named //! `idd_depth`/`zerocopy`/`ten_bit`/`four_four_four` and the multi-site `perf`/`compositor`/ //! `video_source`/`gamepad`). `SessionPlan` (stage 3) consumes it as the single owner of the @@ -36,12 +36,6 @@ use std::sync::OnceLock; /// derived `Debug` impl, so the parser can stay a single platform-neutral function. #[derive(Debug, Clone, Default)] pub struct HostConfig { - /// `PUNKTFUNK_IDD_PUSH` — IDD direct-push monitor mode (the per-session monitor + ring recreate and - /// the discrete-render-GPU pin in [`crate::vdisplay::manager`]). IDD-push is the sole Windows capture - /// path (DXGI Desktop Duplication and the WGC relay were removed), so this should stay on — the - /// installer's `host.env` sets it. **Value-aware** (`0`/`false`/`no`/`off`/empty ⇒ off, else on); - /// unset ⇒ off. NOT a bare presence flag (so an operator can turn it OFF with `=0`). - pub idd_push: bool, /// `PUNKTFUNK_ENCODER` — explicit encoder-backend override (lowercased; empty = auto-detect by GPU vendor). pub encoder_pref: String, /// `PUNKTFUNK_RENDER_ADAPTER` — discrete render-GPU pin by description substring (`Some` even when empty: @@ -80,16 +74,9 @@ impl HostConfig { // String value: `var(k).ok()` — `Some` (possibly empty) when set with valid UTF-8, else `None`. let val = |k: &str| std::env::var(k).ok(); Self { - // Value-aware (not a bare presence flag): the shipped default `host.env` turns it ON, and an - // operator turns it OFF with `PUNKTFUNK_IDD_PUSH=0` (a `var_os` presence check would read `=0` - // as "on"). Unset ⇒ off (the dev / non-pf-driver default). - idd_push: match std::env::var("PUNKTFUNK_IDD_PUSH") { - Ok(v) => !matches!( - v.trim().to_ascii_lowercase().as_str(), - "" | "0" | "false" | "no" | "off" - ), - Err(_) => false, - }, + // (`PUNKTFUNK_IDD_PUSH` was removed: IDD-push is the sole Windows capture path, so the knob + // only split dispatch — capture ignored it while the vdisplay manager obeyed it, and `=0` + // produced dead-swap-chain reuse on reconnect. A stale setting in an old host.env is ignored.) encoder_pref: std::env::var("PUNKTFUNK_ENCODER") .unwrap_or_default() .to_ascii_lowercase(), diff --git a/crates/punktfunk-host/src/vdisplay/windows/manager.rs b/crates/punktfunk-host/src/vdisplay/windows/manager.rs index 522a809..3f4743e 100644 --- a/crates/punktfunk-host/src/vdisplay/windows/manager.rs +++ b/crates/punktfunk-host/src/vdisplay/windows/manager.rs @@ -240,7 +240,7 @@ impl VirtualDisplayManager { // NOT a reconnect. Preempting Active would tear a live session down AND churn REMOVE→ADD on every // retry — the per-cold-start monitor churn that exhausts the IddCx slot pool and wedges ADD at // 0x80070490. Active falls through to the JOIN path below (refcount++, no ADD). - if idd_push_mode() && matches!(*state, MgrState::Lingering { .. }) { + if matches!(*state, MgrState::Lingering { .. }) { if let MgrState::Lingering { mon, .. } = std::mem::replace(&mut *state, MgrState::Idle) { tracing::info!( @@ -676,31 +676,15 @@ impl Drop for MonitorLease { } } -/// IDD-push mode: a new client connection preempts + recreates the monitor (single-client reconnect), -/// because a REUSED IddCx monitor's swap-chain is dead. Off → monitors are shared across sessions. -fn idd_push_mode() -> bool { - crate::config::config().idd_push -} - -/// The render-GPU pin decision (backend-neutral): pin the discrete render GPU when explicitly requested, -/// or under IDD-push (the host runs NVENC on the render adapter, so it MUST be the discrete encoder GPU -/// on a hybrid box). `None` = let the IDD use its natural adapter (Apollo parity — avoids the cross-GPU -/// ACCESS_LOST storm SudoVDA hit when pinned). +/// The render-GPU pin (backend-neutral): IDD-push — the sole Windows capture path — runs NVENC on the +/// render adapter, so it must always be pinned to the selected encoder GPU (a hybrid box would +/// otherwise render on the wrong one). The selection itself (web-console preference > +/// `PUNKTFUNK_RENDER_ADAPTER` > max VRAM) lives in [`crate::win_adapter::resolve_render_adapter_luid`]. +/// (This was gated on the removed `PUNKTFUNK_IDD_PUSH` knob — a dispatch disagreement, since capture +/// stopped consulting it when DDA/WGC were removed.) fn resolve_render_pin() -> Option { - // A web-console manual GPU preference pins exactly like PUNKTFUNK_RENDER_ADAPTER: the whole - // pipeline (driver render device, capture ring, encoder) must sit on the chosen adapter. - let manual_pref = crate::gpu::prefs().get().mode == crate::gpu::GpuMode::Manual; - if crate::config::config().render_adapter.is_some() || manual_pref { - crate::win_adapter::resolve_render_adapter_luid() - } else if crate::config::config().idd_push { - tracing::info!("IDD push: pinning the discrete render GPU (SET_RENDER_ADAPTER)"); - crate::win_adapter::resolve_render_adapter_luid() - } else { - tracing::info!( - "SET_RENDER_ADAPTER skipped (Apollo-parity: no render pin; set PUNKTFUNK_RENDER_ADAPTER= to force one)" - ); - None - } + tracing::info!("IDD push: pinning the render GPU (SET_RENDER_ADAPTER)"); + crate::win_adapter::resolve_render_adapter_luid() } /// Linger window before a session-less monitor is torn down (default 10 s; `PUNKTFUNK_MONITOR_LINGER_MS`). diff --git a/crates/punktfunk-host/src/windows/service.rs b/crates/punktfunk-host/src/windows/service.rs index e015952..f756ca6 100644 --- a/crates/punktfunk-host/src/windows/service.rs +++ b/crates/punktfunk-host/src/windows/service.rs @@ -797,10 +797,9 @@ fn ensure_default_host_env() -> Result<()> { # Force one with nvenc | amf | qsv | sw (software H.264). amf/qsv need an FFmpeg-built host.\n\ PUNKTFUNK_ENCODER=auto\n\ PUNKTFUNK_VIDEO_SOURCE=virtual\n\ - # Virtual display = the bundled pf-vdisplay driver; capture from its shared ring (the validated\n\ - # zero-copy IDD-push path; falls back to DDA if it can't attach). Set PUNKTFUNK_IDD_PUSH=0 to force WGC/DDA.\n\ + # Virtual display = the bundled pf-vdisplay driver; capture is IDD-push from its shared ring\n\ + # (the sole capture path — zero-copy, includes the secure desktop; DDA/WGC were removed).\n\ PUNKTFUNK_VDISPLAY=pf\n\ - PUNKTFUNK_IDD_PUSH=1\n\ PUNKTFUNK_SECURE_DDA=1\n\ RUST_LOG=info\n\ \n\ diff --git a/docs-site/content/docs/configuration.md b/docs-site/content/docs/configuration.md index c5e33b6..657a3bb 100644 --- a/docs-site/content/docs/configuration.md +++ b/docs-site/content/docs/configuration.md @@ -98,9 +98,8 @@ picture. | Setting | Values | Meaning | |---|---|---| | `PUNKTFUNK_VDISPLAY` | `pf` | Virtual-display backend. The bundled pf-vdisplay IddCx driver is the only backend now — informational; leave as `pf`. | -| `PUNKTFUNK_IDD_PUSH` | `1` · `0` | Capture straight from the pf-vdisplay driver's shared ring (the validated zero-copy path, incl. the secure desktop). Set `0` to force WGC/DDA capture. | | `PUNKTFUNK_SECURE_DDA` | `1` | Capture the secure desktop (UAC / lock / login) so the stream survives those transitions. | -| `PUNKTFUNK_MONITOR_LINGER_MS` | ms (default `10000`) | Keep a per-client virtual display alive briefly after disconnect so a quick reconnect reuses it (no display connect/disconnect chime). | +| `PUNKTFUNK_MONITOR_LINGER_MS` | ms (default `10000`) | Defer tearing a per-client virtual display down after disconnect. A reconnect inside the window preempts it and creates a fresh one (a reused IddCx swap-chain is dead); the stable per-client monitor id keeps Windows' saved display config applying either way. | | `PUNKTFUNK_RENDER_ADAPTER` | description substring | Multi-GPU boxes only: force the NVENC/capture GPU by adapter Description substring (e.g. `4090`). Leave unset on single-GPU machines. | | `PUNKTFUNK_HOST_CMD` | e.g. `serve --gamestream` | The host subcommand the service launches. Default `serve --gamestream`; use `serve` for a secure native-only host. | diff --git a/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs b/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs index 9c5e2ce..57d2e27 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/direct_3d_device.rs @@ -6,27 +6,30 @@ //! to the wdk-sys IddCx world happens via raw pointers in `swap_chain_processor.rs`. //! //! STEP 5 binds this device to the swap-chain to keep the monitor a live display; STEP 6 reuses the -//! device's immediate context in the frame publisher's `CopyResource` (both on the swap-chain processor -//! thread, the one thread this device is touched from). +//! device's immediate context in the frame publisher's `CopyResource` on each swap-chain processor +//! thread. The device is POOLED across processors (one per render LUID, [`pooled_device`]), so with +//! two live monitors two worker threads share it concurrently — creation must NOT pass +//! `D3D11_CREATE_DEVICE_SINGLETHREADED` (that was sound only pre-pooling, device-per-processor), and +//! the immediate context is `SetMultithreadProtected` (it has no internal locking of its own). use std::sync::atomic::{AtomicI32, Ordering}; use std::sync::{Arc, Mutex}; use windows::{ Win32::{ - Foundation::LUID, + Foundation::{BOOL, LUID}, Graphics::{ Direct3D::D3D_DRIVER_TYPE_UNKNOWN, Direct3D11::{ D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_CREATE_DEVICE_PREVENT_ALTERING_LAYER_SETTINGS_FROM_REGISTRY, - D3D11_CREATE_DEVICE_SINGLETHREADED, D3D11_SDK_VERSION, D3D11CreateDevice, - ID3D11Device, ID3D11DeviceContext, + D3D11_SDK_VERSION, D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, + ID3D11Multithread, }, Dxgi::{CreateDXGIFactory2, DXGI_CREATE_FACTORY_FLAGS, IDXGIAdapter1, IDXGIFactory5}, }, }, - core::Error, + core::{Error, Interface}, }; #[derive(thiserror::Error, Debug)] @@ -53,8 +56,10 @@ pub struct Direct3DDevice { _dxgi_factory: IDXGIFactory5, _adapter: IDXGIAdapter1, pub device: ID3D11Device, - /// The single (SINGLETHREADED) immediate context — used by STEP 6's frame-push publisher's - /// `CopyResource` on the swap-chain processor thread (the one thread this device is touched from). + /// The shared immediate context — used by STEP 6's frame-push publisher's `CopyResource` on each + /// swap-chain processor thread. Pooled across processors, so it is `SetMultithreadProtected` at + /// init: an immediate context has no internal locking, and two concurrent monitors' workers would + /// otherwise race it (undefined behavior inside the UMD). pub device_context: ID3D11DeviceContext, } @@ -77,8 +82,10 @@ impl Direct3DDevice { &adapter, D3D_DRIVER_TYPE_UNKNOWN, None, + // NO `D3D11_CREATE_DEVICE_SINGLETHREADED`: the DEVICE_POOL shares this device (and + // its immediate context) across every swap-chain processor on the LUID, so the + // single-caller guarantee that flag declares no longer holds with >1 monitor. D3D11_CREATE_DEVICE_BGRA_SUPPORT - | D3D11_CREATE_DEVICE_SINGLETHREADED | D3D11_CREATE_DEVICE_PREVENT_ALTERING_LAYER_SETTINGS_FROM_REGISTRY, None, D3D11_SDK_VERSION, @@ -91,6 +98,23 @@ impl Direct3DDevice { let device = device.ok_or("ID3D11Device not found")?; let device_context = device_context.ok_or("ID3D11DeviceContext not found")?; + // The pool hands this device (and its immediate context) to every processor on the LUID, and + // an immediate context is not thread-safe by itself — turn on the runtime's per-call critical + // section. (D3D11.4 interface, guaranteed on the Win11-22H2 OS floor; if the cast ever fails + // we log and continue — a single monitor is still safe, concurrent ones would not be.) + match device_context.cast::() { + Ok(mt) => { + // SAFETY: plain setter on the live context's multithread interface; the returned + // previous-state BOOL carries no obligation. + unsafe { + let _ = mt.SetMultithreadProtected(BOOL::from(true)); + } + } + Err(e) => dbglog!( + "[pf-vd] ID3D11Multithread unavailable ({e:?}) — immediate context left unprotected" + ), + } + let live = LIVE_DEVICES.fetch_add(1, Ordering::Relaxed) + 1; dbglog!("[pf-vd] Direct3DDevice::init OK — live D3D devices = {live}"); diff --git a/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs b/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs index 3e55951..9f2027e 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/frame_transport.rs @@ -38,7 +38,12 @@ use windows::Win32::System::Threading::SetEvent; use windows::core::Interface; /// `WAIT_TIMEOUT` as an HRESULT — `AcquireSync` returns this when the slot is held by the consumer. +/// SUCCESS-severity (positive), so the windows-rs `Result` wrapper can never surface it (`.ok()` maps +/// every non-negative HRESULT to `Ok(())`) — the publish loop reads the raw vtable HRESULT instead. const WAIT_TIMEOUT_HRESULT: i32 = 0x0000_0102; +/// `WAIT_ABANDONED` as an HRESULT — the host died while holding the slot's keyed mutex. Also +/// SUCCESS-severity, and ownership DID transfer to the caller. +const WAIT_ABANDONED_HRESULT: i32 = 0x0000_0080; /// One monitor's sealed-channel bootstrap: the handle VALUES the host duplicated into THIS process /// (`IOCTL_SET_FRAME_CHANNEL`). Owning a `FrameChannel` means owning those handles — exactly one of @@ -375,9 +380,18 @@ impl FramePublisher { let slot = (start + attempt) % ring_len; let s = &self.slots[slot as usize]; // SAFETY: `s.mutex` is the live keyed mutex on this ring slot's shared texture; a 0 ms - // try-acquire of key 0 (released below or on WAIT_TIMEOUT it's never held). - match unsafe { s.mutex.AcquireSync(0, 0) } { - Ok(()) => { + // try-acquire of key 0 (released below; on WAIT_TIMEOUT it's never held). Raw vtable + // call, NOT the `Result` wrapper: `.ok()` erases success codes, so through `Result` a + // WAIT_TIMEOUT (host holds the slot) is indistinguishable from a real acquire — the + // wrapper made the busy-skip arm below dead code and had us copying into (and + // publishing) a slot the host was still reading. + let hr = unsafe { + (Interface::vtable(&s.mutex).AcquireSync)(Interface::as_raw(&s.mutex), 0, 0) + }; + match hr.0 { + // Acquired — S_OK, or WAIT_ABANDONED (the host died holding the slot: ownership + // still transferred; publish normally, a dead host consumes nothing either way). + 0 | WAIT_ABANDONED_HRESULT => { // STRAIGHT-LINE, NO `?` between acquire + release — a `?`-return here would leak the // keyed-mutex lock and wedge the host on this slot. The ordering below is load-bearing: // the CopyResource is GPU-ordered before the consumer via the slot keyed mutex, and the @@ -409,8 +423,10 @@ impl FramePublisher { self.next = (slot + 1) % ring_len; return; } - Err(e) if e.code().0 == WAIT_TIMEOUT_HRESULT => continue, - Err(_) => return, + // Busy — the host holds this slot (the designed backpressure): try the next one. + WAIT_TIMEOUT_HRESULT => continue, + // Genuine failure (negative HRESULT — device removed / invalid call): drop the frame. + _ => return, } } // All slots busy — drop this frame (never block the swap-chain thread). diff --git a/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs b/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs index 188e16c..0163e5d 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/swap_chain_processor.rs @@ -342,19 +342,28 @@ impl SwapChainProcessor { logged_frame = true; } // STEP 6: copy the acquired surface into the shared ring BEFORE FinishedProcessingFrame - // (the surface is valid until the next ReleaseAndAcquire). The pointer is BORROWED — - // `from_raw_borrowed` does NOT take IddCx's refcount — and the GPU-side copy is ordered - // before the consumer via the slot keyed mutex. (Attach happens at the loop top.) - if let Some(p) = publisher.as_mut() { + // (the surface is valid until the next ReleaseAndAcquire). Every successful acquire + // TRANSFERS one surface reference to the driver — the MS sample `Attach`es it into a + // ComPtr and `Reset`s BEFORE FinishedProcessingFrame, warning that a driver which + // "forgets to release the reference" leaves the surfaces alive after the swap-chain is + // destroyed. Holding it (the old `from_raw_borrowed`) leaked the swap-chain's whole + // surface set per assign/unassign cycle (reconnect, mode change, HDR flip) — so adopt + // the reference UNCONDITIONALLY (publisher or not); it is released when `res` drops at + // the end of this block. (Publisher attach happens at the loop top.) + { let raw = buffer.MetaData.pSurface as *mut core::ffi::c_void; if !raw.is_null() { - // SAFETY: `raw` is IddCx's live surface pointer (valid until the next - // ReleaseAndAcquire); `from_raw_borrowed` does not consume the refcount. - if let Some(res) = unsafe { IDXGIResource::from_raw_borrowed(&raw) } + // SAFETY: `raw` is the live surface IddCx just handed us, carrying the acquire's + // transferred reference; `from_raw` adopts exactly that reference (released on + // drop, below — the queued GPU copy is unaffected: D3D defers destruction, and + // the copy is ordered before the consumer via the slot keyed mutex). + let res = unsafe { IDXGIResource::from_raw(raw) }; + if let Some(p) = publisher.as_mut() && let Ok(tex) = res.cast::() { p.publish(&tex); } + // `res` drops here → the acquire's surface reference is released, pre-Finished. } } diff --git a/scripts/windows/host.env.example b/scripts/windows/host.env.example index 6fda3cb..d7c8b20 100644 --- a/scripts/windows/host.env.example +++ b/scripts/windows/host.env.example @@ -23,9 +23,9 @@ PUNKTFUNK_VIDEO_SOURCE=virtual # backend now (the legacy SudoVDA backend was removed). This is informational; leave it as `pf`. PUNKTFUNK_VDISPLAY=pf -# Capture straight from the pf-vdisplay driver's shared ring — the validated zero-copy path (incl. the -# secure desktop). Falls back to DDA if the driver can't attach. Set to 0 to force WGC/DDA capture. -PUNKTFUNK_IDD_PUSH=1 +# Capture is IDD-push: straight from the pf-vdisplay driver's shared ring — zero-copy, includes the +# secure desktop. It is the SOLE capture path (DDA/WGC were removed; the former PUNKTFUNK_IDD_PUSH +# knob is gone — a stale setting is ignored). # Capture the secure desktop (UAC / lock / login) so the stream survives those transitions. PUNKTFUNK_SECURE_DDA=1