diff --git a/crates/pf-driver-proto/src/lib.rs b/crates/pf-driver-proto/src/lib.rs index 75e33c5..4961542 100644 --- a/crates/pf-driver-proto/src/lib.rs +++ b/crates/pf-driver-proto/src/lib.rs @@ -151,9 +151,13 @@ pub mod control { } /// `IOCTL_SET_FRAME_CHANNEL` input — the sealed frame channel's bootstrap. Every handle field is a - /// handle VALUE already duplicated into the driver's WUDFHost process by the host; receiving it, the - /// driver OWNS those handles (it closes whatever it doesn't consume — a replaced, invalid, or - /// unmatched delivery must not leak entries in its own handle table). + /// handle VALUE already duplicated into the driver's WUDFHost process by the host. Ownership is + /// **adopt-on-success-only** (`design/idd-push-security.md` invariant 5): the driver owns (and + /// eventually closes) the handles IFF it completes the IOCTL successfully — a replaced or + /// later-unconsumed delivery is then the driver's to close. On ANY error completion (malformed + /// request, unknown `target_id`) the driver must NOT close them: the HOST reaps its remote + /// duplicates (`DUPLICATE_CLOSE_SOURCE`). Exactly one side closes each value; a driver that closed + /// on error would double-close possibly-reused handle values against the host's reap. /// /// Handle values are only meaningful inside the target process's handle table, so this struct is /// harmless to any third party: reading it leaks nothing openable, and spoofing it (were the control diff --git a/crates/punktfunk-host/src/capture/windows/idd_push.rs b/crates/punktfunk-host/src/capture/windows/idd_push.rs index 3e2a97b..c9b1448 100644 --- a/crates/punktfunk-host/src/capture/windows/idd_push.rs +++ b/crates/punktfunk-host/src/capture/windows/idd_push.rs @@ -29,7 +29,7 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use windows::core::{w, Interface, PCWSTR, PWSTR}; use windows::Win32::Foundation::{ DuplicateHandle, DUPLICATE_CLOSE_SOURCE, DUPLICATE_HANDLE_OPTIONS, DUPLICATE_SAME_ACCESS, - HANDLE, INVALID_HANDLE_VALUE, LUID, + HANDLE, INVALID_HANDLE_VALUE, LUID, WAIT_OBJECT_0, }; use windows::Win32::Graphics::Direct3D11::{ ID3D11Device, ID3D11DeviceContext, ID3D11ShaderResourceView, ID3D11Texture2D, @@ -53,7 +53,7 @@ use windows::Win32::System::Memory::{ }; use windows::Win32::System::Threading::{ CreateEventW, GetCurrentProcess, OpenProcess, QueryFullProcessImageNameW, WaitForSingleObject, - PROCESS_DUP_HANDLE, PROCESS_NAME_WIN32, PROCESS_QUERY_LIMITED_INFORMATION, + PROCESS_DUP_HANDLE, PROCESS_NAME_WIN32, PROCESS_QUERY_LIMITED_INFORMATION, PROCESS_SYNCHRONIZE, }; // The frame-transport contract — `SharedHeader` layout, `MAGIC`/`VERSION`/`RING_LEN`, the @@ -234,11 +234,14 @@ pub(crate) unsafe fn verify_is_wudfhost(process: HANDLE, wudf_pid: u32, what: &s /// duplicates (it closes them); on any failure [`Self::send`] reaps every duplicate it already made /// (`DUPLICATE_CLOSE_SOURCE`), so a half-delivered channel never leaks handles in WUDFHost. struct ChannelBroker { - /// `PROCESS_DUP_HANDLE` handle to the driver's WUDFHost (pid from the ADD reply; - /// `ProcessSharingDisabled` makes that process exclusively pf-vdisplay's). + /// `PROCESS_DUP_HANDLE | SYNCHRONIZE` handle to the driver's WUDFHost (pid from the ADD reply; + /// `ProcessSharingDisabled` makes that process exclusively pf-vdisplay's). `SYNCHRONIZE` lets the + /// handle double as the driver-death probe ([`Self::driver_alive`]). process: OwnedHandle, + /// The WUDFHost pid `process` refers to (diagnostics for the driver-death bail). + wudf_pid: u32, /// The pf-vdisplay control device — owned by the `VirtualDisplayManager`, never closed for the - /// process lifetime, so holding the bare `HANDLE` is sound. + /// process lifetime (a dead one is retired, kept alive), so holding the bare `HANDLE` is sound. control: HANDLE, } @@ -264,7 +267,7 @@ impl ChannelBroker { // for the duration of the synchronous check and forms no lasting alias. let process = unsafe { let h = OpenProcess( - PROCESS_DUP_HANDLE | PROCESS_QUERY_LIMITED_INFORMATION, + PROCESS_DUP_HANDLE | PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_SYNCHRONIZE, false, wudf_pid, ) @@ -273,7 +276,21 @@ impl ChannelBroker { verify_is_wudfhost(HANDLE(process.as_raw_handle()), wudf_pid, "frame-channel")?; process }; - Ok(Self { process, control }) + Ok(Self { + process, + wudf_pid, + control, + }) + } + + /// Whether the driver's WUDFHost is still alive. The pinned process handle doubles as the + /// liveness probe (`SYNCHRONIZE` requested at open): signaled ⇔ the process exited. This is the + /// definitive "driver died mid-session" signal — at the ring, a dead driver and an idle desktop + /// are indistinguishable (both simply stop publishing). + fn driver_alive(&self) -> bool { + // SAFETY: `process` is the live `OwnedHandle` this broker owns (borrowed for this synchronous + // call); a 0 ms wait only reads the handle's signaled state. + unsafe { WaitForSingleObject(HANDLE(self.process.as_raw_handle()), 0) != WAIT_OBJECT_0 } } /// Duplicate `h` into the WUDFHost handle table, returning the handle VALUE valid there (and only @@ -437,6 +454,12 @@ pub struct IddPushCapturer { /// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops /// the session (recover-or-drop, no DDA). recovering_since: Option, + /// When the last FRESH driver frame was consumed — feeds the driver-death watch in + /// [`Self::try_consume`] (a dead WUDFHost is otherwise indistinguishable from an idle desktop: + /// both stop publishing, and the encode loop would repeat the last frame forever). + last_fresh: Instant, + /// Rate-limits the WUDFHost liveness probe (one 0 ms wait per second, and only while stale). + last_liveness: Instant, /// Host-owned ROTATING output ring NVENC encodes (one YUV texture per slot). Rotating it per frame /// is the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the /// ASIC, frame N+1's convert writes a DIFFERENT texture — the two overlap. Format = `out_format()`: @@ -753,6 +776,8 @@ impl IddPushCapturer { display_hdr, last_acm_poll: Instant::now(), recovering_since: None, + last_fresh: Instant::now(), + last_liveness: Instant::now(), out_ring: Vec::new(), out_idx: 0, video_conv: None, @@ -1074,6 +1099,24 @@ impl IddPushCapturer { ); } } + // Driver-death watch (the SDR path has no other signal): a dead WUDFHost stops publishing, + // which at the ring is indistinguishable from an idle desktop — the encode loop would repeat + // the last frame forever (frozen video + live audio) and `next_frame`'s 20 s bail is + // unreachable once anything ever presented. While no fresh frame is arriving, probe the + // broker's pinned process handle (rate-limited) and fail the capturer so the session's + // rebuild path recreates output + ring against the restarted device. + if self.last_fresh.elapsed() > Duration::from_secs(2) + && self.last_liveness.elapsed() > Duration::from_secs(1) + { + self.last_liveness = Instant::now(); + if !self.broker.driver_alive() { + bail!( + "IDD-push: the pf-vdisplay WUDFHost (pid {}) exited mid-session — driver died; \ + failing the capturer so the session rebuilds the virtual output", + self.broker.wudf_pid + ); + } + } let latest = self.latest(); // `latest` is the proto publish token `(generation << 40) | (seq << 8) | slot`. Reject any publish // whose generation isn't our CURRENT ring (a stale old-ring publish racing a recreate, or the 0 @@ -1136,6 +1179,7 @@ impl IddPushCapturer { self.last_seq = seq; self.last_present = Some((out.clone(), pf)); self.recovering_since = None; // a fresh frame resumed → recovered + self.last_fresh = Instant::now(); // feeds the driver-death watch Ok(Some(CapturedFrame { width: self.width, height: self.height, diff --git a/crates/punktfunk-host/src/vdisplay/windows/manager.rs b/crates/punktfunk-host/src/vdisplay/windows/manager.rs index 3f4743e..7f7c6c8 100644 --- a/crates/punktfunk-host/src/vdisplay/windows/manager.rs +++ b/crates/punktfunk-host/src/vdisplay/windows/manager.rs @@ -23,7 +23,8 @@ use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; use anyhow::Result; -use windows::Win32::Foundation::{HANDLE, LUID}; +use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID, WAIT_OBJECT_0}; +use windows::Win32::System::Threading::{OpenProcess, WaitForSingleObject, PROCESS_SYNCHRONIZE}; use super::{Mode, VirtualOutput}; use crate::win_display::{ @@ -54,13 +55,15 @@ pub(crate) struct AddedMonitor { /// `&'static` singleton reached from the pinger + linger threads. pub(crate) trait VdisplayDriver: Send + Sync { fn name(&self) -> &'static str; - /// Find + open the control device, validate it (version handshake), read the watchdog timeout, and - /// reap monitors orphaned by a crashed previous host (`CLEAR_ALL`). Returns the owned handle + - /// watchdog seconds. + /// Find + open the control device, validate it (version handshake), and read the watchdog + /// timeout. `reap_orphans` (the FIRST open of the process only) additionally `CLEAR_ALL`s + /// monitors orphaned by a crashed previous host — a REOPEN (after a dead handle was retired) + /// must NOT, since sessions this process still considers live may be racing it. Returns the + /// owned handle + watchdog seconds. /// /// # Safety /// Issues setup-API + `DeviceIoControl` calls; runs in the caller's apartment. - unsafe fn open(&self) -> Result<(OwnedHandle, u32)>; + unsafe fn open(&self, reap_orphans: bool) -> Result<(OwnedHandle, u32)>; /// ADD a virtual monitor at `mode`, pinning the IDD render GPU to `render_luid` first if `Some`, and /// requesting `preferred_monitor_id` (the host's per-client stable id; `0` = auto). Returns the REMOVE /// key + target id + the adapter LUID the driver actually used. @@ -125,12 +128,31 @@ enum MgrState { Lingering { mon: Monitor, until: Instant }, } +/// The manager's control-device cache. Reopenable: a driver upgrade / WUDFHost restart kills the +/// cached handle (every IOCTL fails with a gone-class code forever), so such a failure RETIRES it and +/// the next [`VirtualDisplayManager::ensure_device`] reopens the (new) device interface, re-running +/// the version handshake. Retired handles are deliberately kept alive — never closed — for the +/// process lifetime: the pinger/linger threads and every capturer's `ChannelBroker` hold BARE +/// `HANDLE` copies whose soundness contract is "never closed"; a retired handle only ever FAILS +/// IOCTLs, which every holder already tolerates. Reopens are rare (a driver restart), so the retained +/// list is bounded in practice. +#[derive(Default)] +struct DeviceSlot { + current: Option>, + /// Never dropped — see the type doc (bare-`HANDLE` holders rely on no-close). + retired: Vec>, + /// `CLEAR_ALL` (crashed-host orphan reap) runs only on the FIRST open of the process; a reopen + /// races sessions this process still considers live and must not raze them. + opened_once: bool, +} + /// The host-lifetime virtual-display manager: the single owner of the monitor lifecycle. pub(crate) struct VirtualDisplayManager { driver: Box, - /// Control device, opened once on first acquire. Typed + `Send+Sync`, so the pinger/linger threads - /// share it via the `&'static` singleton with no raw-handle smuggling. - device: OnceLock>, + /// Control device, opened on first acquire and REOPENED after a gone-classified failure retired + /// it (see [`DeviceSlot`]). Typed + `Send+Sync`, so the pinger/linger threads share it via the + /// `&'static` singleton with no raw-handle smuggling. + device: Mutex, watchdog_s: AtomicU32, /// Monotonic lease-generation counter (was the `MON_GEN` global). gen: AtomicU64, @@ -155,7 +177,7 @@ static VDM: OnceLock = OnceLock::new(); pub(crate) fn init(driver: Box) -> &'static VirtualDisplayManager { VDM.get_or_init(|| VirtualDisplayManager { driver, - device: OnceLock::new(), + device: Mutex::new(DeviceSlot::default()), watchdog_s: AtomicU32::new(3), gen: AtomicU64::new(1), state: Mutex::new(MgrState::Idle), @@ -173,39 +195,109 @@ pub(crate) fn vdm() -> &'static VirtualDisplayManager { } /// The live pf-vdisplay control-device handle, for the IDD-push capturer's sealed-channel delivery -/// (`IOCTL_SET_FRAME_CHANNEL`). Safe to hand out as a bare `HANDLE`: the device lives in a `OnceLock` -/// that is never cleared or closed for the process lifetime. `None` before the first backend open — -/// impossible for a capturer, which only exists on a monitor the manager created. +/// (`IOCTL_SET_FRAME_CHANNEL`). Safe to hand out as a bare `HANDLE`: cached handles are never closed +/// for the process lifetime — a dead one is RETIRED (kept alive, see [`DeviceSlot`]), so a stale copy +/// can only fail IOCTLs, never dangle. `None` before the first backend open — impossible for a +/// capturer, which only exists on a monitor the manager created. pub(crate) fn control_device_handle() -> Option { VDM.get().and_then(VirtualDisplayManager::device_handle) } +/// True when an IOCTL failure means the CONTROL DEVICE itself is gone (driver upgrade, WUDFHost +/// restart, device disable) — the cached handle can only keep failing and must be retired so the +/// next use reopens. The root `windows` error survives anyhow `.context` chains via `downcast_ref`. +/// NOTE: 0x80070490 (ERROR_NOT_FOUND, the ADD slot-exhaustion wedge) is deliberately NOT here — it +/// has its own reap-and-retry handling and the device is alive when it fires. +/// Best-effort "is this WUDFHost pid still alive?" — the monitor-liveness probe for the JOIN path. +/// `OpenProcess` failing (pid reaped) or the process being signaled ⇒ dead. Pid reuse could +/// theoretically alias a fresh process and read "alive"; the joining session then just retries into +/// its rebuild budget — acceptable for a sub-second reuse window that realistically never hits. +fn wudf_alive(pid: u32) -> bool { + if pid == 0 { + return true; // pre-v2 driver reports no pid — never preempt on the probe's account + } + // SAFETY: plain FFI probe; the opened handle (checked) is closed exactly once below, and the + // 0 ms wait only reads its signaled state. + unsafe { + let Ok(h) = OpenProcess(PROCESS_SYNCHRONIZE, false, pid) else { + return false; + }; + let alive = WaitForSingleObject(h, 0) != WAIT_OBJECT_0; + let _ = CloseHandle(h); + alive + } +} + +fn is_device_gone(e: &anyhow::Error) -> bool { + let Some(w) = e.downcast_ref::() else { + return false; + }; + // Win32 codes as HRESULTs: FILE_NOT_FOUND(2), INVALID_HANDLE(6), BAD_COMMAND(22), + // GEN_FAILURE(31), DEV_NOT_EXIST(55), OPERATION_ABORTED(995), DEVICE_NOT_CONNECTED(1167 = + // 0x48F — one below the 0x490 wedge), DEVICE_REMOVED(1617). + const GONE: [i32; 8] = [ + 0x8007_0002u32 as i32, + 0x8007_0006u32 as i32, + 0x8007_0016u32 as i32, + 0x8007_001Fu32 as i32, + 0x8007_0037u32 as i32, + 0x8007_03E3u32 as i32, + 0x8007_048Fu32 as i32, + 0x8007_0651u32 as i32, + ]; + GONE.contains(&w.code().0) +} + impl VirtualDisplayManager { pub(crate) fn backend_name(&self) -> &'static str { self.driver.name() } - /// Open + cache the control device (once). Called under the `state` lock so two racing acquires can't - /// double-open. + /// Open + cache the control device; REOPEN when a gone-classified failure retired the cached one + /// (driver upgrade / WUDFHost restart). The `device` mutex serializes racing opens. fn ensure_device(&self) -> Result { - if let Some(d) = self.device.get() { + let mut slot = self.device.lock().unwrap(); + if let Some(d) = &slot.current { return Ok(HANDLE(d.as_raw_handle())); } + let reap = !slot.opened_once; // SAFETY: `VdisplayDriver::open` is `unsafe` only because it issues SetupAPI + `DeviceIoControl` - // FFI in the caller's apartment; `ensure_device` runs that on the acquiring thread under the - // `state` lock (callers hold it), so there is no concurrent open. `open` has no handle - // precondition to uphold, and the `OwnedHandle` it returns is the sole owner of the device. - let (handle, watchdog_s) = unsafe { self.driver.open()? }; + // FFI in the caller's apartment; the `device` mutex (held here) serializes it, so there is no + // concurrent open. `open` has no handle precondition to uphold, and the `OwnedHandle` it + // returns is the sole owner of the device. + let (handle, watchdog_s) = unsafe { self.driver.open(reap)? }; + slot.opened_once = true; self.watchdog_s.store(watchdog_s, Ordering::Relaxed); let raw = HANDLE(handle.as_raw_handle()); - let _ = self.device.set(Arc::new(handle)); + slot.current = Some(Arc::new(handle)); + if !reap { + tracing::info!("virtual-display control device reopened (retired handle replaced)"); + } Ok(raw) } - /// The live control handle for the pinger/linger threads (lock-free: the device never changes once - /// opened). `None` only before the first acquire opened it. + /// The live control handle for the pinger/linger threads. `None` before the first acquire opened + /// it, or between a retire and the next reopen. fn device_handle(&self) -> Option { - self.device.get().map(|d| HANDLE(d.as_raw_handle())) + self.device + .lock() + .unwrap() + .current + .as_ref() + .map(|d| HANDLE(d.as_raw_handle())) + } + + /// Retire the cached control handle after a gone-classified IOCTL failure. The handle is retained + /// un-closed (see [`DeviceSlot`]); the next [`ensure_device`](Self::ensure_device) reopens the + /// (new) device interface and re-runs the version handshake. + fn invalidate_device(&self, why: &anyhow::Error) { + let mut slot = self.device.lock().unwrap(); + if let Some(cur) = slot.current.take() { + tracing::warn!( + "virtual-display control device retired — reopening on next use (cause: {why:#})" + ); + slot.retired.push(cur); + } } /// Open + initialise the backend (validates the driver is present). Mirrors the old @@ -247,9 +339,9 @@ impl VirtualDisplayManager { old_target = mon.target_id, "IDD-push reconnect — preempting the lingering monitor, recreating a fresh one" ); - // SAFETY: `teardown` requires `dev` to be the live control handle; `dev` is the value - // `ensure_device()` returned above (the device is cached in the `OnceLock` and never - // closed for the manager's lifetime). `mon` was moved out of the prior `Lingering` + // SAFETY: `teardown` requires `dev` to be a valid control handle; `dev` is the value + // `ensure_device()` returned above (cached handles are never closed — a dead one is + // retired, kept alive; see `DeviceSlot`). `mon` was moved out of the prior `Lingering` // state by `mem::replace`, so it is exclusively owned here — no aliasing. unsafe { self.teardown(dev, mon) }; // Let the OS finish the ASYNC monitor departure before the next ADD; a back-to-back @@ -258,6 +350,30 @@ impl VirtualDisplayManager { } } + // An ACTIVE monitor whose WUDFHost has EXITED is dead driver-side (driver crash / upgrade): + // the capturer's driver-death watch failed its session, and that session's in-place rebuild + // re-acquires here while its old lease is STILL held — so the state is Active. Joining would + // hand the rebuild the dead monitor's target (stale wudf_pid) and starve it to the rebuild + // budget. Preempt instead: best-effort teardown (REMOVE fails harmlessly on a dead/retired + // device) and fall through to a fresh create on the auto-restarted device. Held leases are + // gen-stamped, so their eventual release is a no-op. + if matches!(&*state, MgrState::Active { mon, .. } if !wudf_alive(mon.wudf_pid)) { + if let MgrState::Active { mon, .. } = std::mem::replace(&mut *state, MgrState::Idle) { + tracing::warn!( + old_target = mon.target_id, + wudf_pid = mon.wudf_pid, + "virtual monitor's WUDFHost is gone — preempting the dead monitor, recreating" + ); + // SAFETY: `teardown` requires a valid control handle; `dev` is the value + // `ensure_device()` returned above (cached handles are never closed — a dead one is + // retired, kept alive; see `DeviceSlot`). `mon` was moved out of the replaced state, + // so it is exclusively owned here — no aliasing. + unsafe { self.teardown(dev, mon) }; + // Same async-departure settle as the reconnect preempt above. + thread::sleep(Duration::from_millis(400)); + } + } + // A live monitor already exists — join it (refcount++). Covers concurrent sessions AND the // build-then-drop overlap of a mid-stream Reconfigure (the new lease is taken while the old is // still held). Reconfigure the shared monitor if the requested mode differs. @@ -292,10 +408,26 @@ impl VirtualDisplayManager { } mon } - // SAFETY: `create_monitor` requires `dev` to be the live control handle; `dev` is the - // handle `ensure_device()` returned above (cached in the `OnceLock`, never closed for the - // manager's lifetime), and we hold the `state` lock. - MgrState::Idle => unsafe { self.create_monitor(dev, mode, client_fp)? }, + // SAFETY: `create_monitor` requires `dev` to be a valid control handle; `dev` is the + // handle `ensure_device()` returned above (cached handles are never closed — a dead one + // is retired, kept alive; see `DeviceSlot`), and we hold the `state` lock. + MgrState::Idle => match unsafe { self.create_monitor(dev, mode, client_fp) } { + // The cached device died under us (driver upgrade / WUDFHost restart, detected only + // now — e.g. the host sat idle past the pinger-less window). Retire it, reopen, and + // retry ONCE so the reconnect-after-driver-restart succeeds first try instead of + // burning one failed session per restart. + Err(e) if is_device_gone(&e) => { + self.invalidate_device(&e); + let dev = self.ensure_device()?; + tracing::info!( + "virtual-display control device reopened — retrying the monitor create" + ); + // SAFETY: as above — `dev` is the handle the reopening `ensure_device` just + // returned, and the `state` lock is still held. + unsafe { self.create_monitor(dev, mode, client_fp)? } + } + r => r?, + }, MgrState::Active { .. } => unreachable!("handled above"), }; let out = self.output_for(&mon); @@ -353,13 +485,20 @@ impl VirtualDisplayManager { let mut warned = false; while !stop_t.load(Ordering::Relaxed) { if let Some(h) = vdm().device_handle() { - // SAFETY: `ping` requires `dev` to be the live control handle. `h` is from - // `device_handle()` (the `Some` branch) — the `OnceLock>` that, - // once set, is never cleared or closed for the process lifetime, so the handle is - // live for this call. The pinger thread only spins while the `&'static` manager - // singleton (and thus the device) lives. + // SAFETY: `ping` requires `dev` to be a valid control handle. `h` is from + // `device_handle()` (the `Some` branch) — cached handles are NEVER closed for the + // process lifetime (a dead one is retired, kept alive; see `DeviceSlot`), so the + // handle stays valid for this call even if it was retired concurrently — at worst + // the IOCTL fails. The pinger thread only spins while the `&'static` manager + // singleton lives. match unsafe { vdm().driver.ping(h) } { Ok(()) => warned = false, + Err(e) if is_device_gone(&e) => { + // The device itself is gone (driver upgrade / WUDFHost restart) — pings + // can only keep failing on this handle. Retire it so the next session's + // `ensure_device` reopens; this monitor is already dead driver-side. + vdm().invalidate_device(&e); + } Err(e) => { if !warned { tracing::warn!("virtual-display keepalive PING failed (control handle lost?): {e:#}"); @@ -501,6 +640,11 @@ impl VirtualDisplayManager { // `remove_monitor` requires exactly that. `&mon.key` borrows the `MonitorKey` inside the // still-owned `mon`, alive for this synchronous IOCTL, so the pointer the driver reads stays valid. if let Err(e) = unsafe { self.driver.remove_monitor(dev, &mon.key) } { + // A gone-classified failure means the device died under this monitor (driver upgrade / + // WUDFHost restart) — retire the handle so the NEXT session reopens instead of failing. + if is_device_gone(&e) { + self.invalidate_device(&e); + } tracing::warn!("virtual-display REMOVE failed: {e:#}"); } else { tracing::info!( diff --git a/crates/punktfunk-host/src/vdisplay/windows/pf_vdisplay.rs b/crates/punktfunk-host/src/vdisplay/windows/pf_vdisplay.rs index 96c4c18..9b5a052 100644 --- a/crates/punktfunk-host/src/vdisplay/windows/pf_vdisplay.rs +++ b/crates/punktfunk-host/src/vdisplay/windows/pf_vdisplay.rs @@ -19,14 +19,14 @@ use std::ffi::c_void; use std::mem::size_of; -use std::os::windows::io::{FromRawHandle, OwnedHandle}; +use std::os::windows::io::{AsRawHandle, FromRawHandle, OwnedHandle}; use std::sync::atomic::{AtomicU64, Ordering}; use anyhow::{Context, Result}; use windows::core::{GUID, PCWSTR}; use windows::Win32::Devices::DeviceAndDriverInstallation::{ SetupDiDestroyDeviceInfoList, SetupDiEnumDeviceInterfaces, SetupDiGetClassDevsW, - SetupDiGetDeviceInterfaceDetailW, DIGCF_DEVICEINTERFACE, DIGCF_PRESENT, + SetupDiGetDeviceInterfaceDetailW, DIGCF_DEVICEINTERFACE, DIGCF_PRESENT, HDEVINFO, SPINT_ACTIVE, SP_DEVICE_INTERFACE_DATA, SP_DEVICE_INTERFACE_DETAIL_DATA_W, }; use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; @@ -137,11 +137,9 @@ fn is_slot_exhaustion_wedge(e: &anyhow::Error) -> bool { /// Pin the pf-vdisplay IddCx's RENDER GPU to `luid` (the analogue of Apollo's `SetRenderAdapter`). No /// output buffer. Issued on the driver handle BEFORE `IOCTL_ADD` to steer which GPU the new target /// renders on — on a multi-adapter box this stops DXGI from reparenting the virtual output onto a -/// different adapter than the one we duplicate/encode on (the ACCESS_LOST storm). -/// -/// NOTE: the pf-vdisplay driver currently returns `STATUS_NOT_IMPLEMENTED` for this IOCTL (a STEP-4 -/// stub), so this call WILL fail today. Callers tolerate the `Err` (warn + continue) — exactly as the -/// SudoVDA backend tolerated the driver IGNORING the pin. +/// different adapter than the one we duplicate/encode on (the ACCESS_LOST storm). The driver +/// implements it (`control.rs` → `adapter::set_render_adapter`); callers still tolerate an `Err` +/// (warn + continue) since the driver reports its real render LUID in the shared header either way. unsafe fn set_render_adapter(h: HANDLE, luid: LUID) -> Result<()> { let req = control::SetRenderAdapterRequest { luid_low: luid.LowPart, @@ -185,42 +183,102 @@ pub(crate) unsafe fn send_frame_channel( .context("pf-vdisplay SET_FRAME_CHANNEL") } +/// RAII over a SetupAPI device-info list: every exit path of [`open_device`] destroys it (the error +/// paths used to leak one `HDEVINFO` per failed open — and a driverless / mid-upgrade box probes +/// repeatedly). +struct DevInfoList(HDEVINFO); + +impl Drop for DevInfoList { + fn drop(&mut self) { + // SAFETY: `self.0` is the live device-info list this wrapper solely owns; destroyed exactly + // once here. + unsafe { + let _ = SetupDiDestroyDeviceInfoList(self.0); + } + } +} + unsafe fn open_device() -> Result { - let hdev = SetupDiGetClassDevsW( - Some(&PF_VDISPLAY_INTERFACE), - PCWSTR::null(), - None, - DIGCF_DEVICEINTERFACE | DIGCF_PRESENT, - ) - .context("SetupDiGetClassDevsW(pf-vdisplay) — is the pf-vdisplay driver installed?")?; + // SAFETY: plain SetupAPI enumeration call; the returned list is solely owned by the RAII wrapper. + let hdev = DevInfoList( + unsafe { + SetupDiGetClassDevsW( + Some(&PF_VDISPLAY_INTERFACE), + PCWSTR::null(), + None, + DIGCF_DEVICEINTERFACE | DIGCF_PRESENT, + ) + } + .context("SetupDiGetClassDevsW(pf-vdisplay) — is the pf-vdisplay driver installed?")?, + ); - let mut idata = SP_DEVICE_INTERFACE_DATA { - cbSize: size_of::() as u32, - ..Default::default() - }; - SetupDiEnumDeviceInterfaces(hdev, None, &PF_VDISPLAY_INTERFACE, 0, &mut idata) - .context("SetupDiEnumDeviceInterfaces(pf-vdisplay)")?; - - let mut required = 0u32; - let _ = SetupDiGetDeviceInterfaceDetailW(hdev, &idata, None, 0, Some(&mut required), None); - let mut buf = vec![0u8; required as usize]; - let detail = buf.as_mut_ptr() as *mut SP_DEVICE_INTERFACE_DETAIL_DATA_W; - (*detail).cbSize = size_of::() as u32; - SetupDiGetDeviceInterfaceDetailW(hdev, &idata, Some(detail), required, None, None) - .context("SetupDiGetDeviceInterfaceDetailW(pf-vdisplay)")?; - - let handle = CreateFileW( - PCWSTR((*detail).DevicePath.as_ptr()), - 0xC000_0000, // GENERIC_READ | GENERIC_WRITE - FILE_SHARE_READ | FILE_SHARE_WRITE, - None, - OPEN_EXISTING, - FILE_FLAGS_AND_ATTRIBUTES(0), - None, - ) - .context("CreateFileW(pf-vdisplay device)")?; - let _ = SetupDiDestroyDeviceInfoList(hdev); - Ok(handle) + // Enumerate EVERY interface instance, not just index 0: after a driver upgrade a present-but- + // failed devnode (Code 10) can hold index 0 while the LIVE node's interface sits at a later + // index — the old single-index read then failed every session with "driver not installed" + // even though a working interface existed. `SPINT_ACTIVE` filters dead interfaces (an interface + // is active only while its owning device is started); the first active + openable one wins. + let mut inactive = 0u32; + let mut last_err: Option = None; + for index in 0..64u32 { + let mut idata = SP_DEVICE_INTERFACE_DATA { + cbSize: size_of::() as u32, + ..Default::default() + }; + // SAFETY: `hdev.0` is the live list; `idata` is a valid, size-stamped out-param. + if unsafe { + SetupDiEnumDeviceInterfaces(hdev.0, None, &PF_VDISPLAY_INTERFACE, index, &mut idata) + } + .is_err() + { + break; // ERROR_NO_MORE_ITEMS — no further candidates + } + if idata.Flags & SPINT_ACTIVE == 0 { + inactive += 1; + continue; + } + let mut required = 0u32; + // SAFETY: sizing call — null buffer plus a valid `required` out-param; the expected + // ERROR_INSUFFICIENT_BUFFER "failure" is ignored and only `required` is consumed. + let _ = unsafe { + SetupDiGetDeviceInterfaceDetailW(hdev.0, &idata, None, 0, Some(&mut required), None) + }; + if (required as usize) < size_of::() { + continue; // sizing failed — never stamp a cbSize through an under-sized buffer + } + let mut buf = vec![0u8; required as usize]; + let detail = buf.as_mut_ptr() as *mut SP_DEVICE_INTERFACE_DETAIL_DATA_W; + // SAFETY: `buf` is `required` bytes (>= 4, checked above), so stamping `cbSize` and letting + // the API fill up to `required` bytes stays in bounds; `detail` aliases `buf` only within + // this iteration, and the `DevicePath` pointer is read before `buf` is dropped. + let opened = unsafe { + (*detail).cbSize = size_of::() as u32; + SetupDiGetDeviceInterfaceDetailW(hdev.0, &idata, Some(detail), required, None, None) + .context("SetupDiGetDeviceInterfaceDetailW(pf-vdisplay)") + .and_then(|()| { + CreateFileW( + PCWSTR((*detail).DevicePath.as_ptr()), + 0xC000_0000, // GENERIC_READ | GENERIC_WRITE + FILE_SHARE_READ | FILE_SHARE_WRITE, + None, + OPEN_EXISTING, + FILE_FLAGS_AND_ATTRIBUTES(0), + None, + ) + .context("CreateFileW(pf-vdisplay device)") + }) + }; + match opened { + Ok(h) => return Ok(h), + // A raced-away or wedged device — remember the error, try the next interface. + Err(e) => last_err = Some(e), + } + } + Err(last_err.unwrap_or_else(|| { + anyhow::anyhow!( + "no ACTIVE pf-vdisplay device interface found ({inactive} inactive) — is the \ + pf-vdisplay driver installed and its device started?" + ) + })) } /// The pf-vdisplay IOCTL surface behind the shared [`VirtualDisplayManager`](super::manager::VirtualDisplayManager) @@ -232,30 +290,30 @@ impl VdisplayDriver for PfVdisplayDriver { "pf-vdisplay" } - unsafe fn open(&self) -> Result<(OwnedHandle, u32)> { + unsafe fn open(&self, reap_orphans: bool) -> Result<(OwnedHandle, u32)> { // SAFETY: `open_device` is `unsafe` only because it issues SetupAPI enumeration + `CreateFileW` // FFI; it takes no arguments and returns an owned raw `HANDLE` (or `Err`). Called here on the // backend-init thread, with no precondition beyond a valid thread context. let device = unsafe { open_device()? }; + // Wrap IMMEDIATELY: every `?` below must close the device exactly once — the old + // wrap-on-success-only shape leaked the raw handle whenever GET_INFO itself failed. + // SAFETY: `device` is the valid handle `open_device` just returned; ownership transfers into + // the `OwnedHandle` (single owner, `CloseHandle` on drop). + let device = unsafe { OwnedHandle::from_raw_handle(device.0 as _) }; + let raw = HANDLE(device.as_raw_handle()); // HARD protocol-version check (unlike SudoVDA's best-effort log): a mismatched host/driver pair // fails loudly here rather than corrupting the IOCTL stream. let mut info_buf = [0u8; size_of::()]; // SAFETY: `ioctl` requires `h` to be a valid device handle and its slices to be valid for the - // call. `device` is the live handle just returned by `open_device`. `IOCTL_GET_INFO` takes no - // input (`&[]`) and writes into `info_buf`, a stack `[u8; size_of::()]` whose length - // is passed as the output size — so `DeviceIoControl` can't write OOB — and which outlives this - // synchronous call. - unsafe { ioctl(device, control::IOCTL_GET_INFO, &[], &mut info_buf) } + // call. `raw` borrows the live `OwnedHandle` above for this synchronous call. `IOCTL_GET_INFO` + // takes no input (`&[]`) and writes into `info_buf`, a stack `[u8; size_of::()]` + // whose length is passed as the output size — so `DeviceIoControl` can't write OOB — and which + // outlives this synchronous call. + unsafe { ioctl(raw, control::IOCTL_GET_INFO, &[], &mut info_buf) } .context("pf-vdisplay IOCTL_GET_INFO (version handshake)")?; let info: control::InfoReply = bytemuck::pod_read_unaligned(&info_buf[..size_of::()]); if info.protocol_version != pf_driver_proto::PROTOCOL_VERSION { - // SAFETY: `device` is the valid raw handle from `open_device` and has NOT yet been wrapped - // in an `OwnedHandle` (that happens only on the success path below), so this error path is - // the sole owner closing it exactly once — no double-close. - unsafe { - let _ = CloseHandle(device); - } anyhow::bail!( "pf-vdisplay protocol mismatch: host expects {}, driver reports {} — install matching \ host + driver", @@ -269,12 +327,19 @@ impl VdisplayDriver for PfVdisplayDriver { info.protocol_version, watchdog_s ); - // Reap monitors orphaned by a crashed previous host — a FIRST-CLASS op (driver returns SUCCESS). + // Reap monitors orphaned by a crashed previous host — a FIRST-CLASS op (driver returns + // SUCCESS). FIRST open of the process only: a REOPEN (the manager retired a dead handle after + // a driver upgrade / WUDFHost restart) can race sessions that still believe they are live, and + // an unconditional CLEAR_ALL there would raze them. + if !reap_orphans { + reap_ghost_monitors(); + return Ok((device, watchdog_s)); + } let mut none: [u8; 0] = []; - // SAFETY: `device` is the live handle from `open_device` (still owned here, before it is wrapped - // below). `IOCTL_CLEAR_ALL` has no input and no output: `&[]` and the empty `none` slice pass - // zero-length buffers, so nothing is read or written through them. - if unsafe { ioctl(device, control::IOCTL_CLEAR_ALL, &[], &mut none) }.is_ok() { + // SAFETY: `raw` borrows the live `OwnedHandle` above. `IOCTL_CLEAR_ALL` has no input and no + // output: `&[]` and the empty `none` slice pass zero-length buffers, so nothing is read or + // written through them. + if unsafe { ioctl(raw, control::IOCTL_CLEAR_ALL, &[], &mut none) }.is_ok() { tracing::info!("cleared orphaned virtual monitors on host startup"); } else { tracing::warn!("pf-vdisplay IOCTL_CLEAR_ALL failed on startup (continuing)"); @@ -285,14 +350,7 @@ impl VdisplayDriver for PfVdisplayDriver { // monitor-slot budget — prevents the 0x80070490 slot-exhaustion wedge from carrying across // restarts (the reason a restart's CLEAR_ALL alone never recovered it before). reap_ghost_monitors(); - Ok(( - // SAFETY: `device` is the valid handle from `open_device`, still owned here and NOT closed - // on this success path (the error paths above close it and return). `from_raw_handle`'s - // contract — caller owns a valid handle — holds, so ownership transfers cleanly into the - // `OwnedHandle`: exactly one owner, which `CloseHandle`s it on drop. - unsafe { OwnedHandle::from_raw_handle(device.0 as _) }, - watchdog_s, - )) + Ok((device, watchdog_s)) } unsafe fn add_monitor( diff --git a/crates/punktfunk-host/src/windows/install.rs b/crates/punktfunk-host/src/windows/install.rs index fd6bd91..258f3b7 100644 --- a/crates/punktfunk-host/src/windows/install.rs +++ b/crates/punktfunk-host/src/windows/install.rs @@ -324,10 +324,17 @@ fn read_inf_text(path: &Path) -> String { } } -/// Is a punktfunk virtual-display device already enumerated? Matches the device ID / description, which +/// Is a punktfunk virtual-display device already enumerated AND connected? `/connected` is +/// load-bearing: without it a PHANTOM (disconnected) devnode left by an earlier uninstall satisfies +/// this check, the install skips creating a live ROOT node, and every session then fails "driver not +/// installed" (the host enumerates present devices only). Matches the device ID / description, which /// are NOT localized, so the substring check is locale-safe. fn pf_vdisplay_present() -> bool { - let lo = run_capture("pnputil", &["/enum-devices", "/class", "Display"]).to_ascii_lowercase(); + let lo = run_capture( + "pnputil", + &["/enum-devices", "/connected", "/class", "Display"], + ) + .to_ascii_lowercase(); lo.contains("pf_vdisplay") || lo.contains("punktfunk virtual display") }