From fe54aff6583941e01d403c43a2433ca5fc0400a9 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Fri, 3 Jul 2026 17:28:22 +0000 Subject: [PATCH] fix(windows-host): cross-plane IDD serialization, linger-expiry race, second-host guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Batch C of the audit's medium tier (M7+M8+M9): - M7: GameStream sessions now run the same begin_idd_setup dance as punktfunk/1 before creating the shared monitor. A GS connect could previously ADD/reconfigure the monitor while a native session was mid-build (and vice versa), and its sealed-channel delivery replaced the native ring (newest-wins) — each plane could freeze the other. GS has no cooperative stop plumbing, so it registers a flag nobody reads: a later session signals it, waits the 3 s grace, then force-preempts — the intended handover. - M8: the linger-expiry teardown now runs UNDER the state lock. Running it outside let a concurrent acquire see Idle and ADD+isolate while the old monitor's pinger-join / CCD-restore / REMOVE were still in flight — a failed or de-isolated session exactly at the expiry boundary. A racing acquire now waits the few teardown seconds instead. Lock order stays state → device everywhere; the pinger takes only the device lock. - M9: a named mutex (Global\punktfunk-vdisplay-manager) makes a SECOND host process fail its vdisplay open loudly instead of firing a startup CLEAR_ALL that razes the live host's monitors mid-stream (the admin footgun the shared watchdog then masked). Co-Authored-By: Claude Fable 5 --- .../punktfunk-host/src/gamestream/stream.rs | 17 ++++ .../src/vdisplay/windows/manager.rs | 92 +++++++++++++------ 2 files changed, 79 insertions(+), 30 deletions(-) diff --git a/crates/punktfunk-host/src/gamestream/stream.rs b/crates/punktfunk-host/src/gamestream/stream.rs index 63ecd0f..50fb532 100644 --- a/crates/punktfunk-host/src/gamestream/stream.rs +++ b/crates/punktfunk-host/src/gamestream/stream.rs @@ -269,6 +269,23 @@ fn open_gs_virtual_source( )); #[cfg(not(target_os = "linux"))] vd.set_launch_command(app.and_then(|a| a.cmd.clone())); + // Serialize with the punktfunk/1 plane's IDD-push setup dance (Goal-1 §2.5). A GameStream + // connect used to skip it entirely, so it could ADD/reconfigure the shared monitor while a + // native session was mid-build (and vice versa), and its sealed-channel delivery would replace + // the native session's ring (newest-wins) — each plane could freeze the other. GameStream has + // no cooperative stop-flag plumbing, so it registers a flag nobody reads: a LATER session that + // preempts this one signals it, waits the 3 s release grace, then force-preempts the monitor — + // this session then fails on capture and tears down cleanly (the intended handover). + #[cfg(target_os = "windows")] + let _idd_setup_guard = matches!( + crate::session_plan::CaptureBackend::resolve(), + crate::session_plan::CaptureBackend::IddPush + ) + .then(|| { + crate::vdisplay::manager::vdm().begin_idd_setup(std::sync::Arc::new( + std::sync::atomic::AtomicBool::new(false), + )) + }); let vout = vd .create(punktfunk_core::Mode { width: cfg.width, diff --git a/crates/punktfunk-host/src/vdisplay/windows/manager.rs b/crates/punktfunk-host/src/vdisplay/windows/manager.rs index 7f7c6c8..dd3ae89 100644 --- a/crates/punktfunk-host/src/vdisplay/windows/manager.rs +++ b/crates/punktfunk-host/src/vdisplay/windows/manager.rs @@ -16,15 +16,20 @@ // Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). #![deny(clippy::undocumented_unsafe_blocks)] -use std::os::windows::io::{AsRawHandle, OwnedHandle}; +use std::os::windows::io::{AsRawHandle, FromRawHandle, OwnedHandle}; use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Mutex, Once, OnceLock}; use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; -use anyhow::Result; -use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID, WAIT_OBJECT_0}; -use windows::Win32::System::Threading::{OpenProcess, WaitForSingleObject, PROCESS_SYNCHRONIZE}; +use anyhow::{Context, Result}; +use windows::core::w; +use windows::Win32::Foundation::{ + CloseHandle, GetLastError, ERROR_ALREADY_EXISTS, HANDLE, LUID, WAIT_OBJECT_0, +}; +use windows::Win32::System::Threading::{ + CreateMutexW, OpenProcess, WaitForSingleObject, PROCESS_SYNCHRONIZE, +}; use super::{Mode, VirtualOutput}; use crate::win_display::{ @@ -144,6 +149,9 @@ struct DeviceSlot { /// `CLEAR_ALL` (crashed-host orphan reap) runs only on the FIRST open of the process; a reopen /// races sessions this process still considers live and must not raze them. opened_once: bool, + /// The cross-process single-instance mutex (`Global\punktfunk-vdisplay-manager`), acquired on + /// the first open and held — never released — for the process lifetime. + instance_guard: Option, } /// The host-lifetime virtual-display manager: the single owner of the monitor lifecycle. @@ -208,6 +216,33 @@ pub(crate) fn control_device_handle() -> Option { /// next use reopens. The root `windows` error survives anyhow `.context` chains via `downcast_ref`. /// NOTE: 0x80070490 (ERROR_NOT_FOUND, the ADD slot-exhaustion wedge) is deliberately NOT here — it /// has its own reap-and-retry handling and the device is alive when it fires. +/// The cross-process single-instance guard for pf-vdisplay management. A SECOND host process's +/// first device open used to fire `IOCTL_CLEAR_ALL` and raze the live host's monitors mid-stream — +/// an admin footgun (run `punktfunk-host serve` while the SCM service streams), masked afterwards +/// because both processes' pings satisfy the shared driver watchdog. The named mutex makes the +/// second process fail its vdisplay open LOUDLY instead. Held, never released, for the process +/// lifetime; the OS reclaims it (and frees the name) when the process exits, however it exits. +fn acquire_single_instance() -> Result { + // SAFETY: plain FFI create of a named mutex; the returned handle (checked by `?`) is solely + // owned by the `OwnedHandle`, and `GetLastError` is read immediately after the create — the + // documented ERROR_ALREADY_EXISTS protocol for pre-existing named objects. + unsafe { + let h = CreateMutexW(None, false, w!("Global\\punktfunk-vdisplay-manager")) + .context("CreateMutexW(punktfunk-vdisplay single-instance guard)")?; + let already = GetLastError() == ERROR_ALREADY_EXISTS; + let owned = OwnedHandle::from_raw_handle(h.0 as _); + if already { + anyhow::bail!( + "another punktfunk-host process is already managing pf-vdisplay on this machine — \ + refusing to touch the driver (a second manager's startup CLEAR_ALL would raze the \ + live host's monitors mid-stream). Stop the other instance (e.g. `punktfunk-host \ + service stop`) first." + ); + } + Ok(owned) + } +} + /// Best-effort "is this WUDFHost pid still alive?" — the monitor-liveness probe for the JOIN path. /// `OpenProcess` failing (pid reaped) or the process being signaled ⇒ dead. Pid reuse could /// theoretically alias a fresh process and read "alive"; the joining session then just retries into @@ -261,6 +296,9 @@ impl VirtualDisplayManager { return Ok(HANDLE(d.as_raw_handle())); } let reap = !slot.opened_once; + if slot.instance_guard.is_none() { + slot.instance_guard = Some(acquire_single_instance()?); + } // SAFETY: `VdisplayDriver::open` is `unsafe` only because it issues SetupAPI + `DeviceIoControl` // FFI in the caller's apartment; the `device` mutex (held here) serializes it, so there is no // concurrent open. `open` has no handle precondition to uphold, and the `OwnedHandle` it @@ -770,35 +808,29 @@ impl VirtualDisplayManager { .name("vdisplay-linger".into()) .spawn(move || loop { thread::sleep(Duration::from_millis(500)); - let due = { - let g = self.state.lock().unwrap(); - matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until) - }; - if !due { - continue; - } let Some(dev) = self.device_handle() else { continue; }; - let taken = { - let mut g = self.state.lock().unwrap(); - if matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until) { - if let MgrState::Lingering { mon, .. } = - std::mem::replace(&mut *g, MgrState::Idle) - { - Some(mon) - } else { - None - } - } else { - None - } - }; - if let Some(mon) = taken { - // SAFETY: `teardown` requires `dev` to be the live control handle; `dev` is from - // `self.device_handle()` (the `Some` checked just above), i.e. the cached - // `OwnedHandle` live for the process lifetime. `mon` was moved out of the - // `Lingering` state under the `state` lock, so it is exclusively owned here. + let mut g = self.state.lock().unwrap(); + if !matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until) + { + continue; + } + if let MgrState::Lingering { mon, .. } = + std::mem::replace(&mut *g, MgrState::Idle) + { + // Teardown UNDER the state lock. Dropping the lock first (the old shape) let a + // concurrent `acquire` see Idle and run its ADD + CCD isolate while this + // monitor's pinger-join / CCD-restore / REMOVE were still in flight — the late + // restore then de-isolated (or the REMOVE churn-rejected) the fresh session at + // the linger-expiry boundary. Holding the lock makes the racing acquire WAIT + // the few teardown seconds instead of failing its session. Lock order stays + // state → device (teardown's invalidate path), same as every other holder; the + // pinger takes only the device lock — no inversion. + // SAFETY: `teardown` requires a valid control handle; `dev` is from + // `self.device_handle()` (cached handles are never closed — a dead one is + // retired, kept alive; see `DeviceSlot`). `mon` was moved out of the replaced + // state under the lock, so it is exclusively owned here. unsafe { self.teardown(dev, mon) }; } })