fix(windows-host): cross-plane IDD serialization, linger-expiry race, second-host guard

Batch C of the audit's medium tier (M7+M8+M9):

- M7: GameStream sessions now run the same begin_idd_setup dance as
  punktfunk/1 before creating the shared monitor. A GS connect could
  previously ADD/reconfigure the monitor while a native session was
  mid-build (and vice versa), and its sealed-channel delivery replaced the
  native ring (newest-wins) — each plane could freeze the other. GS has no
  cooperative stop plumbing, so it registers a flag nobody reads: a later
  session signals it, waits the 3 s grace, then force-preempts — the
  intended handover.
- M8: the linger-expiry teardown now runs UNDER the state lock. Running it
  outside let a concurrent acquire see Idle and ADD+isolate while the old
  monitor's pinger-join / CCD-restore / REMOVE were still in flight — a
  failed or de-isolated session exactly at the expiry boundary. A racing
  acquire now waits the few teardown seconds instead. Lock order stays
  state → device everywhere; the pinger takes only the device lock.
- M9: a named mutex (Global\punktfunk-vdisplay-manager) makes a SECOND host
  process fail its vdisplay open loudly instead of firing a startup
  CLEAR_ALL that razes the live host's monitors mid-stream (the admin
  footgun the shared watchdog then masked).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-03 17:28:22 +00:00
parent b46aa15afb
commit fe54aff658
2 changed files with 79 additions and 30 deletions
@@ -269,6 +269,23 @@ fn open_gs_virtual_source(
));
#[cfg(not(target_os = "linux"))]
vd.set_launch_command(app.and_then(|a| a.cmd.clone()));
// Serialize with the punktfunk/1 plane's IDD-push setup dance (Goal-1 §2.5). A GameStream
// connect used to skip it entirely, so it could ADD/reconfigure the shared monitor while a
// native session was mid-build (and vice versa), and its sealed-channel delivery would replace
// the native session's ring (newest-wins) — each plane could freeze the other. GameStream has
// no cooperative stop-flag plumbing, so it registers a flag nobody reads: a LATER session that
// preempts this one signals it, waits the 3 s release grace, then force-preempts the monitor —
// this session then fails on capture and tears down cleanly (the intended handover).
#[cfg(target_os = "windows")]
let _idd_setup_guard = matches!(
crate::session_plan::CaptureBackend::resolve(),
crate::session_plan::CaptureBackend::IddPush
)
.then(|| {
crate::vdisplay::manager::vdm().begin_idd_setup(std::sync::Arc::new(
std::sync::atomic::AtomicBool::new(false),
))
});
let vout = vd
.create(punktfunk_core::Mode {
width: cfg.width,
@@ -16,15 +16,20 @@
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use std::os::windows::io::{AsRawHandle, OwnedHandle};
use std::os::windows::io::{AsRawHandle, FromRawHandle, OwnedHandle};
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Mutex, Once, OnceLock};
use std::thread::{self, JoinHandle};
use std::time::{Duration, Instant};
use anyhow::Result;
use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID, WAIT_OBJECT_0};
use windows::Win32::System::Threading::{OpenProcess, WaitForSingleObject, PROCESS_SYNCHRONIZE};
use anyhow::{Context, Result};
use windows::core::w;
use windows::Win32::Foundation::{
CloseHandle, GetLastError, ERROR_ALREADY_EXISTS, HANDLE, LUID, WAIT_OBJECT_0,
};
use windows::Win32::System::Threading::{
CreateMutexW, OpenProcess, WaitForSingleObject, PROCESS_SYNCHRONIZE,
};
use super::{Mode, VirtualOutput};
use crate::win_display::{
@@ -144,6 +149,9 @@ struct DeviceSlot {
/// `CLEAR_ALL` (crashed-host orphan reap) runs only on the FIRST open of the process; a reopen
/// races sessions this process still considers live and must not raze them.
opened_once: bool,
/// The cross-process single-instance mutex (`Global\punktfunk-vdisplay-manager`), acquired on
/// the first open and held — never released — for the process lifetime.
instance_guard: Option<OwnedHandle>,
}
/// The host-lifetime virtual-display manager: the single owner of the monitor lifecycle.
@@ -208,6 +216,33 @@ pub(crate) fn control_device_handle() -> Option<HANDLE> {
/// next use reopens. The root `windows` error survives anyhow `.context` chains via `downcast_ref`.
/// NOTE: 0x80070490 (ERROR_NOT_FOUND, the ADD slot-exhaustion wedge) is deliberately NOT here — it
/// has its own reap-and-retry handling and the device is alive when it fires.
/// The cross-process single-instance guard for pf-vdisplay management. A SECOND host process's
/// first device open used to fire `IOCTL_CLEAR_ALL` and raze the live host's monitors mid-stream —
/// an admin footgun (run `punktfunk-host serve` while the SCM service streams), masked afterwards
/// because both processes' pings satisfy the shared driver watchdog. The named mutex makes the
/// second process fail its vdisplay open LOUDLY instead. Held, never released, for the process
/// lifetime; the OS reclaims it (and frees the name) when the process exits, however it exits.
fn acquire_single_instance() -> Result<OwnedHandle> {
// SAFETY: plain FFI create of a named mutex; the returned handle (checked by `?`) is solely
// owned by the `OwnedHandle`, and `GetLastError` is read immediately after the create — the
// documented ERROR_ALREADY_EXISTS protocol for pre-existing named objects.
unsafe {
let h = CreateMutexW(None, false, w!("Global\\punktfunk-vdisplay-manager"))
.context("CreateMutexW(punktfunk-vdisplay single-instance guard)")?;
let already = GetLastError() == ERROR_ALREADY_EXISTS;
let owned = OwnedHandle::from_raw_handle(h.0 as _);
if already {
anyhow::bail!(
"another punktfunk-host process is already managing pf-vdisplay on this machine — \
refusing to touch the driver (a second manager's startup CLEAR_ALL would raze the \
live host's monitors mid-stream). Stop the other instance (e.g. `punktfunk-host \
service stop`) first."
);
}
Ok(owned)
}
}
/// Best-effort "is this WUDFHost pid still alive?" — the monitor-liveness probe for the JOIN path.
/// `OpenProcess` failing (pid reaped) or the process being signaled ⇒ dead. Pid reuse could
/// theoretically alias a fresh process and read "alive"; the joining session then just retries into
@@ -261,6 +296,9 @@ impl VirtualDisplayManager {
return Ok(HANDLE(d.as_raw_handle()));
}
let reap = !slot.opened_once;
if slot.instance_guard.is_none() {
slot.instance_guard = Some(acquire_single_instance()?);
}
// SAFETY: `VdisplayDriver::open` is `unsafe` only because it issues SetupAPI + `DeviceIoControl`
// FFI in the caller's apartment; the `device` mutex (held here) serializes it, so there is no
// concurrent open. `open` has no handle precondition to uphold, and the `OwnedHandle` it
@@ -770,35 +808,29 @@ impl VirtualDisplayManager {
.name("vdisplay-linger".into())
.spawn(move || loop {
thread::sleep(Duration::from_millis(500));
let due = {
let g = self.state.lock().unwrap();
matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until)
};
if !due {
continue;
}
let Some(dev) = self.device_handle() else {
continue;
};
let taken = {
let mut g = self.state.lock().unwrap();
if matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until) {
if !matches!(&*g, MgrState::Lingering { until, .. } if Instant::now() >= *until)
{
continue;
}
if let MgrState::Lingering { mon, .. } =
std::mem::replace(&mut *g, MgrState::Idle)
{
Some(mon)
} else {
None
}
} else {
None
}
};
if let Some(mon) = taken {
// SAFETY: `teardown` requires `dev` to be the live control handle; `dev` is from
// `self.device_handle()` (the `Some` checked just above), i.e. the cached
// `OwnedHandle` live for the process lifetime. `mon` was moved out of the
// `Lingering` state under the `state` lock, so it is exclusively owned here.
// Teardown UNDER the state lock. Dropping the lock first (the old shape) let a
// concurrent `acquire` see Idle and run its ADD + CCD isolate while this
// monitor's pinger-join / CCD-restore / REMOVE were still in flight — the late
// restore then de-isolated (or the REMOVE churn-rejected) the fresh session at
// the linger-expiry boundary. Holding the lock makes the racing acquire WAIT
// the few teardown seconds instead of failing its session. Lock order stays
// state → device (teardown's invalidate path), same as every other holder; the
// pinger takes only the device lock — no inversion.
// SAFETY: `teardown` requires a valid control handle; `dev` is from
// `self.device_handle()` (cached handles are never closed — a dead one is
// retired, kept alive; see `DeviceSlot`). `mon` was moved out of the replaced
// state under the lock, so it is exclusively owned here.
unsafe { self.teardown(dev, mon) };
}
})