From 8cde8621cef8f882422f2e5e0f65c2ba239da408 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Thu, 25 Jun 2026 22:11:36 +0000 Subject: [PATCH] fix(windows-drivers): reclaim pf-vdisplay monitor ids on REMOVE (P1, slot-reclaim) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver assigned each virtual monitor a monotonically-increasing NEXT_ID used as the EDID serial / IddCx ConnectorIndex / container GUID, and never reclaimed it on REMOVE. Under sustained ADD/REMOVE churn the connector index kept climbing, so IddCx/PnP allocated a NEW OS target slot every cycle and orphaned the old one (ghost "Generic Monitor (punktfunk)" nodes) until the adapter's target capacity was exhausted and ADD failed 0x80070490 ERROR_NOT_FOUND. Fix: `create_monitor` now allocates the LOWEST free id (`alloc_monitor_id`, computed under the MONITOR_MODES lock with the push) instead of a counter, so a departed monitor's id is reclaimed and a fresh ADD reuses its target slot rather than orphaning it. With <= N live monitors the id stays bounded to 1..=N+1. Deleted the now-unused NEXT_ID + AtomicU32/Ordering import. CI-compile-gated only — the wedge reproduces solely under sustained churn on the RTX box, so this needs an on-glass reconnect-storm A/B to confirm (box is ephemeral/down). Marked on-glass-pending in windows-host-rewrite.md §4; keep reset-pf-vdisplay.ps1 as the recovery until validated. NOT to be relied on (or merged to main) until that A/B passes. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/windows-host-rewrite.md | 12 ++++--- .../drivers/pf-vdisplay/src/monitor.rs | 34 +++++++++++++------ 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/docs/windows-host-rewrite.md b/docs/windows-host-rewrite.md index e94ee7f..15453c2 100644 --- a/docs/windows-host-rewrite.md +++ b/docs/windows-host-rewrite.md @@ -211,10 +211,14 @@ These are expensive empirical wins; keep them intact when touching the code: 2. **Make IDD-push the default** — today it is gated behind `PUNKTFUNK_IDD_PUSH` (`config.rs` default `false`); deployment sets it in `host.env`. Flip the code default (with the WGC/DDA fallback already in place) so a fresh install runs the validated path, or document the `host.env` requirement explicitly. -3. **pf-vdisplay slot reclaim on REMOVE** (driver robustness) — sustained ADD/REMOVE churn wedges the - driver (`ADD → 0x80070490 ERROR_NOT_FOUND`) because IddCx monitor slots aren't reclaimed (ghost nodes - accumulate). Workaround today: `packaging/windows/reset-pf-vdisplay.ps1`. Real fix in `control.rs`/ - `monitor.rs`. On-glass-gated. +3. **pf-vdisplay slot reclaim on REMOVE** (driver robustness) — 🟡 **fix landed, on-glass-validation + pending.** Sustained ADD/REMOVE churn wedged the driver (`ADD → 0x80070490 ERROR_NOT_FOUND`) because the + monitor id (EDID serial / `ConnectorIndex` / container GUID) was a **monotonic** `NEXT_ID`, never + reclaimed → IddCx accumulated a new OS target slot per cycle until exhaustion. `monitor.rs` now allocates + the **lowest free id** (`alloc_monitor_id`), reused on REMOVE, so a fresh ADD reuses the departed + monitor's target slot instead of orphaning it. CI-compile-gated; the wedge only reproduces under + sustained churn on the RTX box, so this needs an **on-glass reconnect-storm A/B** to confirm (the box is + ephemeral). Keep `packaging/windows/reset-pf-vdisplay.ps1` as the recovery until validated. **P2 — hygiene / architecture completion** 4. **D1-host — host-crate P0 lints.** Add `#![deny(unsafe_op_in_unsafe_fn)]` + diff --git a/packaging/windows/drivers/pf-vdisplay/src/monitor.rs b/packaging/windows/drivers/pf-vdisplay/src/monitor.rs index 0172ff3..2c4620b 100644 --- a/packaging/windows/drivers/pf-vdisplay/src/monitor.rs +++ b/packaging/windows/drivers/pf-vdisplay/src/monitor.rs @@ -5,7 +5,6 @@ //! `guid: u128` → `session_id: u64` for the owned `pf_vdisplay_proto` control plane. use std::sync::Mutex; -use std::sync::atomic::{AtomicU32, Ordering}; use std::time::{Duration, Instant}; use wdk_sys::iddcx; @@ -69,8 +68,6 @@ unsafe impl Send for MonitorObject {} /// thread ([`crate::control::start_watchdog`]) races device cleanup — for no real gain. Cleanup of the /// heavy per-monitor resources on device removal is instead done explicitly ([`cleanup_for_device_removal`]). pub static MONITOR_MODES: Mutex> = Mutex::new(Vec::new()); -/// Monitor id / EDID-serial counter (unique per created monitor). -static NEXT_ID: AtomicU32 = AtomicU32::new(1); /// True if any virtual monitor currently exists — the host-gone watchdog only reaps when there's /// something to reap (see [`crate::control::start_watchdog`]). @@ -323,8 +320,6 @@ pub fn create_monitor( dbglog!("[pf-vd] create_monitor: session {session_id} already live — departing the stale monitor"); remove_monitor(session_id); } - let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); - let mut modes = vec![Mode { width, height, @@ -332,8 +327,17 @@ pub fn create_monitor( }]; modes.extend(default_modes()); - // Register the (pending) monitor so the mode DDIs can find it by EDID-serial id before arrival. - if let Ok(mut lock) = MONITOR_MODES.lock() { + // Register the (pending) monitor so the mode DDIs can find it by EDID-serial id before arrival, under a + // REUSED id (the lowest not currently live). Reclaiming the id on REMOVE — instead of a monotonic + // counter — keeps the connector index / EDID serial / container GUID bounded, so IddCx reuses the same + // OS target slot on a fresh ADD rather than leaving a ghost monitor node behind (the slot-exhaustion + // wedge: sustained ADD/REMOVE churn eventually makes ADD fail 0x80070490 ERROR_NOT_FOUND). Allocated + // under the lock with the push so two concurrent ADDs can't pick the same id. + let id = { + let Ok(mut lock) = MONITOR_MODES.lock() else { + return None; + }; + let id = alloc_monitor_id(&lock); lock.push(MonitorObject { object: None, id, @@ -345,9 +349,8 @@ pub fn create_monitor( swap_chain_processor: None, created_at: Instant::now(), }); - } else { - return None; - } + id + }; // EDID (serial = id) describes the monitor; the OS calls back into parse_monitor_description. let mut edid = crate::edid::Edid::generate_with(id); @@ -505,6 +508,17 @@ fn remove_by_id(id: u32) { } } +/// The lowest monitor id (≥1) not currently live. Reusing freed ids (instead of a monotonic counter) keeps +/// the connector index / EDID serial / container GUID bounded to the number of concurrent monitors, so a +/// fresh ADD reuses a departed monitor's OS target slot rather than allocating a new one and orphaning the +/// old (the ghost-monitor accumulation that wedges ADD at 0x80070490 ERROR_NOT_FOUND). Caller holds +/// `MONITOR_MODES`. With ≤ N live ids, a free one always exists in `1..=N+1` (pigeonhole). +fn alloc_monitor_id(modes: &[MonitorObject]) -> u32 { + (1u32..=modes.len() as u32 + 1) + .find(|id| !modes.iter().any(|m| m.id == *id)) + .unwrap_or(1) +} + /// A deterministic, monitor-unique container GUID (groups targets into a physical device). Derived from /// `id` so it is stable + collision-free without a random source. fn container_guid(id: u32) -> wdk_sys::GUID {