feat(host): KDE-reliability phase 2 — pipeline retry, graceful capture teardown, refresh reconcile

Hardens the virtual-display → capture → encode bring-up against the transient
failures that surfaced as black screens / wrong refresh on cold KDE sessions.

- m3: build_pipeline_with_retry wraps the initial vd.create() + first-frame with
  bounded exponential backoff (4 attempts, 500ms→2s). is_permanent_build_error
  classifies config/version/missing-tool failures so they fail fast instead of
  burning the retry budget. Encoder + frame clock now pace to the *achieved*
  refresh reported in VirtualOutput::preferred_mode, not the requested rate.
- capture/linux: PortalCapturer::Drop sends a pipewire channel quit and joins the
  thread, so a dropped/failed/retried capturer releases its PipeWire thread + EGL/
  CUDA context promptly instead of leaking it to process exit. First-frame timeout
  now reports the node id and distinguishes "format never negotiated" from
  "negotiated but no buffers arrived" via a negotiated flag set in param_changed.
- vdisplay/kwin: set_custom_refresh reads back the active mode from kscreen-doctor
  and returns the refresh KWin actually gave us (a rejected custom mode silently
  leaves the output at 60Hz); create() carries it into preferred_mode.
- vdisplay/gamescope: find_gamescope_node requires the Video/Source object (the
  node.name=gamescope tag is on two objects; the other wedges the link); a version
  check warns on <3.16.22 (the PipeWire-1.6 capture-deadlock signature).

Live-validated against headless KWin: 720p120 build with requested=120 achieved=120,
zero-copy CUDA frames, and no per-session thread accumulation across back-to-back
sessions. Tests: +3 unit (retry classifier, gamescope version parse); 49 host tests
green, clippy/fmt clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-10 20:03:11 +00:00
parent a4eacabecd
commit 75eb8fa0d6
4 changed files with 427 additions and 53 deletions
+100 -6
View File
@@ -48,6 +48,7 @@ impl VirtualDisplay for GamescopeDisplay {
keepalive: Box::new(()),
});
}
check_gamescope_version(); // diagnostic only — warns on known-deadlock-prone versions
let proc = GamescopeProc(spawn(mode.width, mode.height, mode.refresh_hz.max(1))?);
// gamescope creates its PipeWire node a moment after start; poll for it (the proc is held
// alive meanwhile, and killed if we give up).
@@ -147,24 +148,92 @@ fn node_from_log() -> Option<u32> {
}
/// Find the `gamescope` `Video/Source` node id in a `pw-dump` snapshot of the default daemon.
///
/// `node.name=gamescope` appears on TWO objects (the adapter *and* the inner stream node); only
/// the one whose `media.class` is `Video/Source` is a valid capture target — connecting to the
/// other wedges the link. So we require `Video/Source` first and fall back to a bare name match
/// only if no class-tagged node is present (older gamescope that doesn't set media.class).
fn find_gamescope_node() -> Option<u32> {
let out = Command::new("pw-dump").output().ok()?;
let dump: serde_json::Value = serde_json::from_slice(&out.stdout).ok()?;
for obj in dump.as_array()? {
let nodes = dump.as_array()?;
let node_props = |obj: &serde_json::Value| -> Option<(u32, String, String)> {
if obj.get("type").and_then(|t| t.as_str()) != Some("PipeWire:Interface:Node") {
continue;
return None;
}
let id = obj.get("id").and_then(|i| i.as_u64())? as u32;
let props = obj.get("info").and_then(|i| i.get("props"));
let name = props
.and_then(|p| p.get("node.name"))
.and_then(|n| n.as_str())
.unwrap_or("");
.unwrap_or("")
.to_string();
let class = props
.and_then(|p| p.get("media.class"))
.and_then(|n| n.as_str())
.unwrap_or("");
if name == "gamescope" || (class == "Video/Source" && name.contains("gamescope")) {
return obj.get("id").and_then(|i| i.as_u64()).map(|x| x as u32);
.unwrap_or("")
.to_string();
Some((id, name, class))
};
// Preferred: a Video/Source node named (or containing) "gamescope".
for obj in nodes {
if let Some((id, name, class)) = node_props(obj) {
if class == "Video/Source" && (name == "gamescope" || name.contains("gamescope")) {
return Some(id);
}
}
}
// Fallback: a node literally named "gamescope" with no usable class tag.
for obj in nodes {
if let Some((id, name, _)) = node_props(obj) {
if name == "gamescope" {
tracing::warn!(
node_id = id,
"gamescope node has no media.class=Video/Source tag — capturing it anyway"
);
return Some(id);
}
}
}
None
}
/// Minimum gamescope that captures reliably: below 3.16.22, headless PipeWire capture deadlocks
/// against PipeWire ≥ 1.6 (a loop-lock bug) and a stuck link head-blocks the whole daemon.
const MIN_GAMESCOPE: (u32, u32, u32) = (3, 16, 22);
/// Best-effort: warn loudly if the installed gamescope is older than [`MIN_GAMESCOPE`]. Parsing
/// failures are silent (don't block a possibly-fine custom build) — this is a diagnostic, not a
/// gate. Returns the parsed version when it could read one.
fn check_gamescope_version() -> Option<(u32, u32, u32)> {
let out = Command::new("gamescope").arg("--version").output().ok()?;
// gamescope prints the version banner to stderr on some builds, stdout on others.
let text = format!(
"{}{}",
String::from_utf8_lossy(&out.stdout),
String::from_utf8_lossy(&out.stderr)
);
let ver = parse_version(&text)?;
if ver < MIN_GAMESCOPE {
tracing::warn!(
found = %format!("{}.{}.{}", ver.0, ver.1, ver.2),
min = %format!("{}.{}.{}", MIN_GAMESCOPE.0, MIN_GAMESCOPE.1, MIN_GAMESCOPE.2),
"gamescope is older than the minimum for reliable headless capture — expect a \
capture deadlock against PipeWire ≥ 1.6 (a wedged link head-blocks the daemon); \
upgrade gamescope or use PUNKTFUNK_COMPOSITOR=kwin|mutter"
);
}
Some(ver)
}
/// Extract the first `X.Y.Z` version triple from arbitrary text (e.g. `gamescope version 3.16.22`).
fn parse_version(text: &str) -> Option<(u32, u32, u32)> {
for token in text.split(|c: char| !(c.is_ascii_digit() || c == '.')) {
let mut parts = token.split('.');
let (a, b, c) = (parts.next()?, parts.next(), parts.next());
let (Some(b), Some(c)) = (b, c) else { continue };
if let (Ok(a), Ok(b), Ok(c)) = (a.parse(), b.parse(), c.parse()) {
return Some((a, b, c));
}
}
None
@@ -179,3 +248,28 @@ impl Drop for GamescopeProc {
let _ = self.0.wait();
}
}
#[cfg(test)]
mod tests {
use super::{parse_version, MIN_GAMESCOPE};
#[test]
fn parses_version_banner() {
assert_eq!(parse_version("gamescope version 3.16.22"), Some((3, 16, 22)));
assert_eq!(
parse_version("gamescope: version v3.15.9 (no PipeWire)"),
Some((3, 15, 9))
);
assert_eq!(parse_version("3.16.20-1.fc41"), Some((3, 16, 20)));
assert_eq!(parse_version("no version here"), None);
assert_eq!(parse_version("only 3.16 here"), None); // needs a full triple
}
#[test]
fn flags_known_bad_versions() {
// The 26.04-shipped 3.16.20 is below the minimum (PipeWire 1.6 deadlock).
assert!(parse_version("gamescope version 3.16.20").unwrap() < MIN_GAMESCOPE);
assert!(parse_version("gamescope version 3.16.22").unwrap() >= MIN_GAMESCOPE);
assert!(parse_version("gamescope version 3.17.0").unwrap() >= MIN_GAMESCOPE);
}
}
+79 -18
View File
@@ -91,17 +91,22 @@ impl VirtualDisplay for KwinDisplay {
};
tracing::info!(node_id, width, height, "KWin virtual output ready");
// KWin creates virtual outputs at a hardcoded 60 Hz and `stream_virtual_output` has no
// refresh argument, so when the client wants more we install + select a custom mode
// (supported on virtual outputs since KWin 6.6). Done before capture connects PipeWire so
// the stream negotiates at the higher rate. First cut shells out to kscreen-doctor; the
// in-process kde_output_management_v2 client is a follow-up.
if mode.refresh_hz > 60 {
set_custom_refresh(width, height, mode.refresh_hz);
}
// refresh argument, so above 60 Hz we install + select a custom mode (supported on virtual
// outputs since KWin 6.6) before capture connects PipeWire, so the stream negotiates at the
// higher rate. First cut shells out to kscreen-doctor; the in-process
// kde_output_management_v2 client is a follow-up. `set_custom_refresh` reads back and
// returns what KWin *actually* achieved so the encoder paces to the real source rate (a
// rejected custom mode leaves the output at 60 Hz). At ≤60 Hz there's nothing to install —
// the source runs 60 Hz and the encoder downsamples — so carry the requested rate through.
let achieved_hz = if mode.refresh_hz > 60 {
set_custom_refresh(width, height, mode.refresh_hz)
} else {
mode.refresh_hz
};
Ok(VirtualOutput {
node_id,
remote_fd: None,
preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)),
preferred_mode: Some((mode.width, mode.height, achieved_hz)),
keepalive: Box::new(StopGuard(stop)),
})
}
@@ -109,8 +114,11 @@ impl VirtualDisplay for KwinDisplay {
/// Best-effort: raise the just-created virtual output's refresh above KWin's default 60 Hz by
/// installing + selecting a custom mode via `kscreen-doctor` (the output is `Virtual-<VOUT_NAME>`,
/// refresh given in mHz). Failure leaves the source at 60 Hz — the stream still works, just capped.
fn set_custom_refresh(width: u32, height: u32, hz: u32) {
/// refresh given in mHz), then **read back the active mode** and return the refresh KWin actually
/// gave us. The apply command can report success yet leave the output at 60 Hz (mode rejected),
/// and a silent rate mismatch surfaces downstream as judder / duplicated frames — so the caller
/// paces the encoder to the *achieved* rate, not the requested one.
fn set_custom_refresh(width: u32, height: u32, hz: u32) -> u32 {
let output = format!("Virtual-{VOUT_NAME}");
let mhz = hz.saturating_mul(1000);
let run = |arg: String| {
@@ -124,17 +132,70 @@ fn set_custom_refresh(width: u32, height: u32, hz: u32) {
let _ = run(format!(
"output.{output}.addCustomMode.{width}.{height}.{mhz}.full"
));
if run(format!("output.{output}.mode.{width}x{height}@{hz}")) {
tracing::info!(output, hz, "KWin virtual output: custom refresh applied");
} else {
tracing::warn!(
output,
hz,
"kscreen-doctor refresh set failed — source stays 60 Hz (is kscreen-doctor installed?)"
);
let applied = run(format!("output.{output}.mode.{width}x{height}@{hz}"));
match read_active_refresh(&output) {
Some(achieved) if achieved >= hz => {
tracing::info!(
output,
requested = hz,
achieved,
"KWin virtual output: custom refresh applied"
);
achieved
}
Some(achieved) => {
tracing::warn!(
output,
requested = hz,
achieved,
applied,
"KWin virtual output refresh below requested — pacing the encoder to the achieved \
rate (custom-mode install rejected? is kscreen-doctor up to date?)"
);
achieved.max(1)
}
None => {
tracing::warn!(
output,
requested = hz,
applied,
"could not read back KWin virtual output refresh — assuming 60 Hz (is \
kscreen-doctor installed?)"
);
60
}
}
}
/// Read the active refresh (Hz, rounded) of `output` from `kscreen-doctor -j`. `None` if the
/// tool, the output, or its current mode can't be found. Mode/output ids come through as either
/// JSON strings or numbers depending on the KWin version, so both are accepted.
fn read_active_refresh(output: &str) -> Option<u32> {
let out = std::process::Command::new("kscreen-doctor")
.arg("-j")
.output()
.ok()?;
let doc: serde_json::Value = serde_json::from_slice(&out.stdout).ok()?;
let as_id = |v: &serde_json::Value| -> Option<String> {
v.as_str()
.map(|s| s.to_string())
.or_else(|| v.as_u64().map(|n| n.to_string()))
};
let o = doc
.get("outputs")?
.as_array()?
.iter()
.find(|o| o.get("name").and_then(|n| n.as_str()) == Some(output))?;
let current = o.get("currentModeId").and_then(as_id)?;
let mode = o
.get("modes")?
.as_array()?
.iter()
.find(|m| m.get("id").and_then(as_id).as_deref() == Some(current.as_str()))?;
let hz = mode.get("refreshRate").and_then(|r| r.as_f64())?;
Some(hz.round() as u32)
}
/// Dropping this releases the KWin virtual output: it flips the keepalive thread's `stop`, which
/// drops the Wayland connection and makes KWin reclaim the output.
struct StopGuard(Arc<AtomicBool>);