feat(host): KDE-reliability phase 2 — pipeline retry, graceful capture teardown, refresh reconcile
Hardens the virtual-display → capture → encode bring-up against the transient failures that surfaced as black screens / wrong refresh on cold KDE sessions. - m3: build_pipeline_with_retry wraps the initial vd.create() + first-frame with bounded exponential backoff (4 attempts, 500ms→2s). is_permanent_build_error classifies config/version/missing-tool failures so they fail fast instead of burning the retry budget. Encoder + frame clock now pace to the *achieved* refresh reported in VirtualOutput::preferred_mode, not the requested rate. - capture/linux: PortalCapturer::Drop sends a pipewire channel quit and joins the thread, so a dropped/failed/retried capturer releases its PipeWire thread + EGL/ CUDA context promptly instead of leaking it to process exit. First-frame timeout now reports the node id and distinguishes "format never negotiated" from "negotiated but no buffers arrived" via a negotiated flag set in param_changed. - vdisplay/kwin: set_custom_refresh reads back the active mode from kscreen-doctor and returns the refresh KWin actually gave us (a rejected custom mode silently leaves the output at 60Hz); create() carries it into preferred_mode. - vdisplay/gamescope: find_gamescope_node requires the Video/Source object (the node.name=gamescope tag is on two objects; the other wedges the link); a version check warns on <3.16.22 (the PipeWire-1.6 capture-deadlock signature). Live-validated against headless KWin: 720p120 build with requested=120 achieved=120, zero-copy CUDA frames, and no per-session thread accumulation across back-to-back sessions. Tests: +3 unit (retry classifier, gamescope version parse); 49 host tests green, clippy/fmt clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -822,7 +822,8 @@ fn virtual_stream(
|
||||
let compositor = crate::vdisplay::detect().context("detect compositor")?;
|
||||
tracing::info!(?compositor, ?mode, "punktfunk/1 virtual display");
|
||||
let mut vd = crate::vdisplay::open(compositor)?;
|
||||
let (mut capturer, mut enc, mut frame, mut interval) = build_pipeline(&mut vd, mode)?;
|
||||
let (mut capturer, mut enc, mut frame, mut interval) =
|
||||
build_pipeline_with_retry(&mut vd, mode)?;
|
||||
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(seconds as u64);
|
||||
let mut next = std::time::Instant::now();
|
||||
@@ -885,11 +886,98 @@ type Pipeline = (
|
||||
std::time::Duration,
|
||||
);
|
||||
|
||||
/// Build the pipeline, retrying *transient* failures with bounded exponential backoff.
|
||||
///
|
||||
/// Bringing a virtual output to first-frame races several async steps — the compositor parenting
|
||||
/// the output, the portal/RemoteDesktop grant, PipeWire format negotiation — any of which can
|
||||
/// momentarily time out on a cold session. A single timed-out attempt shouldn't abort the whole
|
||||
/// punktfunk/1 session. But a *permanent* failure (unsupported compositor/mode, a KWin too old to
|
||||
/// create virtual outputs, a missing tool) must fail fast instead of burning the budget — so the
|
||||
/// error chain is classified and permanent ones short-circuit. Each failed attempt drops its
|
||||
/// capturer, which (via `PortalCapturer::Drop`) tears the PipeWire thread + virtual output down
|
||||
/// before the next attempt — no leak across retries.
|
||||
fn build_pipeline_with_retry(
|
||||
vd: &mut Box<dyn crate::vdisplay::VirtualDisplay>,
|
||||
mode: punktfunk_core::Mode,
|
||||
) -> Result<Pipeline> {
|
||||
const MAX_ATTEMPTS: u32 = 4;
|
||||
let mut backoff = std::time::Duration::from_millis(500);
|
||||
for attempt in 1..=MAX_ATTEMPTS {
|
||||
match build_pipeline(vd, mode) {
|
||||
Ok(pipe) => {
|
||||
if attempt > 1 {
|
||||
tracing::info!(attempt, "pipeline up after retry");
|
||||
}
|
||||
return Ok(pipe);
|
||||
}
|
||||
Err(e) => {
|
||||
let chain = format!("{e:#}");
|
||||
let permanent = is_permanent_build_error(&chain);
|
||||
if permanent || attempt == MAX_ATTEMPTS {
|
||||
let why = if permanent {
|
||||
"permanent"
|
||||
} else {
|
||||
"out of retries"
|
||||
};
|
||||
return Err(e).with_context(|| {
|
||||
format!("pipeline build failed ({why}) after {attempt} attempt(s)")
|
||||
});
|
||||
}
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max = MAX_ATTEMPTS,
|
||||
backoff_ms = backoff.as_millis() as u64,
|
||||
error = %chain,
|
||||
"pipeline build failed — retrying"
|
||||
);
|
||||
std::thread::sleep(backoff);
|
||||
backoff = (backoff * 2).min(std::time::Duration::from_secs(2));
|
||||
}
|
||||
}
|
||||
}
|
||||
unreachable!("the final attempt returns inside the loop")
|
||||
}
|
||||
|
||||
/// Is a pipeline-build error permanent (retrying won't help within this session)? Matches the
|
||||
/// error chain against signatures that don't change between attempts: unsupported compositor or
|
||||
/// mode, a KWin too old to expose virtual outputs, a missing/unparseable config, a tool that
|
||||
/// isn't installed. Everything else — portal/PipeWire negotiation timeouts, "no frame within
|
||||
/// 10s", transient node races — is treated as transient and retried. Biased toward "transient":
|
||||
/// a misjudged permanent error only costs a few seconds before it fails anyway.
|
||||
fn is_permanent_build_error(chain: &str) -> bool {
|
||||
const PERMANENT: &[&str] = &[
|
||||
"virtual displays require linux",
|
||||
"unknown punktfunk_compositor",
|
||||
"could not detect compositor",
|
||||
"could not find output", // KWin < 6.5.6: createVirtualOutput unsupported
|
||||
"must be a node id", // PUNKTFUNK_GAMESCOPE_NODE not an integer
|
||||
"is it installed", // gamescope / kscreen-doctor not on PATH
|
||||
];
|
||||
let lower = chain.to_ascii_lowercase();
|
||||
PERMANENT.iter().any(|p| lower.contains(p))
|
||||
}
|
||||
|
||||
fn build_pipeline(
|
||||
vd: &mut Box<dyn crate::vdisplay::VirtualDisplay>,
|
||||
mode: punktfunk_core::Mode,
|
||||
) -> Result<Pipeline> {
|
||||
let vout = vd.create(mode).context("create virtual output")?;
|
||||
// The backend reports the refresh it actually achieved in `preferred_mode.2` (KWin may cap a
|
||||
// virtual output at 60 Hz if the custom-mode install was rejected). Pace the encoder + frame
|
||||
// clock to that, not the requested rate, so we don't emit phantom duplicate frames over a
|
||||
// slower source. Falls back to the requested rate when a backend reports nothing.
|
||||
let effective_hz = vout
|
||||
.preferred_mode
|
||||
.map(|(_, _, hz)| hz)
|
||||
.filter(|&hz| hz > 0)
|
||||
.unwrap_or(mode.refresh_hz);
|
||||
if effective_hz != mode.refresh_hz {
|
||||
tracing::warn!(
|
||||
requested = mode.refresh_hz,
|
||||
effective = effective_hz,
|
||||
"compositor did not honor the requested refresh — encoding at the achieved rate"
|
||||
);
|
||||
}
|
||||
let mut capturer =
|
||||
crate::capture::capture_virtual_output(vout).context("capture virtual output")?;
|
||||
capturer.set_active(true);
|
||||
@@ -899,12 +987,12 @@ fn build_pipeline(
|
||||
frame.format,
|
||||
frame.width,
|
||||
frame.height,
|
||||
mode.refresh_hz,
|
||||
effective_hz,
|
||||
20_000_000,
|
||||
frame.is_cuda(),
|
||||
)
|
||||
.context("open NVENC")?;
|
||||
let interval = std::time::Duration::from_secs_f64(1.0 / mode.refresh_hz.max(1) as f64);
|
||||
let interval = std::time::Duration::from_secs_f64(1.0 / effective_hz.max(1) as f64);
|
||||
Ok((capturer, enc, frame, interval))
|
||||
}
|
||||
|
||||
@@ -912,6 +1000,29 @@ fn build_pipeline(
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn permanent_errors_short_circuit_retry() {
|
||||
// Permanent: config / version / missing-tool — retrying within a session can't fix these.
|
||||
assert!(is_permanent_build_error(
|
||||
"create virtual output: KWin virtual output failed: Could not find output"
|
||||
));
|
||||
assert!(is_permanent_build_error(
|
||||
"unknown PUNKTFUNK_COMPOSITOR 'foo' (kwin|wlroots|mutter|gamescope)"
|
||||
));
|
||||
assert!(is_permanent_build_error(
|
||||
"spawn gamescope (is it installed? `apt install gamescope`)"
|
||||
));
|
||||
assert!(is_permanent_build_error("virtual displays require Linux"));
|
||||
// Transient: negotiation/timeout races — exactly what backoff is for.
|
||||
assert!(!is_permanent_build_error(
|
||||
"first frame: no PipeWire frame within 10s (node 42): format negotiation never completed"
|
||||
));
|
||||
assert!(!is_permanent_build_error(
|
||||
"create virtual output: timed out creating the KWin virtual output"
|
||||
));
|
||||
assert!(!is_permanent_build_error("open NVENC: device busy"));
|
||||
}
|
||||
|
||||
fn gp(kind: InputKind, code: u32, x: i32, pad: u32) -> InputEvent {
|
||||
InputEvent {
|
||||
kind,
|
||||
|
||||
Reference in New Issue
Block a user