feat(client): request a recovery keyframe on unrecoverable loss
apple / swift (push) Successful in 54s
windows-msix / package (push) Successful in 1m0s
windows / build (push) Successful in 54s
android / android (push) Successful in 2m30s
ci / web (push) Successful in 37s
ci / docs-site (push) Successful in 38s
ci / rust (push) Successful in 4m24s
deb / build-publish (push) Successful in 2m5s
decky / build-publish (push) Successful in 25s
ci / bench (push) Successful in 4m25s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 16s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 2m38s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 2m24s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 22s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 2m15s
flatpak / build-publish (push) Failing after 5m13s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 4m37s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m26s

Under infinite GOP the punktfunk/1 plane has no periodic IDR — the only recovery
keyframe is one the client requests. But the reassembler drops unrecoverable AUs
silently (frames_dropped) and hands the decoder reference-missing delta frames
that libavcodec conceals and returns Ok for, so keying recovery off a decode
error mostly never fires under real loss → a long/permanent freeze.

Surface the data-plane pump's Session.frames_dropped to NativeClient via a shared
atomic (NativeClient::frames_dropped()), updated every pump iteration so it stays
current through a total-loss drought. The Linux and Windows client video loops
watch it and call request_keyframe() when it climbs, throttled to 100 ms (the
decode stays wedged for several frames until the IDR lands). macOS already does
this; client-rs doesn't decode.

Resolves reliability backlog #2.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 11:04:42 +00:00
parent 55d5a4278f
commit be18797df8
3 changed files with 62 additions and 1 deletions
@@ -157,6 +157,9 @@ fn pump(
let mut decode_us_sum = 0u64; let mut decode_us_sum = 0u64;
let mut lat_us: Vec<u64> = Vec::with_capacity(256); let mut lat_us: Vec<u64> = Vec::with_capacity(256);
let mut pcm = vec![0f32; 5760 * 2]; // decode scratch: max Opus frame (120 ms stereo) let mut pcm = vec![0f32; 5760 * 2]; // decode scratch: max Opus frame (120 ms stereo)
// Loss recovery: watch the host→client unrecoverable-drop count and ask for an IDR when it climbs.
let mut last_dropped = connector.frames_dropped();
let mut last_kf_req: Option<Instant> = None;
let end: Option<String> = loop { let end: Option<String> = loop {
if stop.load(Ordering::SeqCst) { if stop.load(Ordering::SeqCst) {
@@ -197,6 +200,22 @@ fn pump(
Err(e) => break Some(format!("session: {e:?}")), Err(e) => break Some(format!("session: {e:?}")),
} }
// Loss recovery: under infinite GOP the only recovery keyframe is one we request. The
// reassembler drops unrecoverable AUs (frames_dropped); the decoder then conceals the
// reference-missing delta frames that follow and returns Ok, so keying off a decode error
// rarely fires. Request an IDR when the drop count climbs, throttled — the decode stays
// wedged for several frames until the IDR lands, so requesting every frame would flood.
let dropped = connector.frames_dropped();
if dropped > last_dropped {
last_dropped = dropped;
let now = Instant::now();
if last_kf_req.is_none_or(|t| now.duration_since(t) >= Duration::from_millis(100)) {
last_kf_req = Some(now);
let _ = connector.request_keyframe();
tracing::debug!(dropped, "requested keyframe (loss recovery)");
}
}
// Drain audio between frames (packets land every 5 ms; the queue holds 320 ms). // Drain audio between frames (packets land every 5 ms; the queue holds 320 ms).
while let Ok(pkt) = connector.next_audio(Duration::ZERO) { while let Ok(pkt) = connector.next_audio(Duration::ZERO) {
if let (Some(player), Some(dec)) = (&player, opus_dec.as_mut()) { if let (Some(player), Some(dec)) = (&player, opus_dec.as_mut()) {
@@ -160,6 +160,9 @@ fn pump(
let mut decode_us_sum = 0u64; let mut decode_us_sum = 0u64;
let mut lat_us: Vec<u64> = Vec::with_capacity(256); let mut lat_us: Vec<u64> = Vec::with_capacity(256);
let mut pcm = vec![0f32; 5760 * 2]; // decode scratch: max Opus frame (120 ms stereo) let mut pcm = vec![0f32; 5760 * 2]; // decode scratch: max Opus frame (120 ms stereo)
// Loss recovery: watch the host→client unrecoverable-drop count and ask for an IDR when it climbs.
let mut last_dropped = connector.frames_dropped();
let mut last_kf_req: Option<Instant> = None;
let end: Option<String> = loop { let end: Option<String> = loop {
if stop.load(Ordering::SeqCst) { if stop.load(Ordering::SeqCst) {
@@ -202,6 +205,21 @@ fn pump(
Err(e) => break Some(format!("session: {e:?}")), Err(e) => break Some(format!("session: {e:?}")),
} }
// Loss recovery: under infinite GOP the only recovery keyframe is one we request. The
// reassembler drops unrecoverable AUs (frames_dropped); the decoder conceals the
// reference-missing delta frames that follow and returns Ok, so keying off a decode error
// rarely fires. Request an IDR when the drop count climbs, throttled.
let dropped = connector.frames_dropped();
if dropped > last_dropped {
last_dropped = dropped;
let now = Instant::now();
if last_kf_req.is_none_or(|t| now.duration_since(t) >= Duration::from_millis(100)) {
last_kf_req = Some(now);
let _ = connector.request_keyframe();
tracing::debug!(dropped, "requested keyframe (loss recovery)");
}
}
// Drain audio between frames (packets land every 5 ms; the queue holds 320 ms). // Drain audio between frames (packets land every 5 ms; the queue holds 320 ms).
while let Ok(pkt) = connector.next_audio(Duration::ZERO) { while let Ok(pkt) = connector.next_audio(Duration::ZERO) {
if let (Some(player), Some(dec)) = (&player, opus_dec.as_mut()) { if let (Some(player), Some(dec)) = (&player, opus_dec.as_mut()) {
+25 -1
View File
@@ -21,7 +21,7 @@ use crate::quic::{
}; };
use crate::session::{Frame, Session}; use crate::session::{Frame, Session};
use crate::transport::UdpTransport; use crate::transport::UdpTransport;
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::mpsc::{Receiver, RecvTimeoutError, SyncSender}; use std::sync::mpsc::{Receiver, RecvTimeoutError, SyncSender};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant}; use std::time::{Duration, Instant};
@@ -128,6 +128,11 @@ pub struct NativeClient {
/// Speed-test accumulator, shared with the data-plane pump + control task. /// Speed-test accumulator, shared with the data-plane pump + control task.
probe: Arc<Mutex<ProbeState>>, probe: Arc<Mutex<ProbeState>>,
shutdown: Arc<AtomicBool>, shutdown: Arc<AtomicBool>,
/// Cumulative count of access units the reassembler gave up on (FEC couldn't recover), mirrored
/// from the data-plane pump's `Session`. A client video loop watches this for increases to request
/// a recovery keyframe under infinite GOP — the correct loss trigger, since unrecoverable loss
/// yields reference-missing frames the decoder silently conceals (a decode-error trigger misses them).
frames_dropped: Arc<AtomicU64>,
worker: Option<std::thread::JoinHandle<()>>, worker: Option<std::thread::JoinHandle<()>>,
/// The currently active session mode (the Welcome's, then updated by every accepted /// The currently active session mode (the Welcome's, then updated by every accepted
/// [`NativeClient::request_mode`]). /// [`NativeClient::request_mode`]).
@@ -208,11 +213,13 @@ impl NativeClient {
let shutdown = Arc::new(AtomicBool::new(false)); let shutdown = Arc::new(AtomicBool::new(false));
let mode_slot = Arc::new(std::sync::Mutex::new(mode)); let mode_slot = Arc::new(std::sync::Mutex::new(mode));
let probe = Arc::new(Mutex::new(ProbeState::default())); let probe = Arc::new(Mutex::new(ProbeState::default()));
let frames_dropped = Arc::new(AtomicU64::new(0));
let host = host.to_string(); let host = host.to_string();
let shutdown_w = shutdown.clone(); let shutdown_w = shutdown.clone();
let mode_slot_w = mode_slot.clone(); let mode_slot_w = mode_slot.clone();
let probe_w = probe.clone(); let probe_w = probe.clone();
let frames_dropped_w = frames_dropped.clone();
let worker = std::thread::Builder::new() let worker = std::thread::Builder::new()
.name("punktfunk-client".into()) .name("punktfunk-client".into())
.spawn(move || { .spawn(move || {
@@ -253,6 +260,7 @@ impl NativeClient {
shutdown: shutdown_w, shutdown: shutdown_w,
mode_slot: mode_slot_w, mode_slot: mode_slot_w,
probe: probe_w, probe: probe_w,
frames_dropped: frames_dropped_w,
})); }));
}) })
.map_err(PunktfunkError::Io)?; .map_err(PunktfunkError::Io)?;
@@ -285,6 +293,7 @@ impl NativeClient {
probe, probe,
shutdown, shutdown,
worker: Some(worker), worker: Some(worker),
frames_dropped,
mode: mode_slot, mode: mode_slot,
host_fingerprint: fingerprint, host_fingerprint: fingerprint,
resolved_compositor, resolved_compositor,
@@ -412,6 +421,15 @@ impl NativeClient {
.map_err(|_| PunktfunkError::Closed) .map_err(|_| PunktfunkError::Closed)
} }
/// Cumulative access units the host→client reassembler dropped as unrecoverable (FEC couldn't
/// rebuild them). A video loop polls this and calls [`request_keyframe`](Self::request_keyframe)
/// when it increases — the correct loss trigger under infinite GOP, where unrecoverable loss
/// produces reference-missing delta frames the decoder silently conceals (so a decode-error
/// trigger would rarely fire). Monotonic for the session; compare against the last observed value.
pub fn frames_dropped(&self) -> u64 {
self.frames_dropped.load(Ordering::Relaxed)
}
/// Start a bandwidth speed test: ask the host to burst filler over the data plane at /// Start a bandwidth speed test: ask the host to burst filler over the data plane at
/// `target_kbps` of goodput for `duration_ms`, *briefly pausing video*. Non-blocking — the /// `target_kbps` of goodput for `duration_ms`, *briefly pausing video*. Non-blocking — the
/// measurement accumulates in the background; poll [`NativeClient::probe_result`] until its /// measurement accumulates in the background; poll [`NativeClient::probe_result`] until its
@@ -566,6 +584,7 @@ struct WorkerArgs {
shutdown: Arc<AtomicBool>, shutdown: Arc<AtomicBool>,
mode_slot: Arc<std::sync::Mutex<Mode>>, mode_slot: Arc<std::sync::Mutex<Mode>>,
probe: Arc<Mutex<ProbeState>>, probe: Arc<Mutex<ProbeState>>,
frames_dropped: Arc<AtomicU64>,
} }
/// The worker: QUIC handshake, then the input/datagram/control tasks + the blocking /// The worker: QUIC handshake, then the input/datagram/control tasks + the blocking
@@ -593,6 +612,7 @@ async fn worker_main(args: WorkerArgs) {
shutdown, shutdown,
mode_slot, mode_slot,
probe, probe,
frames_dropped,
} = args; } = args;
let setup = async { let setup = async {
let remote: std::net::SocketAddr = format!("{host}:{port}") let remote: std::net::SocketAddr = format!("{host}:{port}")
@@ -864,6 +884,10 @@ async fn worker_main(args: WorkerArgs) {
let _ = tokio::task::spawn_blocking(move || { let _ = tokio::task::spawn_blocking(move || {
pin_thread_user_interactive(); // feeds frame_tx → the client's user-interactive video pump pin_thread_user_interactive(); // feeds frame_tx → the client's user-interactive video pump
while !pump_shutdown.load(Ordering::SeqCst) { while !pump_shutdown.load(Ordering::SeqCst) {
// Mirror the reassembler's unrecoverable-drop count for the client's keyframe-recovery
// loop. Updated every iteration (not just on a produced frame) so it stays current through
// a total-loss drought where no AU completes. Cheap: a few relaxed atomic loads.
frames_dropped.store(session.stats().frames_dropped, Ordering::Relaxed);
match session.poll_frame() { match session.poll_frame() {
Ok(frame) => { Ok(frame) => {
if frame.flags & FLAG_PROBE as u32 != 0 { if frame.flags & FLAG_PROBE as u32 != 0 {