diff --git a/README.md b/README.md index 2274e56..dd104d2 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ The **GameStream host works with a stock Moonlight client** — validated live o and **video at the client's exact resolution and refresh** via a per-session virtual output (KWin, gamescope, Mutter, and Sway/wlroots backends), encoded with GPU **zero-copy** (dmabuf → CUDA/Vulkan → NVENC) up to 5120×1440@240. The native **`punktfunk/1`** protocol adds a QUIC control plane and a -GF(2¹⁶) Leopard-FEC + AES-GCM data plane (p50 ~0.8 ms capture→reassembled at 720p120), with +GF(2¹⁶) Leopard-FEC + AES-GCM data plane (p50 ~0.8 ms capture→received at 720p120), with mid-stream mode renegotiation and a wall-clock skew handshake so latency stays valid across machines. Both run from **one process**: bare `punktfunk-host serve` is the **secure native-only default** (`punktfunk/1` + the management API/web console), and `serve --gamestream` additionally enables the diff --git a/clients/android/app/src/main/kotlin/io/unom/punktfunk/StatsOverlay.kt b/clients/android/app/src/main/kotlin/io/unom/punktfunk/StatsOverlay.kt index d56e413..f989c83 100644 --- a/clients/android/app/src/main/kotlin/io/unom/punktfunk/StatsOverlay.kt +++ b/clients/android/app/src/main/kotlin/io/unom/punktfunk/StatsOverlay.kt @@ -15,11 +15,13 @@ import io.unom.punktfunk.kit.NativeBridge import kotlin.math.roundToInt /** - * The live stats overlay — mirrors the Apple client's HUD. Reads the 14-double layout from + * The live stats overlay — the unified HUD (`design/stats-unification.md`, Android v1: headline is + * `capture→decoded`, tiled by `host+network` + `decode`). Reads the 16-double layout from * [NativeBridge.nativeVideoStats]: - * `[fps, mbps, latP50Ms, latP95Ms, latValid, skew, w, h, hz, dropped, bitDepth, colorPrimaries, - * colorTransfer, chromaFormatIdc]`. The trailing four (present on a current native lib) describe the - * negotiated video feed and render as a codec/depth/colour/chroma line; older layouts just omit it. + * `[fps, mbps, e2eP50Ms, e2eP95Ms, latValid, skew, w, h, hz, lost, bitDepth, colorPrimaries, + * colorTransfer, chromaFormatIdc, hostNetP50Ms, decodeP50Ms]`. Indexes 10–13 (present on a current + * native lib) describe the negotiated video feed and render as a codec/depth/colour/chroma line; + * 14/15 render as the stage equation; older layouts just omit those lines. */ @Composable internal fun StatsOverlay(s: DoubleArray, modifier: Modifier = Modifier) { @@ -29,7 +31,7 @@ internal fun StatsOverlay(s: DoubleArray, modifier: Modifier = Modifier) { val hz = s[8].toInt() val latValid = s[4] != 0.0 val skew = s[5] != 0.0 - val dropped = s[9].toLong() + val lost = s[9].toLong() Column( modifier = modifier .background(Color.Black.copy(alpha = 0.45f), RoundedCornerShape(6.dp)) @@ -50,17 +52,25 @@ internal fun StatsOverlay(s: DoubleArray, modifier: Modifier = Modifier) { ) } if (latValid) { - val tag = if (skew) "" else " (same-host)" + val tag = if (skew) "" else " (same-host clock)" Text( - "capture→client ${"%.1f".format(s[2])}/${"%.1f".format(s[3])} ms p50/p95$tag", + "end-to-end ${"%.1f".format(s[2])} ms p50 · ${"%.1f".format(s[3])} p95 · capture→decoded$tag", color = Color.White, fontFamily = FontFamily.Monospace, fontSize = 12.sp, ) + if (s.size >= 16) { + Text( + "= host+network ${"%.1f".format(s[14])} + decode ${"%.1f".format(s[15])}", + color = Color.White, + fontFamily = FontFamily.Monospace, + fontSize = 12.sp, + ) + } } - if (dropped > 0) { + if (lost > 0) { Text( - "dropped $dropped", + "lost $lost", color = Color(0xFFFFB0B0), fontFamily = FontFamily.Monospace, fontSize = 12.sp, diff --git a/clients/android/kit/src/main/kotlin/io/unom/punktfunk/kit/NativeBridge.kt b/clients/android/kit/src/main/kotlin/io/unom/punktfunk/kit/NativeBridge.kt index 20c0c43..c7e624a 100644 --- a/clients/android/kit/src/main/kotlin/io/unom/punktfunk/kit/NativeBridge.kt +++ b/clients/android/kit/src/main/kotlin/io/unom/punktfunk/kit/NativeBridge.kt @@ -105,12 +105,14 @@ object NativeBridge { /** * Drain ~1 s of live decode stats for the on-stream HUD, or `null` when no decode thread runs. - * Returns 14 doubles: - * `[fps, mbps, latP50Ms, latP95Ms, latValid, skewCorrected, width, height, refreshHz, framesDropped, - * bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc]` - * (the two flags are 1.0/0.0; the trailing four describe the negotiated video feed — bit depth - * 8/10, CICP primaries/transfer, and the HEVC chroma_format_idc 1=4:2:0 / 3=4:4:4). Poll ~1 Hz; - * each call resets the measurement window. + * Returns 16 doubles (unified stats spec, `design/stats-unification.md`): + * `[fps, mbps, e2eP50Ms, e2eP95Ms, latValid, skewCorrected, width, height, refreshHz, framesLost, + * bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc, hostNetP50Ms, decodeP50Ms]` + * (the two flags are 1.0/0.0; indexes 2/3 are the end-to-end capture→decoded headline; 10–13 + * describe the negotiated video feed — bit depth 8/10, CICP primaries/transfer, and the HEVC + * chroma_format_idc 1=4:2:0 / 3=4:4:4; 14/15 are the stage p50s tiling the headline — + * `host+network` = capture→received, `decode` = received→decoded). Poll ~1 Hz; each call + * resets the measurement window. */ external fun nativeVideoStats(handle: Long): DoubleArray? diff --git a/clients/android/native/src/decode.rs b/clients/android/native/src/decode.rs index 81bc009..8fc67d7 100644 --- a/clients/android/native/src/decode.rs +++ b/clients/android/native/src/decode.rs @@ -9,16 +9,22 @@ use ndk::data_space::DataSpace; use ndk::media::media_codec::{ DequeuedInputBufferResult, DequeuedOutputBufferInfoResult, MediaCodec, MediaCodecDirection, + OutputBuffer, }; use ndk::media::media_format::MediaFormat; use ndk::native_window::{FrameRateCompatibility, NativeWindow}; use punktfunk_core::client::NativeClient; use punktfunk_core::error::PunktfunkError; use punktfunk_core::session::Frame; +use std::collections::VecDeque; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::{Duration, Instant}; +/// Cap on the pts→received-timestamp map below: MediaCodec holds only a handful of frames in +/// flight, so anything beyond this is stale (codec flushed / HUD toggled) and gets evicted. +const IN_FLIGHT_CAP: usize = 64; + /// The decode loop. Runs on the `pf-decode` thread until `shutdown` is set or the session closes. pub fn run( client: Arc, @@ -141,9 +147,14 @@ pub fn run( // climbs. let mut last_dropped = client.frames_dropped(); let mut last_kf_req: Option = None; - // Capture→client-receipt latency uses the negotiated host-minus-client clock offset (0 if the - // host didn't answer the skew handshake — then the HUD flags it "same-host"). + // Skew-corrected latency stats (spec: design/stats-unification.md) use the negotiated + // host-minus-client clock offset (0 if the host didn't answer the skew handshake — then the + // HUD flags it "(same-host clock)"). let clock_offset = client.clock_offset_ns; + // HUD stage split: receipt timestamps keyed by the pts we queue into the codec, so the decoded + // point (output-buffer dequeue — MediaCodec round-trips presentationTimeUs) can be paired back + // to its receipt for the `decode` stage. Only fed while the HUD is visible. + let mut in_flight: VecDeque<(u64, i128)> = VecDeque::new(); // The dataspace we've signalled on the Surface so far (None = default/SDR). Set reactively once // the decoder reports an HDR stream (see `drain`); avoids re-applying every format event. let mut applied_ds: Option = None; @@ -164,15 +175,21 @@ pub fn run( &p[..p.len().min(6)] ); } - // HUD stat: capture→client-receipt latency = client_now + (host−client) − + // HUD stat, `received` point: host+network = client_now + (host−client) − // capture_pts. Gated on the HUD being visible — `enabled` first so the hidden - // steady state skips the wall-clock read and the lock entirely. + // steady state skips the wall-clock read and the lock entirely. The receipt + // stamp is also parked in `in_flight` (keyed by the pts the codec will echo on + // the output buffer) for the decoded-point pairing in `drain`. if stats.enabled() { - let lat_ns = - now_realtime_ns() + clock_offset as i128 - frame.pts_ns as i128; + let received_ns = now_realtime_ns(); + let lat_ns = received_ns + clock_offset as i128 - frame.pts_ns as i128; let lat_us = (lat_ns > 0 && lat_ns < 10_000_000_000) .then_some((lat_ns / 1000) as u64); - stats.note(frame.data.len(), lat_us, clock_offset != 0); + stats.note_received(frame.data.len(), lat_us, clock_offset != 0); + in_flight.push_back((frame.pts_ns / 1000, received_ns)); + if in_flight.len() > IN_FLIGHT_CAP { + in_flight.pop_front(); // stale — codec never echoed it back + } } pending = Some(frame); } @@ -202,7 +219,15 @@ pub fn run( } else { Duration::ZERO }; - let (r, d) = drain(&codec, &window, &mut applied_ds, wait); + let (r, d) = drain( + &codec, + &window, + &mut applied_ds, + wait, + &stats, + &mut in_flight, + clock_offset, + ); rendered += r; discarded += d; @@ -330,11 +355,19 @@ fn feed(codec: &MediaCodec, au: &[u8], pts_us: u64) -> bool { /// the caller's input is blocked so the loop waits on decoder progress instead of busy-spinning. /// Returns `(rendered, discarded)`. Also reacts to `OutputFormatChanged` (which can interleave /// between buffers — handled without losing the held buffer) to signal HDR on the Surface. +/// +/// Each dequeued buffer is also the HUD's `decoded` measurement point (rendered or not — the frame +/// finished decoding either way): end-to-end = decoded + clock_offset − capture pts, and the +/// `decode` stage pairs the buffer's echoed presentationTimeUs back to the receipt stamp in +/// `in_flight` (single-clock local difference, no skew involved). fn drain( codec: &MediaCodec, window: &NativeWindow, applied_ds: &mut Option, first_wait: Duration, + stats: &crate::stats::VideoStats, + in_flight: &mut VecDeque<(u64, i128)>, + clock_offset: i64, ) -> (u64, u64) { let mut held = None; // newest ready buffer so far, presented after the loop let mut discarded: u64 = 0; @@ -343,6 +376,9 @@ fn drain( match codec.dequeue_output_buffer(wait) { Ok(DequeuedOutputBufferInfoResult::Buffer(buf)) => { wait = Duration::ZERO; // only the first dequeue may block + if stats.enabled() { + note_decoded(stats, in_flight, clock_offset, &buf); + } if let Some(stale) = held.replace(buf) { // A newer frame is ready — drop the held one without rendering. if let Err(e) = codec.release_output_buffer(stale, false) { @@ -392,6 +428,40 @@ fn drain( (rendered, discarded) } +/// HUD `decoded` point for one dequeued output buffer: build the end-to-end (capture→decoded, +/// skew-corrected, clamped to (0, 10 s)) and `decode` (received→decoded, single-clock local, ≥ 0) +/// samples and hand them to [`crate::stats::VideoStats::note_decoded`]. The codec echoes the input +/// `presentationTimeUs` on the output buffer, which keys the receipt stamp in `in_flight`; entries +/// older than the echoed pts are evicted (decode order == input order here — low-latency, no +/// B-frames — so anything before it was dropped inside the codec or stamped before a flush). +fn note_decoded( + stats: &crate::stats::VideoStats, + in_flight: &mut VecDeque<(u64, i128)>, + clock_offset: i64, + buf: &OutputBuffer<'_>, +) { + let pts_us = buf.info().presentation_time_us().max(0) as u64; + let decoded_ns = now_realtime_ns(); + // Pair the echoed pts back to its receipt stamp, evicting stale (older) entries as we go. + let mut received_ns = None; + while let Some(&(p, r)) = in_flight.front() { + if p > pts_us { + break; // future frame — leave it for its own output buffer + } + in_flight.pop_front(); + if p == pts_us { + received_ns = Some(r); + break; + } + } + // pts_us is the truncated frame.pts_ns/1000 we queued, so ×1000 re-approximates capture time + // to < 1 µs — negligible against the ms-scale figures shown. + let e2e_ns = decoded_ns + clock_offset as i128 - pts_us as i128 * 1000; + let e2e_us = (e2e_ns > 0 && e2e_ns < 10_000_000_000).then_some((e2e_ns / 1000) as u64); + let decode_us = received_ns.map(|r| ((decoded_ns - r).max(0) / 1000) as u64); + stats.note_decoded(e2e_us, decode_us); +} + /// Map the decoder's reported output colour to a BT.2020 HDR dataspace, or `None` for SDR. The /// integer values are the Android MediaFormat colour constants the NDK shares: COLOR_TRANSFER /// ST2084 = 6 (PQ/HDR10), HLG = 7; COLOR_RANGE FULL = 1, LIMITED = 2 (the host encodes limited). diff --git a/clients/android/native/src/session/planes.rs b/clients/android/native/src/session/planes.rs index 3c63ace..baab09f 100644 --- a/clients/android/native/src/session/planes.rs +++ b/clients/android/native/src/session/planes.rs @@ -72,14 +72,16 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeStopVideo( }) } -/// `NativeBridge.nativeVideoStats(handle): DoubleArray?` — drain ~1 s of decode stats for the HUD. -/// Returns 14 doubles -/// `[fps, mbps, latP50Ms, latP95Ms, latValid, skewCorrected, width, height, refreshHz, framesDropped, -/// bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc]` -/// (the two flags are 1.0/0.0; the trailing four describe the negotiated video feed — see below), or -/// `null` when no decode thread is running. Poll ~1 Hz from the UI; each call resets the measurement -/// window. Not android-gated — pure `jni` + connector reads, so it links on the host build too -/// (Kotlin only ever calls it on device). +/// `NativeBridge.nativeVideoStats(handle): DoubleArray?` — drain ~1 s of decode stats for the HUD +/// (unified stats spec, `design/stats-unification.md`). Returns 16 doubles +/// `[fps, mbps, e2eP50Ms, e2eP95Ms, latValid, skewCorrected, width, height, refreshHz, framesLost, +/// bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc, hostNetP50Ms, decodeP50Ms]` +/// (the two flags are 1.0/0.0; indexes 0–13 match the previous 14-double layout with the latency +/// pair re-based from capture→received to the end-to-end capture→decoded headline; the two stage +/// p50s tiling it — `host+network` = capture→received, `decode` = received→decoded — are appended +/// at the end), or `null` when no decode thread is running. Poll ~1 Hz from the UI; each call +/// resets the measurement window. Not android-gated — pure `jni` + connector reads, so it links on +/// the host build too (Kotlin only ever calls it on device). #[no_mangle] pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats( env: JNIEnv, @@ -98,11 +100,11 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats( let snap = h.stats.drain(); let mode = h.client.mode(); let color = h.client.color; - let buf: [f64; 14] = [ + let buf: [f64; 16] = [ snap.fps, snap.mbps, - snap.lat_p50_ms, - snap.lat_p95_ms, + snap.e2e_p50_ms, + snap.e2e_p95_ms, if snap.lat_valid { 1.0 } else { 0.0 }, if snap.skew_corrected { 1.0 } else { 0.0 }, mode.width as f64, @@ -117,6 +119,9 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats( color.primaries as f64, color.transfer as f64, h.client.chroma_format as f64, + // Stage p50s tiling the end-to-end headline (appended to keep 0–13 index-compatible). + snap.hostnet_p50_ms, + snap.decode_p50_ms, ]; let arr = match env.new_double_array(buf.len() as jsize) { Ok(a) => a, diff --git a/clients/android/native/src/stats.rs b/clients/android/native/src/stats.rs index 770dce0..071aff1 100644 --- a/clients/android/native/src/stats.rs +++ b/clients/android/native/src/stats.rs @@ -1,8 +1,11 @@ -//! Live decode stats for the on-stream HUD (mirrors the Apple client's stats overlay): FPS, -//! receive throughput, and capture→client-receipt latency (p50/p95). The decode thread is the sole -//! writer (`note` per access unit); the JNI accessor `nativeVideoStats` drains a snapshot ~1 Hz and -//! resets the window. Sampling is gated on the HUD actually being visible (`set_enabled`, driven by -//! `nativeSetVideoStatsEnabled`) so the hidden steady state costs one relaxed atomic load per frame. +//! Live decode stats for the on-stream HUD, following the unified stats spec +//! (`design/stats-unification.md`): FPS, receive throughput, and the Android v1 stage split — +//! headline `end-to-end` = capture→decoded (p50/p95) tiled by `host+network` = capture→received +//! and `decode` = received→decoded (stage p50s). The decode thread is the sole writer +//! (`note_received` per access unit at receipt, `note_decoded` per decoder output buffer); the JNI +//! accessor `nativeVideoStats` drains a snapshot ~1 Hz and resets the window. Sampling is gated on +//! the HUD actually being visible (`set_enabled`, driven by `nativeSetVideoStatsEnabled`) so the +//! hidden steady state costs one relaxed atomic load per frame. //! Pure `std` so it compiles on the host build too (the decode thread is android-only, but //! `SessionHandle` holds the shared handle unconditionally). @@ -13,9 +16,9 @@ use std::time::Instant; /// Rolling per-window accumulator. Rates are computed over the actual elapsed wall-time at drain /// (robust to poll jitter), so a poll that lands at 0.9 s or 1.1 s still reports the right FPS. pub struct VideoStats { - /// HUD gate: `note` runs on the per-frame decode path, so while the overlay is hidden it (and - /// the caller's latency computation — see `enabled`) early-outs on this flag alone. Off until - /// Kotlin shows the HUD. + /// HUD gate: the samplers run on the per-frame decode path, so while the overlay is hidden + /// they (and the caller's latency computation — see `enabled`) early-out on this flag alone. + /// Off until Kotlin shows the HUD. enabled: AtomicBool, inner: Mutex, } @@ -24,23 +27,42 @@ struct Inner { window_start: Instant, frames: u64, bytes: u64, - /// capture→client-receipt latency samples for this window, in microseconds. - lat_us: Vec, + /// `end-to-end` = capture→decoded latency samples for this window, in microseconds + /// (skew-corrected clock base). + e2e_us: Vec, + /// `host+network` stage = capture→received samples, in microseconds (skew-corrected). + hostnet_us: Vec, + /// `decode` stage = received→decoded samples, in microseconds (client-local, single clock). + decode_us: Vec, /// Whether the host answered the clock-skew handshake (latency is cross-machine valid). skew_corrected: bool, } -/// A drained, computed view of one window. `lat_valid` is false when no in-range latency sample -/// landed (then p50/p95 are 0 and the HUD hides the latency line, exactly like the Apple client). +/// A drained, computed view of one window. `lat_valid` is false when no in-range end-to-end sample +/// landed (then the latency figures are 0 and the HUD hides the latency lines, exactly like the +/// Apple client). pub struct Snapshot { pub fps: f64, pub mbps: f64, - pub lat_p50_ms: f64, - pub lat_p95_ms: f64, + /// Headline `end-to-end` (capture→decoded) percentiles, ms. + pub e2e_p50_ms: f64, + pub e2e_p95_ms: f64, + /// Stage p50s (ms): `host+network` (capture→received) and `decode` (received→decoded). + pub hostnet_p50_ms: f64, + pub decode_p50_ms: f64, pub lat_valid: bool, pub skew_corrected: bool, } +/// Percentile over a sorted-in-place µs sample vec, in ms. 0.0 when empty. +fn pctl_ms(sorted_us: &[u64], p: f64) -> f64 { + if sorted_us.is_empty() { + return 0.0; + } + let n = sorted_us.len(); + sorted_us[((n as f64 * p) as usize).min(n - 1)] as f64 / 1000.0 +} + impl VideoStats { pub fn new() -> VideoStats { VideoStats { @@ -49,14 +71,16 @@ impl VideoStats { window_start: Instant::now(), frames: 0, bytes: 0, - lat_us: Vec::with_capacity(256), + e2e_us: Vec::with_capacity(256), + hostnet_us: Vec::with_capacity(256), + decode_us: Vec::with_capacity(256), skew_corrected: false, }), } } /// Whether the HUD wants samples. The decode thread checks this BEFORE building a latency - /// sample, so the per-frame wall-clock read is skipped too while hidden. + /// sample, so the per-frame wall-clock reads are skipped too while hidden. // Read only by the android-only decode thread; unreferenced on the host build — expected. #[cfg_attr(not(target_os = "android"), allow(dead_code))] pub fn enabled(&self) -> bool { @@ -75,18 +99,21 @@ impl VideoStats { g.window_start = Instant::now(); g.frames = 0; g.bytes = 0; - g.lat_us.clear(); + g.e2e_us.clear(); + g.hostnet_us.clear(); + g.decode_us.clear(); } } - /// Record one decoded access unit: its wire size and (if in range) its capture→client latency. + /// Record one received access unit: its wire size and (if in range) its capture→received + /// `host+network` stage sample. Receipt is the fps/goodput counting point per the spec. // Driven only by the android-only decode thread; unreferenced on the host build — expected. #[cfg_attr(not(target_os = "android"), allow(dead_code))] - pub fn note(&self, bytes: usize, lat_us: Option, skew_corrected: bool) { + pub fn note_received(&self, bytes: usize, hostnet_us: Option, skew_corrected: bool) { if !self.enabled.load(Ordering::Relaxed) { return; // HUD hidden — skip the lock (the caller already skipped the clock read) } - // Poison-proof: `note` runs per-frame on the decode thread, which has no catch_unwind — + // Poison-proof: this runs per-frame on the decode thread, which has no catch_unwind — // a panic elsewhere must not turn every later lock into a second panic (the counters // stay consistent regardless). let mut g = self @@ -96,14 +123,37 @@ impl VideoStats { g.frames += 1; g.bytes += bytes as u64; g.skew_corrected = skew_corrected; - if let Some(l) = lat_us { - g.lat_us.push(l); + if let Some(l) = hostnet_us { + g.hostnet_us.push(l); + } + } + + /// Record one decoded output frame: its capture→decoded `end-to-end` sample and its + /// received→decoded `decode` stage sample (either may be absent — e.g. the receipt stamp for + /// this pts predates the HUD being shown). + // Driven only by the android-only decode thread; unreferenced on the host build — expected. + #[cfg_attr(not(target_os = "android"), allow(dead_code))] + pub fn note_decoded(&self, e2e_us: Option, decode_us: Option) { + if !self.enabled.load(Ordering::Relaxed) { + return; // HUD hidden — skip the lock (the caller already skipped the clock read) + } + // Poison-proof for the same reason as `note_received`. + let mut g = self + .inner + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + if let Some(l) = e2e_us { + g.e2e_us.push(l); + } + if let Some(l) = decode_us { + g.decode_us.push(l); } } /// Compute the window's rates + latency percentiles, then reset for the next window. pub fn drain(&self) -> Snapshot { - // Poison-proof for the same reason as `note` — a poisoned window still drains fine. + // Poison-proof for the same reason as `note_received` — a poisoned window still drains + // fine. let mut g = self .inner .lock() @@ -111,26 +161,25 @@ impl VideoStats { let elapsed = g.window_start.elapsed().as_secs_f64().max(1e-3); let fps = g.frames as f64 / elapsed; let mbps = g.bytes as f64 * 8.0 / 1_000_000.0 / elapsed; - let (p50, p95, valid) = if g.lat_us.is_empty() { - (0.0, 0.0, false) - } else { - g.lat_us.sort_unstable(); - let n = g.lat_us.len(); - let at = |p: f64| g.lat_us[((n as f64 * p) as usize).min(n - 1)] as f64 / 1000.0; - (at(0.50), at(0.95), true) + g.e2e_us.sort_unstable(); + g.hostnet_us.sort_unstable(); + g.decode_us.sort_unstable(); + let snap = Snapshot { + fps, + mbps, + e2e_p50_ms: pctl_ms(&g.e2e_us, 0.50), + e2e_p95_ms: pctl_ms(&g.e2e_us, 0.95), + hostnet_p50_ms: pctl_ms(&g.hostnet_us, 0.50), + decode_p50_ms: pctl_ms(&g.decode_us, 0.50), + lat_valid: !g.e2e_us.is_empty(), + skew_corrected: g.skew_corrected, }; - let skew = g.skew_corrected; g.window_start = Instant::now(); g.frames = 0; g.bytes = 0; - g.lat_us.clear(); - Snapshot { - fps, - mbps, - lat_p50_ms: p50, - lat_p95_ms: p95, - lat_valid: valid, - skew_corrected: skew, - } + g.e2e_us.clear(); + g.hostnet_us.clear(); + g.decode_us.clear(); + snap } } diff --git a/clients/apple/Sources/PunktfunkClient/ContentView.swift b/clients/apple/Sources/PunktfunkClient/ContentView.swift index 75dc7cb..d09bd35 100644 --- a/clients/apple/Sources/PunktfunkClient/ContentView.swift +++ b/clients/apple/Sources/PunktfunkClient/ContentView.swift @@ -333,8 +333,9 @@ struct ContentView: View { onSessionEnd: { [weak model] in Task { @MainActor in model?.sessionEnded() } }, - presentMeter: model.presentLatency, - presentTailMeter: model.presentTail + endToEndMeter: model.endToEnd, + decodeMeter: model.decodeStage, + displayMeter: model.displayStage ) .overlay(alignment: placement.alignment) { if captureEnabled && hudEnabled { diff --git a/clients/apple/Sources/PunktfunkClient/Screenshots/ScreenshotScenes.swift b/clients/apple/Sources/PunktfunkClient/Screenshots/ScreenshotScenes.swift index f06e378..3b33efa 100644 --- a/clients/apple/Sources/PunktfunkClient/Screenshots/ScreenshotScenes.swift +++ b/clients/apple/Sources/PunktfunkClient/Screenshots/ScreenshotScenes.swift @@ -170,7 +170,10 @@ private struct ShotHUD: View { Text("5120×1440@240 240 fps 812.4 Mb/s") .font(.system(.caption, design: .monospaced)) } - Text("capture→client 1.3/2.1 ms p50/p95") + Text("end-to-end 2.9 ms p50 · 3.8 p95 · capture→on-glass") + .font(.system(.caption2, design: .monospaced)) + .foregroundStyle(.secondary) + Text("= host+network 1.3 + decode 0.7 + display 0.9") .font(.system(.caption2, design: .monospaced)) .foregroundStyle(.secondary) #if os(macOS) diff --git a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift index e80847a..b5f1547 100644 --- a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift +++ b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift @@ -59,36 +59,50 @@ final class SessionModel: ObservableObject { @Published var fps = 0 @Published var mbps = 0.0 @Published var totalFrames = 0 - /// Capture→client-receipt latency (ms), skew-corrected across machines via the connect-time - /// clock offset — p50/p95 for the HUD. `latencyValid` is false until the first sample drains - /// (and whenever no host frames arrived in the last interval). `latencySkewCorrected` = the host + /// The unified latency stages (design/stats-unification.md), ms per 1 s window. `host+network` + /// = capture→received, skew-corrected across machines via the connect-time clock offset: the + /// stage-2 HUD shows its p50 in the equation line; the stage-1 fallback shows p50/p95 as its + /// `capture→received` headline. `hostNetworkValid` is false until the first sample drains (and + /// whenever no host frames arrived in the last interval). `hostNetworkSkewCorrected` = the host /// answered the skew handshake (the number is cross-machine valid, not just same-host). - @Published var latencyP50Ms = 0.0 - @Published var latencyP95Ms = 0.0 - @Published var latencyValid = false - @Published var latencySkewCorrected = false - /// Capture→present (glass-to-glass, modulo the host render→capture term) — only the stage-2 - /// presenter can stamp this (it owns decode + a CAMetalLayer/display-link present). Stays - /// invalid under stage-1, where the layer presents internally with no per-frame callback. - @Published var presentLatencyP50Ms = 0.0 - @Published var presentLatencyP95Ms = 0.0 - @Published var presentLatencyValid = false - @Published var presentLatencySkewCorrected = false - /// Decode-completion→present (the "present tail": ring wait + render + vsync) — the term the - /// stage-2 presenter exists to shorten. Both instants are client-side, so no skew applies. - @Published var presentTailP50Ms = 0.0 - @Published var presentTailP95Ms = 0.0 - @Published var presentTailValid = false + @Published var hostNetworkP50Ms = 0.0 + @Published var hostNetworkP95Ms = 0.0 + @Published var hostNetworkValid = false + @Published var hostNetworkSkewCorrected = false + /// End-to-end = capture→on-glass, measured directly per frame (never summed from the stages) — + /// the HUD headline. Only the stage-2 presenter can stamp it (it owns decode + a + /// CAMetalLayer/display-link present); stays invalid under stage-1, where the layer presents + /// internally with no per-frame callback. + @Published var endToEndP50Ms = 0.0 + @Published var endToEndP95Ms = 0.0 + @Published var endToEndValid = false + @Published var endToEndSkewCorrected = false + /// The client-local stage terms of the HUD's equation line (single clock, no skew; p50 only): + /// decode = received→decoded, display = decoded→on-glass (ring wait + render + vsync — the + /// term the stage-2 presenter exists to shorten). + @Published var decodeP50Ms = 0.0 + @Published var decodeValid = false + @Published var displayP50Ms = 0.0 + @Published var displayValid = false + /// Unrecoverable network frame drops in the last window (FEC couldn't rebuild them) and their + /// share of frames offered, `lost/(received+lost)`. The HUD hides the line while zero. + @Published var lostFrames = 0 + @Published var lostPct = 0.0 /// Mirrors StreamView's capture state (it owns the input capture; this drives the /// HUD's "click to capture" / "⌘⎋ releases" hint). @Published var mouseCaptured = false let meter = FrameMeter() + /// Capture→received (the host+network stage), fed per AU at receipt by the stream view's + /// onFrame — under both presenters. let latency = LatencyMeter() - /// Fed by the stage-2 presenter's display link (capture→present). Passed to StreamView. - let presentLatency = LatencyMeter() - /// Fed by the same present stamp (decode-completion→present). Passed to StreamView. - let presentTail = LatencyMeter() + /// The stage-2 meters, passed to StreamView: end-to-end (capture→on-glass, stamped at + /// present), decode (received→decoded), display (decoded→on-glass). + let endToEnd = LatencyMeter() + let decodeStage = LatencyMeter() + let displayStage = LatencyMeter() + /// Cumulative reassembler-drop counter at the last stats drain (per-window `lost` delta). + private var lastFramesDropped: UInt64 = 0 private var statsTimer: Timer? private var audio: SessionAudio? private var gamepadCapture: GamepadCapture? @@ -281,7 +295,12 @@ final class SessionModel: ObservableObject { phase = .idle fps = 0 mbps = 0 - latencyValid = false + hostNetworkValid = false + endToEndValid = false + decodeValid = false + displayValid = false + lostFrames = 0 + lostPct = 0 mouseCaptured = false } @@ -321,6 +340,7 @@ final class SessionModel: ObservableObject { } private func startStatsTimer() { + lastFramesDropped = 0 // a fresh connection's cumulative drop counter starts at 0 let timer = Timer(timeInterval: 1.0, repeats: true) { [weak self] _ in guard let self else { return } Task { @MainActor in @@ -328,28 +348,41 @@ final class SessionModel: ObservableObject { self.fps = frames self.mbps = Double(bytes) * 8 / 1_000_000 self.totalFrames = total + // Per-window `lost` = the delta of the connector's cumulative reassembler-drop + // counter (0 after close — treat a rewind as no loss rather than underflowing). + let dropped = self.connection?.framesDropped() ?? 0 + let lost = dropped >= self.lastFramesDropped + ? Int(dropped - self.lastFramesDropped) : 0 + self.lastFramesDropped = dropped + self.lostFrames = lost + self.lostPct = lost > 0 ? Double(lost) / Double(frames + lost) * 100 : 0 if let lat = self.latency.drain() { - self.latencyP50Ms = lat.p50Ms - self.latencyP95Ms = lat.p95Ms - self.latencySkewCorrected = lat.skewCorrected - self.latencyValid = true + self.hostNetworkP50Ms = lat.p50Ms + self.hostNetworkP95Ms = lat.p95Ms + self.hostNetworkSkewCorrected = lat.skewCorrected + self.hostNetworkValid = true } else { - self.latencyValid = false + self.hostNetworkValid = false } - if let p = self.presentLatency.drain() { - self.presentLatencyP50Ms = p.p50Ms - self.presentLatencyP95Ms = p.p95Ms - self.presentLatencySkewCorrected = p.skewCorrected - self.presentLatencyValid = true + if let e = self.endToEnd.drain() { + self.endToEndP50Ms = e.p50Ms + self.endToEndP95Ms = e.p95Ms + self.endToEndSkewCorrected = e.skewCorrected + self.endToEndValid = true } else { - self.presentLatencyValid = false + self.endToEndValid = false } - if let t = self.presentTail.drain() { - self.presentTailP50Ms = t.p50Ms - self.presentTailP95Ms = t.p95Ms - self.presentTailValid = true + if let d = self.decodeStage.drain() { + self.decodeP50Ms = d.p50Ms + self.decodeValid = true } else { - self.presentTailValid = false + self.decodeValid = false + } + if let d = self.displayStage.drain() { + self.displayP50Ms = d.p50Ms + self.displayValid = true + } else { + self.displayValid = false } } } diff --git a/clients/apple/Sources/PunktfunkClient/Session/StreamHUDView.swift b/clients/apple/Sources/PunktfunkClient/Session/StreamHUDView.swift index 1ebde7c..5d5a65a 100644 --- a/clients/apple/Sources/PunktfunkClient/Session/StreamHUDView.swift +++ b/clients/apple/Sources/PunktfunkClient/Session/StreamHUDView.swift @@ -1,5 +1,7 @@ -// The streaming overlay HUD: mode + fps/throughput, the capture→client (and, under the stage-2 -// presenter, capture→present) latency lines, the platform input hint, and disconnect. +// The streaming overlay HUD: mode + fps/throughput, the unified latency lines +// (design/stats-unification.md — end-to-end headline + the stage equation under stage-2, the +// capture→received headline under the stage-1 fallback), the loss counter, the platform input +// hint, and disconnect. import PunktfunkKit import SwiftUI @@ -18,24 +20,32 @@ struct StreamHUDView: View { Text("\(connection.width)×\(connection.height)@\(connection.refreshHz) \(model.fps) fps \(model.mbps, specifier: "%.1f") Mb/s") .font(.system(.caption, design: .monospaced)) } - if model.latencyValid { - // Capture→client-receipt (skew-corrected); excludes the layer's decode+present — - // see LatencyMeter. "(same-host)" when the host didn't answer the skew handshake. - Text("capture→client \(model.latencyP50Ms, specifier: "%.1f")/\(model.latencyP95Ms, specifier: "%.1f") ms p50/p95\(model.latencySkewCorrected ? "" : " (same-host)")") + if model.endToEndValid { + // Stage-2: the end-to-end headline (capture→on-glass, measured directly, skew- + // corrected) — "(same-host clock)" when the host didn't answer the skew handshake. + Text("end-to-end \(model.endToEndP50Ms, specifier: "%.1f") ms p50 · \(model.endToEndP95Ms, specifier: "%.1f") p95 · capture→on-glass\(model.endToEndSkewCorrected ? "" : " (same-host clock)")") + .font(.system(.caption2, design: .monospaced)) + .foregroundStyle(.secondary) + // The equation: the three stages tiling the headline interval (per-window p50s — + // they only approximately sum to the directly-measured total). + if model.hostNetworkValid && model.decodeValid && model.displayValid { + Text("= host+network \(model.hostNetworkP50Ms, specifier: "%.1f") + decode \(model.decodeP50Ms, specifier: "%.1f") + display \(model.displayP50Ms, specifier: "%.1f")") + .font(.system(.caption2, design: .monospaced)) + .foregroundStyle(.secondary) + } + } else if model.hostNetworkValid { + // Stage-1 fallback presenter: the layer decodes + presents internally with no + // per-frame stamp, so the honest headline ends at receipt — and there is no + // equation line (host+network is the whole measured interval). + Text("capture→received \(model.hostNetworkP50Ms, specifier: "%.1f") ms p50 · \(model.hostNetworkP95Ms, specifier: "%.1f") p95\(model.hostNetworkSkewCorrected ? "" : " (same-host clock)")") .font(.system(.caption2, design: .monospaced)) .foregroundStyle(.secondary) } - if model.presentLatencyValid { - // Capture→present (glass-to-glass, modulo host render→capture) — stage-2 presenter - // only; stage-1's layer presents internally with no per-frame stamp. - Text("capture→present \(model.presentLatencyP50Ms, specifier: "%.1f")/\(model.presentLatencyP95Ms, specifier: "%.1f") ms p50/p95\(model.presentLatencySkewCorrected ? "" : " (same-host)")") - .font(.system(.caption2, design: .monospaced)) - .foregroundStyle(.secondary) - } - if model.presentTailValid { - // Decode→present (the client-local "present tail": ring wait + render + vsync) — - // the term the stage-2 presenter shortens; no skew applies (one clock). - Text("decode→present \(model.presentTailP50Ms, specifier: "%.1f")/\(model.presentTailP95Ms, specifier: "%.1f") ms p50/p95") + if model.lostFrames > 0 { + // Unrecoverable network drops this window; hidden while the link is clean. + // String(format:) rather than specifier interpolation: the literal % would + // otherwise land in the LocalizedStringKey's format string as a bogus conversion. + Text(String(format: "lost %d (%.1f%%)", model.lostFrames, model.lostPct)) .font(.system(.caption2, design: .monospaced)) .foregroundStyle(.secondary) } diff --git a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift index 82a04f5..088fa63 100644 --- a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift +++ b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift @@ -310,10 +310,11 @@ extension SettingsView { Text("Video presenter · debug") } footer: { Text("Stage 2 (default) decodes explicitly and presents through Metal with a display " - + "link — it adds a capture→present (glass-to-glass) latency line in the HUD and " - + "self-recovers from decode stalls. Stage 1 feeds compressed video straight to the " - + "system display layer; it freezes on a lost HEVC reference frame, so it's a debug " - + "fallback only. Applies from the next session.") + + "link — it gives the HUD the end-to-end (capture→on-glass) headline with the " + + "host+network/decode/display stage equation and self-recovers from decode " + + "stalls. Stage 1 feeds compressed video straight to the system display layer; " + + "it freezes on a lost HEVC reference frame, so it's a debug fallback only. " + + "Applies from the next session.") .font(.geist(12, relativeTo: .caption)) .foregroundStyle(.secondary) } diff --git a/clients/apple/Sources/PunktfunkKit/Connection/PunktfunkConnection.swift b/clients/apple/Sources/PunktfunkKit/Connection/PunktfunkConnection.swift index 76c39d3..cc1740a 100644 --- a/clients/apple/Sources/PunktfunkKit/Connection/PunktfunkConnection.swift +++ b/clients/apple/Sources/PunktfunkKit/Connection/PunktfunkConnection.swift @@ -35,6 +35,10 @@ public struct AccessUnit: Sendable { public let ptsNs: UInt64 public let frameIndex: UInt32 public let flags: UInt32 + /// Client `CLOCK_REALTIME` instant the AU was handed over by the core (post-FEC, decrypted) + /// — the **received** measurement point of design/stats-unification.md. The decode stage is + /// `decodedNs - receivedNs`, both client-local (no skew offset applies). + public let receivedNs: Int64 } /// One Opus audio packet (48 kHz stereo, 5 ms frames) — decode with AVAudioConverter @@ -419,9 +423,13 @@ public final class PunktfunkConnection { case statusOK: guard let base = frame.data, frame.len > 0 else { return nil } let data = Data(bytes: base, count: Int(frame.len)) // copy: ptr valid only until next call + var ts = timespec() + clock_gettime(CLOCK_REALTIME, &ts) + let receivedNs = Int64(ts.tv_sec) * 1_000_000_000 + Int64(ts.tv_nsec) return AccessUnit( data: data, ptsNs: frame.pts_ns, - frameIndex: frame.frame_index, flags: frame.flags) + frameIndex: frame.frame_index, flags: frame.flags, + receivedNs: receivedNs) case statusNoFrame: return nil case statusClosed: diff --git a/clients/apple/Sources/PunktfunkKit/Video/LatencyMeter.swift b/clients/apple/Sources/PunktfunkKit/Video/LatencyMeter.swift index 7c9b6da..a316204 100644 --- a/clients/apple/Sources/PunktfunkKit/Video/LatencyMeter.swift +++ b/clients/apple/Sources/PunktfunkKit/Video/LatencyMeter.swift @@ -1,23 +1,25 @@ -// Per-frame latency sampler for the live HUD: records capture->client-receipt latency and drains -// percentiles on demand. NSLock rather than an actor — the writer is the non-async pump/arrival -// path (same pattern as the app's FrameMeter). +// Per-frame latency-stage sampler for the live HUD: records one interval per frame (an end +// instant minus a start instant, both CLOCK_REALTIME ns) and drains percentiles on demand. +// NSLock rather than an actor — the writers are the non-async pump/decode/present paths (same +// pattern as the app's FrameMeter). import Foundation -/// Samples the **capture->client-receipt** latency of each access unit and reports percentiles. +/// Samples one **latency stage** per frame and reports percentiles. One instance per stage of the +/// unified stats model (design/stats-unification.md): /// -/// The latency is `now - pts_ns`, where `pts_ns` is the host's capture wall clock (the AU's pts) and -/// `now` is the client's `CLOCK_REALTIME` instant the AU was received, shifted by the connect-time -/// **clock-skew offset** (`PunktfunkConnection.clockOffsetNs`, host minus client) so the difference -/// is valid across machines. `offsetNs == 0` means an old host that didn't answer the skew handshake -/// (or genuinely synced clocks) — the number is then only meaningful same-host. +/// - `host+network` = capture→received: `record(ptsNs:offsetNs:)` at AU receipt. +/// - `decode` = received→decoded and `display` = decoded→displayed: client-local single-clock +/// stages — `record(ptsNs:atNs:offsetNs:)` with the start instant as `ptsNs` and `offsetNs: 0`. +/// - `end-to-end` = capture→displayed, measured directly (never summed from the stages): +/// `record(ptsNs:atNs:offsetNs:)` at present. /// -/// SCOPE (stage-1 presenter): this covers host capture -> encode -> FEC -> network -> reassembly -> -/// decrypt -> handed to the presenter. It does **not** include the on-device VideoToolbox decode or -/// the `AVSampleBufferDisplayLayer` present — that layer decodes and presents compressed samples -/// internally with no per-frame callback. True decode->present (the full glass-to-glass) needs the -/// stage-2 presenter (`VTDecompressionSession` decode-completion + `CAMetalLayer`/display-link -/// present); this meter is the substrate it will extend. +/// For the host-anchored intervals (capture→…) the sample is `end + offset - pts_ns`, where +/// `pts_ns` is the host's capture wall clock (the AU's pts) and the connect-time **clock-skew +/// offset** (`PunktfunkConnection.clockOffsetNs`, host minus client) makes the difference valid +/// across machines. `offsetNs == 0` means an old host that didn't answer the skew handshake (or +/// genuinely synced clocks) — the number is then only meaningful same-host, and the HUD tags the +/// end-to-end line `(same-host clock)`. public final class LatencyMeter: @unchecked Sendable { private let lock = NSLock() private var samplesUs: [Int64] = [] @@ -34,12 +36,16 @@ public final class LatencyMeter: @unchecked Sendable { record(ptsNs: ptsNs, atNs: nowNs, offsetNs: offsetNs) } - /// Record one frame whose latency is `atNs + offsetNs - ptsNs` — an EXPLICIT client instant - /// rather than now. The stage-2 presenter uses this to stamp capture→present at the display - /// link's target present time (not the moment the present call ran). All in `CLOCK_REALTIME`. + /// Record one frame whose sample is `atNs + offsetNs - ptsNs` — an EXPLICIT end instant + /// rather than now. `ptsNs` is the stage's start point: the AU pts for the host-anchored + /// intervals, or a client stamp (receivedNs / decodedNs, with `offsetNs: 0`) for the local + /// decode/display stages. The stage-2 presenter stamps its present-side samples at the + /// display link's target present time (not the moment the present call ran). All in + /// `CLOCK_REALTIME`. public func record(ptsNs: UInt64, atNs: Int64, offsetNs: Int64) { let latNs = atNs &+ offsetNs &- Int64(bitPattern: ptsNs) - // Drop absurd values (a clock step, a wildly wrong offset, or garbage pts). + // Drop absurd values (a clock step, a wildly wrong offset, garbage pts, or a stage whose + // start stamp is missing/after its end) — samples are clamped to (0, 10 s). guard latNs > 0, latNs < 10_000_000_000 else { return } lock.lock() samplesUs.append(latNs / 1000) diff --git a/clients/apple/Sources/PunktfunkKit/Video/SessionPresenter.swift b/clients/apple/Sources/PunktfunkKit/Video/SessionPresenter.swift index f8b2166..ae305e2 100644 --- a/clients/apple/Sources/PunktfunkKit/Video/SessionPresenter.swift +++ b/clients/apple/Sources/PunktfunkKit/Video/SessionPresenter.swift @@ -38,8 +38,9 @@ final class SessionPresenter { func start( connection: PunktfunkConnection, baseLayer: AVSampleBufferDisplayLayer, - presentMeter: LatencyMeter?, - presentTailMeter: LatencyMeter? = nil, + endToEndMeter: LatencyMeter?, + decodeMeter: LatencyMeter? = nil, + displayMeter: LatencyMeter? = nil, makeDisplayLink: (AnyObject, Selector) -> CADisplayLink, onFrame: (@Sendable (AccessUnit) -> Void)?, onSessionEnd: (@Sendable () -> Void)? @@ -59,7 +60,8 @@ final class SessionPresenter { #endif if !forceStage1, let pipeline = Stage2Pipeline( - presentMeter: presentMeter, presentTailMeter: presentTailMeter) { + endToEndMeter: endToEndMeter, decodeMeter: decodeMeter, + displayMeter: displayMeter) { let metal = pipeline.layer // The opaque metal layer composites OVER the AVSampleBufferDisplayLayer base, which // sits idle (un-enqueued) in stage-2. contentsScale + frame are set in layout(). diff --git a/clients/apple/Sources/PunktfunkKit/Video/Stage2Pipeline.swift b/clients/apple/Sources/PunktfunkKit/Video/Stage2Pipeline.swift index 9ed89ca..6634ca1 100644 --- a/clients/apple/Sources/PunktfunkKit/Video/Stage2Pipeline.swift +++ b/clients/apple/Sources/PunktfunkKit/Video/Stage2Pipeline.swift @@ -1,7 +1,8 @@ // Stage-2 presenter orchestrator: a pump thread pulls AUs → VideoDecoder; the decoder's async output // drops the newest decoded frame into a 1-slot ring; the hosting view's display link calls `renderTick` -// once per vsync to draw + present the newest ready frame and stamp capture→present. Mirrors -// StreamPump's lifecycle (one per start; cancel is permanent). +// once per vsync to draw + present the newest ready frame and stamp the unified latency stages +// (end-to-end capture→on-glass, plus the decode and display stage terms — +// design/stats-unification.md). Mirrors StreamPump's lifecycle (one per start; cancel is permanent). // // Threading: the pump runs on its own thread; the decoder callback on a VT thread; `renderTick` + // `start`/`stop` on the MAIN thread (the view's CADisplayLink fires there). Only the ring (lock-guarded) @@ -40,8 +41,8 @@ public final class Stage2Pipeline { private let ring = ReadyRing() private let presenter: MetalVideoPresenter private let decoder: VideoDecoder - private let presentMeter: LatencyMeter? - private let presentTailMeter: LatencyMeter? + private let endToEndMeter: LatencyMeter? + private let displayMeter: LatencyMeter? private let recovery = KeyframeRecovery() private var token = StopFlag() private var offsetNs: Int64 = 0 @@ -56,28 +57,41 @@ public final class Stage2Pipeline { /// The Metal layer the hosting view installs + sizes. public var layer: CAMetalLayer { presenter.layer } - /// `presentMeter` records capture→present (the glass-to-glass term); `presentTailMeter` - /// records decode-completion→present (the ring wait + render — the tail stage-2 exists to - /// shorten). Both optional: metering never gates the presenter choice. Returns nil if Metal - /// can't be set up (headless / no GPU) — caller falls back to the stage-1 presenter. - public init?(presentMeter: LatencyMeter?, presentTailMeter: LatencyMeter? = nil) { + /// Unified-stats meters (design/stats-unification.md): `endToEndMeter` records the headline + /// end-to-end (capture→on-glass, skew-corrected); `decodeMeter` the decode stage + /// (received→decoded); `displayMeter` the display stage (decoded→on-glass, the ring wait + + /// render + vsync — the tail stage-2 exists to shorten). All optional: metering never gates + /// the presenter choice. Returns nil if Metal can't be set up (headless / no GPU) — caller + /// falls back to the stage-1 presenter. + public init?( + endToEndMeter: LatencyMeter?, + decodeMeter: LatencyMeter? = nil, + displayMeter: LatencyMeter? = nil + ) { guard let presenter = MetalVideoPresenter.make() else { return nil } self.presenter = presenter - self.presentMeter = presentMeter - self.presentTailMeter = presentTailMeter + self.endToEndMeter = endToEndMeter + self.displayMeter = displayMeter let ring = ring let recovery = recovery self.decoder = VideoDecoder( - onDecoded: { ring.submit($0) }, + onDecoded: { frame in + // Decode stage = received→decoded, both client CLOCK_REALTIME (offset 0 — no + // skew applies). Stamped at decode completion, so it covers every decoded frame, + // including ones the newest-wins ring drops before present. + decodeMeter?.record( + ptsNs: UInt64(frame.receivedNs), atNs: frame.decodedNs, offsetNs: 0) + ring.submit(frame) + }, // Async decode failure (a bad P-frame referencing a lost/corrupt IDR): the pump resets to // re-gate on the next IDR, and we ask the host to send one now (infinite GOP — it wouldn't // otherwise come soon). Throttled in KeyframeRecovery. onDecodeError: { _ in recovery.request() }) } - /// Start pulling AUs into the decoder. MAIN THREAD. `onFrame` fires per AU at receipt (capture→client - /// meter, exactly as stage-1); `onSessionEnd` on close. `clockOffsetNs` (host minus client) makes the - /// present stamp cross-machine valid. + /// Start pulling AUs into the decoder. MAIN THREAD. `onFrame` fires per AU at receipt (the + /// host+network / capture→received meter, exactly as stage-1); `onSessionEnd` on close. + /// `clockOffsetNs` (host minus client) makes the end-to-end stamp cross-machine valid. public func start( connection: PunktfunkConnection, onFrame: (@Sendable (AccessUnit) -> Void)?, @@ -174,14 +188,16 @@ public final class Stage2Pipeline { public func renderTick(targetPresentNs: Int64) { guard let frame = ring.take() else { return } let offsetNs = offsetNs - let presentMeter = presentMeter - let presentTailMeter = presentTailMeter + let endToEndMeter = endToEndMeter + let displayMeter = displayMeter let rendered = presenter.render(frame.pixelBuffer, isHDR: frame.isHDR) { presentedNs in let atNs = presentedNs ?? targetPresentNs - presentMeter?.record(ptsNs: frame.ptsNs, atNs: atNs, offsetNs: offsetNs) - // Present tail = decode-completion → on-glass. Both instants are client - // CLOCK_REALTIME, so no skew offset applies. - presentTailMeter?.record(ptsNs: UInt64(frame.decodedNs), atNs: atNs, offsetNs: 0) + // End-to-end = capture→on-glass, measured directly (skew-corrected via the + // connect-time clock offset) — the HUD headline. + endToEndMeter?.record(ptsNs: frame.ptsNs, atNs: atNs, offsetNs: offsetNs) + // Display stage = decoded → on-glass. Both instants are client CLOCK_REALTIME, + // so no skew offset applies. + displayMeter?.record(ptsNs: UInt64(frame.decodedNs), atNs: atNs, offsetNs: 0) } if !rendered { ring.putBack(frame) } } diff --git a/clients/apple/Sources/PunktfunkKit/Video/Stage444Probe.swift b/clients/apple/Sources/PunktfunkKit/Video/Stage444Probe.swift index 18bb8b4..7d0112a 100644 --- a/clients/apple/Sources/PunktfunkKit/Video/Stage444Probe.swift +++ b/clients/apple/Sources/PunktfunkKit/Video/Stage444Probe.swift @@ -61,7 +61,7 @@ public enum Stage444Probe { guard created == noErr, let session else { return false } defer { VTDecompressionSessionInvalidate(session) } - let au = AccessUnit(data: data, ptsNs: 0, frameIndex: 0, flags: 0) + let au = AccessUnit(data: data, ptsNs: 0, frameIndex: 0, flags: 0, receivedNs: 0) guard let sample = AnnexB.sampleBuffer(au: au, format: format, codec: .hevc) else { return false } var produced: OSType = 0 diff --git a/clients/apple/Sources/PunktfunkKit/Video/VideoDecoder.swift b/clients/apple/Sources/PunktfunkKit/Video/VideoDecoder.swift index 1327d89..853e83e 100644 --- a/clients/apple/Sources/PunktfunkKit/Video/VideoDecoder.swift +++ b/clients/apple/Sources/PunktfunkKit/Video/VideoDecoder.swift @@ -15,6 +15,10 @@ import VideoToolbox public struct ReadyFrame: @unchecked Sendable { /// Host capture clock (the AU's pts), in nanoseconds. public let ptsNs: UInt64 + /// Client `CLOCK_REALTIME` instant the AU was received (`AccessUnit.receivedNs`, threaded + /// through the decode via the frame refcon), in nanoseconds. 0 when unknown (a caller that + /// didn't stamp receipt) — the decode-stage meter then drops the sample via its sanity guard. + public let receivedNs: Int64 /// Client `CLOCK_REALTIME` instant decode completed, in nanoseconds. public let decodedNs: Int64 /// The decoded image — 8-bit NV12 biplanar (SDR) or 10-bit P010 biplanar (HDR), Metal-compatible. @@ -25,13 +29,16 @@ public struct ReadyFrame: @unchecked Sendable { } /// The C output callback can't capture context, so VideoToolbox hands it the refcon we set at -/// session creation — a pointer back to the owning `VideoDecoder`. +/// session creation — a pointer back to the owning `VideoDecoder`. The per-frame refcon carries +/// the AU's `receivedNs` as a pointer bit pattern (a scalar smuggled through the C void*, never +/// dereferenced) so the decode stage can be computed against decode-completion. private let decoderOutputCallback: VTDecompressionOutputCallback = { - refcon, _, status, _, imageBuffer, pts, _ in + refcon, frameRefcon, status, _, imageBuffer, pts, _ in guard let refcon else { return } + let receivedNs = frameRefcon.map { Int64(Int(bitPattern: $0)) } ?? 0 Unmanaged.fromOpaque(refcon) .takeUnretainedValue() - .handleDecoded(status: status, imageBuffer: imageBuffer, pts: pts) + .handleDecoded(status: status, imageBuffer: imageBuffer, pts: pts, receivedNs: receivedNs) } /// Owns a `VTDecompressionSession` rebuilt whenever the format description changes (every IDR / @@ -112,7 +119,9 @@ public final class VideoDecoder: @unchecked Sendable { session, sampleBuffer: sample, flags: [._EnableAsynchronousDecompression], - frameRefcon: nil, + // The AU's receipt instant rides through as a bit pattern (nil for 0 — the output + // callback maps that back to 0); the callback needs it to stamp the decode stage. + frameRefcon: UnsafeMutableRawPointer(bitPattern: Int(au.receivedNs)), infoFlagsOut: &infoOut) lock.unlock() if status != noErr { @@ -218,8 +227,11 @@ public final class VideoDecoder: @unchecked Sendable { return true } - /// VT thread. Stamp decode-completion and enqueue, or report the error. - fileprivate func handleDecoded(status: OSStatus, imageBuffer: CVImageBuffer?, pts: CMTime) { + /// VT thread. Stamp decode-completion and enqueue, or report the error. `receivedNs` is the + /// AU's receipt instant threaded through the frame refcon (0 = unknown). + fileprivate func handleDecoded( + status: OSStatus, imageBuffer: CVImageBuffer?, pts: CMTime, receivedNs: Int64 + ) { guard status == noErr, let imageBuffer else { onDecodeError(status) return @@ -242,6 +254,8 @@ public final class VideoDecoder: @unchecked Sendable { || fmt == kCVPixelFormatType_444YpCbCr10BiPlanarVideoRange || fmt == kCVPixelFormatType_444YpCbCr10BiPlanarFullRange onDecoded( - ReadyFrame(ptsNs: ptsNs, decodedNs: decodedNs, pixelBuffer: imageBuffer, isHDR: isHDR)) + ReadyFrame( + ptsNs: ptsNs, receivedNs: receivedNs, decodedNs: decodedNs, + pixelBuffer: imageBuffer, isHDR: isHDR)) } } diff --git a/clients/apple/Sources/PunktfunkKit/Views/StreamView.swift b/clients/apple/Sources/PunktfunkKit/Views/StreamView.swift index c6964ef..441211c 100644 --- a/clients/apple/Sources/PunktfunkKit/Views/StreamView.swift +++ b/clients/apple/Sources/PunktfunkKit/Views/StreamView.swift @@ -85,39 +85,45 @@ public struct StreamView: NSViewRepresentable { private let onCaptureChange: ((Bool) -> Void)? private let onFrame: (@Sendable (AccessUnit) -> Void)? private let onSessionEnd: (@Sendable () -> Void)? - private let presentMeter: LatencyMeter? - private let presentTailMeter: LatencyMeter? + private let endToEndMeter: LatencyMeter? + private let decodeMeter: LatencyMeter? + private let displayMeter: LatencyMeter? /// `onFrame`/`onSessionEnd` fire on the pump thread — hop to the main actor for UI. /// `captureEnabled: false` disables input capture entirely while UI (e.g. a trust /// prompt) is layered over the stream; flipping it to true auto-engages capture /// once. `onCaptureChange` (main thread) reports engage/release — drive the HUD's - /// "click to capture" / "⌘⎋ releases" hint with it. `presentMeter` records capture→present - /// and `presentTailMeter` decode→present when the stage-2 presenter is active. + /// "click to capture" / "⌘⎋ releases" hint with it. The meters record the unified latency + /// stages when the stage-2 presenter is active (design/stats-unification.md): + /// `endToEndMeter` capture→on-glass, `decodeMeter` received→decoded, `displayMeter` + /// decoded→on-glass. public init( connection: PunktfunkConnection, captureEnabled: Bool = true, onCaptureChange: ((Bool) -> Void)? = nil, onFrame: (@Sendable (AccessUnit) -> Void)? = nil, onSessionEnd: (@Sendable () -> Void)? = nil, - presentMeter: LatencyMeter? = nil, - presentTailMeter: LatencyMeter? = nil + endToEndMeter: LatencyMeter? = nil, + decodeMeter: LatencyMeter? = nil, + displayMeter: LatencyMeter? = nil ) { self.connection = connection self.captureEnabled = captureEnabled self.onCaptureChange = onCaptureChange self.onFrame = onFrame self.onSessionEnd = onSessionEnd - self.presentMeter = presentMeter - self.presentTailMeter = presentTailMeter + self.endToEndMeter = endToEndMeter + self.decodeMeter = decodeMeter + self.displayMeter = displayMeter } public func makeNSView(context: Context) -> StreamLayerView { let view = StreamLayerView() view.onCaptureChange = onCaptureChange view.captureEnabled = captureEnabled - view.presentMeter = presentMeter - view.presentTailMeter = presentTailMeter + view.endToEndMeter = endToEndMeter + view.decodeMeter = decodeMeter + view.displayMeter = displayMeter view.start(connection: connection, onFrame: onFrame, onSessionEnd: onSessionEnd) return view } @@ -125,8 +131,9 @@ public struct StreamView: NSViewRepresentable { public func updateNSView(_ view: StreamLayerView, context: Context) { view.onCaptureChange = onCaptureChange view.captureEnabled = captureEnabled - view.presentMeter = presentMeter - view.presentTailMeter = presentTailMeter + view.endToEndMeter = endToEndMeter + view.decodeMeter = decodeMeter + view.displayMeter = displayMeter // SwiftUI reuses the NSView across state changes — repoint the pump only when the // connection identity actually changed. if view.connection !== connection { @@ -141,10 +148,11 @@ public struct StreamView: NSViewRepresentable { public final class StreamLayerView: NSView { private let displayLayer = AVSampleBufferDisplayLayer() - /// Record capture→present / decode→present when the stage-2 presenter is active. - /// Consulted at start(). - var presentMeter: LatencyMeter? - var presentTailMeter: LatencyMeter? + /// Record the unified latency stages (end-to-end / decode / display) when the stage-2 + /// presenter is active. Consulted at start(). + var endToEndMeter: LatencyMeter? + var decodeMeter: LatencyMeter? + var displayMeter: LatencyMeter? /// The shared presenter stack: stage-2 (CAMetalLayer sublayer + display link) with the /// stage-1 StreamPump → displayLayer path as the Metal-unavailable / DEBUG fallback. private let presenter = SessionPresenter() @@ -571,8 +579,9 @@ public final class StreamLayerView: NSView { presenter.start( connection: connection, baseLayer: displayLayer, - presentMeter: presentMeter, - presentTailMeter: presentTailMeter, + endToEndMeter: endToEndMeter, + decodeMeter: decodeMeter, + displayMeter: displayMeter, makeDisplayLink: { displayLink(target: $0, selector: $1) }, onFrame: onFrame, onSessionEnd: onSessionEnd) diff --git a/clients/apple/Sources/PunktfunkKit/Views/StreamViewIOS.swift b/clients/apple/Sources/PunktfunkKit/Views/StreamViewIOS.swift index 03e1af2..8c32aab 100644 --- a/clients/apple/Sources/PunktfunkKit/Views/StreamViewIOS.swift +++ b/clients/apple/Sources/PunktfunkKit/Views/StreamViewIOS.swift @@ -50,8 +50,9 @@ public struct StreamView: UIViewControllerRepresentable { private let onCaptureChange: ((Bool) -> Void)? private let onFrame: (@Sendable (AccessUnit) -> Void)? private let onSessionEnd: (@Sendable () -> Void)? - private let presentMeter: LatencyMeter? - private let presentTailMeter: LatencyMeter? + private let endToEndMeter: LatencyMeter? + private let decodeMeter: LatencyMeter? + private let displayMeter: LatencyMeter? public init( connection: PunktfunkConnection, @@ -59,24 +60,27 @@ public struct StreamView: UIViewControllerRepresentable { onCaptureChange: ((Bool) -> Void)? = nil, onFrame: (@Sendable (AccessUnit) -> Void)? = nil, onSessionEnd: (@Sendable () -> Void)? = nil, - presentMeter: LatencyMeter? = nil, - presentTailMeter: LatencyMeter? = nil + endToEndMeter: LatencyMeter? = nil, + decodeMeter: LatencyMeter? = nil, + displayMeter: LatencyMeter? = nil ) { self.connection = connection self.captureEnabled = captureEnabled self.onCaptureChange = onCaptureChange self.onFrame = onFrame self.onSessionEnd = onSessionEnd - self.presentMeter = presentMeter - self.presentTailMeter = presentTailMeter + self.endToEndMeter = endToEndMeter + self.decodeMeter = decodeMeter + self.displayMeter = displayMeter } public func makeUIViewController(context: Context) -> StreamViewController { let controller = StreamViewController() controller.onCaptureChange = onCaptureChange controller.captureEnabled = captureEnabled - controller.presentMeter = presentMeter - controller.presentTailMeter = presentTailMeter + controller.endToEndMeter = endToEndMeter + controller.decodeMeter = decodeMeter + controller.displayMeter = displayMeter controller.start(connection: connection, onFrame: onFrame, onSessionEnd: onSessionEnd) return controller } @@ -84,8 +88,9 @@ public struct StreamView: UIViewControllerRepresentable { public func updateUIViewController(_ controller: StreamViewController, context: Context) { controller.onCaptureChange = onCaptureChange controller.captureEnabled = captureEnabled - controller.presentMeter = presentMeter - controller.presentTailMeter = presentTailMeter + controller.endToEndMeter = endToEndMeter + controller.decodeMeter = decodeMeter + controller.displayMeter = displayMeter if controller.connection !== connection { controller.start(connection: connection, onFrame: onFrame, onSessionEnd: onSessionEnd) } @@ -101,10 +106,11 @@ public struct StreamView: UIViewControllerRepresentable { public final class StreamViewController: UIViewController { public private(set) var connection: PunktfunkConnection? private var observers: [NSObjectProtocol] = [] - /// Record capture→present / decode→present when the stage-2 presenter is active. - /// Consulted at start(). - var presentMeter: LatencyMeter? - var presentTailMeter: LatencyMeter? + /// Record the unified latency stages (end-to-end / decode / display) when the stage-2 + /// presenter is active. Consulted at start(). + var endToEndMeter: LatencyMeter? + var decodeMeter: LatencyMeter? + var displayMeter: LatencyMeter? /// The shared presenter stack: stage-2 (CAMetalLayer sublayer + display link) with the /// stage-1 StreamPump → displayLayer path as the Metal-unavailable / DEBUG fallback. private let presenter = SessionPresenter() @@ -285,8 +291,9 @@ public final class StreamViewController: UIViewController { presenter.start( connection: connection, baseLayer: streamView.displayLayer, - presentMeter: presentMeter, - presentTailMeter: presentTailMeter, + endToEndMeter: endToEndMeter, + decodeMeter: decodeMeter, + displayMeter: displayMeter, makeDisplayLink: { CADisplayLink(target: $0, selector: $1) }, onFrame: onFrame, onSessionEnd: onSessionEnd) diff --git a/clients/apple/Tests/PunktfunkKitTests/LatencyMeterTests.swift b/clients/apple/Tests/PunktfunkKitTests/LatencyMeterTests.swift index c276747..bb64709 100644 --- a/clients/apple/Tests/PunktfunkKitTests/LatencyMeterTests.swift +++ b/clients/apple/Tests/PunktfunkKitTests/LatencyMeterTests.swift @@ -1,6 +1,10 @@ -// Unit tests for LatencyMeter: percentiles, the skew-corrected flag, reset-on-drain, and the -// absurd-value guard. Latencies are constructed by stamping a pts a known interval in the past, so -// the result is that interval plus the (tiny) clock advance between reads — asserted with tolerance. +// Unit tests for LatencyMeter (one instance per unified-stats stage — see +// design/stats-unification.md): percentiles, the skew-corrected flag, reset-on-drain, the +// absurd-value guard, and the explicit-instant stage form (record(ptsNs:atNs:offsetNs:), used for +// the client-local decode/display stages and the at-present end-to-end stamp). Receipt-path +// latencies are constructed by stamping a pts a known interval in the past, so the result is that +// interval plus the (tiny) clock advance between reads — asserted with tolerance; the explicit +// form is exact. import Foundation import XCTest @@ -38,6 +42,26 @@ final class LatencyMeterTests: XCTestCase { XCTAssertEqual(m.drain()?.skewCorrected, true) } + func testExplicitStageRecordIsExact() { + let m = LatencyMeter() + // A client-local stage (decode: received→decoded) — start instant as ptsNs, offset 0. + let receivedNs: Int64 = 1_000_000_000_000 + m.record(ptsNs: UInt64(receivedNs), atNs: receivedNs + 3_000_000, offsetNs: 0) + guard let s = m.drain() else { return XCTFail("expected a sample") } + XCTAssertEqual(s.count, 1) + XCTAssertEqual(s.p50Ms, 3.0, "explicit instants make the sample exact") + XCTAssertFalse(s.skewCorrected, "local stages record with offset 0") + } + + func testExplicitStageDropsNonPositiveInterval() { + let m = LatencyMeter() + // A stage whose start stamp is missing (0) or after its end must not pollute the window. + let decodedNs: Int64 = 1_000_000_000_000 + m.record(ptsNs: 0, atNs: decodedNs, offsetNs: 0) // "start unknown" → > 10 s → dropped + m.record(ptsNs: UInt64(decodedNs + 1), atNs: decodedNs, offsetNs: 0) // negative → dropped + XCTAssertNil(m.drain()) + } + func testDropsAbsurdValues() { let m = LatencyMeter() let now = nowRealtimeNs() diff --git a/clients/apple/Tests/PunktfunkKitTests/Stage444Tests.swift b/clients/apple/Tests/PunktfunkKitTests/Stage444Tests.swift index 328152a..b13fc09 100644 --- a/clients/apple/Tests/PunktfunkKitTests/Stage444Tests.swift +++ b/clients/apple/Tests/PunktfunkKitTests/Stage444Tests.swift @@ -31,7 +31,7 @@ final class Stage444Tests: XCTestCase { let data = Data(Probe444Blobs.au444_8bit) let format = try XCTUnwrap( AnnexB.formatDescription(fromIDR: data, codec: .hevc), "the 4:4:4 blob must yield a format description") - let au = AccessUnit(data: data, ptsNs: 7_000_000, frameIndex: 0, flags: 0) + let au = AccessUnit(data: data, ptsNs: 7_000_000, frameIndex: 0, flags: 0, receivedNs: 0) let box = FrameBox() let done = DispatchSemaphore(value: 0) diff --git a/clients/apple/Tests/PunktfunkKitTests/VideoToolboxRoundTripTests.swift b/clients/apple/Tests/PunktfunkKitTests/VideoToolboxRoundTripTests.swift index c989943..3b3a15c 100644 --- a/clients/apple/Tests/PunktfunkKitTests/VideoToolboxRoundTripTests.swift +++ b/clients/apple/Tests/PunktfunkKitTests/VideoToolboxRoundTripTests.swift @@ -38,7 +38,8 @@ final class VideoToolboxRoundTripTests: XCTestCase { XCTAssertEqual(AnnexB.avcc(from: annexB, codec: .hevc), avccSample) // 3) Sample buffer → real decoder → pixels. - let au = AccessUnit(data: annexB, ptsNs: 1_000_000, frameIndex: 0, flags: 0) + let au = AccessUnit( + data: annexB, ptsNs: 1_000_000, frameIndex: 0, flags: 0, receivedNs: 0) let sample = try XCTUnwrap(AnnexB.sampleBuffer(au: au, format: rebuilt, codec: .hevc)) var session: VTDecompressionSession? @@ -67,13 +68,14 @@ final class VideoToolboxRoundTripTests: XCTestCase { } /// Stage-2 decode half: the same known IDR through `VideoDecoder` — assert its async output - /// callback fires with a CVPixelBuffer of the right dimensions, the pts round-trips, and - /// decode-completion is stamped. + /// callback fires with a CVPixelBuffer of the right dimensions, the pts and the receipt stamp + /// round-trip (the latter rides the frame refcon), and decode-completion is stamped. func testVideoDecoderAsyncCallbackDeliversPixels() throws { let (formatDesc, avccSample) = try encodeOneHEVCKeyframe() let annexB = try annexBAU(formatDesc: formatDesc, avccSample: avccSample) let format = try XCTUnwrap(AnnexB.formatDescription(fromIDR: annexB, codec: .hevc)) - let au = AccessUnit(data: annexB, ptsNs: 42_000_000, frameIndex: 0, flags: 0) + let au = AccessUnit( + data: annexB, ptsNs: 42_000_000, frameIndex: 0, flags: 0, receivedNs: 41_000_000) let box = FrameBox() let done = DispatchSemaphore(value: 0) @@ -100,6 +102,8 @@ final class VideoToolboxRoundTripTests: XCTestCase { XCTAssertEqual(CVPixelBufferGetWidth(ready.pixelBuffer), width) XCTAssertEqual(CVPixelBufferGetHeight(ready.pixelBuffer), height) XCTAssertEqual(ready.ptsNs, 42_000_000, "pts round-trips through the decoder") + XCTAssertEqual( + ready.receivedNs, 41_000_000, "receivedNs round-trips through the frame refcon") XCTAssertGreaterThan(ready.decodedNs, 0, "decode-completion is stamped") } diff --git a/clients/linux/src/session.rs b/clients/linux/src/session.rs index 261f427..4fe86d8 100644 --- a/clients/linux/src/session.rs +++ b/clients/linux/src/session.rs @@ -45,18 +45,40 @@ pub struct SessionParams { pub connect_timeout: Duration, } +/// The session pump's share of the unified stats window (design/stats-unification.md): +/// stream facts plus the two stages measured before the presenter. The frame consumer in +/// `ui_stream` contributes the `display` stage and the end-to-end percentiles. #[derive(Clone, Copy, Default)] pub struct Stats { + /// AUs received (reassembled) per second, actual-elapsed-time denominator. pub fps: f32, + /// Received payload bytes × 8 / elapsed (goodput, excludes FEC overhead). pub mbps: f32, + /// p50 `host+network` stage: capture → received, host-clock corrected (ms). + pub host_net_ms: f32, + /// p50 `decode` stage: received → decoded, single-clock client-local (ms). pub decode_ms: f32, - /// Median capture→decoded latency over the last window (host-clock corrected). - pub latency_ms: f32, + /// Unrecoverable network frame drops this window, and their share of + /// received+lost (%). The OSD renders the counter line only when nonzero. + pub lost: u32, + pub lost_pct: f32, /// The decode path frames actually took this window (`"vaapi"`/`"software"`, empty /// until the first frame) — the OSD's trailing tag; tracks a mid-session fallback. pub decoder: &'static str, } +/// Sort a window of µs samples in place and return `(p50, p95)` per the spec's index +/// rules (`sorted[len/2]`, `sorted[min(len*95/100, len-1)]`); an empty window reads 0. +pub fn window_percentiles(samples: &mut [u64]) -> (u64, u64) { + if samples.is_empty() { + return (0, 0); + } + samples.sort_unstable(); + let p50 = samples[samples.len() / 2]; + let p95 = samples[(samples.len() * 95 / 100).min(samples.len() - 1)]; + (p50, p95) +} + pub enum SessionEvent { Connected { connector: Arc, @@ -219,13 +241,17 @@ fn pump( let mut window_start = Instant::now(); let mut frames_n = 0u32; let mut bytes_n = 0u64; - let mut decode_us_sum = 0u64; - let mut lat_us: Vec = Vec::with_capacity(256); + // Stage windows (µs samples): `host+network` = capture→received (host-clock + // corrected), `decode` = received→decoded (client-local). p50 per 1 s window. + let mut hostnet_us: Vec = Vec::with_capacity(256); + let mut decode_us: Vec = Vec::with_capacity(256); // What actually decoded the last frame — a VAAPI failure demotes mid-session, so // this is read off each frame's image variant rather than fixed at startup. let mut dec_path: &'static str = ""; // Loss recovery: watch the host→client unrecoverable-drop count and ask for an IDR when it climbs. let mut last_dropped = connector.frames_dropped(); + // The stats window keeps its own drop cursor — the OSD shows the per-window delta. + let mut window_dropped = last_dropped; let mut last_kf_req: Option = None; let end: Option = loop { @@ -237,7 +263,11 @@ fn pump( // every ~8–16 ms at 60–120 Hz anyway, so this rarely times out mid-stream). match connector.next_frame(Duration::from_millis(20)) { Ok(frame) => { - let t0 = Instant::now(); + // The `received` point: AU fully reassembled, in hand, before decode. + let received_ns = now_ns(); + // fps / goodput count every received AU (spec), decoded or not. + frames_n += 1; + bytes_n += frame.data.len() as u64; match decoder.decode(&frame.data) { Ok(Some(image)) => { total_frames += 1; @@ -252,18 +282,21 @@ fn pump( }; tracing::info!(width = w, height = h, path, "first frame decoded"); } - // Latency: our wall clock expressed in the host's capture clock, - // minus the host-stamped capture pts (same math as client-rs). - let lat = (now_ns() as i128 + clock_offset as i128 - frame.pts_ns as i128) + // The `decoded` point — travels with the frame so the presenter + // can measure its `display` stage against it. + let decoded_ns = now_ns(); + // `host+network` stage: received expressed in the host's capture + // clock, minus the host-stamped capture pts (clamped (0, 10 s)). + let hn = (received_ns as i128 + clock_offset as i128 - frame.pts_ns as i128) .max(0) as u64; - if lat > 0 && lat < 10_000_000_000 { - lat_us.push(lat / 1000); + if hn > 0 && hn < 10_000_000_000 { + hostnet_us.push(hn / 1000); } - decode_us_sum += t0.elapsed().as_micros() as u64; - frames_n += 1; - bytes_n += frame.data.len() as u64; + // `decode` stage: received→decoded, single clock, no skew. + decode_us.push(decoded_ns.saturating_sub(received_ns) / 1000); let _ = frame_tx.force_send(DecodedFrame { pts_ns: frame.pts_ns, + decoded_ns, image, }); } @@ -295,30 +328,36 @@ fn pump( if window_start.elapsed() >= Duration::from_secs(1) { let secs = window_start.elapsed().as_secs_f32(); - lat_us.sort_unstable(); - let p50 = lat_us.get(lat_us.len() / 2).copied().unwrap_or(0); + let (hn_p50, _) = window_percentiles(&mut hostnet_us); + let (dec_p50, _) = window_percentiles(&mut decode_us); + let lost = dropped.saturating_sub(window_dropped) as u32; + window_dropped = dropped; tracing::debug!( fps = frames_n, - lat_p50_us = p50, + hostnet_p50_us = hn_p50, + decode_p50_us = dec_p50, + lost, total_frames, "stream window" ); let _ = ev_tx.try_send(SessionEvent::Stats(Stats { fps: frames_n as f32 / secs, mbps: bytes_n as f32 * 8.0 / 1e6 / secs, - decode_ms: if frames_n > 0 { - decode_us_sum as f32 / frames_n as f32 / 1000.0 + host_net_ms: hn_p50 as f32 / 1000.0, + decode_ms: dec_p50 as f32 / 1000.0, + lost, + lost_pct: if lost > 0 { + lost as f32 * 100.0 / (frames_n + lost) as f32 } else { 0.0 }, - latency_ms: p50 as f32 / 1000.0, decoder: dec_path, })); window_start = Instant::now(); frames_n = 0; bytes_n = 0; - decode_us_sum = 0; - lat_us.clear(); + hostnet_us.clear(); + decode_us.clear(); } }; diff --git a/clients/linux/src/ui_stream.rs b/clients/linux/src/ui_stream.rs index 550492a..5b76248 100644 --- a/clients/linux/src/ui_stream.rs +++ b/clients/linux/src/ui_stream.rs @@ -31,33 +31,63 @@ use std::time::{Duration, Instant}; pub struct StreamPage { pub page: adw::NavigationPage, stats_label: gtk::Label, - /// Median capture→paintable-set latency (ms) over the frame consumer's last 1 s - /// window — written there, folded into the OSD on each `Stats` event. - present_ms: Rc>, + /// The frame consumer's share of the stats window (end-to-end percentiles + the + /// `display` stage) — written there each 1 s window, folded into the OSD on each + /// `Stats` event. + presented: Rc, /// The stream is HDR (PQ) right now — set by the frame consumer from each frame's /// signaling (the host can flip SDR↔HDR mid-session, in-band). hdr: Rc>, + /// `clock_offset_ns == 0`: the skew handshake didn't run (or same host) — the + /// end-to-end line carries the `(same-host clock)` flag (spec clock rules). + same_host: bool, + /// `W×H@Hz` for the OSD's first line — fixed at connect, per-session. + mode_line: String, +} + +/// Presenter-side window results (design/stats-unification.md): end-to-end = +/// capture→displayed measured directly (p50 + p95), `display` stage = decoded→displayed +/// p50. All ms, refreshed once per 1 s window by the frame consumer. +#[derive(Default)] +struct PresentedStats { + e2e_p50_ms: Cell, + e2e_p95_ms: Cell, + display_ms: Cell, } impl StreamPage { + /// Render the canonical unified-stats OSD (design/stats-unification.md — Linux + /// endpoint is paintable-set, headline reads `capture→displayed`). pub fn update_stats(&self, s: Stats) { - let mut line = format!( - "{:.0} fps · {:.1} Mbit/s · dec {:.1} ms · lat {:.1} ms · present {:.1} ms", - s.fps, - s.mbps, - s.decode_ms, - s.latency_ms, - self.present_ms.get() - ); + let mut line1 = format!("{} · {:.0} fps · {:.1} Mb/s", self.mode_line, s.fps, s.mbps); // Which decoder actually ran this window (vaapi/software) — tracks a fallback. if !s.decoder.is_empty() { - line.push_str(" · "); - line.push_str(s.decoder); + line1.push_str(" · "); + line1.push_str(s.decoder); } if self.hdr.get() { - line.push_str(" · HDR"); + line1.push_str(" · HDR"); } - self.stats_label.set_text(&line); + let mut text = format!( + "{line1}\n\ + end-to-end {:.1} ms p50 · {:.1} p95 · capture→displayed{}\n\ + = host+network {:.1} + decode {:.1} + display {:.1}", + self.presented.e2e_p50_ms.get(), + self.presented.e2e_p95_ms.get(), + if self.same_host { + " (same-host clock)" + } else { + "" + }, + s.host_net_ms, + s.decode_ms, + self.presented.display_ms.get(), + ); + // Counters — only rendered when nonzero this window. + if s.lost > 0 { + text.push_str(&format!("\nlost {} ({:.1}%)", s.lost, s.lost_pct)); + } + self.stats_label.set_text(&text); } } @@ -206,6 +236,13 @@ pub fn new(args: StreamPageArgs) -> StreamPage { let w = build_widgets(&window, &title, chromeless, pad_connected); w.stats_label.set_visible(show_stats); + // OSD line-1 facts, fixed for the session (the mode is negotiated per-session). + let mode = connector.mode(); + let mode_line = format!("{}×{}@{}", mode.width, mode.height, mode.refresh_hz); + // Offset 0 = the host didn't answer the skew handshake / same host — flagged on the + // end-to-end line so an uncorrected cross-machine number is never shown silently. + let same_host = clock_offset_ns == 0; + let capture = Rc::new(Capture { connector, window: window.clone(), @@ -218,13 +255,13 @@ pub fn new(args: StreamPageArgs) -> StreamPage { held_buttons: RefCell::new(HashSet::new()), }); - let present_ms = Rc::new(Cell::new(0.0f32)); + let presented = Rc::new(PresentedStats::default()); let hdr = Rc::new(Cell::new(false)); spawn_frame_consumer( &w.picture, frames, clock_offset_ns, - present_ms.clone(), + presented.clone(), hdr.clone(), ); attach_keyboard(&w.overlay, &window, &capture, &stop, &w.stats_label); @@ -248,8 +285,10 @@ pub fn new(args: StreamPageArgs) -> StreamPage { StreamPage { page: w.page, stats_label: w.stats_label, - present_ms, + presented, hdr, + same_host, + mode_line, } } @@ -456,12 +495,13 @@ fn attach_edge_reveal( /// then draws whatever paintable is current on its own frame clock. Ends itself when the /// channel closes or the picture is gone. /// -/// Also the capture→present-ish measurement point: at each paintable set the frame's -/// host capture pts is compared against the local wall clock expressed in the host clock -/// (`clock_offset_ns`, same math as the session's decode latency). This is -/// capture→paintable-SET — GTK's own present adds one compositor cycle after this. The -/// 1 s p50 lands on the stats OSD (via `present_ms`) and in a "present window" debug -/// line for headless validation. +/// Also the `displayed` measurement point (design/stats-unification.md): each paintable +/// set stamps the local wall clock, yielding end-to-end = capture→displayed (host-clock +/// corrected via `clock_offset_ns`, p50+p95, measured directly) and the client-local +/// `display` stage = decoded→displayed. This is capture→paintable-SET — GTK's own +/// present adds one compositor cycle after this. The 1 s window results land on the +/// stats OSD (via `PresentedStats`) and in a "present window" debug line for headless +/// validation. /// One-entry cache of `ColorDesc` → `GdkColorState` (signaling changes at most on an /// SDR↔HDR flip, never per frame). #[derive(Default)] @@ -516,7 +556,7 @@ fn spawn_frame_consumer( picture: >k::Picture, frames: async_channel::Receiver, clock_offset_ns: i64, - present_ms: Rc>, + presented_stats: Rc, hdr: Rc>, ) { let picture = picture.downgrade(); @@ -528,7 +568,10 @@ fn spawn_frame_consumer( let mut yuv_state = ColorStateCache::default(); let mut rgb_state = ColorStateCache::default(); glib::spawn_future_local(async move { - let mut win_lat_us: Vec = Vec::with_capacity(256); + // Window samples (µs): end-to-end capture→displayed (host-clock corrected) and + // the client-local display stage decoded→displayed. + let mut win_e2e_us: Vec = Vec::with_capacity(256); + let mut win_disp_us: Vec = Vec::with_capacity(256); let mut win_start = Instant::now(); while let Ok(f) = frames.recv().await { let Some(picture) = picture.upgrade() else { @@ -601,26 +644,34 @@ fn spawn_frame_consumer( } } } - // Capture→paintable-set latency, host-clock corrected (same math and sanity - // bound as the session's decode-latency window). + // The `displayed` stamp: end-to-end = capture→displayed host-clock corrected + // (same clamp as the session's stage windows); display = decoded→displayed, + // single clock, no skew. if presented { - let lat = (crate::session::now_ns() as i128 + clock_offset_ns as i128 - - f.pts_ns as i128) - .max(0) as u64; - if lat > 0 && lat < 10_000_000_000 { - win_lat_us.push(lat / 1000); + let displayed_ns = crate::session::now_ns(); + let e2e = (displayed_ns as i128 + clock_offset_ns as i128 - f.pts_ns as i128).max(0) + as u64; + if e2e > 0 && e2e < 10_000_000_000 { + win_e2e_us.push(e2e / 1000); } + win_disp_us.push(displayed_ns.saturating_sub(f.decoded_ns) / 1000); } if win_start.elapsed() >= Duration::from_secs(1) { - win_lat_us.sort_unstable(); - let p50 = win_lat_us.get(win_lat_us.len() / 2).copied().unwrap_or(0); + let frames = win_e2e_us.len(); + let (e2e_p50, e2e_p95) = crate::session::window_percentiles(&mut win_e2e_us); + let (disp_p50, _) = crate::session::window_percentiles(&mut win_disp_us); tracing::debug!( - frames = win_lat_us.len(), - present_p50_us = p50, + frames, + e2e_p50_us = e2e_p50, + e2e_p95_us = e2e_p95, + display_p50_us = disp_p50, "present window" ); - present_ms.set(p50 as f32 / 1000.0); - win_lat_us.clear(); + presented_stats.e2e_p50_ms.set(e2e_p50 as f32 / 1000.0); + presented_stats.e2e_p95_ms.set(e2e_p95 as f32 / 1000.0); + presented_stats.display_ms.set(disp_p50 as f32 / 1000.0); + win_e2e_us.clear(); + win_disp_us.clear(); win_start = Instant::now(); } } diff --git a/clients/linux/src/video.rs b/clients/linux/src/video.rs index 268051b..aadb91b 100644 --- a/clients/linux/src/video.rs +++ b/clients/linux/src/video.rs @@ -24,11 +24,15 @@ use std::os::fd::RawFd; use std::ptr; /// One decoded frame headed for the presenter, carrying the host capture timestamp so the -/// UI can measure capture→paintable-set latency at the moment it presents. +/// UI can measure capture→displayed latency at the moment it presents. pub struct DecodedFrame { /// Host-clock capture pts (ns) of the AU this image decoded from — compare against /// the local wall clock + `clock_offset_ns` at paintable-set time. pub pts_ns: u64, + /// Local wall clock (ns) when the decoder emitted this image — the `decoded` + /// measurement point (design/stats-unification.md); the presenter subtracts it from + /// its paintable-set stamp for the client-local `display` stage. + pub decoded_ns: u64, pub image: DecodedImage, } diff --git a/clients/probe/README.md b/clients/probe/README.md index 8cc4dde..baf0875 100644 --- a/clients/probe/README.md +++ b/clients/probe/README.md @@ -14,7 +14,7 @@ example of driving the protocol end to end: QUIC control plane, UDP data plane, - **Receives a real stream**, writes a playable elementary stream (`.h265`/`.h264`/`.av1` — the extension tracks the **negotiated codec**; the probe advertises all three and the host picks), and - reports per-frame **capture→…→reassembled latency** percentiles (the host stamps each frame with + reports per-frame **capture→received latency** percentiles (the host stamps each frame with its capture clock). - **Verification mode** against a synthetic host — byte-checks deterministic test frames. - **Exercises every plane** with scripted test traffic: diff --git a/clients/probe/src/main.rs b/clients/probe/src/main.rs index 75c3427..1bcb674 100644 --- a/clients/probe/src/main.rs +++ b/clients/probe/src/main.rs @@ -4,7 +4,7 @@ //! * **verification** (`frames > 0`, synthetic host): byte-checks deterministic test frames; //! * **stream** (`frames == 0`, virtual host): receives real encoded AUs, writes a playable //! elementary stream (the dump extension follows the negotiated codec — `.h265`/`.h264`/`.av1`; -//! the probe advertises all three), and reports per-frame **capture→…→reassembled latency** +//! the probe advertises all three), and reports per-frame **capture→received latency** //! percentiles (the host stamps each frame with its capture wall clock; same-host runs share //! that clock). //! @@ -481,7 +481,7 @@ async fn session(args: Args) -> Result<()> { .await?; // Wall-clock skew handshake on the still-private control stream (before --remode/--speed-test - // take it): align our clock to the host's so the per-frame capture→reassembled latency is valid + // take it): align our clock to the host's so the per-frame capture→received latency is valid // across machines. `None` ⇒ an old host that doesn't answer — fall back to a shared clock (0). let clock_offset_ns = match punktfunk_core::quic::clock_sync(&mut send, &mut recv).await { Some(skew) => { @@ -1051,7 +1051,7 @@ async fn session(args: Args) -> Result<()> { continue; } bytes += frame.data.len() as u64; - // capture→reassembled: our receive instant in the host clock (now + offset) + // capture→received: our receive instant in the host clock (now + offset) // minus the host's capture pts. offset is 0 same-host / old host. let lat = (now_ns() as i128 + clock_offset as i128 - frame.pts_ns as i128) .max(0) as u64; @@ -1100,7 +1100,7 @@ async fn session(args: Args) -> Result<()> { lat_p99_us = pct(0.99), lat_max_us = latencies_us.last().copied().unwrap_or(0), skew_corrected, - "punktfunk/1 stream complete (capture→reassembled latency; skew_corrected=true ⇒ \ + "punktfunk/1 stream complete (capture→received latency; skew_corrected=true ⇒ \ cross-machine valid, false ⇒ same-host clock)" ); if expected > 0 { diff --git a/clients/windows/src/app/stream.rs b/clients/windows/src/app/stream.rs index b7e18f7..a5ae9f6 100644 --- a/clients/windows/src/app/stream.rs +++ b/clients/windows/src/app/stream.rs @@ -2,7 +2,7 @@ //! the UI thread, then handed — presenter and all — to the dedicated render thread //! ([`crate::render`]), which presents decoded frames at stream cadence. The page itself only //! forwards panel size/DPI changes and draws the status-chip HUD overlay (mode · decode path · -//! HDR · fps/throughput/latency · capture hint). +//! HDR · fps/goodput · end-to-end latency + stage equation · capture hint). use super::style::{edges, uniform}; use super::Svc; @@ -22,8 +22,9 @@ use windows_reactor::*; pub(crate) struct HudSample { pub(crate) stats: Stats, pub(crate) captured: bool, - /// `(presents/s, skipped/s, capture→presented p50 ms)` — see [`crate::render::present_stats`]. - pub(crate) present: (u32, u32, f32), + /// The render thread's glass-side window (presents/s, skips, end-to-end p50/p95, display + /// stage p50) — see [`crate::render::present_stats`]. + pub(crate) present: crate::render::PresentStats, } /// Props for the stream page: the services plus the live HUD sample that drives the overlay @@ -171,13 +172,15 @@ fn fmt_uptime(secs: u32) -> String { } } -/// The streaming HUD overlay (top-right), mirroring the Apple client: a chip row (mode · codec · -/// decode path · HDR), a stream line (decode fps / bitrate / decode time), a glass line (display -/// presents + end-to-end latency decoded vs on-glass), a session line (host · time · loss), and -/// the shortcut hints. Layered over the `SwapChainPanel` in the same grid cell. +/// The streaming HUD overlay (top-right), unified stats vocabulary (design/stats-unification.md): +/// a chip row (mode · codec · decode path · HDR), a stream line (received fps · goodput · +/// presenter fps), the end-to-end headline (capture→on-glass p50/p95, host-clock corrected), the +/// stage equation (= host+network + decode + display, stage p50s), a session line +/// (host · time · loss/skips), and the shortcut hints. Layered over the `SwapChainPanel` in the +/// same grid cell. fn hud_overlay(hud: &HudSample, mode: Option, host: &str) -> Element { let stats = &hud.stats; - let (pfps, skipped, glass_ms) = hud.present; + let present = &hud.present; let res = mode .map(|m| format!("{}\u{00D7}{}@{}", m.width, m.height, m.refresh_hz)) .unwrap_or_else(|| "\u{2014}".into()); @@ -193,25 +196,38 @@ fn hud_overlay(hud: &HudSample, mode: Option, host: &str) -> Element { if stats.hdr { chips.push(hud_chip("HDR", Color::rgb(255, 205, 90)).into()); } + // Received fps + goodput, plus the presenter's own rate (Moonlight's "Rendering frame rate" + // analog — how often the display actually gets a new frame). let stream_line = format!( - "{:.0} fps \u{00B7} {:.1} Mb/s \u{00B7} decode {:.1} ms", - stats.fps, stats.mbps, stats.decode_ms + "{:.0} fps \u{00B7} {:.1} Mb/s \u{00B7} display {} fps", + stats.fps, stats.mbps, present.fps ); - // End-to-end latency (host-clock corrected): capture→decoded from the pump, capture→on-glass - // from the render thread's post-Present stamp. `skipped` = newest-wins drops (expected when - // the stream outpaces the display); `lost` = unrecoverable network drops. - let glass_line = format!( - "display {pfps} fps \u{00B7} latency {:.1} ms decoded / {glass_ms:.1} ms on-glass", - stats.latency_ms + // The headline: end-to-end capture→displayed, measured directly post-Present (never the sum + // of the stage percentiles). `(same-host clock)` flags an uncorrected clock (offset == 0: + // same host, or the host skipped the skew handshake). + let mut e2e_line = format!( + "end-to-end {:.1} ms p50 \u{00B7} {:.1} p95 \u{00B7} capture\u{2192}on-glass", + present.e2e_p50_ms, present.e2e_p95_ms + ); + if stats.same_host { + e2e_line.push_str(" (same-host clock)"); + } + // The equation: the three stages tile the headline interval per frame; the window p50s only + // approximately sum (percentiles aren't additive). + let stage_line = format!( + "= host+network {:.1} + decode {:.1} + display {:.1}", + stats.hostnet_ms, stats.decode_ms, present.display_p50_ms ); let mut session_bits: Vec = Vec::new(); if !host.is_empty() { session_bits.push(host.to_string()); } + // `lost` = unrecoverable network drops (session-cumulative); `skipped` = the render thread's + // newest-wins drops last window (expected when the stream outpaces the display). session_bits.push(fmt_uptime(stats.uptime_secs)); session_bits.push(format!("{} lost", stats.dropped)); - if skipped > 0 { - session_bits.push(format!("{skipped} skipped")); + if present.skipped > 0 { + session_bits.push(format!("{} skipped", present.skipped)); } let session_line = session_bits.join(" \u{00B7} "); let hint = if hud.captured { @@ -228,7 +244,8 @@ fn hud_overlay(hud: &HudSample, mode: Option, host: &str) -> Element { vstack(( hstack(chips).spacing(6.0), dim(&stream_line), - dim(&glass_line), + dim(&e2e_line), + dim(&stage_line), dim(&session_line), text_block(hint) .font_size(11.0) diff --git a/clients/windows/src/main.rs b/clients/windows/src/main.rs index 64bf1a3..8882610 100644 --- a/clients/windows/src/main.rs +++ b/clients/windows/src/main.rs @@ -241,8 +241,8 @@ fn run_headless_cli(args: &[String], identity: (String, String)) { session::SessionEvent::Stats(s) => tracing::info!( fps = format!("{:.0}", s.fps), mbps = format!("{:.1}", s.mbps), - decode_ms = format!("{:.2}", s.decode_ms), - lat_ms = format!("{:.2}", s.latency_ms), + decode_p50_ms = format!("{:.2}", s.decode_ms), + hostnet_p50_ms = format!("{:.2}", s.hostnet_ms), frames_seen, "stats" ), diff --git a/clients/windows/src/render.rs b/clients/windows/src/render.rs index 1c8aba5..92bb341 100644 --- a/clients/windows/src/render.rs +++ b/clients/windows/src/render.rs @@ -10,27 +10,46 @@ //! draw (and redraws the held frame after a resize — fresh back buffers are blank). use crate::present::Presenter; -use crate::session::FrameRx; +use crate::session::{FrameRx, FrameTimes}; use crossbeam_channel::RecvTimeoutError; use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; use std::time::{Duration, Instant}; /// The last 1-second render window, published for the HUD (one render thread at a time): -/// presents/s, frames skipped by the newest-wins drain, and the capture→presented p50 in µs. +/// presents/s, frames skipped by the newest-wins drain, the end-to-end (capture→on-glass) +/// p50/p95 and the `display` stage (decoded→displayed) p50, all stamped post-`Present()`, in µs. /// Zeroed when a render thread starts so a new session never shows the previous one's numbers. static PRESENT_FPS: AtomicU32 = AtomicU32::new(0); static PRESENT_SKIPPED: AtomicU32 = AtomicU32::new(0); -static PRESENT_P50_US: AtomicU64 = AtomicU64::new(0); +static E2E_P50_US: AtomicU64 = AtomicU64::new(0); +static E2E_P95_US: AtomicU64 = AtomicU64::new(0); +static DISPLAY_P50_US: AtomicU64 = AtomicU64::new(0); -/// `(presents/s, skipped/s, capture→presented p50 ms)` of the last render window — the HUD's -/// display-side line. -pub fn present_stats() -> (u32, u32, f32) { - ( - PRESENT_FPS.load(Ordering::Relaxed), - PRESENT_SKIPPED.load(Ordering::Relaxed), - PRESENT_P50_US.load(Ordering::Relaxed) as f32 / 1000.0, - ) +/// The last render window's glass-side numbers (see the statics above) — the HUD's headline +/// (end-to-end) and trailing stage (display) come from here. +#[derive(Clone, Copy, Default, PartialEq)] +pub struct PresentStats { + /// Presents per second (includes resize redraws of a held frame). + pub fps: u32, + /// Frames dropped by the newest-wins drain this window (client-side pacing skips). + pub skipped: u32, + /// End-to-end capture→displayed p50, ms (host-clock corrected, measured directly). + pub e2e_p50_ms: f32, + /// End-to-end capture→displayed p95, ms. + pub e2e_p95_ms: f32, + /// `display` stage p50, ms: decoded → displayed, single-clock client-local. + pub display_p50_ms: f32, +} + +pub fn present_stats() -> PresentStats { + PresentStats { + fps: PRESENT_FPS.load(Ordering::Relaxed), + skipped: PRESENT_SKIPPED.load(Ordering::Relaxed), + e2e_p50_ms: E2E_P50_US.load(Ordering::Relaxed) as f32 / 1000.0, + e2e_p95_ms: E2E_P95_US.load(Ordering::Relaxed) as f32 / 1000.0, + display_p50_ms: DISPLAY_P50_US.load(Ordering::Relaxed) as f32 / 1000.0, + } } /// UI-thread → render-thread state. Size is packed into ONE atomic (w<<32|h) so a resize never @@ -101,8 +120,9 @@ impl Drop for RenderThread { struct SendPresenter(Presenter); unsafe impl Send for SendPresenter {} -/// Spawn the render thread. `frames` carries `(frame, capture pts_ns)`; `clock_offset_ns` maps our -/// wall clock onto the host's so the logged present latency is end-to-end (same math as the pump). +/// Spawn the render thread. `frames` carries `(frame, FrameTimes)`; `clock_offset_ns` maps our +/// wall clock onto the host's so the end-to-end (capture→on-glass) number is cross-machine valid +/// (same math as the pump's host+network stage). pub fn spawn( presenter: Presenter, frames: FrameRx, @@ -147,12 +167,17 @@ fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc, clo let mut applied = (0u32, 0u32, 0u32); // last (w, h, dpi) handed to the presenter let mut presented = 0u32; let mut dropped = 0u32; - let mut lat_us: Vec = Vec::with_capacity(256); + // 1 s tumbling windows: end-to-end (capture→displayed) and the display stage + // (decoded→displayed), sampled post-Present. Percentiles only (spec: stats-unification.md). + let mut e2e_us: Vec = Vec::with_capacity(256); + let mut display_us: Vec = Vec::with_capacity(256); let mut window_start = Instant::now(); let mut last_dpi_poll = Instant::now(); PRESENT_FPS.store(0, Ordering::Relaxed); PRESENT_SKIPPED.store(0, Ordering::Relaxed); - PRESENT_P50_US.store(0, Ordering::Relaxed); + E2E_P50_US.store(0, Ordering::Relaxed); + E2E_P95_US.store(0, Ordering::Relaxed); + DISPLAY_P50_US.store(0, Ordering::Relaxed); loop { if shared.stop.load(Ordering::SeqCst) { @@ -198,29 +223,55 @@ fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc, clo p.set_hdr_metadata(meta); } - let pts_ns = newest.as_ref().map(|(_, pts)| *pts); + let times: Option = newest.as_ref().map(|(_, t)| *t); p.present(newest.map(|(f, _)| f)); presented += 1; - if let Some(pts) = pts_ns { - // Capture→presented, host-clock corrected — the glass-side companion to the pump's - // capture→decoded p50. - let lat = (now_ns() as i128 + clock_offset_ns as i128 - pts as i128).max(0) as u64; - if lat > 0 && lat < 10_000_000_000 { - lat_us.push(lat / 1000); + if let Some(t) = times { + // The `displayed` point: post-Present() on this thread (the honest best-effort + // presentation instant on Windows — endpoint label `capture→on-glass`). + let displayed_ns = now_ns(); + // End-to-end = capture → displayed, host-clock corrected, measured directly + // (never the sum of stage percentiles). Clamped (0, 10 s). + let e2e = + (displayed_ns as i128 + clock_offset_ns as i128 - t.pts_ns as i128).max(0) as u64; + if e2e > 0 && e2e < 10_000_000_000 { + e2e_us.push(e2e / 1000); + } + // `display` stage = decoded → displayed, single-clock client-local. + let disp = displayed_ns.saturating_sub(t.decoded_ns); + if disp < 10_000_000_000 { + display_us.push(disp / 1000); } } if window_start.elapsed() >= Duration::from_secs(1) { - lat_us.sort_unstable(); - let p50 = lat_us.get(lat_us.len() / 2).copied().unwrap_or(0); - tracing::debug!(presented, dropped, present_p50_us = p50, "render window"); + e2e_us.sort_unstable(); + display_us.sort_unstable(); + let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0); + // p95 = sorted[min(len*95/100, len-1)] — the empty-window case falls to 0 via `get`. + let p95 = |v: &[u64]| { + v.get((v.len() * 95 / 100).min(v.len().saturating_sub(1))) + .copied() + .unwrap_or(0) + }; + tracing::debug!( + presented, + dropped, + e2e_p50_us = p50(&e2e_us), + e2e_p95_us = p95(&e2e_us), + display_p50_us = p50(&display_us), + "render window" + ); PRESENT_FPS.store(presented, Ordering::Relaxed); PRESENT_SKIPPED.store(dropped, Ordering::Relaxed); - PRESENT_P50_US.store(p50, Ordering::Relaxed); + E2E_P50_US.store(p50(&e2e_us), Ordering::Relaxed); + E2E_P95_US.store(p95(&e2e_us), Ordering::Relaxed); + DISPLAY_P50_US.store(p50(&display_us), Ordering::Relaxed); window_start = Instant::now(); presented = 0; dropped = 0; - lat_us.clear(); + e2e_us.clear(); + display_us.clear(); } } tracing::info!("render thread exiting"); diff --git a/clients/windows/src/session.rs b/clients/windows/src/session.rs index 9502c8d..f2b89c6 100644 --- a/clients/windows/src/session.rs +++ b/clients/windows/src/session.rs @@ -46,11 +46,18 @@ pub struct SessionParams { #[derive(Clone, Copy, Default, PartialEq)] pub struct Stats { + /// AUs received (reassembled) per second — actual-elapsed-time denominator. pub fps: f32, + /// Received payload goodput (excludes FEC overhead). pub mbps: f32, + /// `decode` stage p50 over the last 1 s window: received → decoded, client-local clock. pub decode_ms: f32, - /// Median capture→decoded latency over the last window (host-clock corrected). - pub latency_ms: f32, + /// `host+network` stage p50 over the last 1 s window: capture (`pts_ns`) → received, + /// host-clock corrected via `clock_offset_ns`. + pub hostnet_ms: f32, + /// True when `clock_offset_ns == 0` (host didn't answer the skew handshake / same host) — + /// the HUD appends `(same-host clock)` to the end-to-end line. + pub same_host: bool, /// True when decoding on the GPU (D3D11VA) vs. CPU (software). pub hardware: bool, /// True when the stream is BT.2020 PQ HDR10 (last decoded frame). @@ -81,9 +88,19 @@ pub enum SessionEvent { Stats(Stats), } -/// Decoded frames + their host-capture `pts_ns`, session pump → render thread (crossbeam so that +/// Per-frame measurement points carried with a decoded frame to the render thread: the host +/// capture clock (`pts_ns`) and our local `decoded` stamp (wall-clock ns). Post-`Present()` the +/// render thread derives the `display` stage (displayed − decoded, single-clock) and the +/// end-to-end headline (displayed + clock_offset − pts) from them. +#[derive(Clone, Copy)] +pub struct FrameTimes { + pub pts_ns: u64, + pub decoded_ns: u64, +} + +/// Decoded frames + their measurement points, session pump → render thread (crossbeam so that /// thread can block with a timeout — async-channel has no `recv_timeout`). -pub type FrameRx = crossbeam_channel::Receiver<(DecodedFrame, u64)>; +pub type FrameRx = crossbeam_channel::Receiver<(DecodedFrame, FrameTimes)>; pub struct SessionHandle { pub events: async_channel::Receiver, @@ -205,7 +222,7 @@ impl AudioDec { fn pump( params: SessionParams, ev_tx: async_channel::Sender, - frame_tx: crossbeam_channel::Sender<(DecodedFrame, u64)>, + frame_tx: crossbeam_channel::Sender<(DecodedFrame, FrameTimes)>, frame_rx: FrameRx, stop: Arc, ) { @@ -310,8 +327,9 @@ fn pump( let mut window_start = Instant::now(); let mut frames_n = 0u32; let mut bytes_n = 0u64; - let mut decode_us_sum = 0u64; - let mut lat_us: Vec = Vec::with_capacity(256); + // 1 s tumbling stage windows (spec: design/stats-unification.md — percentiles, never means). + let mut hostnet_us: Vec = Vec::with_capacity(256); + let mut decode_us: Vec = Vec::with_capacity(256); let mut pcm = vec![0f32; 5760 * channels as usize]; // scratch: max Opus frame (120 ms) × channels // Loss recovery: watch the host→client unrecoverable-drop count and ask for an IDR when it climbs. let mut last_dropped = connector.frames_dropped(); @@ -323,7 +341,18 @@ fn pump( } match connector.next_frame(Duration::from_millis(4)) { Ok(frame) => { - let t0 = Instant::now(); + // The `received` point: AU fully reassembled, handed to us, before decode. + let received_ns = now_ns(); + // fps = AUs received per second, Mb/s = received goodput (spec: counted at the + // received point, not the decoded one). + frames_n += 1; + bytes_n += frame.data.len() as u64; + // `host+network` stage: capture → received, host-clock corrected. Clamped (0, 10 s). + let hostnet = (received_ns as i128 + clock_offset as i128 - frame.pts_ns as i128) + .max(0) as u64; + if hostnet > 0 && hostnet < 10_000_000_000 { + hostnet_us.push(hostnet / 1000); + } // A D3D11VA→software demotion (see `Decoder::decode`) starts a FRESH decoder that // has none of the stream's parameter sets; under infinite GOP it would sit on // "PPS id out of range" forever. Detect the transition and force a new IDR so the @@ -336,6 +365,8 @@ fn pump( } match decoded { Ok(Some(decoded)) => { + // The `decoded` point: decoder output frame available. + let decoded_ns = now_ns(); total_frames += 1; hdr = decoded.hdr(); // The backend can demote D3D11VA → software mid-session on a hardware error. @@ -350,19 +381,17 @@ fn pump( "first frame decoded" ); } - // Latency: our wall clock expressed in the host's capture clock, - // minus the host-stamped capture pts (same math as client-rs). - let lat = (now_ns() as i128 + clock_offset as i128 - frame.pts_ns as i128) - .max(0) as u64; - if lat > 0 && lat < 10_000_000_000 { - lat_us.push(lat / 1000); - } - decode_us_sum += t0.elapsed().as_micros() as u64; - frames_n += 1; - bytes_n += frame.data.len() as u64; + // `decode` stage: received → decoded, single-clock client-local. + decode_us.push(decoded_ns.saturating_sub(received_ns) / 1000); // Newest wins: displace the oldest queued frame when the renderer lags. if let Err(crossbeam_channel::TrySendError::Full(item)) = - frame_tx.try_send((decoded, frame.pts_ns)) + frame_tx.try_send(( + decoded, + FrameTimes { + pts_ns: frame.pts_ns, + decoded_ns, + }, + )) { let _ = frame_rx.try_recv(); let _ = frame_tx.try_send(item); @@ -413,23 +442,23 @@ fn pump( if window_start.elapsed() >= Duration::from_secs(1) { let secs = window_start.elapsed().as_secs_f32(); - lat_us.sort_unstable(); - let p50 = lat_us.get(lat_us.len() / 2).copied().unwrap_or(0); + hostnet_us.sort_unstable(); + decode_us.sort_unstable(); + let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0); + let (hostnet_p50, decode_p50) = (p50(&hostnet_us), p50(&decode_us)); tracing::debug!( fps = frames_n, - lat_p50_us = p50, + hostnet_p50_us = hostnet_p50, + decode_p50_us = decode_p50, total_frames, "stream window" ); let _ = ev_tx.try_send(SessionEvent::Stats(Stats { fps: frames_n as f32 / secs, mbps: bytes_n as f32 * 8.0 / 1e6 / secs, - decode_ms: if frames_n > 0 { - decode_us_sum as f32 / frames_n as f32 / 1000.0 - } else { - 0.0 - }, - latency_ms: p50 as f32 / 1000.0, + decode_ms: decode_p50 as f32 / 1000.0, + hostnet_ms: hostnet_p50 as f32 / 1000.0, + same_host: clock_offset == 0, hardware, hdr, codec: connector.codec, @@ -439,8 +468,8 @@ fn pump( window_start = Instant::now(); frames_n = 0; bytes_n = 0; - decode_us_sum = 0; - lat_us.clear(); + hostnet_us.clear(); + decode_us.clear(); } }; diff --git a/crates/punktfunk-core/src/quic.rs b/crates/punktfunk-core/src/quic.rs index a345c7b..2c5f07e 100644 --- a/crates/punktfunk-core/src/quic.rs +++ b/crates/punktfunk-core/src/quic.rs @@ -390,7 +390,7 @@ pub struct ProbeResult { /// `client → host`, right after [`Start`]: one round of the wall-clock skew handshake. The client /// stamps `t1_ns` (its monotonic-since-epoch clock) and sends; the host echoes it in [`ClockEcho`] /// with its own receive/send stamps. A few rounds let the client estimate the host↔client clock -/// offset, so the per-frame `capture→reassembled` latency (the AU `pts_ns` is the host's capture +/// offset, so the per-frame `capture→received` latency (the AU `pts_ns` is the host's capture /// clock) is meaningful across machines, not just same-host. An old host ignores it (the client /// times out and assumes a shared clock). #[derive(Clone, Copy, Debug, PartialEq, Eq)] diff --git a/design/stats-unification.md b/design/stats-unification.md new file mode 100644 index 0000000..38ac20a --- /dev/null +++ b/design/stats-unification.md @@ -0,0 +1,142 @@ +# Unified streaming stats — one vocabulary across every client + +Status: spec agreed 2026-07-03; implementation tracked per client below. +User-facing companion: `docs-site/content/docs/stats.md` (overlay guide + Moonlight matrix). + +## Why + +Prospective users compare our numbers against Moonlight/Sunshine's overlay. Before this +spec the clients disagreed with each other (same concept, different name, different +measurement point, mean vs median, missing skew flags), and none of them made clear +which numbers are *sequential stages that add up* versus *overlapping absolutes*. The +Apple HUD showed three arrow-notation lines (`capture→client`, `capture→present`, +`decode→present`) that looked like a pipeline but overlapped, left the decode interval +invisible, and mixed two clock bases. Meanwhile Moonlight shows *only* disjoint +client-side segments and has **no end-to-end number at all** — so our headline +end-to-end figure, presented without context, reads "worse" against a Moonlight line +that measures a fraction of the chain. + +## The model + +Four measurement points per video frame. Every stat on every HUD is a difference of +two of these points — nothing else. + +| point | meaning | who stamps it | +|---|---|---| +| **capture** | host capture clock: the per-AU `pts_ns`. Native path anchors at PipeWire frame delivery (host queue age is *inside* it); GameStream uses the RTP 90 kHz clock. | host | +| **received** | AU fully reassembled (post-FEC), handed to the client, before decode | client | +| **decoded** | decoder output frame available | client | +| **displayed** | best-effort presentation instant (see per-client endpoint table) | client | + +Three **stages** tile the interval exactly (per frame, same timestamps, no gaps, no +overlap): + +- `host+network` = capture → received. Contains the whole host pipeline + (queue/capture/convert/encode/pace) **plus** wire time and reassembly; it cannot be + split client-side today (see Phase 2). +- `decode` = received → decoded. Pure client-local; no clock skew involved. +- `display` = decoded → displayed. Pacing/queue wait + render + vsync. Client-local. + +Headline: **`end-to-end`** = capture → displayed, **measured directly** (not the sum of +stage percentiles). Per frame the stages sum to it exactly; per window the p50s only +approximately sum (percentiles aren't additive) — the doc says so, the HUD shows the +equation with the directly-measured total on the left. + +### Clock rules + +- `end-to-end` and `host+network` use client `CLOCK_REALTIME` + `clock_offset_ns` + (the ClockProbe/ClockEcho skew handshake). Cross-machine valid when the offset was + measured. +- `decode` and `display` are single-clock client-local (offset irrelevant). +- When `clock_offset_ns == 0` (host didn't answer the handshake / same host), the HUD + appends **`(same-host clock)`** once, on the end-to-end line — every client, no + exceptions. Windows and Linux previously showed possibly-uncorrected numbers + silently; that was a bug in presentation. + +### Aggregation rules + +- **Window: 1 s tumbling** (drain and reset), HUD refresh 1×/s. (Moonlight uses a + 1–2 s sliding window — close enough for comparison; documented in the matrix.) +- **Percentiles only, never means**: end-to-end shows `p50` and `p95`; stages show + `p50`. Windows/Linux previously showed *mean* decode time next to median latencies — + banned. +- Latency samples clamped to `(0, 10 s)` as before. +- `fps` = AUs received (reassembled) per second, actual-elapsed-time denominator. +- `Mb/s` = received payload bytes × 8 / elapsed (goodput, excludes FEC overhead — + same basis as Moonlight's bitrate tracker). + +## Canonical HUD layout + +Line order and label strings are normative; platform chips vary. + +``` +1920×1080@120 · 119 fps · 38.2 Mb/s · HEVC 10-bit HDR · GPU decode +end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass += host+network 9.8 + decode 2.1 + display 2.3 +lost 3 (0.1%) · skipped 1 · FEC 12 +``` + +- Line 1 — stream facts. `{W}×{H}@{Hz} · {fps} fps · {mbps} Mb/s` plus whatever + chips the platform already knows (codec, bit depth, HDR, GPU/CPU decode). +- Line 2 — the headline. `end-to-end {p50} ms p50 · {p95} p95 · capture→{endpoint}` + (+ ` (same-host clock)` when applicable). The endpoint suffix is honest per + platform — see the endpoint table. Never a bare "latency". +- Line 3 — the equation. `= host+network {a} + decode {b} + display {c}` (stage p50s, + ms, one decimal). A platform that can't measure a trailing stage drops the term and + the headline endpoint moves accordingly — the equation always tiles the headline + interval. +- Line 4 — counters, only rendered when any value is nonzero. + `lost` = unrecoverable network frame drops in the window (`{n} ({pct}%)`, + pct = lost/(received+lost)); `skipped` = client-side newest-wins/pacing drops; + `FEC` = shards recovered this window (proof FEC is earning its keep). + +### Per-client endpoints (v1) + +| client | displayed point | headline reads | equation terms | +|---|---|---|---| +| Apple stage-2 | display-link target present instant | `capture→on-glass` | host+network + decode + display | +| Apple stage-1 (fallback presenter) | n/a (opaque `AVSampleBufferDisplayLayer`) | `capture→received` | host+network only | +| Windows | post-`Present()` on the render thread | `capture→on-glass` | all three | +| Linux GTK | paintable-set (GTK adds ~1 compositor cycle after) | `capture→displayed` | all three | +| Android | MediaCodec output released to surface | `capture→decoded` (v1) | host+network + decode | +| probe (headless log) | n/a | `capture→received` (was "reassembled") | logs p50/p95/p99/max µs, whole-session — measurement tool, exempt from the 1 s window rule but uses the canonical point names | + +Host web console (`stats_recorder`) keeps its own additive stage vocabulary +(`queue → capture → submit → encode → send`, p50/p99, 1–2 s windows) — operator +deep-dive tool, already additive/stacked, out of scope here except that its stage names +must stay consistent with the docs. The sum of the host stages is our analogue of +Sunshine's "host processing latency" (capture→send). + +## Moonlight comparison (summary — full matrix in docs-site/content/docs/stats.md) + +- Moonlight's latency lines are **disjoint client segments** (decode, queue delay, + render) plus Sunshine's host capture→send; nothing measures the wire, and there is + **no end-to-end line**. Our `end-to-end` must never be compared against any single + Moonlight line. Fair approximation: + `punktfunk end-to-end ≈ ML host processing latency + ~½ RTT + ML decoding time + + ML frame queue delay + ML rendering time`. +- Moonlight "Average network latency" = **ENet control-channel RTT** — not frame + latency; we intentionally have no equivalent line. +- Moonlight "Video stream … FPS" *includes inferred-lost frames* (host-rate estimate + from frame-number gaps); our `fps` counts received only — equal at ~0 loss. +- Moonlight decode/queue/render times are **means**; ours are p50s. + +## Phase 2 (specced, not in v1): split `host+network` + +Carry the host's capture→send duration per AU (host stamps it at send, e.g. a +varint-µs field in the AU header or a 0.1 ms u16 à la Sunshine's frame header). Client +then displays `host {x} + network {y}` instead of `host+network`, where +`network = (received − capture) − host_reported` — and the Moonlight matrix gains a +direct "Host processing latency" counterpart. Requires a core wire/ABI bump +(`punktfunk_frame` gains `host_latency_us`), trailing-byte back-compat like the +compositor/gamepad preference bytes. Also consider surfacing the QUIC path RTT +(quinn exposes it) as a diagnostics line, clearly labelled control-plane RTT. + +## Implementation status + +- [ ] Apple (`StreamHUDView`/`SessionModel`/`Stage2Pipeline` + `LatencyMeter` reuse) +- [ ] Windows (`app/stream.rs` HUD rows, `session.rs`/`render.rs` meters → p50/p95) +- [ ] Linux (`ui_stream.rs` OSD, `session.rs` window meters) +- [ ] Android (`stats.rs`/`decode.rs` stage split, `StatsOverlay.kt`) +- [ ] probe (rename `capture→reassembled` → `capture→received` in the log line) +- [ ] docs-site stats page + matrix; link from `moonlight.md` diff --git a/docs-site/content/docs/meta.json b/docs-site/content/docs/meta.json index ed24e57..221852f 100644 --- a/docs-site/content/docs/meta.json +++ b/docs-site/content/docs/meta.json @@ -26,6 +26,7 @@ "host-cli", "---Troubleshooting---", "troubleshooting", + "stats", "forgot-password", "---Project---", "roadmap", diff --git a/docs-site/content/docs/moonlight.md b/docs-site/content/docs/moonlight.md index 5b68523..1627d8c 100644 --- a/docs-site/content/docs/moonlight.md +++ b/docs-site/content/docs/moonlight.md @@ -52,3 +52,6 @@ it. Mouse, keyboard, and controllers flow back to the host. clients](/docs/clients) have a built-in speed test; with Moonlight, set the bitrate manually. - Moonlight uses the GameStream protocol, not punktfunk's native FEC/encryption extensions. On a solid LAN this is fine; on a lossy link a [native client](/docs/clients) holds up better. +- Comparing Moonlight's performance overlay with a punktfunk client's stats HUD? The numbers + measure different slices of the pipeline — see [Understanding the Stats Overlay](/docs/stats) + for a line-by-line comparison matrix before drawing conclusions. diff --git a/docs-site/content/docs/stats.md b/docs-site/content/docs/stats.md new file mode 100644 index 0000000..a2e7bfa --- /dev/null +++ b/docs-site/content/docs/stats.md @@ -0,0 +1,131 @@ +--- +title: Understanding the Stats Overlay +description: What every number in the punktfunk stats HUD means, and how to compare them fairly with Moonlight/Sunshine. +--- + +Every punktfunk client has an in-stream stats overlay. All clients use **the same +vocabulary, the same measurement points, and the same math**, so a number on your +phone means exactly what the same number means on your desktop. + +## The four measurement points + +Every latency figure is the time between two of these four points in a video frame's +life: + +1. **capture** — the host grabs the frame from the (virtual) display. Stamped on the + host's clock and carried with the frame. +2. **received** — your client has fully received and reassembled the frame from the + network (after any FEC recovery), before decoding. +3. **decoded** — the video decoder has produced the picture. +4. **displayed** — the picture is handed to the screen (as close to "photons" as the + platform lets us measure). + +## Reading the overlay + +``` +1920×1080@120 · 119 fps · 38.2 Mb/s · HEVC 10-bit HDR · GPU decode +end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass += host+network 9.8 + decode 2.1 + display 2.3 +lost 3 (0.1%) · skipped 1 · FEC 12 +``` + +- **Line 1 — the stream.** Resolution@refresh, frames received per second, and the + received video bitrate (goodput — FEC overhead not counted), plus codec details. +- **Line 2 — the headline.** `end-to-end` is the *directly measured* time from host + capture to the endpoint named at the end of the line (`capture→on-glass` here). + `p50` = the typical frame (median), `p95` = the slow outliers. This is the one + number that summarizes your stream. +- **Line 3 — where the time goes.** The three stages **tile the end-to-end interval** + — each starts where the previous one ends, so they add up to the headline: + - `host+network` — capture → received: the host's capture/encode/send pipeline + *plus* the network flight and reassembly, in one number. + - `decode` — received → decoded, on your device. + - `display` — decoded → displayed: waiting for the right screen refresh, rendering, + and vsync. + + (Stage values are per-stage medians, so they sum only *approximately* to the + headline median — percentiles aren't perfectly additive. The headline is measured + directly, never computed as a sum.) +- **Line 4 — reliability** (only shown when something is nonzero). `lost` = frames the + network dropped beyond FEC's ability to recover; `skipped` = frames your client + chose not to display because a newer one had already arrived; `FEC` = packet shards + the error correction recovered this second (loss that you *didn't* feel). + +All values refresh once per second over the last second of frames. + +### Clocks, and the `(same-host clock)` tag + +`end-to-end` and `host+network` span two machines, so they need the two clocks to +agree: at connect, the client runs an NTP-style handshake with the host and corrects +for the measured clock offset. If that handshake wasn't possible, the overlay appends +**`(same-host clock)`** — the numbers are then only trustworthy when client and host +run on the same machine. `decode` and `display` are single-machine measurements and +are always exact. + +### What each platform can measure + +Not every platform exposes a true "displayed" instant, so the headline's endpoint is +always spelled out rather than pretending: + +| client | headline | why | +|---|---|---| +| Windows, macOS/iOS (Metal presenter), Linux | `capture→on-glass` / `capture→displayed` | present instant available (GTK measures at hand-off to the compositor, which adds about one compositor cycle after it) | +| Android | `capture→decoded` | the display hand-off happens inside MediaCodec/SurfaceView where precise present timing isn't exposed | +| macOS/iOS fallback presenter | `capture→received` | the system video layer hides decode and present timing entirely | + +A shorter chain means the number is **smaller because it measures less** — check the +endpoint before comparing two devices. + +## Comparing with Moonlight / Sunshine + +Moonlight's overlay and punktfunk's measure different slices of the pipeline, and the +single biggest difference is: + +> **Moonlight has no end-to-end number.** Its overlay shows separate client-side +> segments (decode time, queue delay, render time) and — on Sunshine hosts — a +> host-side number. Nothing in Moonlight measures capture-to-glass, and nothing +> measures the network flight of video frames. punktfunk's `end-to-end` line has **no +> Moonlight counterpart** — never compare it against any single Moonlight line. + +To compare fairly, reconstruct an approximate end-to-end from Moonlight's lines: + +``` +Moonlight ≈ host processing latency (avg) + + ½ × average network latency + + average decoding time + + average frame queue delay + + average rendering time +``` + +…and compare *that* against punktfunk's `end-to-end`. (It's still approximate: +Moonlight's segments are averages over a slightly different window, and the ½·RTT term +stands in for a one-way frame flight that Moonlight doesn't measure.) + +### Line-by-line matrix + +| Moonlight overlay line | What it actually measures | punktfunk equivalent | Comparable? | +|---|---|---|---| +| `Video stream: WxH FPS` | Received **plus inferred-lost** frames/s (host-rate estimate from frame sequence gaps) | `fps` (line 1) | ≈ equal when loss is near zero; punktfunk counts received frames only | +| `Incoming frame rate from network` | Frames reassembled from the network per second | `fps` (line 1) | **Yes — direct** | +| `Decoding frame rate` (desktop only) | Frames leaving the decoder per second | not shown separately (equals `fps` unless the decoder is falling behind) | — | +| `Rendering frame rate` (desktop only) | Frames actually presented per second | `fps` minus `skipped` | Approximately | +| `Host processing latency min/max/avg` (Sunshine hosts) | Host capture → just-before-send, reported by Sunshine per frame | contained inside `host+network`; the host-side breakdown lives in the punktfunk web console (capture/encode/send stages) | Indirect — punktfunk's `host+network` additionally includes the network flight | +| `Frames dropped by your network connection` | Frame-sequence gaps ÷ total frames | `lost` (line 4) | **Yes — direct** | +| `Frames dropped due to network jitter` | Decoded frames the *client's pacer* chose to drop ÷ decoded frames | `skipped` (line 4) | Approximately (both are client-side pacing decisions, despite Moonlight's name) | +| `Average network latency` | The **control connection's round-trip time** (ENet RTT + variance) — not video frame latency | none, on purpose | **No.** An RTT is not a frame latency; punktfunk measures the actual per-frame path instead | +| `Average decoding time` | Mean time from decoder enqueue to picture out | `decode` (p50) | Yes (mean vs median; both include decoder queueing) | +| `Average frame queue delay` | Mean time a decoded frame waits for its vsync slot | inside `display` | Sum the two Moonlight lines → | +| `Average rendering time (incl. V-sync latency)` | Mean duration of the present call | inside `display` | …and compare against punktfunk's `display` | +| *(no equivalent)* | — | `end-to-end` — true capture→glass, clock-skew-corrected across machines | **punktfunk only** | +| *(no equivalent)* | — | `FEC` recovered shards (loss absorbed invisibly) | punktfunk only | + +Other differences worth knowing when squinting at both overlays side by side: + +- **Averages vs percentiles.** Moonlight's time values are means; punktfunk shows + medians (p50) with a p95 for the headline. Under jitter, a mean sits above the + median — Moonlight's numbers read slightly "worse" than an equivalent p50. +- **Windows.** Both refresh about once per second; Moonlight over a ~1–2 s sliding + window, punktfunk over the last full second. +- **Host frame rate.** Moonlight's headline FPS estimates what the *host* produced + (received + lost). punktfunk shows what your client actually received, and reports + loss separately. diff --git a/docs-site/content/docs/status.md b/docs-site/content/docs/status.md index 36b67ac..d166098 100644 --- a/docs-site/content/docs/status.md +++ b/docs-site/content/docs/status.md @@ -89,7 +89,7 @@ Notable capabilities that have landed, newest first: limit), each with its own virtual output and encoder — e.g. stream the same desktop to a laptop and a TV simultaneously. - **Cross-machine latency HUD + wall-clock skew handshake.** A short NTP-style handshake - aligns client and host clocks, making capture-to-reassembled latency valid across + aligns client and host clocks, making capture-to-received latency valid across machines; the Apple client surfaces a skew-corrected capture-to-receipt p50/p95 in its HUD. - **Native LAN auto-discovery.** Hosts advertise `_punktfunk._udp` over mDNS (with TXT diff --git a/scripts/bench/gpu-stream.sh b/scripts/bench/gpu-stream.sh index ec5c3c9..db867d2 100755 --- a/scripts/bench/gpu-stream.sh +++ b/scripts/bench/gpu-stream.sh @@ -8,7 +8,7 @@ # scripts/bench/gpu-stream.sh 1920x1080x120 12 --update # (re)write scripts/bench/gpu-baseline.json # # Metrics (host PUNKTFUNK_PERF + client report): encode_us_p50/p99, tx_mbps, send_dropped, and the -# client's capture→reassembled lat_p50/p95/p99_us. Lower is better for latency/encode/drops, higher +# client's capture→received lat_p50/p95/p99_us. Lower is better for latency/encode/drops, higher # for throughput. Regressions are flagged ⚠ but the script exits 0 (gate decisions stay human). set -uo pipefail