feat(clients): unified stats vocabulary across every client + Moonlight comparison docs

One stat model everywhere (design/stats-unification.md): four measurement
points (capture/received/decoded/displayed), three stages that tile the
interval exactly, and a HUD that shows the addition explicitly —

  end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass
  = host+network 9.8 + decode 2.1 + display 2.3

replacing each client's ad-hoc mix of overlapping absolutes (the Apple HUD's
three arrow lines that looked sequential but weren't), mean-vs-median decode
times (Windows/Linux), missing same-host-clock flags (Windows/Linux), and
three different names for the same capture→received measurement (probe's
"reassembled", Apple/Android's "client", Windows/Linux's post-decode "lat").

Per client: Apple threads receivedNs through the VT decode via the frame
refcon bit pattern so the decode stage exists at all (stage-1 fallback
honestly degrades to a capture→received headline); Windows carries
FrameTimes through the existing frame channel to the render thread and adds
e2e p50/p95 post-Present; Linux stamps received at AU pop and rides
decoded_ns on DecodedFrame to the paintable-set site; Android pairs receipt
stamps with MediaCodec output buffers via the codec's pts round-trip (JNI
stats array 14→16 doubles, indexes 0-13 unchanged). fps now uniformly counts
received AUs; lost/(received+lost) per window, hidden at zero.

docs-site gains "Understanding the Stats Overlay": what each line means, why
the equation only approximately sums (percentiles), and a line-by-line
Moonlight/Sunshine matrix — including that Moonlight has no end-to-end
number and its "network latency" is an ENet control RTT, so punktfunk's
headline must not be compared against any single Moonlight line.

Verified here: linux client + probe + core check/clippy/fmt green, android
native cargo-ndk arm64 check green. Pending: Windows CI + on-glass, swift
test on the mac, on-device Android.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-03 21:01:29 +00:00
parent c7630ff5dc
commit 09a5957c6d
38 changed files with 1122 additions and 380 deletions
+78 -8
View File
@@ -9,16 +9,22 @@
use ndk::data_space::DataSpace;
use ndk::media::media_codec::{
DequeuedInputBufferResult, DequeuedOutputBufferInfoResult, MediaCodec, MediaCodecDirection,
OutputBuffer,
};
use ndk::media::media_format::MediaFormat;
use ndk::native_window::{FrameRateCompatibility, NativeWindow};
use punktfunk_core::client::NativeClient;
use punktfunk_core::error::PunktfunkError;
use punktfunk_core::session::Frame;
use std::collections::VecDeque;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
/// Cap on the pts→received-timestamp map below: MediaCodec holds only a handful of frames in
/// flight, so anything beyond this is stale (codec flushed / HUD toggled) and gets evicted.
const IN_FLIGHT_CAP: usize = 64;
/// The decode loop. Runs on the `pf-decode` thread until `shutdown` is set or the session closes.
pub fn run(
client: Arc<NativeClient>,
@@ -141,9 +147,14 @@ pub fn run(
// climbs.
let mut last_dropped = client.frames_dropped();
let mut last_kf_req: Option<Instant> = None;
// Capture→client-receipt latency uses the negotiated host-minus-client clock offset (0 if the
// host didn't answer the skew handshake — then the HUD flags it "same-host").
// Skew-corrected latency stats (spec: design/stats-unification.md) use the negotiated
// host-minus-client clock offset (0 if the host didn't answer the skew handshake — then the
// HUD flags it "(same-host clock)").
let clock_offset = client.clock_offset_ns;
// HUD stage split: receipt timestamps keyed by the pts we queue into the codec, so the decoded
// point (output-buffer dequeue — MediaCodec round-trips presentationTimeUs) can be paired back
// to its receipt for the `decode` stage. Only fed while the HUD is visible.
let mut in_flight: VecDeque<(u64, i128)> = VecDeque::new();
// The dataspace we've signalled on the Surface so far (None = default/SDR). Set reactively once
// the decoder reports an HDR stream (see `drain`); avoids re-applying every format event.
let mut applied_ds: Option<DataSpace> = None;
@@ -164,15 +175,21 @@ pub fn run(
&p[..p.len().min(6)]
);
}
// HUD stat: capture→client-receipt latency = client_now + (hostclient)
// HUD stat, `received` point: host+network = client_now + (hostclient)
// capture_pts. Gated on the HUD being visible — `enabled` first so the hidden
// steady state skips the wall-clock read and the lock entirely.
// steady state skips the wall-clock read and the lock entirely. The receipt
// stamp is also parked in `in_flight` (keyed by the pts the codec will echo on
// the output buffer) for the decoded-point pairing in `drain`.
if stats.enabled() {
let lat_ns =
now_realtime_ns() + clock_offset as i128 - frame.pts_ns as i128;
let received_ns = now_realtime_ns();
let lat_ns = received_ns + clock_offset as i128 - frame.pts_ns as i128;
let lat_us = (lat_ns > 0 && lat_ns < 10_000_000_000)
.then_some((lat_ns / 1000) as u64);
stats.note(frame.data.len(), lat_us, clock_offset != 0);
stats.note_received(frame.data.len(), lat_us, clock_offset != 0);
in_flight.push_back((frame.pts_ns / 1000, received_ns));
if in_flight.len() > IN_FLIGHT_CAP {
in_flight.pop_front(); // stale — codec never echoed it back
}
}
pending = Some(frame);
}
@@ -202,7 +219,15 @@ pub fn run(
} else {
Duration::ZERO
};
let (r, d) = drain(&codec, &window, &mut applied_ds, wait);
let (r, d) = drain(
&codec,
&window,
&mut applied_ds,
wait,
&stats,
&mut in_flight,
clock_offset,
);
rendered += r;
discarded += d;
@@ -330,11 +355,19 @@ fn feed(codec: &MediaCodec, au: &[u8], pts_us: u64) -> bool {
/// the caller's input is blocked so the loop waits on decoder progress instead of busy-spinning.
/// Returns `(rendered, discarded)`. Also reacts to `OutputFormatChanged` (which can interleave
/// between buffers — handled without losing the held buffer) to signal HDR on the Surface.
///
/// Each dequeued buffer is also the HUD's `decoded` measurement point (rendered or not — the frame
/// finished decoding either way): end-to-end = decoded + clock_offset capture pts, and the
/// `decode` stage pairs the buffer's echoed presentationTimeUs back to the receipt stamp in
/// `in_flight` (single-clock local difference, no skew involved).
fn drain(
codec: &MediaCodec,
window: &NativeWindow,
applied_ds: &mut Option<DataSpace>,
first_wait: Duration,
stats: &crate::stats::VideoStats,
in_flight: &mut VecDeque<(u64, i128)>,
clock_offset: i64,
) -> (u64, u64) {
let mut held = None; // newest ready buffer so far, presented after the loop
let mut discarded: u64 = 0;
@@ -343,6 +376,9 @@ fn drain(
match codec.dequeue_output_buffer(wait) {
Ok(DequeuedOutputBufferInfoResult::Buffer(buf)) => {
wait = Duration::ZERO; // only the first dequeue may block
if stats.enabled() {
note_decoded(stats, in_flight, clock_offset, &buf);
}
if let Some(stale) = held.replace(buf) {
// A newer frame is ready — drop the held one without rendering.
if let Err(e) = codec.release_output_buffer(stale, false) {
@@ -392,6 +428,40 @@ fn drain(
(rendered, discarded)
}
/// HUD `decoded` point for one dequeued output buffer: build the end-to-end (capture→decoded,
/// skew-corrected, clamped to (0, 10 s)) and `decode` (received→decoded, single-clock local, ≥ 0)
/// samples and hand them to [`crate::stats::VideoStats::note_decoded`]. The codec echoes the input
/// `presentationTimeUs` on the output buffer, which keys the receipt stamp in `in_flight`; entries
/// older than the echoed pts are evicted (decode order == input order here — low-latency, no
/// B-frames — so anything before it was dropped inside the codec or stamped before a flush).
fn note_decoded(
stats: &crate::stats::VideoStats,
in_flight: &mut VecDeque<(u64, i128)>,
clock_offset: i64,
buf: &OutputBuffer<'_>,
) {
let pts_us = buf.info().presentation_time_us().max(0) as u64;
let decoded_ns = now_realtime_ns();
// Pair the echoed pts back to its receipt stamp, evicting stale (older) entries as we go.
let mut received_ns = None;
while let Some(&(p, r)) = in_flight.front() {
if p > pts_us {
break; // future frame — leave it for its own output buffer
}
in_flight.pop_front();
if p == pts_us {
received_ns = Some(r);
break;
}
}
// pts_us is the truncated frame.pts_ns/1000 we queued, so ×1000 re-approximates capture time
// to < 1 µs — negligible against the ms-scale figures shown.
let e2e_ns = decoded_ns + clock_offset as i128 - pts_us as i128 * 1000;
let e2e_us = (e2e_ns > 0 && e2e_ns < 10_000_000_000).then_some((e2e_ns / 1000) as u64);
let decode_us = received_ns.map(|r| ((decoded_ns - r).max(0) / 1000) as u64);
stats.note_decoded(e2e_us, decode_us);
}
/// Map the decoder's reported output colour to a BT.2020 HDR dataspace, or `None` for SDR. The
/// integer values are the Android MediaFormat colour constants the NDK shares: COLOR_TRANSFER
/// ST2084 = 6 (PQ/HDR10), HLG = 7; COLOR_RANGE FULL = 1, LIMITED = 2 (the host encodes limited).
+16 -11
View File
@@ -72,14 +72,16 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeStopVideo(
})
}
/// `NativeBridge.nativeVideoStats(handle): DoubleArray?` — drain ~1 s of decode stats for the HUD.
/// Returns 14 doubles
/// `[fps, mbps, latP50Ms, latP95Ms, latValid, skewCorrected, width, height, refreshHz, framesDropped,
/// bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc]`
/// (the two flags are 1.0/0.0; the trailing four describe the negotiated video feed — see below), or
/// `null` when no decode thread is running. Poll ~1 Hz from the UI; each call resets the measurement
/// window. Not android-gated — pure `jni` + connector reads, so it links on the host build too
/// (Kotlin only ever calls it on device).
/// `NativeBridge.nativeVideoStats(handle): DoubleArray?` — drain ~1 s of decode stats for the HUD
/// (unified stats spec, `design/stats-unification.md`). Returns 16 doubles
/// `[fps, mbps, e2eP50Ms, e2eP95Ms, latValid, skewCorrected, width, height, refreshHz, framesLost,
/// bitDepth, colorPrimaries, colorTransfer, chromaFormatIdc, hostNetP50Ms, decodeP50Ms]`
/// (the two flags are 1.0/0.0; indexes 013 match the previous 14-double layout with the latency
/// pair re-based from capture→received to the end-to-end capture→decoded headline; the two stage
/// p50s tiling it — `host+network` = capture→received, `decode` = received→decoded — are appended
/// at the end), or `null` when no decode thread is running. Poll ~1 Hz from the UI; each call
/// resets the measurement window. Not android-gated — pure `jni` + connector reads, so it links on
/// the host build too (Kotlin only ever calls it on device).
#[no_mangle]
pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats(
env: JNIEnv,
@@ -98,11 +100,11 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats(
let snap = h.stats.drain();
let mode = h.client.mode();
let color = h.client.color;
let buf: [f64; 14] = [
let buf: [f64; 16] = [
snap.fps,
snap.mbps,
snap.lat_p50_ms,
snap.lat_p95_ms,
snap.e2e_p50_ms,
snap.e2e_p95_ms,
if snap.lat_valid { 1.0 } else { 0.0 },
if snap.skew_corrected { 1.0 } else { 0.0 },
mode.width as f64,
@@ -117,6 +119,9 @@ pub extern "system" fn Java_io_unom_punktfunk_kit_NativeBridge_nativeVideoStats(
color.primaries as f64,
color.transfer as f64,
h.client.chroma_format as f64,
// Stage p50s tiling the end-to-end headline (appended to keep 013 index-compatible).
snap.hostnet_p50_ms,
snap.decode_p50_ms,
];
let arr = match env.new_double_array(buf.len() as jsize) {
Ok(a) => a,
+89 -40
View File
@@ -1,8 +1,11 @@
//! Live decode stats for the on-stream HUD (mirrors the Apple client's stats overlay): FPS,
//! receive throughput, and capture→client-receipt latency (p50/p95). The decode thread is the sole
//! writer (`note` per access unit); the JNI accessor `nativeVideoStats` drains a snapshot ~1 Hz and
//! resets the window. Sampling is gated on the HUD actually being visible (`set_enabled`, driven by
//! `nativeSetVideoStatsEnabled`) so the hidden steady state costs one relaxed atomic load per frame.
//! Live decode stats for the on-stream HUD, following the unified stats spec
//! (`design/stats-unification.md`): FPS, receive throughput, and the Android v1 stage split —
//! headline `end-to-end` = capture→decoded (p50/p95) tiled by `host+network` = capture→received
//! and `decode` = received→decoded (stage p50s). The decode thread is the sole writer
//! (`note_received` per access unit at receipt, `note_decoded` per decoder output buffer); the JNI
//! accessor `nativeVideoStats` drains a snapshot ~1 Hz and resets the window. Sampling is gated on
//! the HUD actually being visible (`set_enabled`, driven by `nativeSetVideoStatsEnabled`) so the
//! hidden steady state costs one relaxed atomic load per frame.
//! Pure `std` so it compiles on the host build too (the decode thread is android-only, but
//! `SessionHandle` holds the shared handle unconditionally).
@@ -13,9 +16,9 @@ use std::time::Instant;
/// Rolling per-window accumulator. Rates are computed over the actual elapsed wall-time at drain
/// (robust to poll jitter), so a poll that lands at 0.9 s or 1.1 s still reports the right FPS.
pub struct VideoStats {
/// HUD gate: `note` runs on the per-frame decode path, so while the overlay is hidden it (and
/// the caller's latency computation — see `enabled`) early-outs on this flag alone. Off until
/// Kotlin shows the HUD.
/// HUD gate: the samplers run on the per-frame decode path, so while the overlay is hidden
/// they (and the caller's latency computation — see `enabled`) early-out on this flag alone.
/// Off until Kotlin shows the HUD.
enabled: AtomicBool,
inner: Mutex<Inner>,
}
@@ -24,23 +27,42 @@ struct Inner {
window_start: Instant,
frames: u64,
bytes: u64,
/// capture→client-receipt latency samples for this window, in microseconds.
lat_us: Vec<u64>,
/// `end-to-end` = capture→decoded latency samples for this window, in microseconds
/// (skew-corrected clock base).
e2e_us: Vec<u64>,
/// `host+network` stage = capture→received samples, in microseconds (skew-corrected).
hostnet_us: Vec<u64>,
/// `decode` stage = received→decoded samples, in microseconds (client-local, single clock).
decode_us: Vec<u64>,
/// Whether the host answered the clock-skew handshake (latency is cross-machine valid).
skew_corrected: bool,
}
/// A drained, computed view of one window. `lat_valid` is false when no in-range latency sample
/// landed (then p50/p95 are 0 and the HUD hides the latency line, exactly like the Apple client).
/// A drained, computed view of one window. `lat_valid` is false when no in-range end-to-end sample
/// landed (then the latency figures are 0 and the HUD hides the latency lines, exactly like the
/// Apple client).
pub struct Snapshot {
pub fps: f64,
pub mbps: f64,
pub lat_p50_ms: f64,
pub lat_p95_ms: f64,
/// Headline `end-to-end` (capture→decoded) percentiles, ms.
pub e2e_p50_ms: f64,
pub e2e_p95_ms: f64,
/// Stage p50s (ms): `host+network` (capture→received) and `decode` (received→decoded).
pub hostnet_p50_ms: f64,
pub decode_p50_ms: f64,
pub lat_valid: bool,
pub skew_corrected: bool,
}
/// Percentile over a sorted-in-place µs sample vec, in ms. 0.0 when empty.
fn pctl_ms(sorted_us: &[u64], p: f64) -> f64 {
if sorted_us.is_empty() {
return 0.0;
}
let n = sorted_us.len();
sorted_us[((n as f64 * p) as usize).min(n - 1)] as f64 / 1000.0
}
impl VideoStats {
pub fn new() -> VideoStats {
VideoStats {
@@ -49,14 +71,16 @@ impl VideoStats {
window_start: Instant::now(),
frames: 0,
bytes: 0,
lat_us: Vec::with_capacity(256),
e2e_us: Vec::with_capacity(256),
hostnet_us: Vec::with_capacity(256),
decode_us: Vec::with_capacity(256),
skew_corrected: false,
}),
}
}
/// Whether the HUD wants samples. The decode thread checks this BEFORE building a latency
/// sample, so the per-frame wall-clock read is skipped too while hidden.
/// sample, so the per-frame wall-clock reads are skipped too while hidden.
// Read only by the android-only decode thread; unreferenced on the host build — expected.
#[cfg_attr(not(target_os = "android"), allow(dead_code))]
pub fn enabled(&self) -> bool {
@@ -75,18 +99,21 @@ impl VideoStats {
g.window_start = Instant::now();
g.frames = 0;
g.bytes = 0;
g.lat_us.clear();
g.e2e_us.clear();
g.hostnet_us.clear();
g.decode_us.clear();
}
}
/// Record one decoded access unit: its wire size and (if in range) its capture→client latency.
/// Record one received access unit: its wire size and (if in range) its capture→received
/// `host+network` stage sample. Receipt is the fps/goodput counting point per the spec.
// Driven only by the android-only decode thread; unreferenced on the host build — expected.
#[cfg_attr(not(target_os = "android"), allow(dead_code))]
pub fn note(&self, bytes: usize, lat_us: Option<u64>, skew_corrected: bool) {
pub fn note_received(&self, bytes: usize, hostnet_us: Option<u64>, skew_corrected: bool) {
if !self.enabled.load(Ordering::Relaxed) {
return; // HUD hidden — skip the lock (the caller already skipped the clock read)
}
// Poison-proof: `note` runs per-frame on the decode thread, which has no catch_unwind —
// Poison-proof: this runs per-frame on the decode thread, which has no catch_unwind —
// a panic elsewhere must not turn every later lock into a second panic (the counters
// stay consistent regardless).
let mut g = self
@@ -96,14 +123,37 @@ impl VideoStats {
g.frames += 1;
g.bytes += bytes as u64;
g.skew_corrected = skew_corrected;
if let Some(l) = lat_us {
g.lat_us.push(l);
if let Some(l) = hostnet_us {
g.hostnet_us.push(l);
}
}
/// Record one decoded output frame: its capture→decoded `end-to-end` sample and its
/// received→decoded `decode` stage sample (either may be absent — e.g. the receipt stamp for
/// this pts predates the HUD being shown).
// Driven only by the android-only decode thread; unreferenced on the host build — expected.
#[cfg_attr(not(target_os = "android"), allow(dead_code))]
pub fn note_decoded(&self, e2e_us: Option<u64>, decode_us: Option<u64>) {
if !self.enabled.load(Ordering::Relaxed) {
return; // HUD hidden — skip the lock (the caller already skipped the clock read)
}
// Poison-proof for the same reason as `note_received`.
let mut g = self
.inner
.lock()
.unwrap_or_else(std::sync::PoisonError::into_inner);
if let Some(l) = e2e_us {
g.e2e_us.push(l);
}
if let Some(l) = decode_us {
g.decode_us.push(l);
}
}
/// Compute the window's rates + latency percentiles, then reset for the next window.
pub fn drain(&self) -> Snapshot {
// Poison-proof for the same reason as `note` — a poisoned window still drains fine.
// Poison-proof for the same reason as `note_received` — a poisoned window still drains
// fine.
let mut g = self
.inner
.lock()
@@ -111,26 +161,25 @@ impl VideoStats {
let elapsed = g.window_start.elapsed().as_secs_f64().max(1e-3);
let fps = g.frames as f64 / elapsed;
let mbps = g.bytes as f64 * 8.0 / 1_000_000.0 / elapsed;
let (p50, p95, valid) = if g.lat_us.is_empty() {
(0.0, 0.0, false)
} else {
g.lat_us.sort_unstable();
let n = g.lat_us.len();
let at = |p: f64| g.lat_us[((n as f64 * p) as usize).min(n - 1)] as f64 / 1000.0;
(at(0.50), at(0.95), true)
g.e2e_us.sort_unstable();
g.hostnet_us.sort_unstable();
g.decode_us.sort_unstable();
let snap = Snapshot {
fps,
mbps,
e2e_p50_ms: pctl_ms(&g.e2e_us, 0.50),
e2e_p95_ms: pctl_ms(&g.e2e_us, 0.95),
hostnet_p50_ms: pctl_ms(&g.hostnet_us, 0.50),
decode_p50_ms: pctl_ms(&g.decode_us, 0.50),
lat_valid: !g.e2e_us.is_empty(),
skew_corrected: g.skew_corrected,
};
let skew = g.skew_corrected;
g.window_start = Instant::now();
g.frames = 0;
g.bytes = 0;
g.lat_us.clear();
Snapshot {
fps,
mbps,
lat_p50_ms: p50,
lat_p95_ms: p95,
lat_valid: valid,
skew_corrected: skew,
}
g.e2e_us.clear();
g.hostnet_us.clear();
g.decode_us.clear();
snap
}
}