feat(clients): unified stats vocabulary across every client + Moonlight comparison docs

One stat model everywhere (design/stats-unification.md): four measurement
points (capture/received/decoded/displayed), three stages that tile the
interval exactly, and a HUD that shows the addition explicitly —

  end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass
  = host+network 9.8 + decode 2.1 + display 2.3

replacing each client's ad-hoc mix of overlapping absolutes (the Apple HUD's
three arrow lines that looked sequential but weren't), mean-vs-median decode
times (Windows/Linux), missing same-host-clock flags (Windows/Linux), and
three different names for the same capture→received measurement (probe's
"reassembled", Apple/Android's "client", Windows/Linux's post-decode "lat").

Per client: Apple threads receivedNs through the VT decode via the frame
refcon bit pattern so the decode stage exists at all (stage-1 fallback
honestly degrades to a capture→received headline); Windows carries
FrameTimes through the existing frame channel to the render thread and adds
e2e p50/p95 post-Present; Linux stamps received at AU pop and rides
decoded_ns on DecodedFrame to the paintable-set site; Android pairs receipt
stamps with MediaCodec output buffers via the codec's pts round-trip (JNI
stats array 14→16 doubles, indexes 0-13 unchanged). fps now uniformly counts
received AUs; lost/(received+lost) per window, hidden at zero.

docs-site gains "Understanding the Stats Overlay": what each line means, why
the equation only approximately sums (percentiles), and a line-by-line
Moonlight/Sunshine matrix — including that Moonlight has no end-to-end
number and its "network latency" is an ENet control RTT, so punktfunk's
headline must not be compared against any single Moonlight line.

Verified here: linux client + probe + core check/clippy/fmt green, android
native cargo-ndk arm64 check green. Pending: Windows CI + on-glass, swift
test on the mac, on-device Android.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-03 21:01:29 +00:00
parent c7630ff5dc
commit 09a5957c6d
38 changed files with 1122 additions and 380 deletions
+36 -19
View File
@@ -2,7 +2,7 @@
//! the UI thread, then handed — presenter and all — to the dedicated render thread
//! ([`crate::render`]), which presents decoded frames at stream cadence. The page itself only
//! forwards panel size/DPI changes and draws the status-chip HUD overlay (mode · decode path ·
//! HDR · fps/throughput/latency · capture hint).
//! HDR · fps/goodput · end-to-end latency + stage equation · capture hint).
use super::style::{edges, uniform};
use super::Svc;
@@ -22,8 +22,9 @@ use windows_reactor::*;
pub(crate) struct HudSample {
pub(crate) stats: Stats,
pub(crate) captured: bool,
/// `(presents/s, skipped/s, capture→presented p50 ms)` — see [`crate::render::present_stats`].
pub(crate) present: (u32, u32, f32),
/// The render thread's glass-side window (presents/s, skips, end-to-end p50/p95, display
/// stage p50) — see [`crate::render::present_stats`].
pub(crate) present: crate::render::PresentStats,
}
/// Props for the stream page: the services plus the live HUD sample that drives the overlay
@@ -171,13 +172,15 @@ fn fmt_uptime(secs: u32) -> String {
}
}
/// The streaming HUD overlay (top-right), mirroring the Apple client: a chip row (mode · codec ·
/// decode path · HDR), a stream line (decode fps / bitrate / decode time), a glass line (display
/// presents + end-to-end latency decoded vs on-glass), a session line (host · time · loss), and
/// the shortcut hints. Layered over the `SwapChainPanel` in the same grid cell.
/// The streaming HUD overlay (top-right), unified stats vocabulary (design/stats-unification.md):
/// a chip row (mode · codec · decode path · HDR), a stream line (received fps · goodput ·
/// presenter fps), the end-to-end headline (capture→on-glass p50/p95, host-clock corrected), the
/// stage equation (= host+network + decode + display, stage p50s), a session line
/// (host · time · loss/skips), and the shortcut hints. Layered over the `SwapChainPanel` in the
/// same grid cell.
fn hud_overlay(hud: &HudSample, mode: Option<Mode>, host: &str) -> Element {
let stats = &hud.stats;
let (pfps, skipped, glass_ms) = hud.present;
let present = &hud.present;
let res = mode
.map(|m| format!("{}\u{00D7}{}@{}", m.width, m.height, m.refresh_hz))
.unwrap_or_else(|| "\u{2014}".into());
@@ -193,25 +196,38 @@ fn hud_overlay(hud: &HudSample, mode: Option<Mode>, host: &str) -> Element {
if stats.hdr {
chips.push(hud_chip("HDR", Color::rgb(255, 205, 90)).into());
}
// Received fps + goodput, plus the presenter's own rate (Moonlight's "Rendering frame rate"
// analog — how often the display actually gets a new frame).
let stream_line = format!(
"{:.0} fps \u{00B7} {:.1} Mb/s \u{00B7} decode {:.1} ms",
stats.fps, stats.mbps, stats.decode_ms
"{:.0} fps \u{00B7} {:.1} Mb/s \u{00B7} display {} fps",
stats.fps, stats.mbps, present.fps
);
// End-to-end latency (host-clock corrected): capture→decoded from the pump, capture→on-glass
// from the render thread's post-Present stamp. `skipped` = newest-wins drops (expected when
// the stream outpaces the display); `lost` = unrecoverable network drops.
let glass_line = format!(
"display {pfps} fps \u{00B7} latency {:.1} ms decoded / {glass_ms:.1} ms on-glass",
stats.latency_ms
// The headline: end-to-end capture→displayed, measured directly post-Present (never the sum
// of the stage percentiles). `(same-host clock)` flags an uncorrected clock (offset == 0:
// same host, or the host skipped the skew handshake).
let mut e2e_line = format!(
"end-to-end {:.1} ms p50 \u{00B7} {:.1} p95 \u{00B7} capture\u{2192}on-glass",
present.e2e_p50_ms, present.e2e_p95_ms
);
if stats.same_host {
e2e_line.push_str(" (same-host clock)");
}
// The equation: the three stages tile the headline interval per frame; the window p50s only
// approximately sum (percentiles aren't additive).
let stage_line = format!(
"= host+network {:.1} + decode {:.1} + display {:.1}",
stats.hostnet_ms, stats.decode_ms, present.display_p50_ms
);
let mut session_bits: Vec<String> = Vec::new();
if !host.is_empty() {
session_bits.push(host.to_string());
}
// `lost` = unrecoverable network drops (session-cumulative); `skipped` = the render thread's
// newest-wins drops last window (expected when the stream outpaces the display).
session_bits.push(fmt_uptime(stats.uptime_secs));
session_bits.push(format!("{} lost", stats.dropped));
if skipped > 0 {
session_bits.push(format!("{skipped} skipped"));
if present.skipped > 0 {
session_bits.push(format!("{} skipped", present.skipped));
}
let session_line = session_bits.join(" \u{00B7} ");
let hint = if hud.captured {
@@ -228,7 +244,8 @@ fn hud_overlay(hud: &HudSample, mode: Option<Mode>, host: &str) -> Element {
vstack((
hstack(chips).spacing(6.0),
dim(&stream_line),
dim(&glass_line),
dim(&e2e_line),
dim(&stage_line),
dim(&session_line),
text_block(hint)
.font_size(11.0)
+2 -2
View File
@@ -241,8 +241,8 @@ fn run_headless_cli(args: &[String], identity: (String, String)) {
session::SessionEvent::Stats(s) => tracing::info!(
fps = format!("{:.0}", s.fps),
mbps = format!("{:.1}", s.mbps),
decode_ms = format!("{:.2}", s.decode_ms),
lat_ms = format!("{:.2}", s.latency_ms),
decode_p50_ms = format!("{:.2}", s.decode_ms),
hostnet_p50_ms = format!("{:.2}", s.hostnet_ms),
frames_seen,
"stats"
),
+78 -27
View File
@@ -10,27 +10,46 @@
//! draw (and redraws the held frame after a resize — fresh back buffers are blank).
use crate::present::Presenter;
use crate::session::FrameRx;
use crate::session::{FrameRx, FrameTimes};
use crossbeam_channel::RecvTimeoutError;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
/// The last 1-second render window, published for the HUD (one render thread at a time):
/// presents/s, frames skipped by the newest-wins drain, and the capture→presented p50 in µs.
/// presents/s, frames skipped by the newest-wins drain, the end-to-end (capture→on-glass)
/// p50/p95 and the `display` stage (decoded→displayed) p50, all stamped post-`Present()`, in µs.
/// Zeroed when a render thread starts so a new session never shows the previous one's numbers.
static PRESENT_FPS: AtomicU32 = AtomicU32::new(0);
static PRESENT_SKIPPED: AtomicU32 = AtomicU32::new(0);
static PRESENT_P50_US: AtomicU64 = AtomicU64::new(0);
static E2E_P50_US: AtomicU64 = AtomicU64::new(0);
static E2E_P95_US: AtomicU64 = AtomicU64::new(0);
static DISPLAY_P50_US: AtomicU64 = AtomicU64::new(0);
/// `(presents/s, skipped/s, capture→presented p50 ms)` of the last render window — the HUD's
/// display-side line.
pub fn present_stats() -> (u32, u32, f32) {
(
PRESENT_FPS.load(Ordering::Relaxed),
PRESENT_SKIPPED.load(Ordering::Relaxed),
PRESENT_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
)
/// The last render window's glass-side numbers (see the statics above) — the HUD's headline
/// (end-to-end) and trailing stage (display) come from here.
#[derive(Clone, Copy, Default, PartialEq)]
pub struct PresentStats {
/// Presents per second (includes resize redraws of a held frame).
pub fps: u32,
/// Frames dropped by the newest-wins drain this window (client-side pacing skips).
pub skipped: u32,
/// End-to-end capture→displayed p50, ms (host-clock corrected, measured directly).
pub e2e_p50_ms: f32,
/// End-to-end capture→displayed p95, ms.
pub e2e_p95_ms: f32,
/// `display` stage p50, ms: decoded → displayed, single-clock client-local.
pub display_p50_ms: f32,
}
pub fn present_stats() -> PresentStats {
PresentStats {
fps: PRESENT_FPS.load(Ordering::Relaxed),
skipped: PRESENT_SKIPPED.load(Ordering::Relaxed),
e2e_p50_ms: E2E_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
e2e_p95_ms: E2E_P95_US.load(Ordering::Relaxed) as f32 / 1000.0,
display_p50_ms: DISPLAY_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
}
}
/// UI-thread → render-thread state. Size is packed into ONE atomic (w<<32|h) so a resize never
@@ -101,8 +120,9 @@ impl Drop for RenderThread {
struct SendPresenter(Presenter);
unsafe impl Send for SendPresenter {}
/// Spawn the render thread. `frames` carries `(frame, capture pts_ns)`; `clock_offset_ns` maps our
/// wall clock onto the host's so the logged present latency is end-to-end (same math as the pump).
/// Spawn the render thread. `frames` carries `(frame, FrameTimes)`; `clock_offset_ns` maps our
/// wall clock onto the host's so the end-to-end (capture→on-glass) number is cross-machine valid
/// (same math as the pump's host+network stage).
pub fn spawn(
presenter: Presenter,
frames: FrameRx,
@@ -147,12 +167,17 @@ fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc<RenderShared>, clo
let mut applied = (0u32, 0u32, 0u32); // last (w, h, dpi) handed to the presenter
let mut presented = 0u32;
let mut dropped = 0u32;
let mut lat_us: Vec<u64> = Vec::with_capacity(256);
// 1 s tumbling windows: end-to-end (capture→displayed) and the display stage
// (decoded→displayed), sampled post-Present. Percentiles only (spec: stats-unification.md).
let mut e2e_us: Vec<u64> = Vec::with_capacity(256);
let mut display_us: Vec<u64> = Vec::with_capacity(256);
let mut window_start = Instant::now();
let mut last_dpi_poll = Instant::now();
PRESENT_FPS.store(0, Ordering::Relaxed);
PRESENT_SKIPPED.store(0, Ordering::Relaxed);
PRESENT_P50_US.store(0, Ordering::Relaxed);
E2E_P50_US.store(0, Ordering::Relaxed);
E2E_P95_US.store(0, Ordering::Relaxed);
DISPLAY_P50_US.store(0, Ordering::Relaxed);
loop {
if shared.stop.load(Ordering::SeqCst) {
@@ -198,29 +223,55 @@ fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc<RenderShared>, clo
p.set_hdr_metadata(meta);
}
let pts_ns = newest.as_ref().map(|(_, pts)| *pts);
let times: Option<FrameTimes> = newest.as_ref().map(|(_, t)| *t);
p.present(newest.map(|(f, _)| f));
presented += 1;
if let Some(pts) = pts_ns {
// Capture→presented, host-clock corrected — the glass-side companion to the pump's
// capture→decoded p50.
let lat = (now_ns() as i128 + clock_offset_ns as i128 - pts as i128).max(0) as u64;
if lat > 0 && lat < 10_000_000_000 {
lat_us.push(lat / 1000);
if let Some(t) = times {
// The `displayed` point: post-Present() on this thread (the honest best-effort
// presentation instant on Windows — endpoint label `capture→on-glass`).
let displayed_ns = now_ns();
// End-to-end = capture → displayed, host-clock corrected, measured directly
// (never the sum of stage percentiles). Clamped (0, 10 s).
let e2e =
(displayed_ns as i128 + clock_offset_ns as i128 - t.pts_ns as i128).max(0) as u64;
if e2e > 0 && e2e < 10_000_000_000 {
e2e_us.push(e2e / 1000);
}
// `display` stage = decoded → displayed, single-clock client-local.
let disp = displayed_ns.saturating_sub(t.decoded_ns);
if disp < 10_000_000_000 {
display_us.push(disp / 1000);
}
}
if window_start.elapsed() >= Duration::from_secs(1) {
lat_us.sort_unstable();
let p50 = lat_us.get(lat_us.len() / 2).copied().unwrap_or(0);
tracing::debug!(presented, dropped, present_p50_us = p50, "render window");
e2e_us.sort_unstable();
display_us.sort_unstable();
let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0);
// p95 = sorted[min(len*95/100, len-1)] — the empty-window case falls to 0 via `get`.
let p95 = |v: &[u64]| {
v.get((v.len() * 95 / 100).min(v.len().saturating_sub(1)))
.copied()
.unwrap_or(0)
};
tracing::debug!(
presented,
dropped,
e2e_p50_us = p50(&e2e_us),
e2e_p95_us = p95(&e2e_us),
display_p50_us = p50(&display_us),
"render window"
);
PRESENT_FPS.store(presented, Ordering::Relaxed);
PRESENT_SKIPPED.store(dropped, Ordering::Relaxed);
PRESENT_P50_US.store(p50, Ordering::Relaxed);
E2E_P50_US.store(p50(&e2e_us), Ordering::Relaxed);
E2E_P95_US.store(p95(&e2e_us), Ordering::Relaxed);
DISPLAY_P50_US.store(p50(&display_us), Ordering::Relaxed);
window_start = Instant::now();
presented = 0;
dropped = 0;
lat_us.clear();
e2e_us.clear();
display_us.clear();
}
}
tracing::info!("render thread exiting");
+59 -30
View File
@@ -46,11 +46,18 @@ pub struct SessionParams {
#[derive(Clone, Copy, Default, PartialEq)]
pub struct Stats {
/// AUs received (reassembled) per second — actual-elapsed-time denominator.
pub fps: f32,
/// Received payload goodput (excludes FEC overhead).
pub mbps: f32,
/// `decode` stage p50 over the last 1 s window: received → decoded, client-local clock.
pub decode_ms: f32,
/// Median capture→decoded latency over the last window (host-clock corrected).
pub latency_ms: f32,
/// `host+network` stage p50 over the last 1 s window: capture (`pts_ns`) → received,
/// host-clock corrected via `clock_offset_ns`.
pub hostnet_ms: f32,
/// True when `clock_offset_ns == 0` (host didn't answer the skew handshake / same host) —
/// the HUD appends `(same-host clock)` to the end-to-end line.
pub same_host: bool,
/// True when decoding on the GPU (D3D11VA) vs. CPU (software).
pub hardware: bool,
/// True when the stream is BT.2020 PQ HDR10 (last decoded frame).
@@ -81,9 +88,19 @@ pub enum SessionEvent {
Stats(Stats),
}
/// Decoded frames + their host-capture `pts_ns`, session pump → render thread (crossbeam so that
/// Per-frame measurement points carried with a decoded frame to the render thread: the host
/// capture clock (`pts_ns`) and our local `decoded` stamp (wall-clock ns). Post-`Present()` the
/// render thread derives the `display` stage (displayed decoded, single-clock) and the
/// end-to-end headline (displayed + clock_offset pts) from them.
#[derive(Clone, Copy)]
pub struct FrameTimes {
pub pts_ns: u64,
pub decoded_ns: u64,
}
/// Decoded frames + their measurement points, session pump → render thread (crossbeam so that
/// thread can block with a timeout — async-channel has no `recv_timeout`).
pub type FrameRx = crossbeam_channel::Receiver<(DecodedFrame, u64)>;
pub type FrameRx = crossbeam_channel::Receiver<(DecodedFrame, FrameTimes)>;
pub struct SessionHandle {
pub events: async_channel::Receiver<SessionEvent>,
@@ -205,7 +222,7 @@ impl AudioDec {
fn pump(
params: SessionParams,
ev_tx: async_channel::Sender<SessionEvent>,
frame_tx: crossbeam_channel::Sender<(DecodedFrame, u64)>,
frame_tx: crossbeam_channel::Sender<(DecodedFrame, FrameTimes)>,
frame_rx: FrameRx,
stop: Arc<AtomicBool>,
) {
@@ -310,8 +327,9 @@ fn pump(
let mut window_start = Instant::now();
let mut frames_n = 0u32;
let mut bytes_n = 0u64;
let mut decode_us_sum = 0u64;
let mut lat_us: Vec<u64> = Vec::with_capacity(256);
// 1 s tumbling stage windows (spec: design/stats-unification.md — percentiles, never means).
let mut hostnet_us: Vec<u64> = Vec::with_capacity(256);
let mut decode_us: Vec<u64> = Vec::with_capacity(256);
let mut pcm = vec![0f32; 5760 * channels as usize]; // scratch: max Opus frame (120 ms) × channels
// Loss recovery: watch the host→client unrecoverable-drop count and ask for an IDR when it climbs.
let mut last_dropped = connector.frames_dropped();
@@ -323,7 +341,18 @@ fn pump(
}
match connector.next_frame(Duration::from_millis(4)) {
Ok(frame) => {
let t0 = Instant::now();
// The `received` point: AU fully reassembled, handed to us, before decode.
let received_ns = now_ns();
// fps = AUs received per second, Mb/s = received goodput (spec: counted at the
// received point, not the decoded one).
frames_n += 1;
bytes_n += frame.data.len() as u64;
// `host+network` stage: capture → received, host-clock corrected. Clamped (0, 10 s).
let hostnet = (received_ns as i128 + clock_offset as i128 - frame.pts_ns as i128)
.max(0) as u64;
if hostnet > 0 && hostnet < 10_000_000_000 {
hostnet_us.push(hostnet / 1000);
}
// A D3D11VA→software demotion (see `Decoder::decode`) starts a FRESH decoder that
// has none of the stream's parameter sets; under infinite GOP it would sit on
// "PPS id out of range" forever. Detect the transition and force a new IDR so the
@@ -336,6 +365,8 @@ fn pump(
}
match decoded {
Ok(Some(decoded)) => {
// The `decoded` point: decoder output frame available.
let decoded_ns = now_ns();
total_frames += 1;
hdr = decoded.hdr();
// The backend can demote D3D11VA → software mid-session on a hardware error.
@@ -350,19 +381,17 @@ fn pump(
"first frame decoded"
);
}
// Latency: our wall clock expressed in the host's capture clock,
// minus the host-stamped capture pts (same math as client-rs).
let lat = (now_ns() as i128 + clock_offset as i128 - frame.pts_ns as i128)
.max(0) as u64;
if lat > 0 && lat < 10_000_000_000 {
lat_us.push(lat / 1000);
}
decode_us_sum += t0.elapsed().as_micros() as u64;
frames_n += 1;
bytes_n += frame.data.len() as u64;
// `decode` stage: received → decoded, single-clock client-local.
decode_us.push(decoded_ns.saturating_sub(received_ns) / 1000);
// Newest wins: displace the oldest queued frame when the renderer lags.
if let Err(crossbeam_channel::TrySendError::Full(item)) =
frame_tx.try_send((decoded, frame.pts_ns))
frame_tx.try_send((
decoded,
FrameTimes {
pts_ns: frame.pts_ns,
decoded_ns,
},
))
{
let _ = frame_rx.try_recv();
let _ = frame_tx.try_send(item);
@@ -413,23 +442,23 @@ fn pump(
if window_start.elapsed() >= Duration::from_secs(1) {
let secs = window_start.elapsed().as_secs_f32();
lat_us.sort_unstable();
let p50 = lat_us.get(lat_us.len() / 2).copied().unwrap_or(0);
hostnet_us.sort_unstable();
decode_us.sort_unstable();
let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0);
let (hostnet_p50, decode_p50) = (p50(&hostnet_us), p50(&decode_us));
tracing::debug!(
fps = frames_n,
lat_p50_us = p50,
hostnet_p50_us = hostnet_p50,
decode_p50_us = decode_p50,
total_frames,
"stream window"
);
let _ = ev_tx.try_send(SessionEvent::Stats(Stats {
fps: frames_n as f32 / secs,
mbps: bytes_n as f32 * 8.0 / 1e6 / secs,
decode_ms: if frames_n > 0 {
decode_us_sum as f32 / frames_n as f32 / 1000.0
} else {
0.0
},
latency_ms: p50 as f32 / 1000.0,
decode_ms: decode_p50 as f32 / 1000.0,
hostnet_ms: hostnet_p50 as f32 / 1000.0,
same_host: clock_offset == 0,
hardware,
hdr,
codec: connector.codec,
@@ -439,8 +468,8 @@ fn pump(
window_start = Instant::now();
frames_n = 0;
bytes_n = 0;
decode_us_sum = 0;
lat_us.clear();
hostnet_us.clear();
decode_us.clear();
}
};