Files
punktfunk/clients/windows/src/render.rs
T
enricobuehler 09a5957c6d feat(clients): unified stats vocabulary across every client + Moonlight comparison docs
One stat model everywhere (design/stats-unification.md): four measurement
points (capture/received/decoded/displayed), three stages that tile the
interval exactly, and a HUD that shows the addition explicitly —

  end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass
  = host+network 9.8 + decode 2.1 + display 2.3

replacing each client's ad-hoc mix of overlapping absolutes (the Apple HUD's
three arrow lines that looked sequential but weren't), mean-vs-median decode
times (Windows/Linux), missing same-host-clock flags (Windows/Linux), and
three different names for the same capture→received measurement (probe's
"reassembled", Apple/Android's "client", Windows/Linux's post-decode "lat").

Per client: Apple threads receivedNs through the VT decode via the frame
refcon bit pattern so the decode stage exists at all (stage-1 fallback
honestly degrades to a capture→received headline); Windows carries
FrameTimes through the existing frame channel to the render thread and adds
e2e p50/p95 post-Present; Linux stamps received at AU pop and rides
decoded_ns on DecodedFrame to the paintable-set site; Android pairs receipt
stamps with MediaCodec output buffers via the codec's pts round-trip (JNI
stats array 14→16 doubles, indexes 0-13 unchanged). fps now uniformly counts
received AUs; lost/(received+lost) per window, hidden at zero.

docs-site gains "Understanding the Stats Overlay": what each line means, why
the equation only approximately sums (percentiles), and a line-by-line
Moonlight/Sunshine matrix — including that Moonlight has no end-to-end
number and its "network latency" is an ENet control RTT, so punktfunk's
headline must not be compared against any single Moonlight line.

Verified here: linux client + probe + core check/clippy/fmt green, android
native cargo-ndk arm64 check green. Pending: Windows CI + on-glass, swift
test on the mac, on-device Android.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-03 21:01:29 +00:00

279 lines
11 KiB
Rust

//! The dedicated video render thread: decoded frames flow session pump → bounded channel → here →
//! `Presenter::present`. Presenting off the XAML thread means UI jank (layout, input, dialogs)
//! never stalls video, and a filled present queue never blocks the UI thread — the two failure
//! modes of the old present-from-`on_rendering` design.
//!
//! Pacing: block on the channel (the host paces the stream), then on the swapchain's
//! frame-latency waitable (≤1 queued present — see `present.rs`), then drain to the NEWEST frame
//! so a stream faster than the display drops backlog before any GPU work. The UI thread only
//! writes panel size/DPI into [`RenderShared`] atomics; the loop applies them before the next
//! draw (and redraws the held frame after a resize — fresh back buffers are blank).
use crate::present::Presenter;
use crate::session::{FrameRx, FrameTimes};
use crossbeam_channel::RecvTimeoutError;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
/// The last 1-second render window, published for the HUD (one render thread at a time):
/// presents/s, frames skipped by the newest-wins drain, the end-to-end (capture→on-glass)
/// p50/p95 and the `display` stage (decoded→displayed) p50, all stamped post-`Present()`, in µs.
/// Zeroed when a render thread starts so a new session never shows the previous one's numbers.
static PRESENT_FPS: AtomicU32 = AtomicU32::new(0);
static PRESENT_SKIPPED: AtomicU32 = AtomicU32::new(0);
static E2E_P50_US: AtomicU64 = AtomicU64::new(0);
static E2E_P95_US: AtomicU64 = AtomicU64::new(0);
static DISPLAY_P50_US: AtomicU64 = AtomicU64::new(0);
/// The last render window's glass-side numbers (see the statics above) — the HUD's headline
/// (end-to-end) and trailing stage (display) come from here.
#[derive(Clone, Copy, Default, PartialEq)]
pub struct PresentStats {
/// Presents per second (includes resize redraws of a held frame).
pub fps: u32,
/// Frames dropped by the newest-wins drain this window (client-side pacing skips).
pub skipped: u32,
/// End-to-end capture→displayed p50, ms (host-clock corrected, measured directly).
pub e2e_p50_ms: f32,
/// End-to-end capture→displayed p95, ms.
pub e2e_p95_ms: f32,
/// `display` stage p50, ms: decoded → displayed, single-clock client-local.
pub display_p50_ms: f32,
}
pub fn present_stats() -> PresentStats {
PresentStats {
fps: PRESENT_FPS.load(Ordering::Relaxed),
skipped: PRESENT_SKIPPED.load(Ordering::Relaxed),
e2e_p50_ms: E2E_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
e2e_p95_ms: E2E_P95_US.load(Ordering::Relaxed) as f32 / 1000.0,
display_p50_ms: DISPLAY_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
}
}
/// UI-thread → render-thread state. Size is packed into ONE atomic (w<<32|h) so a resize never
/// tears into a (new-width, old-height) pair.
pub struct RenderShared {
size_px: AtomicU64,
dpi: AtomicU32,
stop: AtomicBool,
}
impl RenderShared {
pub fn new(width: u32, height: u32, dpi: u32) -> Arc<RenderShared> {
Arc::new(RenderShared {
size_px: AtomicU64::new(pack(width, height)),
dpi: AtomicU32::new(dpi),
stop: AtomicBool::new(false),
})
}
pub fn set_size(&self, width: u32, height: u32) {
self.size_px.store(pack(width, height), Ordering::Relaxed);
}
pub fn set_dpi(&self, dpi: u32) {
self.dpi.store(dpi, Ordering::Relaxed);
}
fn snapshot(&self) -> (u32, u32, u32) {
let s = self.size_px.load(Ordering::Relaxed);
((s >> 32) as u32, s as u32, self.dpi.load(Ordering::Relaxed))
}
}
fn pack(w: u32, h: u32) -> u64 {
((w as u64) << 32) | h as u64
}
/// Handle owned by the stream page; stops + joins the thread on unmount (and on drop, so a
/// navigation away can't leak a presenting thread).
pub struct RenderThread {
shared: Arc<RenderShared>,
join: Option<std::thread::JoinHandle<()>>,
}
impl RenderThread {
pub fn shared(&self) -> &Arc<RenderShared> {
&self.shared
}
pub fn stop_and_join(&mut self) {
self.shared.stop.store(true, Ordering::SeqCst);
if let Some(j) = self.join.take() {
let _ = j.join();
}
}
}
impl Drop for RenderThread {
fn drop(&mut self) {
self.stop_and_join();
}
}
/// Moves the presenter (COM interfaces, `!Send` by default) onto the render thread. Sound here:
/// the shared device + immediate context are multithread-protected (see `crate::gpu`), D3D/DXGI
/// objects are apartment-agile, and after this one handoff the swapchain/RTV/context calls happen
/// on exactly the render thread — the same single-owner discipline as `SharedDevice`.
struct SendPresenter(Presenter);
unsafe impl Send for SendPresenter {}
/// Spawn the render thread. `frames` carries `(frame, FrameTimes)`; `clock_offset_ns` maps our
/// wall clock onto the host's so the end-to-end (capture→on-glass) number is cross-machine valid
/// (same math as the pump's host+network stage).
pub fn spawn(
presenter: Presenter,
frames: FrameRx,
shared: Arc<RenderShared>,
clock_offset_ns: i64,
) -> RenderThread {
let boxed = SendPresenter(presenter);
let shared_w = shared.clone();
let join = std::thread::Builder::new()
.name("pf-render".into())
.spawn(move || run(boxed, frames, shared_w, clock_offset_ns))
.expect("spawn render thread");
RenderThread {
shared,
join: Some(join),
}
}
fn now_ns() -> u64 {
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0)
}
/// The window DPI, polled ~1 Hz as belt-and-braces for a monitor move that changes DPI without a
/// `SizeChanged` (same DIP size on both screens). `None` when the window isn't up (headless).
fn poll_window_dpi() -> Option<u32> {
use windows::Win32::UI::HiDpi::GetDpiForWindow;
use windows::Win32::UI::WindowsAndMessaging::FindWindowW;
unsafe {
let hwnd = FindWindowW(None, windows::core::w!("Punktfunk")).ok()?;
match GetDpiForWindow(hwnd) {
0 => None,
d => Some(d),
}
}
}
fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc<RenderShared>, clock_offset_ns: i64) {
let mut p = presenter.0;
let mut applied = (0u32, 0u32, 0u32); // last (w, h, dpi) handed to the presenter
let mut presented = 0u32;
let mut dropped = 0u32;
// 1 s tumbling windows: end-to-end (capture→displayed) and the display stage
// (decoded→displayed), sampled post-Present. Percentiles only (spec: stats-unification.md).
let mut e2e_us: Vec<u64> = Vec::with_capacity(256);
let mut display_us: Vec<u64> = Vec::with_capacity(256);
let mut window_start = Instant::now();
let mut last_dpi_poll = Instant::now();
PRESENT_FPS.store(0, Ordering::Relaxed);
PRESENT_SKIPPED.store(0, Ordering::Relaxed);
E2E_P50_US.store(0, Ordering::Relaxed);
E2E_P95_US.store(0, Ordering::Relaxed);
DISPLAY_P50_US.store(0, Ordering::Relaxed);
loop {
if shared.stop.load(Ordering::SeqCst) {
break;
}
let first = match frames.recv_timeout(Duration::from_millis(50)) {
Ok(f) => Some(f),
Err(RecvTimeoutError::Timeout) => None,
Err(RecvTimeoutError::Disconnected) => break,
};
if last_dpi_poll.elapsed() >= Duration::from_secs(1) {
last_dpi_poll = Instant::now();
if let Some(dpi) = poll_window_dpi() {
shared.set_dpi(dpi);
}
}
let snap = shared.snapshot();
let resized = snap != applied && snap.0 > 0 && snap.1 > 0;
if resized {
p.resize(snap.0, snap.1, snap.2);
applied = snap;
}
if first.is_none() && !resized {
continue; // nothing new to show — don't burn GPU re-presenting a static frame
}
// Throttle to the compositor: with ≤1 present outstanding this returns as DWM frees a
// slot, and frames decoded meanwhile are drained below so the newest is what's drawn.
if !p.wait_present_slot(1000) {
tracing::debug!("frame-latency waitable timed out — presenting anyway");
}
let mut newest = first;
while let Ok(f) = frames.try_recv() {
if newest.is_some() {
dropped += 1;
}
newest = Some(f);
}
// The session pump is the sole 0xCE consumer and stashes the latest here (rare updates).
if let Some(meta) = *crate::present::LATEST_HDR_META.lock().unwrap() {
p.set_hdr_metadata(meta);
}
let times: Option<FrameTimes> = newest.as_ref().map(|(_, t)| *t);
p.present(newest.map(|(f, _)| f));
presented += 1;
if let Some(t) = times {
// The `displayed` point: post-Present() on this thread (the honest best-effort
// presentation instant on Windows — endpoint label `capture→on-glass`).
let displayed_ns = now_ns();
// End-to-end = capture → displayed, host-clock corrected, measured directly
// (never the sum of stage percentiles). Clamped (0, 10 s).
let e2e =
(displayed_ns as i128 + clock_offset_ns as i128 - t.pts_ns as i128).max(0) as u64;
if e2e > 0 && e2e < 10_000_000_000 {
e2e_us.push(e2e / 1000);
}
// `display` stage = decoded → displayed, single-clock client-local.
let disp = displayed_ns.saturating_sub(t.decoded_ns);
if disp < 10_000_000_000 {
display_us.push(disp / 1000);
}
}
if window_start.elapsed() >= Duration::from_secs(1) {
e2e_us.sort_unstable();
display_us.sort_unstable();
let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0);
// p95 = sorted[min(len*95/100, len-1)] — the empty-window case falls to 0 via `get`.
let p95 = |v: &[u64]| {
v.get((v.len() * 95 / 100).min(v.len().saturating_sub(1)))
.copied()
.unwrap_or(0)
};
tracing::debug!(
presented,
dropped,
e2e_p50_us = p50(&e2e_us),
e2e_p95_us = p95(&e2e_us),
display_p50_us = p50(&display_us),
"render window"
);
PRESENT_FPS.store(presented, Ordering::Relaxed);
PRESENT_SKIPPED.store(dropped, Ordering::Relaxed);
E2E_P50_US.store(p50(&e2e_us), Ordering::Relaxed);
E2E_P95_US.store(p95(&e2e_us), Ordering::Relaxed);
DISPLAY_P50_US.store(p50(&display_us), Ordering::Relaxed);
window_start = Instant::now();
presented = 0;
dropped = 0;
e2e_us.clear();
display_us.clear();
}
}
tracing::info!("render thread exiting");
}