09a5957c6d
One stat model everywhere (design/stats-unification.md): four measurement points (capture/received/decoded/displayed), three stages that tile the interval exactly, and a HUD that shows the addition explicitly — end-to-end 14.2 ms p50 · 19.8 p95 · capture→on-glass = host+network 9.8 + decode 2.1 + display 2.3 replacing each client's ad-hoc mix of overlapping absolutes (the Apple HUD's three arrow lines that looked sequential but weren't), mean-vs-median decode times (Windows/Linux), missing same-host-clock flags (Windows/Linux), and three different names for the same capture→received measurement (probe's "reassembled", Apple/Android's "client", Windows/Linux's post-decode "lat"). Per client: Apple threads receivedNs through the VT decode via the frame refcon bit pattern so the decode stage exists at all (stage-1 fallback honestly degrades to a capture→received headline); Windows carries FrameTimes through the existing frame channel to the render thread and adds e2e p50/p95 post-Present; Linux stamps received at AU pop and rides decoded_ns on DecodedFrame to the paintable-set site; Android pairs receipt stamps with MediaCodec output buffers via the codec's pts round-trip (JNI stats array 14→16 doubles, indexes 0-13 unchanged). fps now uniformly counts received AUs; lost/(received+lost) per window, hidden at zero. docs-site gains "Understanding the Stats Overlay": what each line means, why the equation only approximately sums (percentiles), and a line-by-line Moonlight/Sunshine matrix — including that Moonlight has no end-to-end number and its "network latency" is an ENet control RTT, so punktfunk's headline must not be compared against any single Moonlight line. Verified here: linux client + probe + core check/clippy/fmt green, android native cargo-ndk arm64 check green. Pending: Windows CI + on-glass, swift test on the mac, on-device Android. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
279 lines
11 KiB
Rust
279 lines
11 KiB
Rust
//! The dedicated video render thread: decoded frames flow session pump → bounded channel → here →
|
|
//! `Presenter::present`. Presenting off the XAML thread means UI jank (layout, input, dialogs)
|
|
//! never stalls video, and a filled present queue never blocks the UI thread — the two failure
|
|
//! modes of the old present-from-`on_rendering` design.
|
|
//!
|
|
//! Pacing: block on the channel (the host paces the stream), then on the swapchain's
|
|
//! frame-latency waitable (≤1 queued present — see `present.rs`), then drain to the NEWEST frame
|
|
//! so a stream faster than the display drops backlog before any GPU work. The UI thread only
|
|
//! writes panel size/DPI into [`RenderShared`] atomics; the loop applies them before the next
|
|
//! draw (and redraws the held frame after a resize — fresh back buffers are blank).
|
|
|
|
use crate::present::Presenter;
|
|
use crate::session::{FrameRx, FrameTimes};
|
|
use crossbeam_channel::RecvTimeoutError;
|
|
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
|
|
use std::sync::Arc;
|
|
use std::time::{Duration, Instant};
|
|
|
|
/// The last 1-second render window, published for the HUD (one render thread at a time):
|
|
/// presents/s, frames skipped by the newest-wins drain, the end-to-end (capture→on-glass)
|
|
/// p50/p95 and the `display` stage (decoded→displayed) p50, all stamped post-`Present()`, in µs.
|
|
/// Zeroed when a render thread starts so a new session never shows the previous one's numbers.
|
|
static PRESENT_FPS: AtomicU32 = AtomicU32::new(0);
|
|
static PRESENT_SKIPPED: AtomicU32 = AtomicU32::new(0);
|
|
static E2E_P50_US: AtomicU64 = AtomicU64::new(0);
|
|
static E2E_P95_US: AtomicU64 = AtomicU64::new(0);
|
|
static DISPLAY_P50_US: AtomicU64 = AtomicU64::new(0);
|
|
|
|
/// The last render window's glass-side numbers (see the statics above) — the HUD's headline
|
|
/// (end-to-end) and trailing stage (display) come from here.
|
|
#[derive(Clone, Copy, Default, PartialEq)]
|
|
pub struct PresentStats {
|
|
/// Presents per second (includes resize redraws of a held frame).
|
|
pub fps: u32,
|
|
/// Frames dropped by the newest-wins drain this window (client-side pacing skips).
|
|
pub skipped: u32,
|
|
/// End-to-end capture→displayed p50, ms (host-clock corrected, measured directly).
|
|
pub e2e_p50_ms: f32,
|
|
/// End-to-end capture→displayed p95, ms.
|
|
pub e2e_p95_ms: f32,
|
|
/// `display` stage p50, ms: decoded → displayed, single-clock client-local.
|
|
pub display_p50_ms: f32,
|
|
}
|
|
|
|
pub fn present_stats() -> PresentStats {
|
|
PresentStats {
|
|
fps: PRESENT_FPS.load(Ordering::Relaxed),
|
|
skipped: PRESENT_SKIPPED.load(Ordering::Relaxed),
|
|
e2e_p50_ms: E2E_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
|
|
e2e_p95_ms: E2E_P95_US.load(Ordering::Relaxed) as f32 / 1000.0,
|
|
display_p50_ms: DISPLAY_P50_US.load(Ordering::Relaxed) as f32 / 1000.0,
|
|
}
|
|
}
|
|
|
|
/// UI-thread → render-thread state. Size is packed into ONE atomic (w<<32|h) so a resize never
|
|
/// tears into a (new-width, old-height) pair.
|
|
pub struct RenderShared {
|
|
size_px: AtomicU64,
|
|
dpi: AtomicU32,
|
|
stop: AtomicBool,
|
|
}
|
|
|
|
impl RenderShared {
|
|
pub fn new(width: u32, height: u32, dpi: u32) -> Arc<RenderShared> {
|
|
Arc::new(RenderShared {
|
|
size_px: AtomicU64::new(pack(width, height)),
|
|
dpi: AtomicU32::new(dpi),
|
|
stop: AtomicBool::new(false),
|
|
})
|
|
}
|
|
|
|
pub fn set_size(&self, width: u32, height: u32) {
|
|
self.size_px.store(pack(width, height), Ordering::Relaxed);
|
|
}
|
|
|
|
pub fn set_dpi(&self, dpi: u32) {
|
|
self.dpi.store(dpi, Ordering::Relaxed);
|
|
}
|
|
|
|
fn snapshot(&self) -> (u32, u32, u32) {
|
|
let s = self.size_px.load(Ordering::Relaxed);
|
|
((s >> 32) as u32, s as u32, self.dpi.load(Ordering::Relaxed))
|
|
}
|
|
}
|
|
|
|
fn pack(w: u32, h: u32) -> u64 {
|
|
((w as u64) << 32) | h as u64
|
|
}
|
|
|
|
/// Handle owned by the stream page; stops + joins the thread on unmount (and on drop, so a
|
|
/// navigation away can't leak a presenting thread).
|
|
pub struct RenderThread {
|
|
shared: Arc<RenderShared>,
|
|
join: Option<std::thread::JoinHandle<()>>,
|
|
}
|
|
|
|
impl RenderThread {
|
|
pub fn shared(&self) -> &Arc<RenderShared> {
|
|
&self.shared
|
|
}
|
|
|
|
pub fn stop_and_join(&mut self) {
|
|
self.shared.stop.store(true, Ordering::SeqCst);
|
|
if let Some(j) = self.join.take() {
|
|
let _ = j.join();
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Drop for RenderThread {
|
|
fn drop(&mut self) {
|
|
self.stop_and_join();
|
|
}
|
|
}
|
|
|
|
/// Moves the presenter (COM interfaces, `!Send` by default) onto the render thread. Sound here:
|
|
/// the shared device + immediate context are multithread-protected (see `crate::gpu`), D3D/DXGI
|
|
/// objects are apartment-agile, and after this one handoff the swapchain/RTV/context calls happen
|
|
/// on exactly the render thread — the same single-owner discipline as `SharedDevice`.
|
|
struct SendPresenter(Presenter);
|
|
unsafe impl Send for SendPresenter {}
|
|
|
|
/// Spawn the render thread. `frames` carries `(frame, FrameTimes)`; `clock_offset_ns` maps our
|
|
/// wall clock onto the host's so the end-to-end (capture→on-glass) number is cross-machine valid
|
|
/// (same math as the pump's host+network stage).
|
|
pub fn spawn(
|
|
presenter: Presenter,
|
|
frames: FrameRx,
|
|
shared: Arc<RenderShared>,
|
|
clock_offset_ns: i64,
|
|
) -> RenderThread {
|
|
let boxed = SendPresenter(presenter);
|
|
let shared_w = shared.clone();
|
|
let join = std::thread::Builder::new()
|
|
.name("pf-render".into())
|
|
.spawn(move || run(boxed, frames, shared_w, clock_offset_ns))
|
|
.expect("spawn render thread");
|
|
RenderThread {
|
|
shared,
|
|
join: Some(join),
|
|
}
|
|
}
|
|
|
|
fn now_ns() -> u64 {
|
|
std::time::SystemTime::now()
|
|
.duration_since(std::time::UNIX_EPOCH)
|
|
.map(|d| d.as_nanos() as u64)
|
|
.unwrap_or(0)
|
|
}
|
|
|
|
/// The window DPI, polled ~1 Hz as belt-and-braces for a monitor move that changes DPI without a
|
|
/// `SizeChanged` (same DIP size on both screens). `None` when the window isn't up (headless).
|
|
fn poll_window_dpi() -> Option<u32> {
|
|
use windows::Win32::UI::HiDpi::GetDpiForWindow;
|
|
use windows::Win32::UI::WindowsAndMessaging::FindWindowW;
|
|
unsafe {
|
|
let hwnd = FindWindowW(None, windows::core::w!("Punktfunk")).ok()?;
|
|
match GetDpiForWindow(hwnd) {
|
|
0 => None,
|
|
d => Some(d),
|
|
}
|
|
}
|
|
}
|
|
|
|
fn run(presenter: SendPresenter, frames: FrameRx, shared: Arc<RenderShared>, clock_offset_ns: i64) {
|
|
let mut p = presenter.0;
|
|
let mut applied = (0u32, 0u32, 0u32); // last (w, h, dpi) handed to the presenter
|
|
let mut presented = 0u32;
|
|
let mut dropped = 0u32;
|
|
// 1 s tumbling windows: end-to-end (capture→displayed) and the display stage
|
|
// (decoded→displayed), sampled post-Present. Percentiles only (spec: stats-unification.md).
|
|
let mut e2e_us: Vec<u64> = Vec::with_capacity(256);
|
|
let mut display_us: Vec<u64> = Vec::with_capacity(256);
|
|
let mut window_start = Instant::now();
|
|
let mut last_dpi_poll = Instant::now();
|
|
PRESENT_FPS.store(0, Ordering::Relaxed);
|
|
PRESENT_SKIPPED.store(0, Ordering::Relaxed);
|
|
E2E_P50_US.store(0, Ordering::Relaxed);
|
|
E2E_P95_US.store(0, Ordering::Relaxed);
|
|
DISPLAY_P50_US.store(0, Ordering::Relaxed);
|
|
|
|
loop {
|
|
if shared.stop.load(Ordering::SeqCst) {
|
|
break;
|
|
}
|
|
let first = match frames.recv_timeout(Duration::from_millis(50)) {
|
|
Ok(f) => Some(f),
|
|
Err(RecvTimeoutError::Timeout) => None,
|
|
Err(RecvTimeoutError::Disconnected) => break,
|
|
};
|
|
|
|
if last_dpi_poll.elapsed() >= Duration::from_secs(1) {
|
|
last_dpi_poll = Instant::now();
|
|
if let Some(dpi) = poll_window_dpi() {
|
|
shared.set_dpi(dpi);
|
|
}
|
|
}
|
|
let snap = shared.snapshot();
|
|
let resized = snap != applied && snap.0 > 0 && snap.1 > 0;
|
|
if resized {
|
|
p.resize(snap.0, snap.1, snap.2);
|
|
applied = snap;
|
|
}
|
|
if first.is_none() && !resized {
|
|
continue; // nothing new to show — don't burn GPU re-presenting a static frame
|
|
}
|
|
|
|
// Throttle to the compositor: with ≤1 present outstanding this returns as DWM frees a
|
|
// slot, and frames decoded meanwhile are drained below so the newest is what's drawn.
|
|
if !p.wait_present_slot(1000) {
|
|
tracing::debug!("frame-latency waitable timed out — presenting anyway");
|
|
}
|
|
let mut newest = first;
|
|
while let Ok(f) = frames.try_recv() {
|
|
if newest.is_some() {
|
|
dropped += 1;
|
|
}
|
|
newest = Some(f);
|
|
}
|
|
|
|
// The session pump is the sole 0xCE consumer and stashes the latest here (rare updates).
|
|
if let Some(meta) = *crate::present::LATEST_HDR_META.lock().unwrap() {
|
|
p.set_hdr_metadata(meta);
|
|
}
|
|
|
|
let times: Option<FrameTimes> = newest.as_ref().map(|(_, t)| *t);
|
|
p.present(newest.map(|(f, _)| f));
|
|
presented += 1;
|
|
if let Some(t) = times {
|
|
// The `displayed` point: post-Present() on this thread (the honest best-effort
|
|
// presentation instant on Windows — endpoint label `capture→on-glass`).
|
|
let displayed_ns = now_ns();
|
|
// End-to-end = capture → displayed, host-clock corrected, measured directly
|
|
// (never the sum of stage percentiles). Clamped (0, 10 s).
|
|
let e2e =
|
|
(displayed_ns as i128 + clock_offset_ns as i128 - t.pts_ns as i128).max(0) as u64;
|
|
if e2e > 0 && e2e < 10_000_000_000 {
|
|
e2e_us.push(e2e / 1000);
|
|
}
|
|
// `display` stage = decoded → displayed, single-clock client-local.
|
|
let disp = displayed_ns.saturating_sub(t.decoded_ns);
|
|
if disp < 10_000_000_000 {
|
|
display_us.push(disp / 1000);
|
|
}
|
|
}
|
|
|
|
if window_start.elapsed() >= Duration::from_secs(1) {
|
|
e2e_us.sort_unstable();
|
|
display_us.sort_unstable();
|
|
let p50 = |v: &[u64]| v.get(v.len() / 2).copied().unwrap_or(0);
|
|
// p95 = sorted[min(len*95/100, len-1)] — the empty-window case falls to 0 via `get`.
|
|
let p95 = |v: &[u64]| {
|
|
v.get((v.len() * 95 / 100).min(v.len().saturating_sub(1)))
|
|
.copied()
|
|
.unwrap_or(0)
|
|
};
|
|
tracing::debug!(
|
|
presented,
|
|
dropped,
|
|
e2e_p50_us = p50(&e2e_us),
|
|
e2e_p95_us = p95(&e2e_us),
|
|
display_p50_us = p50(&display_us),
|
|
"render window"
|
|
);
|
|
PRESENT_FPS.store(presented, Ordering::Relaxed);
|
|
PRESENT_SKIPPED.store(dropped, Ordering::Relaxed);
|
|
E2E_P50_US.store(p50(&e2e_us), Ordering::Relaxed);
|
|
E2E_P95_US.store(p95(&e2e_us), Ordering::Relaxed);
|
|
DISPLAY_P50_US.store(p50(&display_us), Ordering::Relaxed);
|
|
window_start = Instant::now();
|
|
presented = 0;
|
|
dropped = 0;
|
|
e2e_us.clear();
|
|
display_us.clear();
|
|
}
|
|
}
|
|
tracing::info!("render thread exiting");
|
|
}
|