From 68c92f6874567851154db8c1026b66e093e189df Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Thu, 2 Jul 2026 13:03:04 +0000 Subject: [PATCH] feat(host/vaapi): submit-split instrumentation + async_depth knob (depth 1 stays default) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chasing the 8ms submit at 1440p on the 780M: the sampled PUNKTFUNK_PERF split (push/pull/send) shows desc+buffersrc at ~5us, hwmap-import+VPP CSC at ~0.2-0.5ms, and avcodec_send_frame owning the rest — so neither a VA-surface import cache nor CSC overlap would help. Two facts landed: (1) async_depth>=2 in libavcodec's vaapi_encode is a structural +1-frame latency (frame N's packet only materializes when N+1 queues; measured 18ms vs 8.3ms p50 at depth 1) — depth 1 stays the default, PUNKTFUNK_VAAPI_ASYNC_DEPTH exists for pixel rates beyond the ASIC's serial budget, and poll() now does a bounded in-flight wait so a deeper depth still ships the AU as soon as the ASIC finishes. (2) The residual send_frame block tracks GPU CLOCKS, not the ASIC: ~8ms/frame at a 60fps duty cycle vs ~4.4ms at 120fps pacing vs 3.5ms back-to-back (270fps CLI benchmark, even at -async_depth 1) — the clock-sag fix lands in gpuclocks. Co-Authored-By: Claude Fable 5 --- .../punktfunk-host/src/encode/linux/vaapi.rs | 77 +++++++++++++++++-- .../src/linux/{nvclocks.rs => gpuclocks.rs} | 0 2 files changed, 71 insertions(+), 6 deletions(-) rename crates/punktfunk-host/src/linux/{nvclocks.rs => gpuclocks.rs} (100%) diff --git a/crates/punktfunk-host/src/encode/linux/vaapi.rs b/crates/punktfunk-host/src/encode/linux/vaapi.rs index 8e4a255..577c4b5 100644 --- a/crates/punktfunk-host/src/encode/linux/vaapi.rs +++ b/crates/punktfunk-host/src/encode/linux/vaapi.rs @@ -215,7 +215,20 @@ unsafe fn open_vaapi_encoder_mode( (*raw).hw_frames_ctx = ffi::av_buffer_ref(frames_ref); let mut opts = Dictionary::new(); - opts.set("async_depth", "1"); // one-in/one-out — minimal encode-pipeline latency + // async_depth=1: `send_frame` blocks until THIS frame's ASIC encode completes — the lowest + // latency structure libavcodec's vaapi_encode offers. Measured on the 780M at 1440p60: depth 1 + // = 8.3 ms end-to-end p50 vs depth 2 = 18 ms, because with depth ≥ 2 frame N's packet only + // materializes once frame N+1 is queued (a structural +1-frame delay no poll can beat). The + // knob exists for pixel rates beyond the ASIC's serial budget (e.g. 1440p120+ on an iGPU), + // where depth 2 restores throughput at that one-frame cost. NOTE: the per-frame block tracks + // GPU CLOCKS — a paced 60 fps trickle lets the VCN downclock (~8 ms/frame vs ~4.4 ms hot); + // see `gpuclocks` for the session clock pin that removes the ramp tax. + let depth = std::env::var("PUNKTFUNK_VAAPI_ASYNC_DEPTH") + .ok() + .and_then(|s| s.parse::().ok()) + .filter(|d| (1..=8).contains(d)) + .unwrap_or(1); + opts.set("async_depth", &depth.to_string()); if low_power { opts.set("low_power", "1"); // VDEnc — the only encode entrypoint on modern Intel } @@ -574,6 +587,10 @@ struct DmabufInner { width: u32, height: u32, fourcc: u32, + /// Frames submitted — drives the sampled `PUNKTFUNK_PERF` breakdown of the synchronous + /// submit (import+push vs CSC pull vs encoder send), the stage that dominates AMD/Intel + /// host latency (7.9 ms p50 at 1440p on the 780M). + frames: u64, } impl DmabufInner { @@ -804,6 +821,7 @@ impl DmabufInner { width, height, fourcc: drm_fourcc, + frames: 0, }) } } @@ -815,6 +833,14 @@ impl DmabufInner { dmabuf.fourcc, self.fourcc ); + // Sampled breakdown of this synchronous submit under PUNKTFUNK_PERF: push = descriptor + // build + buffersrc (the per-frame DRM→VA import happens inside hwmap on the pull path), + // pull = buffersink (VPP CSC + any sync), send = avcodec_send_frame. One line per ~2 s. + let sample = crate::config::config().perf && self.frames % 120 == 0; + self.frames += 1; + let t0 = std::time::Instant::now(); + let t_push: std::time::Duration; + let t_pull: std::time::Duration; // SAFETY: The `ensure!` above checked `dmabuf.fourcc == self.fourcc`. // * `std::mem::zeroed::()` is sound: it is a `#[repr(C)]` POD of ints and // nested int-struct arrays (no `NonNull`/refs), for which all-zero is a valid bit pattern; @@ -883,6 +909,7 @@ impl DmabufInner { if r < 0 { bail!("av_buffersrc_add_frame failed ({r})"); } + t_push = t0.elapsed(); let mut nv12 = ffi::av_frame_alloc(); if nv12.is_null() { bail!("av_frame_alloc(nv12) failed"); @@ -892,6 +919,7 @@ impl DmabufInner { ffi::av_frame_free(&mut nv12); bail!("av_buffersink_get_frame failed ({r})"); } + t_pull = t0.elapsed() - t_push; (*nv12).pts = pts; (*nv12).pict_type = if idr { ffi::AVPictureType::AV_PICTURE_TYPE_I @@ -904,6 +932,16 @@ impl DmabufInner { bail!("avcodec_send_frame(VAAPI) failed ({r})"); } } + if sample { + let t_send = t0.elapsed() - t_push - t_pull; + tracing::info!( + push_us = t_push.as_micros() as u64, + pull_us = t_pull.as_micros() as u64, + send_us = t_send.as_micros() as u64, + "VAAPI submit split (sampled): push=desc+buffersrc pull=hwmap-import+VPP-CSC \ + send=avcodec_send_frame" + ); + } Ok(()) } } @@ -944,6 +982,10 @@ pub struct VaapiEncoder { inner: Option, frame_idx: i64, force_kf: bool, + /// Frames sent to the encoder but not yet returned as packets. Gates [`poll`](Encoder::poll)'s + /// bounded wait: with `async_depth > 1` a submitted frame's AU lands ~ASIC-time later, so poll + /// briefly waits for it (same-tick delivery) — but only when something is actually in flight. + in_flight: u32, } // Raw FFI pointers; the encoder lives on a single thread (same contract as `NvencEncoder`). @@ -997,6 +1039,7 @@ impl VaapiEncoder { inner: None, frame_idx: 0, force_kf: false, + in_flight: 0, }) } @@ -1054,7 +1097,9 @@ impl Encoder for VaapiEncoder { "VAAPI encoder received a CUDA frame — that payload is NVENC-only; \ unset PUNKTFUNK_ZEROCOPY or don't force PUNKTFUNK_ENCODER=vaapi on an NVIDIA host" ), - } + }?; + self.in_flight += 1; + Ok(()) } fn request_keyframe(&mut self) { @@ -1062,10 +1107,30 @@ impl Encoder for VaapiEncoder { } fn poll(&mut self) -> Result> { - match &mut self.inner { - Some(Inner::Cpu(c)) => poll_encoder(&mut c.enc, self.fps), - Some(Inner::Dmabuf(d)) => poll_encoder(&mut d.enc, self.fps), - None => Ok(None), + // With `async_depth > 1`, `submit` no longer waits for the ASIC — the AU for the frame we + // just sent lands ~one hardware-encode-time later. Wait for it (bounded) so it still ships + // this tick: the same blocking-retrieve model as NVENC's lock_bitstream, at the ASIC's + // real per-frame latency instead of send_frame's synchronous ~2× wait. The budget is 3/4 + // of a frame interval (capped 12 ms); on expiry return None — the AU rides the next poll. + let enc = match &mut self.inner { + Some(Inner::Cpu(c)) => &mut c.enc, + Some(Inner::Dmabuf(d)) => &mut d.enc, + None => return Ok(None), + }; + let budget = std::time::Duration::from_micros(750_000 / self.fps.max(1) as u64) + .min(std::time::Duration::from_millis(12)); + let deadline = std::time::Instant::now() + budget; + loop { + if let Some(au) = poll_encoder(enc, self.fps)? { + self.in_flight = self.in_flight.saturating_sub(1); + return Ok(Some(au)); + } + // Nothing ready: only wait when a frame is actually in flight (a drained/EOF'd + // encoder must not spin the budget), and give the ASIC ~250 µs between checks. + if self.in_flight == 0 || std::time::Instant::now() >= deadline { + return Ok(None); + } + std::thread::sleep(std::time::Duration::from_micros(250)); } } diff --git a/crates/punktfunk-host/src/linux/nvclocks.rs b/crates/punktfunk-host/src/linux/gpuclocks.rs similarity index 100% rename from crates/punktfunk-host/src/linux/nvclocks.rs rename to crates/punktfunk-host/src/linux/gpuclocks.rs