feat(host/vaapi): submit-split instrumentation + async_depth knob (depth 1 stays default)

Chasing the 8ms submit at 1440p on the 780M: the sampled PUNKTFUNK_PERF
split (push/pull/send) shows desc+buffersrc at ~5us, hwmap-import+VPP
CSC at ~0.2-0.5ms, and avcodec_send_frame owning the rest — so neither
a VA-surface import cache nor CSC overlap would help. Two facts landed:
(1) async_depth>=2 in libavcodec's vaapi_encode is a structural
+1-frame latency (frame N's packet only materializes when N+1 queues;
measured 18ms vs 8.3ms p50 at depth 1) — depth 1 stays the default,
PUNKTFUNK_VAAPI_ASYNC_DEPTH exists for pixel rates beyond the ASIC's
serial budget, and poll() now does a bounded in-flight wait so a deeper
depth still ships the AU as soon as the ASIC finishes. (2) The residual
send_frame block tracks GPU CLOCKS, not the ASIC: ~8ms/frame at a 60fps
duty cycle vs ~4.4ms at 120fps pacing vs 3.5ms back-to-back (270fps CLI
benchmark, even at -async_depth 1) — the clock-sag fix lands in
gpuclocks.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-07-02 13:03:04 +00:00
parent 16f72da725
commit 68c92f6874
2 changed files with 71 additions and 6 deletions
@@ -215,7 +215,20 @@ unsafe fn open_vaapi_encoder_mode(
(*raw).hw_frames_ctx = ffi::av_buffer_ref(frames_ref); (*raw).hw_frames_ctx = ffi::av_buffer_ref(frames_ref);
let mut opts = Dictionary::new(); let mut opts = Dictionary::new();
opts.set("async_depth", "1"); // one-in/one-out — minimal encode-pipeline latency // async_depth=1: `send_frame` blocks until THIS frame's ASIC encode completes — the lowest
// latency structure libavcodec's vaapi_encode offers. Measured on the 780M at 1440p60: depth 1
// = 8.3 ms end-to-end p50 vs depth 2 = 18 ms, because with depth ≥ 2 frame N's packet only
// materializes once frame N+1 is queued (a structural +1-frame delay no poll can beat). The
// knob exists for pixel rates beyond the ASIC's serial budget (e.g. 1440p120+ on an iGPU),
// where depth 2 restores throughput at that one-frame cost. NOTE: the per-frame block tracks
// GPU CLOCKS — a paced 60 fps trickle lets the VCN downclock (~8 ms/frame vs ~4.4 ms hot);
// see `gpuclocks` for the session clock pin that removes the ramp tax.
let depth = std::env::var("PUNKTFUNK_VAAPI_ASYNC_DEPTH")
.ok()
.and_then(|s| s.parse::<u32>().ok())
.filter(|d| (1..=8).contains(d))
.unwrap_or(1);
opts.set("async_depth", &depth.to_string());
if low_power { if low_power {
opts.set("low_power", "1"); // VDEnc — the only encode entrypoint on modern Intel opts.set("low_power", "1"); // VDEnc — the only encode entrypoint on modern Intel
} }
@@ -574,6 +587,10 @@ struct DmabufInner {
width: u32, width: u32,
height: u32, height: u32,
fourcc: u32, fourcc: u32,
/// Frames submitted — drives the sampled `PUNKTFUNK_PERF` breakdown of the synchronous
/// submit (import+push vs CSC pull vs encoder send), the stage that dominates AMD/Intel
/// host latency (7.9 ms p50 at 1440p on the 780M).
frames: u64,
} }
impl DmabufInner { impl DmabufInner {
@@ -804,6 +821,7 @@ impl DmabufInner {
width, width,
height, height,
fourcc: drm_fourcc, fourcc: drm_fourcc,
frames: 0,
}) })
} }
} }
@@ -815,6 +833,14 @@ impl DmabufInner {
dmabuf.fourcc, dmabuf.fourcc,
self.fourcc self.fourcc
); );
// Sampled breakdown of this synchronous submit under PUNKTFUNK_PERF: push = descriptor
// build + buffersrc (the per-frame DRM→VA import happens inside hwmap on the pull path),
// pull = buffersink (VPP CSC + any sync), send = avcodec_send_frame. One line per ~2 s.
let sample = crate::config::config().perf && self.frames % 120 == 0;
self.frames += 1;
let t0 = std::time::Instant::now();
let t_push: std::time::Duration;
let t_pull: std::time::Duration;
// SAFETY: The `ensure!` above checked `dmabuf.fourcc == self.fourcc`. // SAFETY: The `ensure!` above checked `dmabuf.fourcc == self.fourcc`.
// * `std::mem::zeroed::<AVDRMFrameDescriptor>()` is sound: it is a `#[repr(C)]` POD of ints and // * `std::mem::zeroed::<AVDRMFrameDescriptor>()` is sound: it is a `#[repr(C)]` POD of ints and
// nested int-struct arrays (no `NonNull`/refs), for which all-zero is a valid bit pattern; // nested int-struct arrays (no `NonNull`/refs), for which all-zero is a valid bit pattern;
@@ -883,6 +909,7 @@ impl DmabufInner {
if r < 0 { if r < 0 {
bail!("av_buffersrc_add_frame failed ({r})"); bail!("av_buffersrc_add_frame failed ({r})");
} }
t_push = t0.elapsed();
let mut nv12 = ffi::av_frame_alloc(); let mut nv12 = ffi::av_frame_alloc();
if nv12.is_null() { if nv12.is_null() {
bail!("av_frame_alloc(nv12) failed"); bail!("av_frame_alloc(nv12) failed");
@@ -892,6 +919,7 @@ impl DmabufInner {
ffi::av_frame_free(&mut nv12); ffi::av_frame_free(&mut nv12);
bail!("av_buffersink_get_frame failed ({r})"); bail!("av_buffersink_get_frame failed ({r})");
} }
t_pull = t0.elapsed() - t_push;
(*nv12).pts = pts; (*nv12).pts = pts;
(*nv12).pict_type = if idr { (*nv12).pict_type = if idr {
ffi::AVPictureType::AV_PICTURE_TYPE_I ffi::AVPictureType::AV_PICTURE_TYPE_I
@@ -904,6 +932,16 @@ impl DmabufInner {
bail!("avcodec_send_frame(VAAPI) failed ({r})"); bail!("avcodec_send_frame(VAAPI) failed ({r})");
} }
} }
if sample {
let t_send = t0.elapsed() - t_push - t_pull;
tracing::info!(
push_us = t_push.as_micros() as u64,
pull_us = t_pull.as_micros() as u64,
send_us = t_send.as_micros() as u64,
"VAAPI submit split (sampled): push=desc+buffersrc pull=hwmap-import+VPP-CSC \
send=avcodec_send_frame"
);
}
Ok(()) Ok(())
} }
} }
@@ -944,6 +982,10 @@ pub struct VaapiEncoder {
inner: Option<Inner>, inner: Option<Inner>,
frame_idx: i64, frame_idx: i64,
force_kf: bool, force_kf: bool,
/// Frames sent to the encoder but not yet returned as packets. Gates [`poll`](Encoder::poll)'s
/// bounded wait: with `async_depth > 1` a submitted frame's AU lands ~ASIC-time later, so poll
/// briefly waits for it (same-tick delivery) — but only when something is actually in flight.
in_flight: u32,
} }
// Raw FFI pointers; the encoder lives on a single thread (same contract as `NvencEncoder`). // Raw FFI pointers; the encoder lives on a single thread (same contract as `NvencEncoder`).
@@ -997,6 +1039,7 @@ impl VaapiEncoder {
inner: None, inner: None,
frame_idx: 0, frame_idx: 0,
force_kf: false, force_kf: false,
in_flight: 0,
}) })
} }
@@ -1054,7 +1097,9 @@ impl Encoder for VaapiEncoder {
"VAAPI encoder received a CUDA frame — that payload is NVENC-only; \ "VAAPI encoder received a CUDA frame — that payload is NVENC-only; \
unset PUNKTFUNK_ZEROCOPY or don't force PUNKTFUNK_ENCODER=vaapi on an NVIDIA host" unset PUNKTFUNK_ZEROCOPY or don't force PUNKTFUNK_ENCODER=vaapi on an NVIDIA host"
), ),
} }?;
self.in_flight += 1;
Ok(())
} }
fn request_keyframe(&mut self) { fn request_keyframe(&mut self) {
@@ -1062,10 +1107,30 @@ impl Encoder for VaapiEncoder {
} }
fn poll(&mut self) -> Result<Option<EncodedFrame>> { fn poll(&mut self) -> Result<Option<EncodedFrame>> {
match &mut self.inner { // With `async_depth > 1`, `submit` no longer waits for the ASIC — the AU for the frame we
Some(Inner::Cpu(c)) => poll_encoder(&mut c.enc, self.fps), // just sent lands ~one hardware-encode-time later. Wait for it (bounded) so it still ships
Some(Inner::Dmabuf(d)) => poll_encoder(&mut d.enc, self.fps), // this tick: the same blocking-retrieve model as NVENC's lock_bitstream, at the ASIC's
None => Ok(None), // real per-frame latency instead of send_frame's synchronous ~2× wait. The budget is 3/4
// of a frame interval (capped 12 ms); on expiry return None — the AU rides the next poll.
let enc = match &mut self.inner {
Some(Inner::Cpu(c)) => &mut c.enc,
Some(Inner::Dmabuf(d)) => &mut d.enc,
None => return Ok(None),
};
let budget = std::time::Duration::from_micros(750_000 / self.fps.max(1) as u64)
.min(std::time::Duration::from_millis(12));
let deadline = std::time::Instant::now() + budget;
loop {
if let Some(au) = poll_encoder(enc, self.fps)? {
self.in_flight = self.in_flight.saturating_sub(1);
return Ok(Some(au));
}
// Nothing ready: only wait when a frame is actually in flight (a drained/EOF'd
// encoder must not spin the budget), and give the ASIC ~250 µs between checks.
if self.in_flight == 0 || std::time::Instant::now() >= deadline {
return Ok(None);
}
std::thread::sleep(std::time::Duration::from_micros(250));
} }
} }