feat(host/vaapi): submit-split instrumentation + async_depth knob (depth 1 stays default)

Chasing the 8ms submit at 1440p on the 780M: the sampled PUNKTFUNK_PERF split (push/pull/send) shows desc+buffersrc at ~5us, hwmap-import+VPP CSC at ~0.2-0.5ms, and avcodec_send_frame owning the rest — so neither a VA-surface import cache nor CSC overlap would help. Two facts landed: (1) async_depth>=2 in libavcodec's vaapi_encode is a structural +1-frame latency (frame N's packet only materializes when N+1 queues; measured 18ms vs 8.3ms p50 at depth 1) — depth 1 stays the default, PUNKTFUNK_VAAPI_ASYNC_DEPTH exists for pixel rates beyond the ASIC's serial budget, and poll() now does a bounded in-flight wait so a deeper depth still ships the AU as soon as the ASIC finishes. (2) The residual send_frame block tracks GPU CLOCKS, not the ASIC: ~8ms/frame at a 60fps duty cycle vs ~4.4ms at 120fps pacing vs 3.5ms back-to-back (270fps CLI benchmark, even at -async_depth 1) — the clock-sag fix lands in gpuclocks. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 13:03:04 +00:00
parent 12a3944156
commit fd1086074b
2 changed files with 71 additions and 6 deletions
@@ -215,7 +215,20 @@ unsafe fn open_vaapi_encoder_mode(
    (*raw).hw_frames_ctx = ffi::av_buffer_ref(frames_ref);

    let mut opts = Dictionary::new();
-    opts.set("async_depth", "1"); // one-in/one-out — minimal encode-pipeline latency
+    // async_depth=1: `send_frame` blocks until THIS frame's ASIC encode completes — the lowest
+    // latency structure libavcodec's vaapi_encode offers. Measured on the 780M at 1440p60: depth 1
+    // = 8.3 ms end-to-end p50 vs depth 2 = 18 ms, because with depth ≥ 2 frame N's packet only
+    // materializes once frame N+1 is queued (a structural +1-frame delay no poll can beat). The
+    // knob exists for pixel rates beyond the ASIC's serial budget (e.g. 1440p120+ on an iGPU),
+    // where depth 2 restores throughput at that one-frame cost. NOTE: the per-frame block tracks
+    // GPU CLOCKS — a paced 60 fps trickle lets the VCN downclock (~8 ms/frame vs ~4.4 ms hot);
+    // see `gpuclocks` for the session clock pin that removes the ramp tax.
+    let depth = std::env::var("PUNKTFUNK_VAAPI_ASYNC_DEPTH")
+        .ok()
+        .and_then(|s| s.parse::<u32>().ok())
+        .filter(|d| (1..=8).contains(d))
+        .unwrap_or(1);
+    opts.set("async_depth", &depth.to_string());
    if low_power {
        opts.set("low_power", "1"); // VDEnc — the only encode entrypoint on modern Intel
    }
@@ -574,6 +587,10 @@ struct DmabufInner {
    width: u32,
    height: u32,
    fourcc: u32,
+    /// Frames submitted — drives the sampled `PUNKTFUNK_PERF` breakdown of the synchronous
+    /// submit (import+push vs CSC pull vs encoder send), the stage that dominates AMD/Intel
+    /// host latency (7.9 ms p50 at 1440p on the 780M).
+    frames: u64,
 }

 impl DmabufInner {
@@ -804,6 +821,7 @@ impl DmabufInner {
                width,
                height,
                fourcc: drm_fourcc,
+                frames: 0,
            })
        }
    }
@@ -815,6 +833,14 @@ impl DmabufInner {
            dmabuf.fourcc,
            self.fourcc
        );
+        // Sampled breakdown of this synchronous submit under PUNKTFUNK_PERF: push = descriptor
+        // build + buffersrc (the per-frame DRM→VA import happens inside hwmap on the pull path),
+        // pull = buffersink (VPP CSC + any sync), send = avcodec_send_frame. One line per ~2 s.
+        let sample = crate::config::config().perf && self.frames % 120 == 0;
+        self.frames += 1;
+        let t0 = std::time::Instant::now();
+        let t_push: std::time::Duration;
+        let t_pull: std::time::Duration;
        // SAFETY: The `ensure!` above checked `dmabuf.fourcc == self.fourcc`.
        //  * `std::mem::zeroed::<AVDRMFrameDescriptor>()` is sound: it is a `#[repr(C)]` POD of ints and
        //    nested int-struct arrays (no `NonNull`/refs), for which all-zero is a valid bit pattern;
@@ -883,6 +909,7 @@ impl DmabufInner {
            if r < 0 {
                bail!("av_buffersrc_add_frame failed ({r})");
            }
+            t_push = t0.elapsed();
            let mut nv12 = ffi::av_frame_alloc();
            if nv12.is_null() {
                bail!("av_frame_alloc(nv12) failed");
@@ -892,6 +919,7 @@ impl DmabufInner {
                ffi::av_frame_free(&mut nv12);
                bail!("av_buffersink_get_frame failed ({r})");
            }
+            t_pull = t0.elapsed() - t_push;
            (*nv12).pts = pts;
            (*nv12).pict_type = if idr {
                ffi::AVPictureType::AV_PICTURE_TYPE_I
@@ -904,6 +932,16 @@ impl DmabufInner {
                bail!("avcodec_send_frame(VAAPI) failed ({r})");
            }
        }
+        if sample {
+            let t_send = t0.elapsed() - t_push - t_pull;
+            tracing::info!(
+                push_us = t_push.as_micros() as u64,
+                pull_us = t_pull.as_micros() as u64,
+                send_us = t_send.as_micros() as u64,
+                "VAAPI submit split (sampled): push=desc+buffersrc pull=hwmap-import+VPP-CSC \
+                 send=avcodec_send_frame"
+            );
+        }
        Ok(())
    }
 }
@@ -944,6 +982,10 @@ pub struct VaapiEncoder {
    inner: Option<Inner>,
    frame_idx: i64,
    force_kf: bool,
+    /// Frames sent to the encoder but not yet returned as packets. Gates [`poll`](Encoder::poll)'s
+    /// bounded wait: with `async_depth > 1` a submitted frame's AU lands ~ASIC-time later, so poll
+    /// briefly waits for it (same-tick delivery) — but only when something is actually in flight.
+    in_flight: u32,
 }

 // Raw FFI pointers; the encoder lives on a single thread (same contract as `NvencEncoder`).
@@ -997,6 +1039,7 @@ impl VaapiEncoder {
            inner: None,
            frame_idx: 0,
            force_kf: false,
+            in_flight: 0,
        })
    }

@@ -1054,7 +1097,9 @@ impl Encoder for VaapiEncoder {
                "VAAPI encoder received a CUDA frame — that payload is NVENC-only; \
                 unset PUNKTFUNK_ZEROCOPY or don't force PUNKTFUNK_ENCODER=vaapi on an NVIDIA host"
            ),
-        }
+        }?;
+        self.in_flight += 1;
+        Ok(())
    }

    fn request_keyframe(&mut self) {
@@ -1062,10 +1107,30 @@ impl Encoder for VaapiEncoder {
    }

    fn poll(&mut self) -> Result<Option<EncodedFrame>> {
-        match &mut self.inner {
-            Some(Inner::Cpu(c)) => poll_encoder(&mut c.enc, self.fps),
-            Some(Inner::Dmabuf(d)) => poll_encoder(&mut d.enc, self.fps),
-            None => Ok(None),
+        // With `async_depth > 1`, `submit` no longer waits for the ASIC — the AU for the frame we
+        // just sent lands ~one hardware-encode-time later. Wait for it (bounded) so it still ships
+        // this tick: the same blocking-retrieve model as NVENC's lock_bitstream, at the ASIC's
+        // real per-frame latency instead of send_frame's synchronous ~2× wait. The budget is 3/4
+        // of a frame interval (capped 12 ms); on expiry return None — the AU rides the next poll.
+        let enc = match &mut self.inner {
+            Some(Inner::Cpu(c)) => &mut c.enc,
+            Some(Inner::Dmabuf(d)) => &mut d.enc,
+            None => return Ok(None),
+        };
+        let budget = std::time::Duration::from_micros(750_000 / self.fps.max(1) as u64)
+            .min(std::time::Duration::from_millis(12));
+        let deadline = std::time::Instant::now() + budget;
+        loop {
+            if let Some(au) = poll_encoder(enc, self.fps)? {
+                self.in_flight = self.in_flight.saturating_sub(1);
+                return Ok(Some(au));
+            }
+            // Nothing ready: only wait when a frame is actually in flight (a drained/EOF'd
+            // encoder must not spin the budget), and give the ASIC ~250 µs between checks.
+            if self.in_flight == 0 || std::time::Instant::now() >= deadline {
+                return Ok(None);
+            }
+            std::thread::sleep(std::time::Duration::from_micros(250));
        }
    }