feat(windows-host): IDD-push encodes native NV12/P010 (skip NVENC's SM-side CSC)
GPU-contention work (host-latency plan §5.A): the IDD-push output ring now hands NVENC native YUV instead of RGB, so NVENC skips its internal RGB→YUV colour conversion on the SM/3D engine the running game saturates. - idd_push.rs: out_ring is now NV12 (SDR, BT.709 limited) via a D3D11 VIDEO-engine BGRA→NV12 VideoConverter (keeps the CSC off the contended 3D/compute engine), or P010 (HDR, BT.2020 PQ limited) via the FP16→P010 shader (NVIDIA's VideoProcessor can't do RGB→P010). The ring drops its per-slot RTV (textures only), matching the WGC YUV ring; converters rebuild on a size/HDR flip. - nvenc.rs: NV12 input forces bit_depth=8 so an HDR→SDR toggle (or a 10-bit- negotiated client on an SDR display) re-inits the session at the matching depth — NV12 can't feed a 10-bit session (register_resource rejects it). - punktfunk1.rs: per-stage latency instrumentation under PUNKTFUNK_PERF (cap=try_latest, submit=encode_picture, wait=lock_bitstream µs p50/p99/max) to pinpoint where capture→encoded latency goes under GPU saturation.
This commit is contained in:
@@ -10,7 +10,7 @@
|
|||||||
//! [`pf_driver_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides
|
//! [`pf_driver_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides
|
||||||
//! `use` it, so drift is a compile error rather than a "must match" comment.
|
//! `use` it, so drift is a compile error rather than a "must match" comment.
|
||||||
|
|
||||||
use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget};
|
use super::dxgi::{make_device, D3d11Frame, HdrP010Converter, VideoConverter, WinCaptureTarget};
|
||||||
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
use pf_driver_proto::frame;
|
use pf_driver_proto::frame;
|
||||||
@@ -20,13 +20,12 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
|||||||
use windows::core::{w, Interface, HSTRING};
|
use windows::core::{w, Interface, HSTRING};
|
||||||
use windows::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE, LUID};
|
use windows::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE, LUID};
|
||||||
use windows::Win32::Graphics::Direct3D11::{
|
use windows::Win32::Graphics::Direct3D11::{
|
||||||
ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView,
|
ID3D11Device, ID3D11DeviceContext, ID3D11ShaderResourceView, ID3D11Texture2D,
|
||||||
ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE,
|
D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX,
|
||||||
D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX, D3D11_RESOURCE_MISC_SHARED_NTHANDLE,
|
D3D11_RESOURCE_MISC_SHARED_NTHANDLE, D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
|
||||||
D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
|
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::Common::{
|
use windows::Win32::Graphics::Dxgi::Common::{
|
||||||
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
|
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_NV12, DXGI_FORMAT_P010,
|
||||||
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::{
|
use windows::Win32::Graphics::Dxgi::{
|
||||||
@@ -205,15 +204,22 @@ pub struct IddPushCapturer {
|
|||||||
/// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops
|
/// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops
|
||||||
/// the session (recover-or-drop, no DDA).
|
/// the session (recover-or-drop, no DDA).
|
||||||
recovering_since: Option<Instant>,
|
recovering_since: Option<Instant>,
|
||||||
/// Host-owned ROTATING output ring NVENC encodes (texture + RTV per slot). Rotating it per frame is
|
/// Host-owned ROTATING output ring NVENC encodes (one YUV texture per slot). Rotating it per frame
|
||||||
/// the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
|
/// is the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
|
||||||
/// ASIC, frame N+1's convert/copy writes a DIFFERENT texture on the 3D engine — the two overlap. The
|
/// ASIC, frame N+1's convert writes a DIFFERENT texture — the two overlap. Format = `out_format()`:
|
||||||
/// HDR convert and the SDR copy both write into the current slot. Format = `out_format()` (Rgb10a2 in
|
/// NV12 (SDR, BT.709 limited) or P010 (HDR, BT.2020 PQ limited), so NVENC takes native YUV and skips
|
||||||
/// HDR, Bgra in SDR); rebuilt on a display-mode flip. Built lazily.
|
/// its internal RGB→YUV CSC on the SM/3D engine the game saturates (plan §5.A). Rebuilt on a
|
||||||
out_ring: Vec<(ID3D11Texture2D, ID3D11RenderTargetView)>,
|
/// display-mode flip. Built lazily.
|
||||||
|
out_ring: Vec<ID3D11Texture2D>,
|
||||||
out_idx: usize,
|
out_idx: usize,
|
||||||
/// FP16 scRGB → `Rgb10a2` BT.2020 PQ converter, used while the display is HDR. Built lazily.
|
/// BGRA slot → NV12 (BT.709 limited) on the dedicated D3D11 VIDEO engine, used while the display is
|
||||||
hdr_conv: Option<HdrConverter>,
|
/// SDR — keeps the colour-convert OFF the contended 3D/compute engine. Built lazily; rebuilt on a
|
||||||
|
/// size/HDR flip.
|
||||||
|
video_conv: Option<VideoConverter>,
|
||||||
|
/// FP16 scRGB slot → P010 (BT.2020 PQ limited) via two shader passes, used while the display is HDR
|
||||||
|
/// (NVIDIA's VideoProcessor can't do RGB→P010). The passes run on the 3D engine, but it still skips
|
||||||
|
/// NVENC's internal SM-side CSC. Built lazily.
|
||||||
|
hdr_p010_conv: Option<HdrP010Converter>,
|
||||||
last_seq: u64,
|
last_seq: u64,
|
||||||
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
|
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
|
||||||
status_logged: bool,
|
status_logged: bool,
|
||||||
@@ -504,7 +510,8 @@ impl IddPushCapturer {
|
|||||||
recovering_since: None,
|
recovering_since: None,
|
||||||
out_ring: Vec::new(),
|
out_ring: Vec::new(),
|
||||||
out_idx: 0,
|
out_idx: 0,
|
||||||
hdr_conv: None,
|
video_conv: None,
|
||||||
|
hdr_p010_conv: None,
|
||||||
last_seq: 0,
|
last_seq: 0,
|
||||||
last_present: None,
|
last_present: None,
|
||||||
status_logged: false,
|
status_logged: false,
|
||||||
@@ -625,16 +632,17 @@ impl IddPushCapturer {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The output texture format + the [`PixelFormat`] it presents as, driven SOLELY by the DISPLAY's
|
/// The output texture format + the [`PixelFormat`] NVENC encodes, driven SOLELY by the DISPLAY's HDR
|
||||||
/// HDR state (like the WGC path): HDR → `Rgb10a2` BT.2020 PQ → NVENC Main10, and the client
|
/// state (like the WGC path): HDR → `P010` (BT.2020 PQ 10-bit limited) → NVENC Main10, and the client
|
||||||
/// auto-detects PQ from the HEVC VUI; SDR → 8-bit `Bgra`. We do NOT gate HDR on the client's
|
/// auto-detects PQ from the HEVC VUI; SDR → `Nv12` (BT.709 8-bit limited). Both are native YUV so
|
||||||
/// advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit only when
|
/// NVENC skips its internal RGB→YUV CSC on the contended SM (plan §5.A). We do NOT gate HDR on the
|
||||||
/// its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
|
/// client's advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit
|
||||||
|
/// only when its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
|
||||||
fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) {
|
fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) {
|
||||||
if self.display_hdr {
|
if self.display_hdr {
|
||||||
(DXGI_FORMAT_R10G10B10A2_UNORM, PixelFormat::Rgb10a2)
|
(DXGI_FORMAT_P010, PixelFormat::P010)
|
||||||
} else {
|
} else {
|
||||||
(DXGI_FORMAT_B8G8R8A8_UNORM, PixelFormat::Bgra)
|
(DXGI_FORMAT_NV12, PixelFormat::Nv12)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -688,6 +696,8 @@ impl IddPushCapturer {
|
|||||||
self.generation = new_gen;
|
self.generation = new_gen;
|
||||||
self.last_seq = 0;
|
self.last_seq = 0;
|
||||||
self.out_ring.clear(); // the output format changed → rebuild lazily at the new format
|
self.out_ring.clear(); // the output format changed → rebuild lazily at the new format
|
||||||
|
self.video_conv = None; // converters are sized + HDR-specific → rebuild at the new mode
|
||||||
|
self.hdr_p010_conv = None;
|
||||||
self.out_idx = 0;
|
self.out_idx = 0;
|
||||||
self.last_present = None;
|
self.last_present = None;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -742,31 +752,35 @@ impl IddPushCapturer {
|
|||||||
Quality: 0,
|
Quality: 0,
|
||||||
},
|
},
|
||||||
Usage: D3D11_USAGE_DEFAULT,
|
Usage: D3D11_USAGE_DEFAULT,
|
||||||
BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
|
// RENDER_TARGET: the VIDEO processor (NV12) and the P010 shader passes both write here, and
|
||||||
|
// NVENC registers it as encode input — matching the WGC YUV ring.
|
||||||
|
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
CPUAccessFlags: 0,
|
CPUAccessFlags: 0,
|
||||||
MiscFlags: 0,
|
MiscFlags: 0,
|
||||||
};
|
};
|
||||||
for _ in 0..OUT_RING {
|
for _ in 0..OUT_RING {
|
||||||
let mut t: Option<ID3D11Texture2D> = None;
|
let mut t: Option<ID3D11Texture2D> = None;
|
||||||
let mut rtv: Option<ID3D11RenderTargetView> = None;
|
|
||||||
unsafe {
|
unsafe {
|
||||||
self.device
|
self.device
|
||||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||||
.context("CreateTexture2D(IDD out ring)")?;
|
.context("CreateTexture2D(IDD out ring)")?;
|
||||||
let t = t.context("null out-ring texture")?;
|
self.out_ring.push(t.context("null out-ring texture")?);
|
||||||
self.device
|
|
||||||
.CreateRenderTargetView(&t, None, Some(&mut rtv))
|
|
||||||
.context("CreateRenderTargetView(IDD out ring)")?;
|
|
||||||
self.out_ring.push((t, rtv.context("null out-ring rtv")?));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build the HDR converter if not already built (HDR-display path only — an SDR display is a copy).
|
/// Build the per-mode YUV converter if not already built: a VIDEO-engine BGRA→NV12 processor on an
|
||||||
|
/// SDR display, or the FP16→P010 shader on an HDR display. Both keep NVENC's RGB→YUV CSC off the SM.
|
||||||
fn ensure_converter(&mut self) -> Result<()> {
|
fn ensure_converter(&mut self) -> Result<()> {
|
||||||
if self.hdr_conv.is_none() {
|
if self.display_hdr {
|
||||||
self.hdr_conv = Some(unsafe { HdrConverter::new(&self.device)? });
|
if self.hdr_p010_conv.is_none() {
|
||||||
|
self.hdr_p010_conv = Some(unsafe { HdrP010Converter::new(&self.device)? });
|
||||||
|
}
|
||||||
|
} else if self.video_conv.is_none() {
|
||||||
|
self.video_conv = Some(unsafe {
|
||||||
|
VideoConverter::new(&self.device, &self.context, self.width, self.height, false)?
|
||||||
|
});
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -801,16 +815,11 @@ impl IddPushCapturer {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
self.ensure_out_ring()?;
|
self.ensure_out_ring()?;
|
||||||
// Build the HDR converter BEFORE acquiring the slot so nothing between Acquire and Release can
|
// Build the converter BEFORE acquiring the slot so nothing between Acquire and Release can
|
||||||
// `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot).
|
// `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot).
|
||||||
if self.display_hdr {
|
|
||||||
self.ensure_converter()?;
|
self.ensure_converter()?;
|
||||||
}
|
|
||||||
let i = self.out_idx;
|
let i = self.out_idx;
|
||||||
let (out, out_rtv) = {
|
let out = self.out_ring[i].clone();
|
||||||
let (t, rtv) = &self.out_ring[i];
|
|
||||||
(t.clone(), rtv.clone())
|
|
||||||
};
|
|
||||||
let (_, pf) = self.out_format();
|
let (_, pf) = self.out_format();
|
||||||
|
|
||||||
// Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the
|
// Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the
|
||||||
@@ -824,16 +833,27 @@ impl IddPushCapturer {
|
|||||||
let Some(_lock) = KeyedMutexGuard::acquire(&s.mutex, 0, 8) else {
|
let Some(_lock) = KeyedMutexGuard::acquire(&s.mutex, 0, 8) else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
// SAFETY: convert/copy on the owning (encode) thread's immediate context, holding the slot lock.
|
// SAFETY: convert on the owning (encode) thread's immediate context, holding the slot lock.
|
||||||
|
// A `?` here is leak-safe: `_lock` (the KeyedMutexGuard) drops on the early return, releasing
|
||||||
|
// the slot back to the driver.
|
||||||
unsafe {
|
unsafe {
|
||||||
if self.display_hdr {
|
if self.display_hdr {
|
||||||
// Sample the FP16 slot's SRV directly (no scratch copy) → BT.2020 PQ Rgb10a2.
|
// HDR: FP16 slot SRV → P010 (BT.2020 PQ) via the shader; NVENC takes native P010.
|
||||||
if let Some(conv) = self.hdr_conv.as_ref() {
|
if let Some(conv) = self.hdr_p010_conv.as_ref() {
|
||||||
conv.convert(&self.context, &s.srv, &out_rtv, self.width, self.height);
|
conv.convert(
|
||||||
|
&self.device,
|
||||||
|
&self.context,
|
||||||
|
&s.srv,
|
||||||
|
&out,
|
||||||
|
self.width,
|
||||||
|
self.height,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// SDR: the slot is already 8-bit BGRA — one copy into the out-ring (hidden by pipelining).
|
// SDR: BGRA slot → NV12 on the VIDEO engine; NVENC takes native NV12, no SM-side CSC.
|
||||||
self.context.CopyResource(&out, &s.tex);
|
if let Some(conv) = self.video_conv.as_ref() {
|
||||||
|
conv.convert(&s.tex, &out)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// `_lock` drops here → `ReleaseSync(0)`.
|
// `_lock` drops here → `ReleaseSync(0)`.
|
||||||
@@ -861,7 +881,7 @@ impl IddPushCapturer {
|
|||||||
// OUT_RING(3) > the max pipeline_depth(2) guarantees the rotated slot is not in flight.
|
// OUT_RING(3) > the max pipeline_depth(2) guarantees the rotated slot is not in flight.
|
||||||
let (src, pf) = self.last_present.clone()?;
|
let (src, pf) = self.last_present.clone()?;
|
||||||
let i = self.out_idx;
|
let i = self.out_idx;
|
||||||
let dst = self.out_ring.get(i)?.0.clone();
|
let dst = self.out_ring.get(i)?.clone();
|
||||||
// SAFETY: GPU copy on the owning thread's immediate context; src/dst are our out-ring textures of
|
// SAFETY: GPU copy on the owning thread's immediate context; src/dst are our out-ring textures of
|
||||||
// identical format/size (src is a previous out-ring slot; dst the next).
|
// identical format/size (src is a previous out-ring slot; dst the next).
|
||||||
unsafe {
|
unsafe {
|
||||||
|
|||||||
@@ -609,7 +609,14 @@ impl Encoder for NvencD3d11Encoder {
|
|||||||
self.bit_depth = 10;
|
self.bit_depth = 10;
|
||||||
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10
|
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10
|
||||||
}
|
}
|
||||||
PixelFormat::Nv12 => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12,
|
PixelFormat::Nv12 => {
|
||||||
|
// NV12 is 8-bit 4:2:0. Force 8-bit so a transition from a prior P010 (10-bit) session
|
||||||
|
// — or a 10-bit-negotiated client on an SDR display — re-inits at the matching depth.
|
||||||
|
// Unlike ARGB (which NVENC upconverts to Main10), NV12 cannot feed a 10-bit session:
|
||||||
|
// `register_resource` rejects it as InvalidParam (the HDR→SDR-toggle stream drop).
|
||||||
|
self.bit_depth = 8;
|
||||||
|
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12
|
||||||
|
}
|
||||||
_ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
|
_ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
|
||||||
};
|
};
|
||||||
let device = frame.device.clone();
|
let device = frame.device.clone();
|
||||||
|
|||||||
@@ -2356,6 +2356,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
|
|||||||
// compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`.
|
// compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`.
|
||||||
let (mut diag_new, mut diag_repeat) = (0u64, 0u64);
|
let (mut diag_new, mut diag_repeat) = (0u64, 0u64);
|
||||||
let mut diag_at = std::time::Instant::now();
|
let mut diag_at = std::time::Instant::now();
|
||||||
|
// Per-stage latency breakdown (PUNKTFUNK_PERF): per-call µs for the GPU-bound stages so we see
|
||||||
|
// exactly where the capture→encoded latency goes — cap=try_latest (ring read + colour convert),
|
||||||
|
// submit=encode_picture launch, wait=lock_bitstream (the scheduling wait + ASIC encode, the one
|
||||||
|
// that dominates under a GPU-saturating game).
|
||||||
|
let (mut st_cap, mut st_submit, mut st_wait): (Vec<u32>, Vec<u32>, Vec<u32>) =
|
||||||
|
(Vec::new(), Vec::new(), Vec::new());
|
||||||
while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
|
while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
|
||||||
// Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in
|
// Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in
|
||||||
// place — a different compositor at the SAME client mode — keeping the Session + send thread
|
// place — a different compositor at the SAME client mode — keeping the Session + send thread
|
||||||
@@ -2462,7 +2468,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
|
|||||||
tracing::debug!("forcing keyframe (client decode recovery)");
|
tracing::debug!("forcing keyframe (client decode recovery)");
|
||||||
enc.request_keyframe();
|
enc.request_keyframe();
|
||||||
}
|
}
|
||||||
match capturer.try_latest() {
|
let t_cap = std::time::Instant::now();
|
||||||
|
let cap_result = capturer.try_latest();
|
||||||
|
if perf {
|
||||||
|
st_cap.push(t_cap.elapsed().as_micros() as u32);
|
||||||
|
}
|
||||||
|
match cap_result {
|
||||||
Ok(Some(f)) => {
|
Ok(Some(f)) => {
|
||||||
frame = f;
|
frame = f;
|
||||||
diag_new += 1;
|
diag_new += 1;
|
||||||
@@ -2501,6 +2512,20 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
|
|||||||
"capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \
|
"capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \
|
||||||
the source isn't producing frames, not an encode stall)"
|
the source isn't producing frames, not an encode stall)"
|
||||||
);
|
);
|
||||||
|
let wait_max = st_wait.iter().copied().max().unwrap_or(0);
|
||||||
|
tracing::info!(
|
||||||
|
cap_us_p50 = percentile(&mut st_cap, 0.50),
|
||||||
|
cap_us_p99 = percentile(&mut st_cap, 0.99),
|
||||||
|
submit_us_p50 = percentile(&mut st_submit, 0.50),
|
||||||
|
submit_us_p99 = percentile(&mut st_submit, 0.99),
|
||||||
|
wait_us_p50 = percentile(&mut st_wait, 0.50),
|
||||||
|
wait_us_p99 = percentile(&mut st_wait, 0.99),
|
||||||
|
wait_us_max = wait_max,
|
||||||
|
"stage perf (µs/call): cap=try_latest(ring+convert) submit=encode_picture wait=lock_bitstream(sched+ASIC)"
|
||||||
|
);
|
||||||
|
st_cap.clear();
|
||||||
|
st_submit.clear();
|
||||||
|
st_wait.clear();
|
||||||
diag_new = 0;
|
diag_new = 0;
|
||||||
diag_repeat = 0;
|
diag_repeat = 0;
|
||||||
diag_at = std::time::Instant::now();
|
diag_at = std::time::Instant::now();
|
||||||
@@ -2519,7 +2544,11 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
|
|||||||
// capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1.
|
// capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1.
|
||||||
let depth = capturer.pipeline_depth().max(1);
|
let depth = capturer.pipeline_depth().max(1);
|
||||||
let capture_ns = now_ns();
|
let capture_ns = now_ns();
|
||||||
|
let t_submit = std::time::Instant::now();
|
||||||
enc.submit(&frame).context("encoder submit")?;
|
enc.submit(&frame).context("encoder submit")?;
|
||||||
|
if perf {
|
||||||
|
st_submit.push(t_submit.elapsed().as_micros() as u32);
|
||||||
|
}
|
||||||
// This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame
|
// This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame
|
||||||
// up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled.
|
// up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled.
|
||||||
next += interval;
|
next += interval;
|
||||||
@@ -2530,7 +2559,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
|
|||||||
// the oldest submitted frame's AU — matching `inflight.pop_front()`.
|
// the oldest submitted frame's AU — matching `inflight.pop_front()`.
|
||||||
let mut send_gone = false;
|
let mut send_gone = false;
|
||||||
while inflight.len() >= depth {
|
while inflight.len() >= depth {
|
||||||
let au = match enc.poll().context("encoder poll")? {
|
let t_wait = std::time::Instant::now();
|
||||||
|
let polled = enc.poll().context("encoder poll")?;
|
||||||
|
if perf {
|
||||||
|
st_wait.push(t_wait.elapsed().as_micros() as u32);
|
||||||
|
}
|
||||||
|
let au = match polled {
|
||||||
Some(au) => au,
|
Some(au) => au,
|
||||||
None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks)
|
None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks)
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user