feat(windows-host): IDD-push encodes native NV12/P010 (skip NVENC's SM-side CSC)

GPU-contention work (host-latency plan §5.A): the IDD-push output ring now hands
NVENC native YUV instead of RGB, so NVENC skips its internal RGB→YUV colour
conversion on the SM/3D engine the running game saturates.

- idd_push.rs: out_ring is now NV12 (SDR, BT.709 limited) via a D3D11 VIDEO-engine
  BGRA→NV12 VideoConverter (keeps the CSC off the contended 3D/compute engine), or
  P010 (HDR, BT.2020 PQ limited) via the FP16→P010 shader (NVIDIA's VideoProcessor
  can't do RGB→P010). The ring drops its per-slot RTV (textures only), matching the
  WGC YUV ring; converters rebuild on a size/HDR flip.
- nvenc.rs: NV12 input forces bit_depth=8 so an HDR→SDR toggle (or a 10-bit-
  negotiated client on an SDR display) re-inits the session at the matching depth —
  NV12 can't feed a 10-bit session (register_resource rejects it).
- punktfunk1.rs: per-stage latency instrumentation under PUNKTFUNK_PERF
  (cap=try_latest, submit=encode_picture, wait=lock_bitstream µs p50/p99/max) to
  pinpoint where capture→encoded latency goes under GPU saturation.
This commit is contained in:
2026-06-26 09:35:23 +00:00
parent 327a5fa828
commit 3514702d8c
3 changed files with 111 additions and 50 deletions
@@ -10,7 +10,7 @@
//! [`pf_driver_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides //! [`pf_driver_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides
//! `use` it, so drift is a compile error rather than a "must match" comment. //! `use` it, so drift is a compile error rather than a "must match" comment.
use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget}; use super::dxgi::{make_device, D3d11Frame, HdrP010Converter, VideoConverter, WinCaptureTarget};
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
use anyhow::{bail, Context, Result}; use anyhow::{bail, Context, Result};
use pf_driver_proto::frame; use pf_driver_proto::frame;
@@ -20,13 +20,12 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use windows::core::{w, Interface, HSTRING}; use windows::core::{w, Interface, HSTRING};
use windows::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE, LUID}; use windows::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE, LUID};
use windows::Win32::Graphics::Direct3D11::{ use windows::Win32::Graphics::Direct3D11::{
ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView, ID3D11Device, ID3D11DeviceContext, ID3D11ShaderResourceView, ID3D11Texture2D,
ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX,
D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX, D3D11_RESOURCE_MISC_SHARED_NTHANDLE, D3D11_RESOURCE_MISC_SHARED_NTHANDLE, D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
}; };
use windows::Win32::Graphics::Dxgi::Common::{ use windows::Win32::Graphics::Dxgi::Common::{
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_NV12, DXGI_FORMAT_P010,
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
}; };
use windows::Win32::Graphics::Dxgi::{ use windows::Win32::Graphics::Dxgi::{
@@ -205,15 +204,22 @@ pub struct IddPushCapturer {
/// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops /// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops
/// the session (recover-or-drop, no DDA). /// the session (recover-or-drop, no DDA).
recovering_since: Option<Instant>, recovering_since: Option<Instant>,
/// Host-owned ROTATING output ring NVENC encodes (texture + RTV per slot). Rotating it per frame is /// Host-owned ROTATING output ring NVENC encodes (one YUV texture per slot). Rotating it per frame
/// the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the /// is the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
/// ASIC, frame N+1's convert/copy writes a DIFFERENT texture on the 3D engine — the two overlap. The /// ASIC, frame N+1's convert writes a DIFFERENT texture — the two overlap. Format = `out_format()`:
/// HDR convert and the SDR copy both write into the current slot. Format = `out_format()` (Rgb10a2 in /// NV12 (SDR, BT.709 limited) or P010 (HDR, BT.2020 PQ limited), so NVENC takes native YUV and skips
/// HDR, Bgra in SDR); rebuilt on a display-mode flip. Built lazily. /// its internal RGB→YUV CSC on the SM/3D engine the game saturates (plan §5.A). Rebuilt on a
out_ring: Vec<(ID3D11Texture2D, ID3D11RenderTargetView)>, /// display-mode flip. Built lazily.
out_ring: Vec<ID3D11Texture2D>,
out_idx: usize, out_idx: usize,
/// FP16 scRGB → `Rgb10a2` BT.2020 PQ converter, used while the display is HDR. Built lazily. /// BGRA slot → NV12 (BT.709 limited) on the dedicated D3D11 VIDEO engine, used while the display is
hdr_conv: Option<HdrConverter>, /// SDR — keeps the colour-convert OFF the contended 3D/compute engine. Built lazily; rebuilt on a
/// size/HDR flip.
video_conv: Option<VideoConverter>,
/// FP16 scRGB slot → P010 (BT.2020 PQ limited) via two shader passes, used while the display is HDR
/// (NVIDIA's VideoProcessor can't do RGB→P010). The passes run on the 3D engine, but it still skips
/// NVENC's internal SM-side CSC. Built lazily.
hdr_p010_conv: Option<HdrP010Converter>,
last_seq: u64, last_seq: u64,
last_present: Option<(ID3D11Texture2D, PixelFormat)>, last_present: Option<(ID3D11Texture2D, PixelFormat)>,
status_logged: bool, status_logged: bool,
@@ -504,7 +510,8 @@ impl IddPushCapturer {
recovering_since: None, recovering_since: None,
out_ring: Vec::new(), out_ring: Vec::new(),
out_idx: 0, out_idx: 0,
hdr_conv: None, video_conv: None,
hdr_p010_conv: None,
last_seq: 0, last_seq: 0,
last_present: None, last_present: None,
status_logged: false, status_logged: false,
@@ -625,16 +632,17 @@ impl IddPushCapturer {
); );
} }
/// The output texture format + the [`PixelFormat`] it presents as, driven SOLELY by the DISPLAY's /// The output texture format + the [`PixelFormat`] NVENC encodes, driven SOLELY by the DISPLAY's HDR
/// HDR state (like the WGC path): HDR → `Rgb10a2` BT.2020 PQ → NVENC Main10, and the client /// state (like the WGC path): HDR → `P010` (BT.2020 PQ 10-bit limited) → NVENC Main10, and the client
/// auto-detects PQ from the HEVC VUI; SDR → 8-bit `Bgra`. We do NOT gate HDR on the client's /// auto-detects PQ from the HEVC VUI; SDR → `Nv12` (BT.709 8-bit limited). Both are native YUV so
/// advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit only when /// NVENC skips its internal RGB→YUV CSC on the contended SM (plan §5.A). We do NOT gate HDR on the
/// its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path. /// client's advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit
/// only when its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) { fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) {
if self.display_hdr { if self.display_hdr {
(DXGI_FORMAT_R10G10B10A2_UNORM, PixelFormat::Rgb10a2) (DXGI_FORMAT_P010, PixelFormat::P010)
} else { } else {
(DXGI_FORMAT_B8G8R8A8_UNORM, PixelFormat::Bgra) (DXGI_FORMAT_NV12, PixelFormat::Nv12)
} }
} }
@@ -688,6 +696,8 @@ impl IddPushCapturer {
self.generation = new_gen; self.generation = new_gen;
self.last_seq = 0; self.last_seq = 0;
self.out_ring.clear(); // the output format changed → rebuild lazily at the new format self.out_ring.clear(); // the output format changed → rebuild lazily at the new format
self.video_conv = None; // converters are sized + HDR-specific → rebuild at the new mode
self.hdr_p010_conv = None;
self.out_idx = 0; self.out_idx = 0;
self.last_present = None; self.last_present = None;
Ok(()) Ok(())
@@ -742,31 +752,35 @@ impl IddPushCapturer {
Quality: 0, Quality: 0,
}, },
Usage: D3D11_USAGE_DEFAULT, Usage: D3D11_USAGE_DEFAULT,
BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32, // RENDER_TARGET: the VIDEO processor (NV12) and the P010 shader passes both write here, and
// NVENC registers it as encode input — matching the WGC YUV ring.
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0, CPUAccessFlags: 0,
MiscFlags: 0, MiscFlags: 0,
}; };
for _ in 0..OUT_RING { for _ in 0..OUT_RING {
let mut t: Option<ID3D11Texture2D> = None; let mut t: Option<ID3D11Texture2D> = None;
let mut rtv: Option<ID3D11RenderTargetView> = None;
unsafe { unsafe {
self.device self.device
.CreateTexture2D(&desc, None, Some(&mut t)) .CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(IDD out ring)")?; .context("CreateTexture2D(IDD out ring)")?;
let t = t.context("null out-ring texture")?; self.out_ring.push(t.context("null out-ring texture")?);
self.device
.CreateRenderTargetView(&t, None, Some(&mut rtv))
.context("CreateRenderTargetView(IDD out ring)")?;
self.out_ring.push((t, rtv.context("null out-ring rtv")?));
} }
} }
Ok(()) Ok(())
} }
/// Build the HDR converter if not already built (HDR-display path only — an SDR display is a copy). /// Build the per-mode YUV converter if not already built: a VIDEO-engine BGRA→NV12 processor on an
/// SDR display, or the FP16→P010 shader on an HDR display. Both keep NVENC's RGB→YUV CSC off the SM.
fn ensure_converter(&mut self) -> Result<()> { fn ensure_converter(&mut self) -> Result<()> {
if self.hdr_conv.is_none() { if self.display_hdr {
self.hdr_conv = Some(unsafe { HdrConverter::new(&self.device)? }); if self.hdr_p010_conv.is_none() {
self.hdr_p010_conv = Some(unsafe { HdrP010Converter::new(&self.device)? });
}
} else if self.video_conv.is_none() {
self.video_conv = Some(unsafe {
VideoConverter::new(&self.device, &self.context, self.width, self.height, false)?
});
} }
Ok(()) Ok(())
} }
@@ -801,16 +815,11 @@ impl IddPushCapturer {
return Ok(None); return Ok(None);
} }
self.ensure_out_ring()?; self.ensure_out_ring()?;
// Build the HDR converter BEFORE acquiring the slot so nothing between Acquire and Release can // Build the converter BEFORE acquiring the slot so nothing between Acquire and Release can
// `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot). // `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot).
if self.display_hdr {
self.ensure_converter()?; self.ensure_converter()?;
}
let i = self.out_idx; let i = self.out_idx;
let (out, out_rtv) = { let out = self.out_ring[i].clone();
let (t, rtv) = &self.out_ring[i];
(t.clone(), rtv.clone())
};
let (_, pf) = self.out_format(); let (_, pf) = self.out_format();
// Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the // Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the
@@ -824,16 +833,27 @@ impl IddPushCapturer {
let Some(_lock) = KeyedMutexGuard::acquire(&s.mutex, 0, 8) else { let Some(_lock) = KeyedMutexGuard::acquire(&s.mutex, 0, 8) else {
return Ok(None); return Ok(None);
}; };
// SAFETY: convert/copy on the owning (encode) thread's immediate context, holding the slot lock. // SAFETY: convert on the owning (encode) thread's immediate context, holding the slot lock.
// A `?` here is leak-safe: `_lock` (the KeyedMutexGuard) drops on the early return, releasing
// the slot back to the driver.
unsafe { unsafe {
if self.display_hdr { if self.display_hdr {
// Sample the FP16 slot's SRV directly (no scratch copy) → BT.2020 PQ Rgb10a2. // HDR: FP16 slot SRV → P010 (BT.2020 PQ) via the shader; NVENC takes native P010.
if let Some(conv) = self.hdr_conv.as_ref() { if let Some(conv) = self.hdr_p010_conv.as_ref() {
conv.convert(&self.context, &s.srv, &out_rtv, self.width, self.height); conv.convert(
&self.device,
&self.context,
&s.srv,
&out,
self.width,
self.height,
)?;
} }
} else { } else {
// SDR: the slot is already 8-bit BGRA — one copy into the out-ring (hidden by pipelining). // SDR: BGRA slot → NV12 on the VIDEO engine; NVENC takes native NV12, no SM-side CSC.
self.context.CopyResource(&out, &s.tex); if let Some(conv) = self.video_conv.as_ref() {
conv.convert(&s.tex, &out)?;
}
} }
} }
// `_lock` drops here → `ReleaseSync(0)`. // `_lock` drops here → `ReleaseSync(0)`.
@@ -861,7 +881,7 @@ impl IddPushCapturer {
// OUT_RING(3) > the max pipeline_depth(2) guarantees the rotated slot is not in flight. // OUT_RING(3) > the max pipeline_depth(2) guarantees the rotated slot is not in flight.
let (src, pf) = self.last_present.clone()?; let (src, pf) = self.last_present.clone()?;
let i = self.out_idx; let i = self.out_idx;
let dst = self.out_ring.get(i)?.0.clone(); let dst = self.out_ring.get(i)?.clone();
// SAFETY: GPU copy on the owning thread's immediate context; src/dst are our out-ring textures of // SAFETY: GPU copy on the owning thread's immediate context; src/dst are our out-ring textures of
// identical format/size (src is a previous out-ring slot; dst the next). // identical format/size (src is a previous out-ring slot; dst the next).
unsafe { unsafe {
@@ -609,7 +609,14 @@ impl Encoder for NvencD3d11Encoder {
self.bit_depth = 10; self.bit_depth = 10;
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10 nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10
} }
PixelFormat::Nv12 => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12, PixelFormat::Nv12 => {
// NV12 is 8-bit 4:2:0. Force 8-bit so a transition from a prior P010 (10-bit) session
// — or a 10-bit-negotiated client on an SDR display — re-inits at the matching depth.
// Unlike ARGB (which NVENC upconverts to Main10), NV12 cannot feed a 10-bit session:
// `register_resource` rejects it as InvalidParam (the HDR→SDR-toggle stream drop).
self.bit_depth = 8;
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12
}
_ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB, _ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
}; };
let device = frame.device.clone(); let device = frame.device.clone();
+36 -2
View File
@@ -2356,6 +2356,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
// compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`. // compositing), NOT an encoder problem. Logged every 2 s when `PUNKTFUNK_PERF`.
let (mut diag_new, mut diag_repeat) = (0u64, 0u64); let (mut diag_new, mut diag_repeat) = (0u64, 0u64);
let mut diag_at = std::time::Instant::now(); let mut diag_at = std::time::Instant::now();
// Per-stage latency breakdown (PUNKTFUNK_PERF): per-call µs for the GPU-bound stages so we see
// exactly where the capture→encoded latency goes — cap=try_latest (ring read + colour convert),
// submit=encode_picture launch, wait=lock_bitstream (the scheduling wait + ASIC encode, the one
// that dominates under a GPU-saturating game).
let (mut st_cap, mut st_submit, mut st_wait): (Vec<u32>, Vec<u32>, Vec<u32>) =
(Vec::new(), Vec::new(), Vec::new());
while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline { while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
// Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in // Mid-stream session switch (the box flipped Gaming↔Desktop): rebuild the WHOLE backend in
// place — a different compositor at the SAME client mode — keeping the Session + send thread // place — a different compositor at the SAME client mode — keeping the Session + send thread
@@ -2462,7 +2468,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
tracing::debug!("forcing keyframe (client decode recovery)"); tracing::debug!("forcing keyframe (client decode recovery)");
enc.request_keyframe(); enc.request_keyframe();
} }
match capturer.try_latest() { let t_cap = std::time::Instant::now();
let cap_result = capturer.try_latest();
if perf {
st_cap.push(t_cap.elapsed().as_micros() as u32);
}
match cap_result {
Ok(Some(f)) => { Ok(Some(f)) => {
frame = f; frame = f;
diag_new += 1; diag_new += 1;
@@ -2501,6 +2512,20 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
"capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \ "capture diag: NEW frames from the source vs REPEATS (low new_fps at high send rate ⇒ \
the source isn't producing frames, not an encode stall)" the source isn't producing frames, not an encode stall)"
); );
let wait_max = st_wait.iter().copied().max().unwrap_or(0);
tracing::info!(
cap_us_p50 = percentile(&mut st_cap, 0.50),
cap_us_p99 = percentile(&mut st_cap, 0.99),
submit_us_p50 = percentile(&mut st_submit, 0.50),
submit_us_p99 = percentile(&mut st_submit, 0.99),
wait_us_p50 = percentile(&mut st_wait, 0.50),
wait_us_p99 = percentile(&mut st_wait, 0.99),
wait_us_max = wait_max,
"stage perf (µs/call): cap=try_latest(ring+convert) submit=encode_picture wait=lock_bitstream(sched+ASIC)"
);
st_cap.clear();
st_submit.clear();
st_wait.clear();
diag_new = 0; diag_new = 0;
diag_repeat = 0; diag_repeat = 0;
diag_at = std::time::Instant::now(); diag_at = std::time::Instant::now();
@@ -2519,7 +2544,11 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
// capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1. // capturer hands a rotating ring of output textures, so it returns >1; other capturers default 1.
let depth = capturer.pipeline_depth().max(1); let depth = capturer.pipeline_depth().max(1);
let capture_ns = now_ns(); let capture_ns = now_ns();
let t_submit = std::time::Instant::now();
enc.submit(&frame).context("encoder submit")?; enc.submit(&frame).context("encoder submit")?;
if perf {
st_submit.push(t_submit.elapsed().as_micros() as u32);
}
// This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame // This frame's pacing deadline (the next frame's due time); the send thread spreads a big frame
// up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled. // up to here. Each in-flight frame carries its own (capture_ns, deadline) for when it's polled.
next += interval; next += interval;
@@ -2530,7 +2559,12 @@ fn virtual_stream(ctx: SessionContext) -> Result<()> {
// the oldest submitted frame's AU — matching `inflight.pop_front()`. // the oldest submitted frame's AU — matching `inflight.pop_front()`.
let mut send_gone = false; let mut send_gone = false;
while inflight.len() >= depth { while inflight.len() >= depth {
let au = match enc.poll().context("encoder poll")? { let t_wait = std::time::Instant::now();
let polled = enc.poll().context("encoder poll")?;
if perf {
st_wait.push(t_wait.elapsed().as_micros() as u32);
}
let au = match polled {
Some(au) => au, Some(au) => au,
None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks) None => break, // no AU ready for a submitted frame (shouldn't happen — poll blocks)
}; };