feat(windows-host): IDD-push encodes native NV12/P010 (skip NVENC's SM-side CSC)

GPU-contention work (host-latency plan §5.A): the IDD-push output ring now hands
NVENC native YUV instead of RGB, so NVENC skips its internal RGB→YUV colour
conversion on the SM/3D engine the running game saturates.

- idd_push.rs: out_ring is now NV12 (SDR, BT.709 limited) via a D3D11 VIDEO-engine
  BGRA→NV12 VideoConverter (keeps the CSC off the contended 3D/compute engine), or
  P010 (HDR, BT.2020 PQ limited) via the FP16→P010 shader (NVIDIA's VideoProcessor
  can't do RGB→P010). The ring drops its per-slot RTV (textures only), matching the
  WGC YUV ring; converters rebuild on a size/HDR flip.
- nvenc.rs: NV12 input forces bit_depth=8 so an HDR→SDR toggle (or a 10-bit-
  negotiated client on an SDR display) re-inits the session at the matching depth —
  NV12 can't feed a 10-bit session (register_resource rejects it).
- punktfunk1.rs: per-stage latency instrumentation under PUNKTFUNK_PERF
  (cap=try_latest, submit=encode_picture, wait=lock_bitstream µs p50/p99/max) to
  pinpoint where capture→encoded latency goes under GPU saturation.
This commit is contained in:
2026-06-26 09:35:23 +00:00
parent 327a5fa828
commit 3514702d8c
3 changed files with 111 additions and 50 deletions
@@ -10,7 +10,7 @@
//! [`pf_driver_proto::frame`] (which OWNS the contract, with `const` size asserts) — both sides
//! `use` it, so drift is a compile error rather than a "must match" comment.
use super::dxgi::{make_device, D3d11Frame, HdrConverter, WinCaptureTarget};
use super::dxgi::{make_device, D3d11Frame, HdrP010Converter, VideoConverter, WinCaptureTarget};
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
use anyhow::{bail, Context, Result};
use pf_driver_proto::frame;
@@ -20,13 +20,12 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use windows::core::{w, Interface, HSTRING};
use windows::Win32::Foundation::{HANDLE, INVALID_HANDLE_VALUE, LUID};
use windows::Win32::Graphics::Direct3D11::{
ID3D11Device, ID3D11DeviceContext, ID3D11RenderTargetView, ID3D11ShaderResourceView,
ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE,
D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX, D3D11_RESOURCE_MISC_SHARED_NTHANDLE,
D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
ID3D11Device, ID3D11DeviceContext, ID3D11ShaderResourceView, ID3D11Texture2D,
D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_RESOURCE_MISC_SHARED_KEYEDMUTEX,
D3D11_RESOURCE_MISC_SHARED_NTHANDLE, D3D11_TEXTURE2D_DESC, D3D11_USAGE_DEFAULT,
};
use windows::Win32::Graphics::Dxgi::Common::{
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_NV12, DXGI_FORMAT_P010,
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
};
use windows::Win32::Graphics::Dxgi::{
@@ -205,15 +204,22 @@ pub struct IddPushCapturer {
/// cleared when a fresh frame resumes. If it stays set past the recovery window, `try_consume` drops
/// the session (recover-or-drop, no DDA).
recovering_since: Option<Instant>,
/// Host-owned ROTATING output ring NVENC encodes (texture + RTV per slot). Rotating it per frame is
/// the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
/// ASIC, frame N+1's convert/copy writes a DIFFERENT texture on the 3D engine — the two overlap. The
/// HDR convert and the SDR copy both write into the current slot. Format = `out_format()` (Rgb10a2 in
/// HDR, Bgra in SDR); rebuilt on a display-mode flip. Built lazily.
out_ring: Vec<(ID3D11Texture2D, ID3D11RenderTargetView)>,
/// Host-owned ROTATING output ring NVENC encodes (one YUV texture per slot). Rotating it per frame
/// is the precondition for pipelining the encode loop: while NVENC encodes frame N's texture on the
/// ASIC, frame N+1's convert writes a DIFFERENT texture — the two overlap. Format = `out_format()`:
/// NV12 (SDR, BT.709 limited) or P010 (HDR, BT.2020 PQ limited), so NVENC takes native YUV and skips
/// its internal RGB→YUV CSC on the SM/3D engine the game saturates (plan §5.A). Rebuilt on a
/// display-mode flip. Built lazily.
out_ring: Vec<ID3D11Texture2D>,
out_idx: usize,
/// FP16 scRGB → `Rgb10a2` BT.2020 PQ converter, used while the display is HDR. Built lazily.
hdr_conv: Option<HdrConverter>,
/// BGRA slot → NV12 (BT.709 limited) on the dedicated D3D11 VIDEO engine, used while the display is
/// SDR — keeps the colour-convert OFF the contended 3D/compute engine. Built lazily; rebuilt on a
/// size/HDR flip.
video_conv: Option<VideoConverter>,
/// FP16 scRGB slot → P010 (BT.2020 PQ limited) via two shader passes, used while the display is HDR
/// (NVIDIA's VideoProcessor can't do RGB→P010). The passes run on the 3D engine, but it still skips
/// NVENC's internal SM-side CSC. Built lazily.
hdr_p010_conv: Option<HdrP010Converter>,
last_seq: u64,
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
status_logged: bool,
@@ -504,7 +510,8 @@ impl IddPushCapturer {
recovering_since: None,
out_ring: Vec::new(),
out_idx: 0,
hdr_conv: None,
video_conv: None,
hdr_p010_conv: None,
last_seq: 0,
last_present: None,
status_logged: false,
@@ -625,16 +632,17 @@ impl IddPushCapturer {
);
}
/// The output texture format + the [`PixelFormat`] it presents as, driven SOLELY by the DISPLAY's
/// HDR state (like the WGC path): HDR → `Rgb10a2` BT.2020 PQ → NVENC Main10, and the client
/// auto-detects PQ from the HEVC VUI; SDR → 8-bit `Bgra`. We do NOT gate HDR on the client's
/// advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit only when
/// its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
/// The output texture format + the [`PixelFormat`] NVENC encodes, driven SOLELY by the DISPLAY's HDR
/// state (like the WGC path): HDR → `P010` (BT.2020 PQ 10-bit limited) → NVENC Main10, and the client
/// auto-detects PQ from the HEVC VUI; SDR → `Nv12` (BT.709 8-bit limited). Both are native YUV so
/// NVENC skips its internal RGB→YUV CSC on the contended SM (plan §5.A). We do NOT gate HDR on the
/// client's advertised `VIDEO_CAP_10BIT` — clients under-report it (e.g. the Mac advertises 10-bit
/// only when its OWN display is HDR), yet all decode Main10 + auto-switch, exactly as on the WGC path.
fn out_format(&self) -> (DXGI_FORMAT, PixelFormat) {
if self.display_hdr {
(DXGI_FORMAT_R10G10B10A2_UNORM, PixelFormat::Rgb10a2)
(DXGI_FORMAT_P010, PixelFormat::P010)
} else {
(DXGI_FORMAT_B8G8R8A8_UNORM, PixelFormat::Bgra)
(DXGI_FORMAT_NV12, PixelFormat::Nv12)
}
}
@@ -688,6 +696,8 @@ impl IddPushCapturer {
self.generation = new_gen;
self.last_seq = 0;
self.out_ring.clear(); // the output format changed → rebuild lazily at the new format
self.video_conv = None; // converters are sized + HDR-specific → rebuild at the new mode
self.hdr_p010_conv = None;
self.out_idx = 0;
self.last_present = None;
Ok(())
@@ -742,31 +752,35 @@ impl IddPushCapturer {
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
// RENDER_TARGET: the VIDEO processor (NV12) and the P010 shader passes both write here, and
// NVENC registers it as encode input — matching the WGC YUV ring.
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
for _ in 0..OUT_RING {
let mut t: Option<ID3D11Texture2D> = None;
let mut rtv: Option<ID3D11RenderTargetView> = None;
unsafe {
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(IDD out ring)")?;
let t = t.context("null out-ring texture")?;
self.device
.CreateRenderTargetView(&t, None, Some(&mut rtv))
.context("CreateRenderTargetView(IDD out ring)")?;
self.out_ring.push((t, rtv.context("null out-ring rtv")?));
self.out_ring.push(t.context("null out-ring texture")?);
}
}
Ok(())
}
/// Build the HDR converter if not already built (HDR-display path only — an SDR display is a copy).
/// Build the per-mode YUV converter if not already built: a VIDEO-engine BGRA→NV12 processor on an
/// SDR display, or the FP16→P010 shader on an HDR display. Both keep NVENC's RGB→YUV CSC off the SM.
fn ensure_converter(&mut self) -> Result<()> {
if self.hdr_conv.is_none() {
self.hdr_conv = Some(unsafe { HdrConverter::new(&self.device)? });
if self.display_hdr {
if self.hdr_p010_conv.is_none() {
self.hdr_p010_conv = Some(unsafe { HdrP010Converter::new(&self.device)? });
}
} else if self.video_conv.is_none() {
self.video_conv = Some(unsafe {
VideoConverter::new(&self.device, &self.context, self.width, self.height, false)?
});
}
Ok(())
}
@@ -801,16 +815,11 @@ impl IddPushCapturer {
return Ok(None);
}
self.ensure_out_ring()?;
// Build the HDR converter BEFORE acquiring the slot so nothing between Acquire and Release can
// Build the converter BEFORE acquiring the slot so nothing between Acquire and Release can
// `?`-return and leak the keyed-mutex lock (which would stall the driver on that slot).
if self.display_hdr {
self.ensure_converter()?;
}
self.ensure_converter()?;
let i = self.out_idx;
let (out, out_rtv) = {
let (t, rtv) = &self.out_ring[i];
(t.clone(), rtv.clone())
};
let out = self.out_ring[i].clone();
let (_, pf) = self.out_format();
// Hold the slot's keyed mutex only across the convert/copy into the host out-ring (NOT across the
@@ -824,16 +833,27 @@ impl IddPushCapturer {
let Some(_lock) = KeyedMutexGuard::acquire(&s.mutex, 0, 8) else {
return Ok(None);
};
// SAFETY: convert/copy on the owning (encode) thread's immediate context, holding the slot lock.
// SAFETY: convert on the owning (encode) thread's immediate context, holding the slot lock.
// A `?` here is leak-safe: `_lock` (the KeyedMutexGuard) drops on the early return, releasing
// the slot back to the driver.
unsafe {
if self.display_hdr {
// Sample the FP16 slot's SRV directly (no scratch copy) → BT.2020 PQ Rgb10a2.
if let Some(conv) = self.hdr_conv.as_ref() {
conv.convert(&self.context, &s.srv, &out_rtv, self.width, self.height);
// HDR: FP16 slot SRV → P010 (BT.2020 PQ) via the shader; NVENC takes native P010.
if let Some(conv) = self.hdr_p010_conv.as_ref() {
conv.convert(
&self.device,
&self.context,
&s.srv,
&out,
self.width,
self.height,
)?;
}
} else {
// SDR: the slot is already 8-bit BGRA — one copy into the out-ring (hidden by pipelining).
self.context.CopyResource(&out, &s.tex);
// SDR: BGRA slot → NV12 on the VIDEO engine; NVENC takes native NV12, no SM-side CSC.
if let Some(conv) = self.video_conv.as_ref() {
conv.convert(&s.tex, &out)?;
}
}
}
// `_lock` drops here → `ReleaseSync(0)`.
@@ -861,7 +881,7 @@ impl IddPushCapturer {
// OUT_RING(3) > the max pipeline_depth(2) guarantees the rotated slot is not in flight.
let (src, pf) = self.last_present.clone()?;
let i = self.out_idx;
let dst = self.out_ring.get(i)?.0.clone();
let dst = self.out_ring.get(i)?.clone();
// SAFETY: GPU copy on the owning thread's immediate context; src/dst are our out-ring textures of
// identical format/size (src is a previous out-ring slot; dst the next).
unsafe {