perf(host/windows): move capture→encode off the 3D engine (NV12/P010 video-processor path, zero-copy, GPU priority)
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
The Windows host capped at ~60 fps with 35-40 ms latency on a GPU-heavy game: the per-frame capture→encode path shared the 3D engine with the game and got scheduled behind it. Rework to minimize 3D-engine work per frame: - VideoConverter (D3D11 video processor): capture → NVENC-native NV12/P010 so NVENC skips its internal RGB→YUV (a 3D/compute step). Wired into both DDA (dxgi.rs) and WGC (wgc.rs). New PixelFormat::Nv12/P010 + NVENC YUV input. - GPU scheduling hardening (Apollo-style): D3DKMTSetProcessSchedulingPriorityClass HIGH, absolute SetGPUThreadPriority, SetMaximumFrameLatency(1). - WGC SDR zero-copy (hold pool frames; no CopyResource). DDA keeps a fast CopyResource to decouple its single-frame acquire/release from the async convert. - Pipelined helper encode loop (PUNKTFUNK_ENCODE_DEPTH, default 1) + perf split (cap_wait / encode / write). Live on the RTX 4090: hard 60 fps ceiling removed (now scene-scaling 40-200+), latency much reduced. Residual cap in GPU-pinned scenes is the irreducible RGB→YUV convert (no fixed-function unit on NVIDIA — VideoProcessing engine reads 0%) waiting behind an uncapped game under WDDM context time-slicing; Linux avoids it via gamescope capping the game to the display refresh. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -17,10 +17,12 @@
|
||||
//! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs).
|
||||
|
||||
use super::dxgi::{
|
||||
find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, WinCaptureTarget,
|
||||
find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, VideoConverter,
|
||||
WinCaptureTarget,
|
||||
};
|
||||
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use std::collections::VecDeque;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -37,8 +39,8 @@ use windows::Win32::Graphics::Direct3D11::{
|
||||
D3D11_USAGE_DEFAULT,
|
||||
};
|
||||
use windows::Win32::Graphics::Dxgi::Common::{
|
||||
DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||
DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
||||
DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_R10G10B10A2_UNORM,
|
||||
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
||||
};
|
||||
use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6};
|
||||
use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf};
|
||||
@@ -49,6 +51,22 @@ use windows::Win32::System::WinRT::Direct3D11::{
|
||||
use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop;
|
||||
use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED};
|
||||
|
||||
/// Output texture ring depth. The encode loop pipelines one frame deep (NVENC encodes frame N while
|
||||
/// the capturer produces N+1), so two live textures suffice; three gives headroom against a slow
|
||||
/// `lock_bitstream` and matches the WGC frame-pool depth.
|
||||
// Sized for the deep encode pipeline (`PUNKTFUNK_ENCODE_DEPTH`, default 4, clamped ≤ 6): up to DEPTH
|
||||
// frames are in flight in NVENC at once, so the HDR convert ring and the SDR held-frame set must each
|
||||
// keep DEPTH(+headroom) live textures, and the WGC pool needs spare buffers beyond what we hold.
|
||||
const OUT_RING: usize = 8;
|
||||
|
||||
/// SDR zero-copy: how many recent WGC frames to keep alive so NVENC can encode the pool texture in
|
||||
/// place (no `CopyResource`). Each in-flight encode reads a distinct frame, so this must exceed the
|
||||
/// pipeline depth; the oldest is released once `HELD_FRAMES` newer ones exist.
|
||||
const HELD_FRAMES: usize = 8;
|
||||
/// WGC frame-pool buffer count. Must exceed `HELD_FRAMES` so the compositor always has free buffers
|
||||
/// to render into while we hold frames for in-place (zero-copy) SDR encode.
|
||||
const WGC_POOL_BUFFERS: i32 = 10;
|
||||
|
||||
/// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under
|
||||
/// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user
|
||||
/// for the WGC activation. Returns the user token (the caller reverts + closes it after activation)
|
||||
@@ -112,8 +130,27 @@ pub struct WgcCapturer {
|
||||
hdr_conv: Option<HdrConverter>,
|
||||
fp16_src: Option<ID3D11Texture2D>,
|
||||
fp16_srv: Option<ID3D11ShaderResourceView>,
|
||||
hdr10_out: Option<ID3D11Texture2D>,
|
||||
bgra_copy: Option<ID3D11Texture2D>,
|
||||
/// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed
|
||||
/// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC
|
||||
/// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so
|
||||
/// N+1 must land in a DIFFERENT texture or it clobbers the in-flight encode. (`fp16_src` stays
|
||||
/// single: it's only touched within the D3D11 immediate context, whose op ordering already
|
||||
/// serializes the convert's read against the next copy's write — NVENC's async engine read is the
|
||||
/// only consumer that escapes that ordering, and it reads the ring output, never `fp16_src`.)
|
||||
out_ring: Vec<ID3D11Texture2D>,
|
||||
ring_idx: usize,
|
||||
/// Video-processor RGB→YUV converter (off the 3D engine where possible) + its NV12/P010 output
|
||||
/// ring. Preferred path: the OS-composited capture (cursor already in it) is converted DIRECTLY to
|
||||
/// NVENC's native YUV — no `CopyResource`, no cursor draw, and NVENC skips its internal RGB→YUV.
|
||||
/// `None`/error → falls back to the legacy SDR-zero-copy / HDR-shader paths.
|
||||
video_conv: Option<VideoConverter>,
|
||||
yuv_out: Vec<ID3D11Texture2D>,
|
||||
yuv_idx: usize,
|
||||
yuv_is_hdr: bool,
|
||||
vp_disabled: bool,
|
||||
/// SDR zero-copy: the recent WGC frames we hand to NVENC in place. Held so the pool doesn't
|
||||
/// recycle the texture mid-encode; the oldest is released once `HELD_FRAMES` newer ones exist.
|
||||
held: VecDeque<Direct3D11CaptureFrame>,
|
||||
/// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop).
|
||||
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
|
||||
|
||||
@@ -204,10 +241,15 @@ impl WgcCapturer {
|
||||
} else {
|
||||
DirectXPixelFormat::B8G8R8A8UIntNormalized
|
||||
};
|
||||
// ≥3 buffers for 240 Hz headroom (avoid the producer waiting on a free buffer).
|
||||
let pool =
|
||||
Direct3D11CaptureFramePool::CreateFreeThreaded(&d3d_device, pixel_format, 3, size)
|
||||
.context("CreateFreeThreaded frame pool")?;
|
||||
// Extra buffers: SDR zero-copy holds the last `HELD_FRAMES` frames (encoded in place), so
|
||||
// the pool needs headroom beyond that for the producer to keep rendering at 240 Hz.
|
||||
let pool = Direct3D11CaptureFramePool::CreateFreeThreaded(
|
||||
&d3d_device,
|
||||
pixel_format,
|
||||
WGC_POOL_BUFFERS,
|
||||
size,
|
||||
)
|
||||
.context("CreateFreeThreaded frame pool")?;
|
||||
|
||||
let signal = Arc::new(WgcSignal {
|
||||
available: AtomicU64::new(0),
|
||||
@@ -278,8 +320,14 @@ impl WgcCapturer {
|
||||
hdr_conv: None,
|
||||
fp16_src: None,
|
||||
fp16_srv: None,
|
||||
hdr10_out: None,
|
||||
bgra_copy: None,
|
||||
out_ring: Vec::new(),
|
||||
ring_idx: 0,
|
||||
video_conv: None,
|
||||
yuv_out: Vec::new(),
|
||||
yuv_idx: 0,
|
||||
yuv_is_hdr: false,
|
||||
vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(),
|
||||
held: VecDeque::new(),
|
||||
last_present: None,
|
||||
_keepalive: None,
|
||||
})
|
||||
@@ -347,38 +395,112 @@ impl WgcCapturer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
unsafe fn ensure_hdr10_out(&mut self) -> Result<()> {
|
||||
if self.hdr10_out.is_none() {
|
||||
let desc = tex_desc(
|
||||
self.width,
|
||||
self.height,
|
||||
DXGI_FORMAT_R10G10B10A2_UNORM,
|
||||
D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||
);
|
||||
/// Lazily allocate the HDR output texture ring (R10G10B10A2, the convert pass's render target →
|
||||
/// NVENC input), `RENDER_TARGET`-bindable. SDR is zero-copy (encodes the WGC pool texture in
|
||||
/// place) and uses no ring.
|
||||
unsafe fn ensure_out_ring(
|
||||
&mut self,
|
||||
format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT,
|
||||
) -> Result<()> {
|
||||
if !self.out_ring.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let desc = tex_desc(
|
||||
self.width,
|
||||
self.height,
|
||||
format,
|
||||
D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||
);
|
||||
for _ in 0..OUT_RING {
|
||||
let mut t = None;
|
||||
self.device
|
||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||
.context("CreateTexture2D(wgc hdr10 out)")?;
|
||||
self.hdr10_out = t;
|
||||
.context("CreateTexture2D(wgc out ring)")?;
|
||||
self.out_ring.push(t.context("wgc out ring tex")?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
unsafe fn ensure_bgra(&mut self) -> Result<()> {
|
||||
if self.bgra_copy.is_none() {
|
||||
/// Convert `input` (the OS-composited WGC pool texture: BGRA or scRGB FP16) → NVENC's native YUV
|
||||
/// (NV12 / P010) on the video processor. Returns the YUV texture (from a ring so consecutive
|
||||
/// encodes don't collide), or `None` to fall back to the legacy RGB paths.
|
||||
unsafe fn convert_to_yuv(
|
||||
&mut self,
|
||||
input: &ID3D11Texture2D,
|
||||
hdr: bool,
|
||||
) -> Option<ID3D11Texture2D> {
|
||||
if self.vp_disabled {
|
||||
return None;
|
||||
}
|
||||
if self.video_conv.is_none() || self.yuv_out.is_empty() || self.yuv_is_hdr != hdr {
|
||||
self.video_conv = None;
|
||||
self.yuv_out.clear();
|
||||
self.yuv_idx = 0;
|
||||
let vc = match VideoConverter::new(
|
||||
&self.device,
|
||||
&self.context,
|
||||
self.width,
|
||||
self.height,
|
||||
hdr,
|
||||
) {
|
||||
Ok(vc) => vc,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %format!("{e:#}"),
|
||||
"WGC: video processor unavailable — falling back to RGB path");
|
||||
self.vp_disabled = true;
|
||||
return None;
|
||||
}
|
||||
};
|
||||
let fmt = if hdr {
|
||||
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010
|
||||
} else {
|
||||
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12
|
||||
};
|
||||
let desc = tex_desc(
|
||||
self.width,
|
||||
self.height,
|
||||
DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||
fmt,
|
||||
D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||
);
|
||||
let mut t = None;
|
||||
self.device
|
||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||
.context("CreateTexture2D(wgc bgra)")?;
|
||||
self.bgra_copy = t;
|
||||
for _ in 0..OUT_RING {
|
||||
let mut t = None;
|
||||
if self
|
||||
.device
|
||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||
.is_err()
|
||||
{
|
||||
tracing::warn!("WGC: CreateTexture2D(YUV) failed — falling back to RGB path");
|
||||
self.vp_disabled = true;
|
||||
self.yuv_out.clear();
|
||||
return None;
|
||||
}
|
||||
let Some(tex) = t else {
|
||||
self.vp_disabled = true;
|
||||
self.yuv_out.clear();
|
||||
return None;
|
||||
};
|
||||
self.yuv_out.push(tex);
|
||||
}
|
||||
self.video_conv = Some(vc);
|
||||
self.yuv_is_hdr = hdr;
|
||||
tracing::info!(
|
||||
hdr,
|
||||
"WGC: video-processor YUV path active ({})",
|
||||
if hdr { "P010" } else { "NV12" }
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
let slot = self.yuv_idx;
|
||||
self.yuv_idx = (self.yuv_idx + 1) % self.yuv_out.len();
|
||||
let out = self.yuv_out[slot].clone();
|
||||
if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) {
|
||||
tracing::warn!(error = %format!("{e:#}"),
|
||||
"WGC: VideoProcessorBlt failed — falling back to RGB path");
|
||||
self.vp_disabled = true;
|
||||
self.video_conv = None;
|
||||
self.yuv_out.clear();
|
||||
return None;
|
||||
}
|
||||
Some(out)
|
||||
}
|
||||
|
||||
fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result<CapturedFrame> {
|
||||
@@ -391,13 +513,38 @@ impl WgcCapturer {
|
||||
.GetInterface()
|
||||
.context("GetInterface ID3D11Texture2D")?;
|
||||
|
||||
// Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to
|
||||
// NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC
|
||||
// skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set
|
||||
// means reading the pool texture directly does NOT serialize (unlike DDA's single-frame
|
||||
// model). The frame is held until the async Blt finishes.
|
||||
if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) {
|
||||
let fmt = if self.hdr {
|
||||
PixelFormat::P010
|
||||
} else {
|
||||
PixelFormat::Nv12
|
||||
};
|
||||
self.last_present = Some((yuv.clone(), fmt));
|
||||
let out = self.d3d11_frame(yuv, fmt);
|
||||
self.held.push_back(frame);
|
||||
while self.held.len() > HELD_FRAMES {
|
||||
self.held.pop_front();
|
||||
}
|
||||
return Ok(out);
|
||||
}
|
||||
|
||||
// --- fallback (video processor unavailable) ---
|
||||
if self.hdr {
|
||||
// Next ring slot — the in-flight encode reads the slot we handed out last time, so
|
||||
// this capture must land in a different one (see `out_ring`).
|
||||
let slot = self.ring_idx;
|
||||
self.ring_idx = (self.ring_idx + 1) % OUT_RING;
|
||||
// FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC.
|
||||
self.ensure_fp16_src()?;
|
||||
let fp16 = self.fp16_src.clone().context("fp16 src")?;
|
||||
self.context.CopyResource(&fp16, &src);
|
||||
self.ensure_hdr10_out()?;
|
||||
let out = self.hdr10_out.clone().context("hdr10 out")?;
|
||||
self.ensure_out_ring(DXGI_FORMAT_R10G10B10A2_UNORM)?;
|
||||
let out = self.out_ring[slot].clone();
|
||||
if self.hdr_conv.is_none() {
|
||||
self.hdr_conv = Some(HdrConverter::new(&self.device)?);
|
||||
}
|
||||
@@ -416,12 +563,19 @@ impl WgcCapturer {
|
||||
self.last_present = Some((out.clone(), PixelFormat::Rgb10a2));
|
||||
Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2))
|
||||
} else {
|
||||
// SDR: copy out of the recycled pool texture (cursor already composited) and hand off.
|
||||
self.ensure_bgra()?;
|
||||
let bgra = self.bgra_copy.clone().context("bgra copy")?;
|
||||
self.context.CopyResource(&bgra, &src);
|
||||
self.last_present = Some((bgra.clone(), PixelFormat::Bgra));
|
||||
Ok(self.d3d11_frame(bgra, PixelFormat::Bgra))
|
||||
// SDR ZERO-COPY: hand NVENC the WGC pool texture DIRECTLY — no `CopyResource`. The
|
||||
// per-frame copy otherwise queues on the graphics engine behind a GPU-saturating game
|
||||
// and stalls `lock_bitstream` ~20 ms (NVENC sits idle waiting for its input). Encoding
|
||||
// the pool texture in place removes that graphics-queue dependency (Apollo's model).
|
||||
// We must keep the frame alive until its async encode finishes, so retain the last
|
||||
// `HELD_FRAMES`; the pool has spare buffers so the producer never starves.
|
||||
self.last_present = Some((src.clone(), PixelFormat::Bgra));
|
||||
let out = self.d3d11_frame(src, PixelFormat::Bgra);
|
||||
self.held.push_back(frame);
|
||||
while self.held.len() > HELD_FRAMES {
|
||||
self.held.pop_front();
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user