perf(host/windows): move capture→encode off the 3D engine (NV12/P010 video-processor path, zero-copy, GPU priority)
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
The Windows host capped at ~60 fps with 35-40 ms latency on a GPU-heavy game: the per-frame capture→encode path shared the 3D engine with the game and got scheduled behind it. Rework to minimize 3D-engine work per frame: - VideoConverter (D3D11 video processor): capture → NVENC-native NV12/P010 so NVENC skips its internal RGB→YUV (a 3D/compute step). Wired into both DDA (dxgi.rs) and WGC (wgc.rs). New PixelFormat::Nv12/P010 + NVENC YUV input. - GPU scheduling hardening (Apollo-style): D3DKMTSetProcessSchedulingPriorityClass HIGH, absolute SetGPUThreadPriority, SetMaximumFrameLatency(1). - WGC SDR zero-copy (hold pool frames; no CopyResource). DDA keeps a fast CopyResource to decouple its single-frame acquire/release from the async convert. - Pipelined helper encode loop (PUNKTFUNK_ENCODE_DEPTH, default 1) + perf split (cap_wait / encode / write). Live on the RTX 4090: hard 60 fps ceiling removed (now scene-scaling 40-200+), latency much reduced. Residual cap in GPU-pinned scenes is the irreducible RGB→YUV convert (no fixed-function unit on NVIDIA — VideoProcessing engine reads 0%) waiting behind an uncapped game under WDDM context time-slicing; Linux avoids it via gamescope capping the game to the display refresh. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -103,9 +103,10 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) {
|
||||
PixelFormat::Rgba => (Pixel::RGBA, false),
|
||||
PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0
|
||||
PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0
|
||||
// 10-bit HDR (R10G10B10A2) is produced only by the Windows DXGI HDR capture path; the Linux
|
||||
// capturer never emits it. Map to BGRA so the match is exhaustive — unreachable here.
|
||||
PixelFormat::Rgb10a2 => (Pixel::BGRA, false),
|
||||
// Rgb10a2 (HDR) and NV12/P010 (the Windows video-processor YUV outputs) are produced only by
|
||||
// the Windows capture/encode paths; the Linux capturer never emits them. Map to BGRA so the
|
||||
// match is exhaustive — unreachable here.
|
||||
PixelFormat::Rgb10a2 | PixelFormat::Nv12 | PixelFormat::P010 => (Pixel::BGRA, false),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,10 @@ use windows::Win32::Graphics::Direct3D11::{ID3D11Device, ID3D11Texture2D};
|
||||
use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv;
|
||||
use nvidia_video_codec_sdk::ENCODE_API as API;
|
||||
|
||||
const POOL: usize = 4;
|
||||
// Output bitstream buffers = max in-flight encodes. The helper deep-pipelines (submits several frames
|
||||
// before locking the oldest) so per-frame GPU-scheduling waits OVERLAP instead of serializing under a
|
||||
// GPU-saturating game; this must be ≥ the helper's `PUNKTFUNK_ENCODE_DEPTH` (default 4, clamped ≤ 6).
|
||||
const POOL: usize = 8;
|
||||
|
||||
fn codec_guid(codec: Codec) -> nv::GUID {
|
||||
match codec {
|
||||
@@ -363,7 +366,9 @@ impl Encoder for NvencD3d11Encoder {
|
||||
// frame arrives on a different device OR at a different size than our session was built on.
|
||||
// HDR (BT.2020 PQ 10-bit) when the capturer hands us a 10-bit R10G10B10A2 frame. This can flip
|
||||
// mid-session when the user toggles HDR (which arrives as a capture device recreate anyway).
|
||||
let hdr = matches!(captured.format, PixelFormat::Rgb10a2);
|
||||
// HDR (BT.2020 PQ) when the capturer hands a 10-bit frame — either R10G10B10A2 (the legacy
|
||||
// shader path) or P010 (the video-processor path). 8-bit NV12/ARGB → SDR.
|
||||
let hdr = matches!(captured.format, PixelFormat::Rgb10a2 | PixelFormat::P010);
|
||||
let dev_raw = frame.device.as_raw();
|
||||
let size_changed =
|
||||
self.inited && (self.width != captured.width || self.height != captured.height);
|
||||
@@ -384,13 +389,22 @@ impl Encoder for NvencD3d11Encoder {
|
||||
self.width = captured.width;
|
||||
self.height = captured.height;
|
||||
self.hdr = hdr;
|
||||
if hdr {
|
||||
// 10-bit BT.2020 PQ input; force Main10 regardless of the negotiated SDR bit depth.
|
||||
self.bit_depth = 10;
|
||||
self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10;
|
||||
} else {
|
||||
self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB;
|
||||
}
|
||||
// Pick the NVENC input format from the captured pixel format. YUV (NV12/P010) is the
|
||||
// video-processor path — NVENC encodes it natively (no internal RGB→YUV, which is a hidden
|
||||
// 3D/compute step that would fight a GPU-saturating game). RGB (ARGB/ABGR10) is the legacy
|
||||
// shader path. 10-bit (P010/ABGR10) forces HEVC Main10 + the BT.2020 PQ VUI.
|
||||
self.buffer_fmt = match captured.format {
|
||||
PixelFormat::P010 => {
|
||||
self.bit_depth = 10;
|
||||
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_YUV420_10BIT
|
||||
}
|
||||
PixelFormat::Rgb10a2 => {
|
||||
self.bit_depth = 10;
|
||||
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10
|
||||
}
|
||||
PixelFormat::Nv12 => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12,
|
||||
_ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
|
||||
};
|
||||
let device = frame.device.clone();
|
||||
self.init_session(&device)?;
|
||||
self.init_device = dev_raw;
|
||||
|
||||
@@ -151,6 +151,13 @@ impl Encoder for OpenH264Encoder {
|
||||
PixelFormat::Rgb10a2 => {
|
||||
anyhow::bail!("software H.264 encoder cannot encode 10-bit HDR (Rgb10a2)")
|
||||
}
|
||||
// NV12/P010 are GPU-resident video-processor outputs for the NVENC path; the software
|
||||
// encoder never receives them (it only gets CPU RGB frames).
|
||||
PixelFormat::Nv12 | PixelFormat::P010 => {
|
||||
anyhow::bail!(
|
||||
"software encoder cannot encode YUV GPU textures (NV12/P010 → NVENC only)"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
if self.force_kf {
|
||||
|
||||
Reference in New Issue
Block a user