From 4cc57d5c39bcab15150db449ae2cb0fe7b8fa556 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Wed, 17 Jun 2026 13:08:03 +0000 Subject: [PATCH] =?UTF-8?q?perf(host/windows):=20move=20capture=E2=86=92en?= =?UTF-8?q?code=20off=20the=203D=20engine=20(NV12/P010=20video-processor?= =?UTF-8?q?=20path,=20zero-copy,=20GPU=20priority)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Windows host capped at ~60 fps with 35-40 ms latency on a GPU-heavy game: the per-frame capture→encode path shared the 3D engine with the game and got scheduled behind it. Rework to minimize 3D-engine work per frame: - VideoConverter (D3D11 video processor): capture → NVENC-native NV12/P010 so NVENC skips its internal RGB→YUV (a 3D/compute step). Wired into both DDA (dxgi.rs) and WGC (wgc.rs). New PixelFormat::Nv12/P010 + NVENC YUV input. - GPU scheduling hardening (Apollo-style): D3DKMTSetProcessSchedulingPriorityClass HIGH, absolute SetGPUThreadPriority, SetMaximumFrameLatency(1). - WGC SDR zero-copy (hold pool frames; no CopyResource). DDA keeps a fast CopyResource to decouple its single-frame acquire/release from the async convert. - Pipelined helper encode loop (PUNKTFUNK_ENCODE_DEPTH, default 1) + perf split (cap_wait / encode / write). Live on the RTX 4090: hard 60 fps ceiling removed (now scene-scaling 40-200+), latency much reduced. Residual cap in GPU-pinned scenes is the irreducible RGB→YUV convert (no fixed-function unit on NVIDIA — VideoProcessing engine reads 0%) waiting behind an uncapped game under WDDM context time-slicing; Linux avoids it via gamescope capping the game to the display refresh. Co-Authored-By: Claude Opus 4.8 --- crates/punktfunk-host/src/capture.rs | 7 + crates/punktfunk-host/src/capture/dxgi.rs | 393 +++++++++++++++++++++- crates/punktfunk-host/src/capture/wgc.rs | 230 ++++++++++--- crates/punktfunk-host/src/encode/linux.rs | 7 +- crates/punktfunk-host/src/encode/nvenc.rs | 32 +- crates/punktfunk-host/src/encode/sw.rs | 7 + crates/punktfunk-host/src/wgc_helper.rs | 76 ++++- crates/punktfunk-host/src/zerocopy/mod.rs | 4 +- 8 files changed, 689 insertions(+), 67 deletions(-) diff --git a/crates/punktfunk-host/src/capture.rs b/crates/punktfunk-host/src/capture.rs index fe3f9f1..46b7d89 100644 --- a/crates/punktfunk-host/src/capture.rs +++ b/crates/punktfunk-host/src/capture.rs @@ -26,6 +26,13 @@ pub enum PixelFormat { /// produces this: scRGB FP16 desktop pixels are converted to BT.2020 PQ and written here, then /// handed to NVENC as `ABGR10` for an HEVC Main10 / HDR10 encode. Rgb10a2, + /// `NV12` (DXGI `NV12`): 8-bit BT.709 limited-range YUV 4:2:0. Produced by the D3D11 **video + /// processor** (video engine, not the 3D engine) so the per-frame colour conversion doesn't fight a + /// GPU-saturating game; handed to NVENC as `NV12` (it encodes YUV natively — no internal RGB→YUV). + Nv12, + /// `P010` (DXGI `P010`): 10-bit BT.2020 PQ limited-range YUV 4:2:0. HDR analogue of [`Nv12`]: + /// video-processor output for HEVC Main10 / HDR10, handed to NVENC as `YUV420_10BIT`. + P010, } impl PixelFormat { diff --git a/crates/punktfunk-host/src/capture/dxgi.rs b/crates/punktfunk-host/src/capture/dxgi.rs index 65ef733..6b6c5d5 100644 --- a/crates/punktfunk-host/src/capture/dxgi.rs +++ b/crates/punktfunk-host/src/capture/dxgi.rs @@ -37,12 +37,12 @@ use windows::Win32::Graphics::Dxgi::Common::{ DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, }; use windows::Win32::Graphics::Dxgi::{ - CreateDXGIFactory1, IDXGIAdapter1, IDXGIFactory1, IDXGIOutput1, IDXGIOutput5, - IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST, DXGI_ERROR_DEVICE_REMOVED, - DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL, DXGI_ERROR_MODE_CHANGE_IN_PROGRESS, - DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, DXGI_OUTDUPL_FRAME_INFO, - DXGI_OUTDUPL_POINTER_SHAPE_INFO, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, - DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR, + CreateDXGIFactory1, IDXGIAdapter1, IDXGIDevice, IDXGIDevice1, IDXGIFactory1, IDXGIOutput1, + IDXGIOutput5, IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST, + DXGI_ERROR_DEVICE_REMOVED, DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL, + DXGI_ERROR_MODE_CHANGE_IN_PROGRESS, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, + DXGI_OUTDUPL_FRAME_INFO, DXGI_OUTDUPL_POINTER_SHAPE_INFO, + DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR, }; use windows::Win32::System::StationsAndDesktops::{ CloseDesktop, OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS, @@ -147,10 +147,119 @@ pub(crate) unsafe fn make_device( Some(&mut context), ) .context("D3D11CreateDevice")?; - Ok(( - device.context("null D3D11 device")?, - context.context("null D3D11 context")?, - )) + let device = device.context("null D3D11 device")?; + let context = context.context("null D3D11 context")?; + + // Apollo-style GPU scheduling hardening (Sunshine display_base.cpp:599-709). Our capture+encode + // shares the GPU with the streamed game; when the game saturates the GPU our process is starved of + // GPU time slices, so NVENC sits near-idle yet `lock_bitstream` waits ~20 ms for our context to be + // scheduled — capping the stream (~47 fps measured at 5K@240) and stuttering. Per-frame copy/convert + // is NOT the cause (zero-copy + thread-priority alone didn't move it); the PROCESS-level GPU + // scheduling priority class is the decisive cross-process lever. Secondary: the absolute per-device + // GPU thread priority and a 1-frame latency cap. + elevate_process_gpu_priority(); + if let Ok(dxgi_dev) = device.cast::() { + // Apollo's absolute max GPU thread priority (0x4000001E); fall back to relative +7. + if dxgi_dev.SetGPUThreadPriority(0x4000_001E).is_err() + && dxgi_dev.SetGPUThreadPriority(7).is_err() + { + tracing::warn!("SetGPUThreadPriority failed (run as admin/SYSTEM for GPU priority)"); + } + } + if let Ok(dxgi1) = device.cast::() { + let _ = dxgi1.SetMaximumFrameLatency(1); + } + Ok((device, context)) +} + +/// Apollo-style GPU scheduling-priority hardening (Sunshine `display_base.cpp:599-709`). On a +/// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but +/// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling +/// priority class (the strong cross-process lever — far more effective than `SetGPUThreadPriority` +/// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT +/// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this). +/// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime` +/// (default high). +fn elevate_process_gpu_priority() { + use std::sync::Once; + static ONCE: Once = Once::new(); + ONCE.call_once(|| unsafe { + use windows::core::{s, PCWSTR}; + use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; + use windows::Win32::Security::{ + AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES, + SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, + TOKEN_PRIVILEGES, TOKEN_QUERY, + }; + use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA}; + use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken}; + + // D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, + // REALTIME 5. + let prio: i32 = match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS").ok().as_deref() { + Some("off") => { + tracing::info!("GPU process scheduling priority class left at default (off)"); + return; + } + Some("normal") => 2, + Some("realtime") => 5, + _ => 4, // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC) + }; + + // 1. Enable SE_INC_BASE_PRIORITY so the kernel permits the GPU priority bump. + let mut token = HANDLE::default(); + if OpenProcessToken( + GetCurrentProcess(), + TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, + &mut token, + ) + .is_ok() + { + let mut luid = LUID::default(); + if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() { + let tp = TOKEN_PRIVILEGES { + PrivilegeCount: 1, + Privileges: [LUID_AND_ATTRIBUTES { + Luid: luid, + Attributes: SE_PRIVILEGE_ENABLED, + }], + }; + if AdjustTokenPrivileges( + token, + false, + Some(&tp as *const TOKEN_PRIVILEGES), + 0, + None, + None, + ) + .is_err() + { + tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority"); + } + } + let _ = CloseHandle(token); + } + + // 2. D3DKMTSetProcessSchedulingPriorityClass via gdi32 (no stable windows-rs binding). + if let Ok(gdi32) = LoadLibraryA(s!("gdi32.dll")) { + if let Some(p) = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass")) { + type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32; + let f: SetPrio = std::mem::transmute(p); + let st = f(GetCurrentProcess(), prio); + if st == 0 { + tracing::info!( + priority_class = prio, + "GPU process scheduling priority class set (2=normal 4=high 5=realtime)" + ); + } else { + tracing::warn!( + status = format!("0x{st:08X}"), + "D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)" + ); + } + } + } + }); } /// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST @@ -827,6 +936,135 @@ impl HdrConverter { } } +use windows::Win32::Graphics::Direct3D11::{ + ID3D11VideoContext1, ID3D11VideoDevice, ID3D11VideoProcessor, ID3D11VideoProcessorEnumerator, + ID3D11VideoProcessorInputView, ID3D11VideoProcessorOutputView, D3D11_TEX2D_VPIV, + D3D11_TEX2D_VPOV, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE, D3D11_VIDEO_PROCESSOR_CONTENT_DESC, + D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0, + D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0, + D3D11_VIDEO_PROCESSOR_STREAM, D3D11_VIDEO_USAGE_PLAYBACK_NORMAL, + D3D11_VPIV_DIMENSION_TEXTURE2D, D3D11_VPOV_DIMENSION_TEXTURE2D, +}; +use windows::Win32::Graphics::Dxgi::Common::{ + DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709, + DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020, DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709, + DXGI_RATIONAL, +}; + +/// D3D11 **Video Processor** colour/format converter — runs on the GPU's dedicated VIDEO engine, NOT +/// the 3D engine, so the per-frame RGB→YUV conversion does not contend with a GPU-saturating game (the +/// HDR pixel-shader path and NVENC's internal RGB→YUV both use the 3D/compute engine, which an AAA +/// title pins at ~100%). Output is NV12 (SDR, BT.709 studio-range) or P010 (HDR, BT.2020 PQ +/// studio-range) — NVENC's native YUV inputs, so it encodes them with no further conversion. +pub(crate) struct VideoConverter { + vdev: ID3D11VideoDevice, + vctx: ID3D11VideoContext1, + enumr: ID3D11VideoProcessorEnumerator, + vp: ID3D11VideoProcessor, +} + +impl VideoConverter { + pub(crate) unsafe fn new( + device: &ID3D11Device, + context: &ID3D11DeviceContext, + width: u32, + height: u32, + hdr: bool, + ) -> Result { + let vdev: ID3D11VideoDevice = device.cast().context("device -> ID3D11VideoDevice")?; + let vctx: ID3D11VideoContext1 = context.cast().context("context -> ID3D11VideoContext1")?; + let rate = DXGI_RATIONAL { + Numerator: 240, + Denominator: 1, + }; + let desc = D3D11_VIDEO_PROCESSOR_CONTENT_DESC { + InputFrameFormat: D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE, + InputFrameRate: rate, + InputWidth: width, + InputHeight: height, + OutputFrameRate: rate, + OutputWidth: width, + OutputHeight: height, + Usage: D3D11_VIDEO_USAGE_PLAYBACK_NORMAL, + }; + let enumr = vdev + .CreateVideoProcessorEnumerator(&desc) + .context("CreateVideoProcessorEnumerator")?; + let vp = vdev + .CreateVideoProcessor(&enumr, 0) + .context("CreateVideoProcessor")?; + + // Full-range RGB in → studio-range YUV out. HDR: scRGB linear (G10) → BT.2020 PQ (G2084). + // SDR: sRGB (G22) → BT.709 (G22). + let (in_cs, out_cs) = if hdr { + ( + DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, + DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020, + ) + } else { + ( + DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709, + DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709, + ) + }; + vctx.VideoProcessorSetStreamColorSpace1(&vp, 0, in_cs); + vctx.VideoProcessorSetOutputColorSpace1(&vp, out_cs); + // One frame in, one frame out — no interpolation/auto-processing. + vctx.VideoProcessorSetStreamFrameFormat(&vp, 0, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE); + + Ok(Self { + vdev, + vctx, + enumr, + vp, + }) + } + + /// Convert `input` (BGRA or scRGB FP16) → `output` (NV12 or P010) on the video engine. Views are + /// created per call (cheap relative to the Blt) so the input texture can vary frame to frame. + pub(crate) unsafe fn convert( + &self, + input: &ID3D11Texture2D, + output: &ID3D11Texture2D, + ) -> Result<()> { + let in_desc = D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC { + FourCC: 0, + ViewDimension: D3D11_VPIV_DIMENSION_TEXTURE2D, + Anonymous: D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0 { + Texture2D: D3D11_TEX2D_VPIV { + MipSlice: 0, + ArraySlice: 0, + }, + }, + }; + let mut in_view: Option = None; + self.vdev + .CreateVideoProcessorInputView(input, &self.enumr, &in_desc, Some(&mut in_view)) + .context("CreateVideoProcessorInputView")?; + + let out_desc = D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC { + ViewDimension: D3D11_VPOV_DIMENSION_TEXTURE2D, + Anonymous: D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0 { + Texture2D: D3D11_TEX2D_VPOV { MipSlice: 0 }, + }, + }; + let mut out_view: Option = None; + self.vdev + .CreateVideoProcessorOutputView(output, &self.enumr, &out_desc, Some(&mut out_view)) + .context("CreateVideoProcessorOutputView")?; + let out_view = out_view.context("null output view")?; + + let stream = D3D11_VIDEO_PROCESSOR_STREAM { + Enable: true.into(), + pInputSurface: std::mem::ManuallyDrop::new(in_view), + ..Default::default() + }; + self.vctx + .VideoProcessorBlt(&self.vp, &out_view, 0, &[stream]) + .context("VideoProcessorBlt") + } +} + /// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA. fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option { let w = si.Width as usize; @@ -1055,6 +1293,17 @@ pub struct DuplCapturer { hdr10_out: Option, /// scRGB→PQ conversion pass; rebuilt on device recreate. hdr_conv: Option, + /// Video-processor RGB→YUV converter (runs on the VIDEO engine, not the 3D engine) + its NV12 + /// (SDR) / P010 (HDR) output texture. This is the zero-3D path: the per-frame colour conversion and + /// NVENC's RGB→YUV both move off the 3D engine so capture+encode don't fight a GPU-saturating game. + /// Lazily built for the current size+HDR; rebuilt on change. `None`/error → falls back to the + /// legacy RGB path. Disabled with `PUNKTFUNK_NO_VIDEO_PROCESSOR=1`. + video_conv: Option, + yuv_out: Option, + /// HDR-ness the current `video_conv`/`yuv_out` were built for, so an HDR toggle rebuilds them. + yuv_is_hdr: bool, + /// Latched off after a VideoConverter failure so we don't retry it every frame (fall back to RGB). + vp_disabled: bool, /// Last time a duplication rebuild was attempted, to throttle retries during an outage (e.g. a /// secure-desktop dwell where the output is gone) so we don't block the encode loop or hammer /// DuplicateOutput — between attempts the last good frame is repeated. `None` = never attempted. @@ -1306,6 +1555,10 @@ impl DuplCapturer { fp16_srv: None, hdr10_out: None, hdr_conv: None, + video_conv: None, + yuv_out: None, + yuv_is_hdr: false, + vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(), last_rebuild: None, last_recover: None, ever_got_frame: false, @@ -1375,6 +1628,85 @@ impl DuplCapturer { Ok(()) } + /// Convert `input` (BGRA for SDR, scRGB FP16 for HDR) to NVENC's native YUV (NV12 / P010) via the + /// D3D11 **video processor** (video engine) — keeping the per-frame colour conversion AND NVENC's + /// RGB→YUV off the 3D engine so capture+encode don't fight a GPU-saturating game. Returns the YUV + /// texture, or `None` to fall back to the legacy RGB path (processor disabled/unavailable). Lazily + /// builds + caches the processor + output texture for the current size + HDR-ness. + unsafe fn convert_to_yuv( + &mut self, + input: &ID3D11Texture2D, + hdr: bool, + ) -> Option { + if self.vp_disabled { + return None; + } + if self.video_conv.is_none() || self.yuv_out.is_none() || self.yuv_is_hdr != hdr { + self.video_conv = None; + self.yuv_out = None; + let vc = match VideoConverter::new( + &self.device, + &self.context, + self.width, + self.height, + hdr, + ) { + Ok(vc) => vc, + Err(e) => { + tracing::warn!(error = %format!("{e:#}"), + "video processor unavailable — falling back to RGB encode path"); + self.vp_disabled = true; + return None; + } + }; + let fmt = if hdr { + windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010 + } else { + windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12 + }; + let desc = D3D11_TEXTURE2D_DESC { + Width: self.width, + Height: self.height, + MipLevels: 1, + ArraySize: 1, + Format: fmt, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32, + CPUAccessFlags: 0, + MiscFlags: 0, + }; + let mut t: Option = None; + if let Err(e) = self.device.CreateTexture2D(&desc, None, Some(&mut t)) { + tracing::warn!(error = %format!("{e:?}"), + "CreateTexture2D(YUV out) failed — falling back to RGB encode path"); + self.vp_disabled = true; + return None; + } + self.video_conv = Some(vc); + self.yuv_out = t; + self.yuv_is_hdr = hdr; + tracing::info!( + hdr, + "video-processor YUV path active ({} on the video engine, 0% 3D)", + if hdr { "P010" } else { "NV12" } + ); + } + let out = self.yuv_out.clone()?; + if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) { + tracing::warn!(error = %format!("{e:#}"), + "VideoProcessorBlt failed — falling back to RGB encode path"); + self.vp_disabled = true; + self.video_conv = None; + self.yuv_out = None; + return None; + } + Some(out) + } + /// FP16 (`R16G16B16A16_FLOAT`) copy of the HDR duplication surface (RT for the cursor composite + /// SRV for the converter). Reallocated when absent (device/size change drops it). unsafe fn ensure_fp16_src(&mut self) -> Result<()> { @@ -1718,6 +2050,9 @@ impl DuplCapturer { self.fp16_srv = None; self.hdr10_out = None; self.hdr_conv = None; + // Video processor + its YUV output belonged to the old device / size / HDR-ness — rebuild lazily. + self.video_conv = None; + self.yuv_out = None; self.first_frame = true; // Capture the CURRENT desktop frame as `last_present` (instead of seeding black). The secure // (lock/login/UAC) desktop is STATIC, so DDA only emits a frame on change — if we seeded black @@ -1982,6 +2317,22 @@ impl DuplCapturer { let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); self.holding_frame = false; self.composite_cursor_gpu(&src, true)?; // onto the FP16 surface (HDR: decode + nits scale) + // Video-engine path: scRGB FP16 → BT.2020 PQ P010 on the VIDEO engine (no 3D shader, and + // NVENC encodes P010 natively). Fall back to the HdrConverter pixel shader (3D) only if the + // video processor is unavailable. + if let Some(p010) = self.convert_to_yuv(&src, true) { + self.last_present = Some((p010.clone(), PixelFormat::P010)); + return Ok(CapturedFrame { + width: self.width, + height: self.height, + pts_ns: now_ns(), + format: PixelFormat::P010, + payload: FramePayload::D3d11(D3d11Frame { + texture: p010, + device: self.device.clone(), + }), + }); + } self.ensure_hdr10_out()?; let out = self.hdr10_out.clone().context("hdr10 out texture")?; if self.hdr_conv.is_none() { @@ -2014,12 +2365,34 @@ impl DuplCapturer { if self.gpu_mode { // Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication // surface into a reused owned texture, release the duplication frame, hand off the texture. + // NOTE: do NOT convert the duplication surface directly on the video processor to skip this + // copy — the VP colour-convert (3D/compute on NVIDIA) holds the DDA surface until it + // completes, blocking ReleaseFrame/AcquireNextFrame and SERIALIZING capture+convert (~60 fps, + // encode_us 15-20 ms measured). The fast same-format CopyResource decouples them: it releases + // the DDA frame immediately so the convert runs independently (40-200 fps). Worth ~5% 3D. self.ensure_gpu_copy()?; let gpu = self.gpu_copy.clone().context("gpu copy texture")?; self.context.CopyResource(&gpu, &tex); let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame()); self.holding_frame = false; self.composite_cursor_gpu(&gpu, false)?; + // Prefer the video-engine YUV path (BGRA → NV12 on the video engine) so the colour + // conversion AND NVENC's encode stay OFF the 3D engine — the only way to keep up when a + // game pins the 3D engine at ~100%. Fall back to handing NVENC the BGRA texture (it then + // does RGB→YUV internally on the 3D/compute engine). + if let Some(nv12) = self.convert_to_yuv(&gpu, false) { + self.last_present = Some((nv12.clone(), PixelFormat::Nv12)); + return Ok(CapturedFrame { + width: self.width, + height: self.height, + pts_ns: now_ns(), + format: PixelFormat::Nv12, + payload: FramePayload::D3d11(D3d11Frame { + texture: nv12, + device: self.device.clone(), + }), + }); + } self.last_present = Some((gpu.clone(), PixelFormat::Bgra)); return Ok(CapturedFrame { width: self.width, diff --git a/crates/punktfunk-host/src/capture/wgc.rs b/crates/punktfunk-host/src/capture/wgc.rs index 6d4c888..7a144e0 100644 --- a/crates/punktfunk-host/src/capture/wgc.rs +++ b/crates/punktfunk-host/src/capture/wgc.rs @@ -17,10 +17,12 @@ //! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs). use super::dxgi::{ - find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, WinCaptureTarget, + find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, VideoConverter, + WinCaptureTarget, }; use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use anyhow::{bail, Context, Result}; +use std::collections::VecDeque; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex}; use std::time::{Duration, Instant}; @@ -37,8 +39,8 @@ use windows::Win32::Graphics::Direct3D11::{ D3D11_USAGE_DEFAULT, }; use windows::Win32::Graphics::Dxgi::Common::{ - DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_B8G8R8A8_UNORM, - DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, + DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_R10G10B10A2_UNORM, + DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, }; use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6}; use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf}; @@ -49,6 +51,22 @@ use windows::Win32::System::WinRT::Direct3D11::{ use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop; use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED}; +/// Output texture ring depth. The encode loop pipelines one frame deep (NVENC encodes frame N while +/// the capturer produces N+1), so two live textures suffice; three gives headroom against a slow +/// `lock_bitstream` and matches the WGC frame-pool depth. +// Sized for the deep encode pipeline (`PUNKTFUNK_ENCODE_DEPTH`, default 4, clamped ≤ 6): up to DEPTH +// frames are in flight in NVENC at once, so the HDR convert ring and the SDR held-frame set must each +// keep DEPTH(+headroom) live textures, and the WGC pool needs spare buffers beyond what we hold. +const OUT_RING: usize = 8; + +/// SDR zero-copy: how many recent WGC frames to keep alive so NVENC can encode the pool texture in +/// place (no `CopyResource`). Each in-flight encode reads a distinct frame, so this must exceed the +/// pipeline depth; the oldest is released once `HELD_FRAMES` newer ones exist. +const HELD_FRAMES: usize = 8; +/// WGC frame-pool buffer count. Must exceed `HELD_FRAMES` so the compositor always has free buffers +/// to render into while we hold frames for in-place (zero-copy) SDR encode. +const WGC_POOL_BUFFERS: i32 = 10; + /// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under /// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user /// for the WGC activation. Returns the user token (the caller reverts + closes it after activation) @@ -112,8 +130,27 @@ pub struct WgcCapturer { hdr_conv: Option, fp16_src: Option, fp16_srv: Option, - hdr10_out: Option, - bgra_copy: Option, + /// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed + /// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC + /// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so + /// N+1 must land in a DIFFERENT texture or it clobbers the in-flight encode. (`fp16_src` stays + /// single: it's only touched within the D3D11 immediate context, whose op ordering already + /// serializes the convert's read against the next copy's write — NVENC's async engine read is the + /// only consumer that escapes that ordering, and it reads the ring output, never `fp16_src`.) + out_ring: Vec, + ring_idx: usize, + /// Video-processor RGB→YUV converter (off the 3D engine where possible) + its NV12/P010 output + /// ring. Preferred path: the OS-composited capture (cursor already in it) is converted DIRECTLY to + /// NVENC's native YUV — no `CopyResource`, no cursor draw, and NVENC skips its internal RGB→YUV. + /// `None`/error → falls back to the legacy SDR-zero-copy / HDR-shader paths. + video_conv: Option, + yuv_out: Vec, + yuv_idx: usize, + yuv_is_hdr: bool, + vp_disabled: bool, + /// SDR zero-copy: the recent WGC frames we hand to NVENC in place. Held so the pool doesn't + /// recycle the texture mid-encode; the oldest is released once `HELD_FRAMES` newer ones exist. + held: VecDeque, /// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop). last_present: Option<(ID3D11Texture2D, PixelFormat)>, @@ -204,10 +241,15 @@ impl WgcCapturer { } else { DirectXPixelFormat::B8G8R8A8UIntNormalized }; - // ≥3 buffers for 240 Hz headroom (avoid the producer waiting on a free buffer). - let pool = - Direct3D11CaptureFramePool::CreateFreeThreaded(&d3d_device, pixel_format, 3, size) - .context("CreateFreeThreaded frame pool")?; + // Extra buffers: SDR zero-copy holds the last `HELD_FRAMES` frames (encoded in place), so + // the pool needs headroom beyond that for the producer to keep rendering at 240 Hz. + let pool = Direct3D11CaptureFramePool::CreateFreeThreaded( + &d3d_device, + pixel_format, + WGC_POOL_BUFFERS, + size, + ) + .context("CreateFreeThreaded frame pool")?; let signal = Arc::new(WgcSignal { available: AtomicU64::new(0), @@ -278,8 +320,14 @@ impl WgcCapturer { hdr_conv: None, fp16_src: None, fp16_srv: None, - hdr10_out: None, - bgra_copy: None, + out_ring: Vec::new(), + ring_idx: 0, + video_conv: None, + yuv_out: Vec::new(), + yuv_idx: 0, + yuv_is_hdr: false, + vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(), + held: VecDeque::new(), last_present: None, _keepalive: None, }) @@ -347,38 +395,112 @@ impl WgcCapturer { Ok(()) } - unsafe fn ensure_hdr10_out(&mut self) -> Result<()> { - if self.hdr10_out.is_none() { - let desc = tex_desc( - self.width, - self.height, - DXGI_FORMAT_R10G10B10A2_UNORM, - D3D11_BIND_RENDER_TARGET.0 as u32, - ); + /// Lazily allocate the HDR output texture ring (R10G10B10A2, the convert pass's render target → + /// NVENC input), `RENDER_TARGET`-bindable. SDR is zero-copy (encodes the WGC pool texture in + /// place) and uses no ring. + unsafe fn ensure_out_ring( + &mut self, + format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT, + ) -> Result<()> { + if !self.out_ring.is_empty() { + return Ok(()); + } + let desc = tex_desc( + self.width, + self.height, + format, + D3D11_BIND_RENDER_TARGET.0 as u32, + ); + for _ in 0..OUT_RING { let mut t = None; self.device .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(wgc hdr10 out)")?; - self.hdr10_out = t; + .context("CreateTexture2D(wgc out ring)")?; + self.out_ring.push(t.context("wgc out ring tex")?); } Ok(()) } - unsafe fn ensure_bgra(&mut self) -> Result<()> { - if self.bgra_copy.is_none() { + /// Convert `input` (the OS-composited WGC pool texture: BGRA or scRGB FP16) → NVENC's native YUV + /// (NV12 / P010) on the video processor. Returns the YUV texture (from a ring so consecutive + /// encodes don't collide), or `None` to fall back to the legacy RGB paths. + unsafe fn convert_to_yuv( + &mut self, + input: &ID3D11Texture2D, + hdr: bool, + ) -> Option { + if self.vp_disabled { + return None; + } + if self.video_conv.is_none() || self.yuv_out.is_empty() || self.yuv_is_hdr != hdr { + self.video_conv = None; + self.yuv_out.clear(); + self.yuv_idx = 0; + let vc = match VideoConverter::new( + &self.device, + &self.context, + self.width, + self.height, + hdr, + ) { + Ok(vc) => vc, + Err(e) => { + tracing::warn!(error = %format!("{e:#}"), + "WGC: video processor unavailable — falling back to RGB path"); + self.vp_disabled = true; + return None; + } + }; + let fmt = if hdr { + windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010 + } else { + windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12 + }; let desc = tex_desc( self.width, self.height, - DXGI_FORMAT_B8G8R8A8_UNORM, + fmt, D3D11_BIND_RENDER_TARGET.0 as u32, ); - let mut t = None; - self.device - .CreateTexture2D(&desc, None, Some(&mut t)) - .context("CreateTexture2D(wgc bgra)")?; - self.bgra_copy = t; + for _ in 0..OUT_RING { + let mut t = None; + if self + .device + .CreateTexture2D(&desc, None, Some(&mut t)) + .is_err() + { + tracing::warn!("WGC: CreateTexture2D(YUV) failed — falling back to RGB path"); + self.vp_disabled = true; + self.yuv_out.clear(); + return None; + } + let Some(tex) = t else { + self.vp_disabled = true; + self.yuv_out.clear(); + return None; + }; + self.yuv_out.push(tex); + } + self.video_conv = Some(vc); + self.yuv_is_hdr = hdr; + tracing::info!( + hdr, + "WGC: video-processor YUV path active ({})", + if hdr { "P010" } else { "NV12" } + ); } - Ok(()) + let slot = self.yuv_idx; + self.yuv_idx = (self.yuv_idx + 1) % self.yuv_out.len(); + let out = self.yuv_out[slot].clone(); + if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) { + tracing::warn!(error = %format!("{e:#}"), + "WGC: VideoProcessorBlt failed — falling back to RGB path"); + self.vp_disabled = true; + self.video_conv = None; + self.yuv_out.clear(); + return None; + } + Some(out) } fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result { @@ -391,13 +513,38 @@ impl WgcCapturer { .GetInterface() .context("GetInterface ID3D11Texture2D")?; + // Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to + // NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC + // skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set + // means reading the pool texture directly does NOT serialize (unlike DDA's single-frame + // model). The frame is held until the async Blt finishes. + if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) { + let fmt = if self.hdr { + PixelFormat::P010 + } else { + PixelFormat::Nv12 + }; + self.last_present = Some((yuv.clone(), fmt)); + let out = self.d3d11_frame(yuv, fmt); + self.held.push_back(frame); + while self.held.len() > HELD_FRAMES { + self.held.pop_front(); + } + return Ok(out); + } + + // --- fallback (video processor unavailable) --- if self.hdr { + // Next ring slot — the in-flight encode reads the slot we handed out last time, so + // this capture must land in a different one (see `out_ring`). + let slot = self.ring_idx; + self.ring_idx = (self.ring_idx + 1) % OUT_RING; // FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC. self.ensure_fp16_src()?; let fp16 = self.fp16_src.clone().context("fp16 src")?; self.context.CopyResource(&fp16, &src); - self.ensure_hdr10_out()?; - let out = self.hdr10_out.clone().context("hdr10 out")?; + self.ensure_out_ring(DXGI_FORMAT_R10G10B10A2_UNORM)?; + let out = self.out_ring[slot].clone(); if self.hdr_conv.is_none() { self.hdr_conv = Some(HdrConverter::new(&self.device)?); } @@ -416,12 +563,19 @@ impl WgcCapturer { self.last_present = Some((out.clone(), PixelFormat::Rgb10a2)); Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2)) } else { - // SDR: copy out of the recycled pool texture (cursor already composited) and hand off. - self.ensure_bgra()?; - let bgra = self.bgra_copy.clone().context("bgra copy")?; - self.context.CopyResource(&bgra, &src); - self.last_present = Some((bgra.clone(), PixelFormat::Bgra)); - Ok(self.d3d11_frame(bgra, PixelFormat::Bgra)) + // SDR ZERO-COPY: hand NVENC the WGC pool texture DIRECTLY — no `CopyResource`. The + // per-frame copy otherwise queues on the graphics engine behind a GPU-saturating game + // and stalls `lock_bitstream` ~20 ms (NVENC sits idle waiting for its input). Encoding + // the pool texture in place removes that graphics-queue dependency (Apollo's model). + // We must keep the frame alive until its async encode finishes, so retain the last + // `HELD_FRAMES`; the pool has spare buffers so the producer never starves. + self.last_present = Some((src.clone(), PixelFormat::Bgra)); + let out = self.d3d11_frame(src, PixelFormat::Bgra); + self.held.push_back(frame); + while self.held.len() > HELD_FRAMES { + self.held.pop_front(); + } + Ok(out) } } } diff --git a/crates/punktfunk-host/src/encode/linux.rs b/crates/punktfunk-host/src/encode/linux.rs index 3ec6876..a0bc40d 100644 --- a/crates/punktfunk-host/src/encode/linux.rs +++ b/crates/punktfunk-host/src/encode/linux.rs @@ -103,9 +103,10 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) { PixelFormat::Rgba => (Pixel::RGBA, false), PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0 PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0 - // 10-bit HDR (R10G10B10A2) is produced only by the Windows DXGI HDR capture path; the Linux - // capturer never emits it. Map to BGRA so the match is exhaustive — unreachable here. - PixelFormat::Rgb10a2 => (Pixel::BGRA, false), + // Rgb10a2 (HDR) and NV12/P010 (the Windows video-processor YUV outputs) are produced only by + // the Windows capture/encode paths; the Linux capturer never emits them. Map to BGRA so the + // match is exhaustive — unreachable here. + PixelFormat::Rgb10a2 | PixelFormat::Nv12 | PixelFormat::P010 => (Pixel::BGRA, false), } } diff --git a/crates/punktfunk-host/src/encode/nvenc.rs b/crates/punktfunk-host/src/encode/nvenc.rs index f7f01d4..462bdc3 100644 --- a/crates/punktfunk-host/src/encode/nvenc.rs +++ b/crates/punktfunk-host/src/encode/nvenc.rs @@ -25,7 +25,10 @@ use windows::Win32::Graphics::Direct3D11::{ID3D11Device, ID3D11Texture2D}; use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv; use nvidia_video_codec_sdk::ENCODE_API as API; -const POOL: usize = 4; +// Output bitstream buffers = max in-flight encodes. The helper deep-pipelines (submits several frames +// before locking the oldest) so per-frame GPU-scheduling waits OVERLAP instead of serializing under a +// GPU-saturating game; this must be ≥ the helper's `PUNKTFUNK_ENCODE_DEPTH` (default 4, clamped ≤ 6). +const POOL: usize = 8; fn codec_guid(codec: Codec) -> nv::GUID { match codec { @@ -363,7 +366,9 @@ impl Encoder for NvencD3d11Encoder { // frame arrives on a different device OR at a different size than our session was built on. // HDR (BT.2020 PQ 10-bit) when the capturer hands us a 10-bit R10G10B10A2 frame. This can flip // mid-session when the user toggles HDR (which arrives as a capture device recreate anyway). - let hdr = matches!(captured.format, PixelFormat::Rgb10a2); + // HDR (BT.2020 PQ) when the capturer hands a 10-bit frame — either R10G10B10A2 (the legacy + // shader path) or P010 (the video-processor path). 8-bit NV12/ARGB → SDR. + let hdr = matches!(captured.format, PixelFormat::Rgb10a2 | PixelFormat::P010); let dev_raw = frame.device.as_raw(); let size_changed = self.inited && (self.width != captured.width || self.height != captured.height); @@ -384,13 +389,22 @@ impl Encoder for NvencD3d11Encoder { self.width = captured.width; self.height = captured.height; self.hdr = hdr; - if hdr { - // 10-bit BT.2020 PQ input; force Main10 regardless of the negotiated SDR bit depth. - self.bit_depth = 10; - self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10; - } else { - self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB; - } + // Pick the NVENC input format from the captured pixel format. YUV (NV12/P010) is the + // video-processor path — NVENC encodes it natively (no internal RGB→YUV, which is a hidden + // 3D/compute step that would fight a GPU-saturating game). RGB (ARGB/ABGR10) is the legacy + // shader path. 10-bit (P010/ABGR10) forces HEVC Main10 + the BT.2020 PQ VUI. + self.buffer_fmt = match captured.format { + PixelFormat::P010 => { + self.bit_depth = 10; + nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_YUV420_10BIT + } + PixelFormat::Rgb10a2 => { + self.bit_depth = 10; + nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10 + } + PixelFormat::Nv12 => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12, + _ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB, + }; let device = frame.device.clone(); self.init_session(&device)?; self.init_device = dev_raw; diff --git a/crates/punktfunk-host/src/encode/sw.rs b/crates/punktfunk-host/src/encode/sw.rs index f628089..7eaed2b 100644 --- a/crates/punktfunk-host/src/encode/sw.rs +++ b/crates/punktfunk-host/src/encode/sw.rs @@ -151,6 +151,13 @@ impl Encoder for OpenH264Encoder { PixelFormat::Rgb10a2 => { anyhow::bail!("software H.264 encoder cannot encode 10-bit HDR (Rgb10a2)") } + // NV12/P010 are GPU-resident video-processor outputs for the NVENC path; the software + // encoder never receives them (it only gets CPU RGB frames). + PixelFormat::Nv12 | PixelFormat::P010 => { + anyhow::bail!( + "software encoder cannot encode YUV GPU textures (NV12/P010 → NVENC only)" + ) + } } if self.force_kf { diff --git a/crates/punktfunk-host/src/wgc_helper.rs b/crates/punktfunk-host/src/wgc_helper.rs index d580e89..864cd58 100644 --- a/crates/punktfunk-host/src/wgc_helper.rs +++ b/crates/punktfunk-host/src/wgc_helper.rs @@ -101,19 +101,85 @@ pub fn run(opts: HelperOptions) -> Result<()> { let stdout = std::io::stdout(); let mut out = stdout.lock(); - let mut frame = first; + // Encode pipeline depth. The loop keeps DEPTH frames in flight so per-frame GPU-scheduling waits + // can overlap. NOTE: depth > 1 was measured to REGRESS under a GPU-saturating game — the encodes + // serialize on the contended GPU anyway, so a deeper queue just stacks latency (≈ depth × frame + // time) without raising throughput. Default 1 (the validated-best); `PUNKTFUNK_ENCODE_DEPTH` (1..=6) + // can raise it if a future workload is genuinely encode-throughput-bound rather than scheduling-bound. + let depth: usize = std::env::var("PUNKTFUNK_ENCODE_DEPTH") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|&d| (1..=6).contains(&d)) + .unwrap_or(1); + tracing::info!(depth, "WGC helper: encode pipeline depth"); + + let perf = std::env::var_os("PUNKTFUNK_PERF").is_some(); + let mut frames = 0u64; + let mut cap_wait_ns = 0u64; + let mut encode_ns = 0u64; // time blocked in lock_bitstream (the oldest in-flight encode) + let mut write_ns = 0u64; // time blocked writing the AU to the stdout pipe (relay backpressure) + let mut window = std::time::Instant::now(); + + // Prime: submit `depth` frames before the first poll so NVENC has that many encodes in flight. + // We don't hold the `CapturedFrame`s past `submit`: NVENC keeps its own registered texture clone + // and the capturer's ring/held-set own the canonical refs (sized for `depth`), so the in-flight + // inputs stay valid after our clones drop. + enc.submit(&first).context("first encoder submit")?; + drop(first); + for _ in 1..depth { + let f = cap.next_frame().context("WGC prime frame")?; + enc.submit(&f).context("prime encoder submit")?; + } loop { if kf.swap(false, Ordering::Relaxed) { enc.request_keyframe(); } - enc.submit(&frame).context("encoder submit")?; - while let Some(au) = enc.poll().context("encoder poll")? { - if write_au(&mut out, &au).is_err() { + // Pop + forward the OLDEST in-flight frame (FIFO). With `depth` outstanding it has had + // depth-1 frames' worth of GPU slots to finish, so this rarely blocks under load. + let p0 = std::time::Instant::now(); + let polled = enc.poll().context("encoder poll")?; + if perf { + encode_ns += p0.elapsed().as_nanos() as u64; + } + if let Some(au) = polled { + let w0 = std::time::Instant::now(); + let wrote = write_au(&mut out, &au); + if perf { + write_ns += w0.elapsed().as_nanos() as u64; + } + if wrote.is_err() { tracing::info!("WGC helper: stdout closed (host gone) — exiting"); return Ok(()); } } - frame = cap.next_frame().context("WGC next frame")?; + // Refill: capture + submit to keep `depth` frames in flight. + let t0 = std::time::Instant::now(); + let next = cap.next_frame().context("WGC next frame")?; + if perf { + cap_wait_ns += t0.elapsed().as_nanos() as u64; + } + enc.submit(&next).context("encoder submit")?; + + if perf { + frames += 1; + let since = window.elapsed(); + if since.as_secs() >= 2 { + let secs = since.as_secs_f64(); + let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6); + tracing::info!( + fps = format!("{:.1}", frames as f64 / secs), + cap_wait_ms = per(cap_wait_ns), + encode_ms = per(encode_ns), + write_ms = per(write_ns), + "WGC helper perf (depth-pipelined; encode_ms=lock_bitstream on the oldest)" + ); + frames = 0; + cap_wait_ns = 0; + encode_ns = 0; + write_ns = 0; + window = std::time::Instant::now(); + } + } } } diff --git a/crates/punktfunk-host/src/zerocopy/mod.rs b/crates/punktfunk-host/src/zerocopy/mod.rs index 538bd61..40311ed 100644 --- a/crates/punktfunk-host/src/zerocopy/mod.rs +++ b/crates/punktfunk-host/src/zerocopy/mod.rs @@ -36,8 +36,8 @@ pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option { Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888 Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888 // 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path. - // Rgb10a2 is the Windows HDR capture format — never produced by the Linux capturer. - Rgb | Bgr | Rgb10a2 => return None, + // Rgb10a2/Nv12/P010 are the Windows HDR / video-processor formats — never produced on Linux. + Rgb | Bgr | Rgb10a2 | Nv12 | P010 => return None, }) }