perf(host/windows): move capture→encode off the 3D engine (NV12/P010 video-processor path, zero-copy, GPU priority)
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
The Windows host capped at ~60 fps with 35-40 ms latency on a GPU-heavy game: the per-frame capture→encode path shared the 3D engine with the game and got scheduled behind it. Rework to minimize 3D-engine work per frame: - VideoConverter (D3D11 video processor): capture → NVENC-native NV12/P010 so NVENC skips its internal RGB→YUV (a 3D/compute step). Wired into both DDA (dxgi.rs) and WGC (wgc.rs). New PixelFormat::Nv12/P010 + NVENC YUV input. - GPU scheduling hardening (Apollo-style): D3DKMTSetProcessSchedulingPriorityClass HIGH, absolute SetGPUThreadPriority, SetMaximumFrameLatency(1). - WGC SDR zero-copy (hold pool frames; no CopyResource). DDA keeps a fast CopyResource to decouple its single-frame acquire/release from the async convert. - Pipelined helper encode loop (PUNKTFUNK_ENCODE_DEPTH, default 1) + perf split (cap_wait / encode / write). Live on the RTX 4090: hard 60 fps ceiling removed (now scene-scaling 40-200+), latency much reduced. Residual cap in GPU-pinned scenes is the irreducible RGB→YUV convert (no fixed-function unit on NVIDIA — VideoProcessing engine reads 0%) waiting behind an uncapped game under WDDM context time-slicing; Linux avoids it via gamescope capping the game to the display refresh. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -26,6 +26,13 @@ pub enum PixelFormat {
|
|||||||
/// produces this: scRGB FP16 desktop pixels are converted to BT.2020 PQ and written here, then
|
/// produces this: scRGB FP16 desktop pixels are converted to BT.2020 PQ and written here, then
|
||||||
/// handed to NVENC as `ABGR10` for an HEVC Main10 / HDR10 encode.
|
/// handed to NVENC as `ABGR10` for an HEVC Main10 / HDR10 encode.
|
||||||
Rgb10a2,
|
Rgb10a2,
|
||||||
|
/// `NV12` (DXGI `NV12`): 8-bit BT.709 limited-range YUV 4:2:0. Produced by the D3D11 **video
|
||||||
|
/// processor** (video engine, not the 3D engine) so the per-frame colour conversion doesn't fight a
|
||||||
|
/// GPU-saturating game; handed to NVENC as `NV12` (it encodes YUV natively — no internal RGB→YUV).
|
||||||
|
Nv12,
|
||||||
|
/// `P010` (DXGI `P010`): 10-bit BT.2020 PQ limited-range YUV 4:2:0. HDR analogue of [`Nv12`]:
|
||||||
|
/// video-processor output for HEVC Main10 / HDR10, handed to NVENC as `YUV420_10BIT`.
|
||||||
|
P010,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PixelFormat {
|
impl PixelFormat {
|
||||||
|
|||||||
@@ -37,12 +37,12 @@ use windows::Win32::Graphics::Dxgi::Common::{
|
|||||||
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::{
|
use windows::Win32::Graphics::Dxgi::{
|
||||||
CreateDXGIFactory1, IDXGIAdapter1, IDXGIFactory1, IDXGIOutput1, IDXGIOutput5,
|
CreateDXGIFactory1, IDXGIAdapter1, IDXGIDevice, IDXGIDevice1, IDXGIFactory1, IDXGIOutput1,
|
||||||
IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST, DXGI_ERROR_DEVICE_REMOVED,
|
IDXGIOutput5, IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST,
|
||||||
DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL, DXGI_ERROR_MODE_CHANGE_IN_PROGRESS,
|
DXGI_ERROR_DEVICE_REMOVED, DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL,
|
||||||
DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, DXGI_OUTDUPL_FRAME_INFO,
|
DXGI_ERROR_MODE_CHANGE_IN_PROGRESS, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC,
|
||||||
DXGI_OUTDUPL_POINTER_SHAPE_INFO, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR,
|
DXGI_OUTDUPL_FRAME_INFO, DXGI_OUTDUPL_POINTER_SHAPE_INFO,
|
||||||
DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR,
|
DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR,
|
||||||
};
|
};
|
||||||
use windows::Win32::System::StationsAndDesktops::{
|
use windows::Win32::System::StationsAndDesktops::{
|
||||||
CloseDesktop, OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS,
|
CloseDesktop, OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS,
|
||||||
@@ -147,10 +147,119 @@ pub(crate) unsafe fn make_device(
|
|||||||
Some(&mut context),
|
Some(&mut context),
|
||||||
)
|
)
|
||||||
.context("D3D11CreateDevice")?;
|
.context("D3D11CreateDevice")?;
|
||||||
Ok((
|
let device = device.context("null D3D11 device")?;
|
||||||
device.context("null D3D11 device")?,
|
let context = context.context("null D3D11 context")?;
|
||||||
context.context("null D3D11 context")?,
|
|
||||||
))
|
// Apollo-style GPU scheduling hardening (Sunshine display_base.cpp:599-709). Our capture+encode
|
||||||
|
// shares the GPU with the streamed game; when the game saturates the GPU our process is starved of
|
||||||
|
// GPU time slices, so NVENC sits near-idle yet `lock_bitstream` waits ~20 ms for our context to be
|
||||||
|
// scheduled — capping the stream (~47 fps measured at 5K@240) and stuttering. Per-frame copy/convert
|
||||||
|
// is NOT the cause (zero-copy + thread-priority alone didn't move it); the PROCESS-level GPU
|
||||||
|
// scheduling priority class is the decisive cross-process lever. Secondary: the absolute per-device
|
||||||
|
// GPU thread priority and a 1-frame latency cap.
|
||||||
|
elevate_process_gpu_priority();
|
||||||
|
if let Ok(dxgi_dev) = device.cast::<IDXGIDevice>() {
|
||||||
|
// Apollo's absolute max GPU thread priority (0x4000001E); fall back to relative +7.
|
||||||
|
if dxgi_dev.SetGPUThreadPriority(0x4000_001E).is_err()
|
||||||
|
&& dxgi_dev.SetGPUThreadPriority(7).is_err()
|
||||||
|
{
|
||||||
|
tracing::warn!("SetGPUThreadPriority failed (run as admin/SYSTEM for GPU priority)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Ok(dxgi1) = device.cast::<IDXGIDevice1>() {
|
||||||
|
let _ = dxgi1.SetMaximumFrameLatency(1);
|
||||||
|
}
|
||||||
|
Ok((device, context))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Apollo-style GPU scheduling-priority hardening (Sunshine `display_base.cpp:599-709`). On a
|
||||||
|
/// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but
|
||||||
|
/// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling
|
||||||
|
/// priority class (the strong cross-process lever — far more effective than `SetGPUThreadPriority`
|
||||||
|
/// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT
|
||||||
|
/// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this).
|
||||||
|
/// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime`
|
||||||
|
/// (default high).
|
||||||
|
fn elevate_process_gpu_priority() {
|
||||||
|
use std::sync::Once;
|
||||||
|
static ONCE: Once = Once::new();
|
||||||
|
ONCE.call_once(|| unsafe {
|
||||||
|
use windows::core::{s, PCWSTR};
|
||||||
|
use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
|
||||||
|
use windows::Win32::Security::{
|
||||||
|
AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES,
|
||||||
|
SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES,
|
||||||
|
TOKEN_PRIVILEGES, TOKEN_QUERY,
|
||||||
|
};
|
||||||
|
use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
|
||||||
|
use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
|
||||||
|
|
||||||
|
// D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4,
|
||||||
|
// REALTIME 5.
|
||||||
|
let prio: i32 = match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS").ok().as_deref() {
|
||||||
|
Some("off") => {
|
||||||
|
tracing::info!("GPU process scheduling priority class left at default (off)");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Some("normal") => 2,
|
||||||
|
Some("realtime") => 5,
|
||||||
|
_ => 4, // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC)
|
||||||
|
};
|
||||||
|
|
||||||
|
// 1. Enable SE_INC_BASE_PRIORITY so the kernel permits the GPU priority bump.
|
||||||
|
let mut token = HANDLE::default();
|
||||||
|
if OpenProcessToken(
|
||||||
|
GetCurrentProcess(),
|
||||||
|
TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
|
||||||
|
&mut token,
|
||||||
|
)
|
||||||
|
.is_ok()
|
||||||
|
{
|
||||||
|
let mut luid = LUID::default();
|
||||||
|
if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() {
|
||||||
|
let tp = TOKEN_PRIVILEGES {
|
||||||
|
PrivilegeCount: 1,
|
||||||
|
Privileges: [LUID_AND_ATTRIBUTES {
|
||||||
|
Luid: luid,
|
||||||
|
Attributes: SE_PRIVILEGE_ENABLED,
|
||||||
|
}],
|
||||||
|
};
|
||||||
|
if AdjustTokenPrivileges(
|
||||||
|
token,
|
||||||
|
false,
|
||||||
|
Some(&tp as *const TOKEN_PRIVILEGES),
|
||||||
|
0,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let _ = CloseHandle(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. D3DKMTSetProcessSchedulingPriorityClass via gdi32 (no stable windows-rs binding).
|
||||||
|
if let Ok(gdi32) = LoadLibraryA(s!("gdi32.dll")) {
|
||||||
|
if let Some(p) = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass")) {
|
||||||
|
type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32;
|
||||||
|
let f: SetPrio = std::mem::transmute(p);
|
||||||
|
let st = f(GetCurrentProcess(), prio);
|
||||||
|
if st == 0 {
|
||||||
|
tracing::info!(
|
||||||
|
priority_class = prio,
|
||||||
|
"GPU process scheduling priority class set (2=normal 4=high 5=realtime)"
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tracing::warn!(
|
||||||
|
status = format!("0x{st:08X}"),
|
||||||
|
"D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST
|
/// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST
|
||||||
@@ -827,6 +936,135 @@ impl HdrConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use windows::Win32::Graphics::Direct3D11::{
|
||||||
|
ID3D11VideoContext1, ID3D11VideoDevice, ID3D11VideoProcessor, ID3D11VideoProcessorEnumerator,
|
||||||
|
ID3D11VideoProcessorInputView, ID3D11VideoProcessorOutputView, D3D11_TEX2D_VPIV,
|
||||||
|
D3D11_TEX2D_VPOV, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE, D3D11_VIDEO_PROCESSOR_CONTENT_DESC,
|
||||||
|
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0,
|
||||||
|
D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0,
|
||||||
|
D3D11_VIDEO_PROCESSOR_STREAM, D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
|
||||||
|
D3D11_VPIV_DIMENSION_TEXTURE2D, D3D11_VPOV_DIMENSION_TEXTURE2D,
|
||||||
|
};
|
||||||
|
use windows::Win32::Graphics::Dxgi::Common::{
|
||||||
|
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
|
||||||
|
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020, DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
|
||||||
|
DXGI_RATIONAL,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// D3D11 **Video Processor** colour/format converter — runs on the GPU's dedicated VIDEO engine, NOT
|
||||||
|
/// the 3D engine, so the per-frame RGB→YUV conversion does not contend with a GPU-saturating game (the
|
||||||
|
/// HDR pixel-shader path and NVENC's internal RGB→YUV both use the 3D/compute engine, which an AAA
|
||||||
|
/// title pins at ~100%). Output is NV12 (SDR, BT.709 studio-range) or P010 (HDR, BT.2020 PQ
|
||||||
|
/// studio-range) — NVENC's native YUV inputs, so it encodes them with no further conversion.
|
||||||
|
pub(crate) struct VideoConverter {
|
||||||
|
vdev: ID3D11VideoDevice,
|
||||||
|
vctx: ID3D11VideoContext1,
|
||||||
|
enumr: ID3D11VideoProcessorEnumerator,
|
||||||
|
vp: ID3D11VideoProcessor,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VideoConverter {
|
||||||
|
pub(crate) unsafe fn new(
|
||||||
|
device: &ID3D11Device,
|
||||||
|
context: &ID3D11DeviceContext,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
hdr: bool,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let vdev: ID3D11VideoDevice = device.cast().context("device -> ID3D11VideoDevice")?;
|
||||||
|
let vctx: ID3D11VideoContext1 = context.cast().context("context -> ID3D11VideoContext1")?;
|
||||||
|
let rate = DXGI_RATIONAL {
|
||||||
|
Numerator: 240,
|
||||||
|
Denominator: 1,
|
||||||
|
};
|
||||||
|
let desc = D3D11_VIDEO_PROCESSOR_CONTENT_DESC {
|
||||||
|
InputFrameFormat: D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE,
|
||||||
|
InputFrameRate: rate,
|
||||||
|
InputWidth: width,
|
||||||
|
InputHeight: height,
|
||||||
|
OutputFrameRate: rate,
|
||||||
|
OutputWidth: width,
|
||||||
|
OutputHeight: height,
|
||||||
|
Usage: D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
|
||||||
|
};
|
||||||
|
let enumr = vdev
|
||||||
|
.CreateVideoProcessorEnumerator(&desc)
|
||||||
|
.context("CreateVideoProcessorEnumerator")?;
|
||||||
|
let vp = vdev
|
||||||
|
.CreateVideoProcessor(&enumr, 0)
|
||||||
|
.context("CreateVideoProcessor")?;
|
||||||
|
|
||||||
|
// Full-range RGB in → studio-range YUV out. HDR: scRGB linear (G10) → BT.2020 PQ (G2084).
|
||||||
|
// SDR: sRGB (G22) → BT.709 (G22).
|
||||||
|
let (in_cs, out_cs) = if hdr {
|
||||||
|
(
|
||||||
|
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709,
|
||||||
|
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(
|
||||||
|
DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
|
||||||
|
DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
vctx.VideoProcessorSetStreamColorSpace1(&vp, 0, in_cs);
|
||||||
|
vctx.VideoProcessorSetOutputColorSpace1(&vp, out_cs);
|
||||||
|
// One frame in, one frame out — no interpolation/auto-processing.
|
||||||
|
vctx.VideoProcessorSetStreamFrameFormat(&vp, 0, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE);
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
vdev,
|
||||||
|
vctx,
|
||||||
|
enumr,
|
||||||
|
vp,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert `input` (BGRA or scRGB FP16) → `output` (NV12 or P010) on the video engine. Views are
|
||||||
|
/// created per call (cheap relative to the Blt) so the input texture can vary frame to frame.
|
||||||
|
pub(crate) unsafe fn convert(
|
||||||
|
&self,
|
||||||
|
input: &ID3D11Texture2D,
|
||||||
|
output: &ID3D11Texture2D,
|
||||||
|
) -> Result<()> {
|
||||||
|
let in_desc = D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC {
|
||||||
|
FourCC: 0,
|
||||||
|
ViewDimension: D3D11_VPIV_DIMENSION_TEXTURE2D,
|
||||||
|
Anonymous: D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0 {
|
||||||
|
Texture2D: D3D11_TEX2D_VPIV {
|
||||||
|
MipSlice: 0,
|
||||||
|
ArraySlice: 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
let mut in_view: Option<ID3D11VideoProcessorInputView> = None;
|
||||||
|
self.vdev
|
||||||
|
.CreateVideoProcessorInputView(input, &self.enumr, &in_desc, Some(&mut in_view))
|
||||||
|
.context("CreateVideoProcessorInputView")?;
|
||||||
|
|
||||||
|
let out_desc = D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC {
|
||||||
|
ViewDimension: D3D11_VPOV_DIMENSION_TEXTURE2D,
|
||||||
|
Anonymous: D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0 {
|
||||||
|
Texture2D: D3D11_TEX2D_VPOV { MipSlice: 0 },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
let mut out_view: Option<ID3D11VideoProcessorOutputView> = None;
|
||||||
|
self.vdev
|
||||||
|
.CreateVideoProcessorOutputView(output, &self.enumr, &out_desc, Some(&mut out_view))
|
||||||
|
.context("CreateVideoProcessorOutputView")?;
|
||||||
|
let out_view = out_view.context("null output view")?;
|
||||||
|
|
||||||
|
let stream = D3D11_VIDEO_PROCESSOR_STREAM {
|
||||||
|
Enable: true.into(),
|
||||||
|
pInputSurface: std::mem::ManuallyDrop::new(in_view),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
self.vctx
|
||||||
|
.VideoProcessorBlt(&self.vp, &out_view, 0, &[stream])
|
||||||
|
.context("VideoProcessorBlt")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA.
|
/// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA.
|
||||||
fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option<CursorShape> {
|
fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option<CursorShape> {
|
||||||
let w = si.Width as usize;
|
let w = si.Width as usize;
|
||||||
@@ -1055,6 +1293,17 @@ pub struct DuplCapturer {
|
|||||||
hdr10_out: Option<ID3D11Texture2D>,
|
hdr10_out: Option<ID3D11Texture2D>,
|
||||||
/// scRGB→PQ conversion pass; rebuilt on device recreate.
|
/// scRGB→PQ conversion pass; rebuilt on device recreate.
|
||||||
hdr_conv: Option<HdrConverter>,
|
hdr_conv: Option<HdrConverter>,
|
||||||
|
/// Video-processor RGB→YUV converter (runs on the VIDEO engine, not the 3D engine) + its NV12
|
||||||
|
/// (SDR) / P010 (HDR) output texture. This is the zero-3D path: the per-frame colour conversion and
|
||||||
|
/// NVENC's RGB→YUV both move off the 3D engine so capture+encode don't fight a GPU-saturating game.
|
||||||
|
/// Lazily built for the current size+HDR; rebuilt on change. `None`/error → falls back to the
|
||||||
|
/// legacy RGB path. Disabled with `PUNKTFUNK_NO_VIDEO_PROCESSOR=1`.
|
||||||
|
video_conv: Option<VideoConverter>,
|
||||||
|
yuv_out: Option<ID3D11Texture2D>,
|
||||||
|
/// HDR-ness the current `video_conv`/`yuv_out` were built for, so an HDR toggle rebuilds them.
|
||||||
|
yuv_is_hdr: bool,
|
||||||
|
/// Latched off after a VideoConverter failure so we don't retry it every frame (fall back to RGB).
|
||||||
|
vp_disabled: bool,
|
||||||
/// Last time a duplication rebuild was attempted, to throttle retries during an outage (e.g. a
|
/// Last time a duplication rebuild was attempted, to throttle retries during an outage (e.g. a
|
||||||
/// secure-desktop dwell where the output is gone) so we don't block the encode loop or hammer
|
/// secure-desktop dwell where the output is gone) so we don't block the encode loop or hammer
|
||||||
/// DuplicateOutput — between attempts the last good frame is repeated. `None` = never attempted.
|
/// DuplicateOutput — between attempts the last good frame is repeated. `None` = never attempted.
|
||||||
@@ -1306,6 +1555,10 @@ impl DuplCapturer {
|
|||||||
fp16_srv: None,
|
fp16_srv: None,
|
||||||
hdr10_out: None,
|
hdr10_out: None,
|
||||||
hdr_conv: None,
|
hdr_conv: None,
|
||||||
|
video_conv: None,
|
||||||
|
yuv_out: None,
|
||||||
|
yuv_is_hdr: false,
|
||||||
|
vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(),
|
||||||
last_rebuild: None,
|
last_rebuild: None,
|
||||||
last_recover: None,
|
last_recover: None,
|
||||||
ever_got_frame: false,
|
ever_got_frame: false,
|
||||||
@@ -1375,6 +1628,85 @@ impl DuplCapturer {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert `input` (BGRA for SDR, scRGB FP16 for HDR) to NVENC's native YUV (NV12 / P010) via the
|
||||||
|
/// D3D11 **video processor** (video engine) — keeping the per-frame colour conversion AND NVENC's
|
||||||
|
/// RGB→YUV off the 3D engine so capture+encode don't fight a GPU-saturating game. Returns the YUV
|
||||||
|
/// texture, or `None` to fall back to the legacy RGB path (processor disabled/unavailable). Lazily
|
||||||
|
/// builds + caches the processor + output texture for the current size + HDR-ness.
|
||||||
|
unsafe fn convert_to_yuv(
|
||||||
|
&mut self,
|
||||||
|
input: &ID3D11Texture2D,
|
||||||
|
hdr: bool,
|
||||||
|
) -> Option<ID3D11Texture2D> {
|
||||||
|
if self.vp_disabled {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if self.video_conv.is_none() || self.yuv_out.is_none() || self.yuv_is_hdr != hdr {
|
||||||
|
self.video_conv = None;
|
||||||
|
self.yuv_out = None;
|
||||||
|
let vc = match VideoConverter::new(
|
||||||
|
&self.device,
|
||||||
|
&self.context,
|
||||||
|
self.width,
|
||||||
|
self.height,
|
||||||
|
hdr,
|
||||||
|
) {
|
||||||
|
Ok(vc) => vc,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = %format!("{e:#}"),
|
||||||
|
"video processor unavailable — falling back to RGB encode path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let fmt = if hdr {
|
||||||
|
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010
|
||||||
|
} else {
|
||||||
|
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12
|
||||||
|
};
|
||||||
|
let desc = D3D11_TEXTURE2D_DESC {
|
||||||
|
Width: self.width,
|
||||||
|
Height: self.height,
|
||||||
|
MipLevels: 1,
|
||||||
|
ArraySize: 1,
|
||||||
|
Format: fmt,
|
||||||
|
SampleDesc: DXGI_SAMPLE_DESC {
|
||||||
|
Count: 1,
|
||||||
|
Quality: 0,
|
||||||
|
},
|
||||||
|
Usage: D3D11_USAGE_DEFAULT,
|
||||||
|
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
|
CPUAccessFlags: 0,
|
||||||
|
MiscFlags: 0,
|
||||||
|
};
|
||||||
|
let mut t: Option<ID3D11Texture2D> = None;
|
||||||
|
if let Err(e) = self.device.CreateTexture2D(&desc, None, Some(&mut t)) {
|
||||||
|
tracing::warn!(error = %format!("{e:?}"),
|
||||||
|
"CreateTexture2D(YUV out) failed — falling back to RGB encode path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
self.video_conv = Some(vc);
|
||||||
|
self.yuv_out = t;
|
||||||
|
self.yuv_is_hdr = hdr;
|
||||||
|
tracing::info!(
|
||||||
|
hdr,
|
||||||
|
"video-processor YUV path active ({} on the video engine, 0% 3D)",
|
||||||
|
if hdr { "P010" } else { "NV12" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let out = self.yuv_out.clone()?;
|
||||||
|
if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) {
|
||||||
|
tracing::warn!(error = %format!("{e:#}"),
|
||||||
|
"VideoProcessorBlt failed — falling back to RGB encode path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
self.video_conv = None;
|
||||||
|
self.yuv_out = None;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(out)
|
||||||
|
}
|
||||||
|
|
||||||
/// FP16 (`R16G16B16A16_FLOAT`) copy of the HDR duplication surface (RT for the cursor composite +
|
/// FP16 (`R16G16B16A16_FLOAT`) copy of the HDR duplication surface (RT for the cursor composite +
|
||||||
/// SRV for the converter). Reallocated when absent (device/size change drops it).
|
/// SRV for the converter). Reallocated when absent (device/size change drops it).
|
||||||
unsafe fn ensure_fp16_src(&mut self) -> Result<()> {
|
unsafe fn ensure_fp16_src(&mut self) -> Result<()> {
|
||||||
@@ -1718,6 +2050,9 @@ impl DuplCapturer {
|
|||||||
self.fp16_srv = None;
|
self.fp16_srv = None;
|
||||||
self.hdr10_out = None;
|
self.hdr10_out = None;
|
||||||
self.hdr_conv = None;
|
self.hdr_conv = None;
|
||||||
|
// Video processor + its YUV output belonged to the old device / size / HDR-ness — rebuild lazily.
|
||||||
|
self.video_conv = None;
|
||||||
|
self.yuv_out = None;
|
||||||
self.first_frame = true;
|
self.first_frame = true;
|
||||||
// Capture the CURRENT desktop frame as `last_present` (instead of seeding black). The secure
|
// Capture the CURRENT desktop frame as `last_present` (instead of seeding black). The secure
|
||||||
// (lock/login/UAC) desktop is STATIC, so DDA only emits a frame on change — if we seeded black
|
// (lock/login/UAC) desktop is STATIC, so DDA only emits a frame on change — if we seeded black
|
||||||
@@ -1982,6 +2317,22 @@ impl DuplCapturer {
|
|||||||
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
|
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
|
||||||
self.holding_frame = false;
|
self.holding_frame = false;
|
||||||
self.composite_cursor_gpu(&src, true)?; // onto the FP16 surface (HDR: decode + nits scale)
|
self.composite_cursor_gpu(&src, true)?; // onto the FP16 surface (HDR: decode + nits scale)
|
||||||
|
// Video-engine path: scRGB FP16 → BT.2020 PQ P010 on the VIDEO engine (no 3D shader, and
|
||||||
|
// NVENC encodes P010 natively). Fall back to the HdrConverter pixel shader (3D) only if the
|
||||||
|
// video processor is unavailable.
|
||||||
|
if let Some(p010) = self.convert_to_yuv(&src, true) {
|
||||||
|
self.last_present = Some((p010.clone(), PixelFormat::P010));
|
||||||
|
return Ok(CapturedFrame {
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pts_ns: now_ns(),
|
||||||
|
format: PixelFormat::P010,
|
||||||
|
payload: FramePayload::D3d11(D3d11Frame {
|
||||||
|
texture: p010,
|
||||||
|
device: self.device.clone(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
}
|
||||||
self.ensure_hdr10_out()?;
|
self.ensure_hdr10_out()?;
|
||||||
let out = self.hdr10_out.clone().context("hdr10 out texture")?;
|
let out = self.hdr10_out.clone().context("hdr10 out texture")?;
|
||||||
if self.hdr_conv.is_none() {
|
if self.hdr_conv.is_none() {
|
||||||
@@ -2014,12 +2365,34 @@ impl DuplCapturer {
|
|||||||
if self.gpu_mode {
|
if self.gpu_mode {
|
||||||
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
|
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
|
||||||
// surface into a reused owned texture, release the duplication frame, hand off the texture.
|
// surface into a reused owned texture, release the duplication frame, hand off the texture.
|
||||||
|
// NOTE: do NOT convert the duplication surface directly on the video processor to skip this
|
||||||
|
// copy — the VP colour-convert (3D/compute on NVIDIA) holds the DDA surface until it
|
||||||
|
// completes, blocking ReleaseFrame/AcquireNextFrame and SERIALIZING capture+convert (~60 fps,
|
||||||
|
// encode_us 15-20 ms measured). The fast same-format CopyResource decouples them: it releases
|
||||||
|
// the DDA frame immediately so the convert runs independently (40-200 fps). Worth ~5% 3D.
|
||||||
self.ensure_gpu_copy()?;
|
self.ensure_gpu_copy()?;
|
||||||
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
|
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
|
||||||
self.context.CopyResource(&gpu, &tex);
|
self.context.CopyResource(&gpu, &tex);
|
||||||
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
|
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
|
||||||
self.holding_frame = false;
|
self.holding_frame = false;
|
||||||
self.composite_cursor_gpu(&gpu, false)?;
|
self.composite_cursor_gpu(&gpu, false)?;
|
||||||
|
// Prefer the video-engine YUV path (BGRA → NV12 on the video engine) so the colour
|
||||||
|
// conversion AND NVENC's encode stay OFF the 3D engine — the only way to keep up when a
|
||||||
|
// game pins the 3D engine at ~100%. Fall back to handing NVENC the BGRA texture (it then
|
||||||
|
// does RGB→YUV internally on the 3D/compute engine).
|
||||||
|
if let Some(nv12) = self.convert_to_yuv(&gpu, false) {
|
||||||
|
self.last_present = Some((nv12.clone(), PixelFormat::Nv12));
|
||||||
|
return Ok(CapturedFrame {
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pts_ns: now_ns(),
|
||||||
|
format: PixelFormat::Nv12,
|
||||||
|
payload: FramePayload::D3d11(D3d11Frame {
|
||||||
|
texture: nv12,
|
||||||
|
device: self.device.clone(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
}
|
||||||
self.last_present = Some((gpu.clone(), PixelFormat::Bgra));
|
self.last_present = Some((gpu.clone(), PixelFormat::Bgra));
|
||||||
return Ok(CapturedFrame {
|
return Ok(CapturedFrame {
|
||||||
width: self.width,
|
width: self.width,
|
||||||
|
|||||||
@@ -17,10 +17,12 @@
|
|||||||
//! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs).
|
//! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs).
|
||||||
|
|
||||||
use super::dxgi::{
|
use super::dxgi::{
|
||||||
find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, WinCaptureTarget,
|
find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, VideoConverter,
|
||||||
|
WinCaptureTarget,
|
||||||
};
|
};
|
||||||
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
|
use std::collections::VecDeque;
|
||||||
use std::sync::atomic::{AtomicU64, Ordering};
|
use std::sync::atomic::{AtomicU64, Ordering};
|
||||||
use std::sync::{Arc, Condvar, Mutex};
|
use std::sync::{Arc, Condvar, Mutex};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
@@ -37,8 +39,8 @@ use windows::Win32::Graphics::Direct3D11::{
|
|||||||
D3D11_USAGE_DEFAULT,
|
D3D11_USAGE_DEFAULT,
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::Common::{
|
use windows::Win32::Graphics::Dxgi::Common::{
|
||||||
DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_B8G8R8A8_UNORM,
|
DXGI_COLOR_SPACE_RGB_FULL_G2084_NONE_P2020, DXGI_FORMAT_R10G10B10A2_UNORM,
|
||||||
DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC,
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6};
|
use windows::Win32::Graphics::Dxgi::{IDXGIDevice, IDXGIOutput6};
|
||||||
use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf};
|
use windows::Win32::Security::{ImpersonateLoggedOnUser, RevertToSelf};
|
||||||
@@ -49,6 +51,22 @@ use windows::Win32::System::WinRT::Direct3D11::{
|
|||||||
use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop;
|
use windows::Win32::System::WinRT::Graphics::Capture::IGraphicsCaptureItemInterop;
|
||||||
use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED};
|
use windows::Win32::System::WinRT::{RoInitialize, RO_INIT_MULTITHREADED};
|
||||||
|
|
||||||
|
/// Output texture ring depth. The encode loop pipelines one frame deep (NVENC encodes frame N while
|
||||||
|
/// the capturer produces N+1), so two live textures suffice; three gives headroom against a slow
|
||||||
|
/// `lock_bitstream` and matches the WGC frame-pool depth.
|
||||||
|
// Sized for the deep encode pipeline (`PUNKTFUNK_ENCODE_DEPTH`, default 4, clamped ≤ 6): up to DEPTH
|
||||||
|
// frames are in flight in NVENC at once, so the HDR convert ring and the SDR held-frame set must each
|
||||||
|
// keep DEPTH(+headroom) live textures, and the WGC pool needs spare buffers beyond what we hold.
|
||||||
|
const OUT_RING: usize = 8;
|
||||||
|
|
||||||
|
/// SDR zero-copy: how many recent WGC frames to keep alive so NVENC can encode the pool texture in
|
||||||
|
/// place (no `CopyResource`). Each in-flight encode reads a distinct frame, so this must exceed the
|
||||||
|
/// pipeline depth; the oldest is released once `HELD_FRAMES` newer ones exist.
|
||||||
|
const HELD_FRAMES: usize = 8;
|
||||||
|
/// WGC frame-pool buffer count. Must exceed `HELD_FRAMES` so the compositor always has free buffers
|
||||||
|
/// to render into while we hold frames for in-place (zero-copy) SDR encode.
|
||||||
|
const WGC_POOL_BUFFERS: i32 = 10;
|
||||||
|
|
||||||
/// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under
|
/// The host runs as SYSTEM (so the DDA secure-desktop path works), but WGC will NOT activate under
|
||||||
/// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user
|
/// the SYSTEM account (`CreateForMonitor` → 0x80070424). Impersonate the interactive console user
|
||||||
/// for the WGC activation. Returns the user token (the caller reverts + closes it after activation)
|
/// for the WGC activation. Returns the user token (the caller reverts + closes it after activation)
|
||||||
@@ -112,8 +130,27 @@ pub struct WgcCapturer {
|
|||||||
hdr_conv: Option<HdrConverter>,
|
hdr_conv: Option<HdrConverter>,
|
||||||
fp16_src: Option<ID3D11Texture2D>,
|
fp16_src: Option<ID3D11Texture2D>,
|
||||||
fp16_srv: Option<ID3D11ShaderResourceView>,
|
fp16_srv: Option<ID3D11ShaderResourceView>,
|
||||||
hdr10_out: Option<ID3D11Texture2D>,
|
/// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed
|
||||||
bgra_copy: Option<ID3D11Texture2D>,
|
/// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC
|
||||||
|
/// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so
|
||||||
|
/// N+1 must land in a DIFFERENT texture or it clobbers the in-flight encode. (`fp16_src` stays
|
||||||
|
/// single: it's only touched within the D3D11 immediate context, whose op ordering already
|
||||||
|
/// serializes the convert's read against the next copy's write — NVENC's async engine read is the
|
||||||
|
/// only consumer that escapes that ordering, and it reads the ring output, never `fp16_src`.)
|
||||||
|
out_ring: Vec<ID3D11Texture2D>,
|
||||||
|
ring_idx: usize,
|
||||||
|
/// Video-processor RGB→YUV converter (off the 3D engine where possible) + its NV12/P010 output
|
||||||
|
/// ring. Preferred path: the OS-composited capture (cursor already in it) is converted DIRECTLY to
|
||||||
|
/// NVENC's native YUV — no `CopyResource`, no cursor draw, and NVENC skips its internal RGB→YUV.
|
||||||
|
/// `None`/error → falls back to the legacy SDR-zero-copy / HDR-shader paths.
|
||||||
|
video_conv: Option<VideoConverter>,
|
||||||
|
yuv_out: Vec<ID3D11Texture2D>,
|
||||||
|
yuv_idx: usize,
|
||||||
|
yuv_is_hdr: bool,
|
||||||
|
vp_disabled: bool,
|
||||||
|
/// SDR zero-copy: the recent WGC frames we hand to NVENC in place. Held so the pool doesn't
|
||||||
|
/// recycle the texture mid-encode; the oldest is released once `HELD_FRAMES` newer ones exist.
|
||||||
|
held: VecDeque<Direct3D11CaptureFrame>,
|
||||||
/// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop).
|
/// Last presentable GPU texture + format, repeated when no new frame arrived (static desktop).
|
||||||
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
|
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
|
||||||
|
|
||||||
@@ -204,10 +241,15 @@ impl WgcCapturer {
|
|||||||
} else {
|
} else {
|
||||||
DirectXPixelFormat::B8G8R8A8UIntNormalized
|
DirectXPixelFormat::B8G8R8A8UIntNormalized
|
||||||
};
|
};
|
||||||
// ≥3 buffers for 240 Hz headroom (avoid the producer waiting on a free buffer).
|
// Extra buffers: SDR zero-copy holds the last `HELD_FRAMES` frames (encoded in place), so
|
||||||
let pool =
|
// the pool needs headroom beyond that for the producer to keep rendering at 240 Hz.
|
||||||
Direct3D11CaptureFramePool::CreateFreeThreaded(&d3d_device, pixel_format, 3, size)
|
let pool = Direct3D11CaptureFramePool::CreateFreeThreaded(
|
||||||
.context("CreateFreeThreaded frame pool")?;
|
&d3d_device,
|
||||||
|
pixel_format,
|
||||||
|
WGC_POOL_BUFFERS,
|
||||||
|
size,
|
||||||
|
)
|
||||||
|
.context("CreateFreeThreaded frame pool")?;
|
||||||
|
|
||||||
let signal = Arc::new(WgcSignal {
|
let signal = Arc::new(WgcSignal {
|
||||||
available: AtomicU64::new(0),
|
available: AtomicU64::new(0),
|
||||||
@@ -278,8 +320,14 @@ impl WgcCapturer {
|
|||||||
hdr_conv: None,
|
hdr_conv: None,
|
||||||
fp16_src: None,
|
fp16_src: None,
|
||||||
fp16_srv: None,
|
fp16_srv: None,
|
||||||
hdr10_out: None,
|
out_ring: Vec::new(),
|
||||||
bgra_copy: None,
|
ring_idx: 0,
|
||||||
|
video_conv: None,
|
||||||
|
yuv_out: Vec::new(),
|
||||||
|
yuv_idx: 0,
|
||||||
|
yuv_is_hdr: false,
|
||||||
|
vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(),
|
||||||
|
held: VecDeque::new(),
|
||||||
last_present: None,
|
last_present: None,
|
||||||
_keepalive: None,
|
_keepalive: None,
|
||||||
})
|
})
|
||||||
@@ -347,38 +395,112 @@ impl WgcCapturer {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn ensure_hdr10_out(&mut self) -> Result<()> {
|
/// Lazily allocate the HDR output texture ring (R10G10B10A2, the convert pass's render target →
|
||||||
if self.hdr10_out.is_none() {
|
/// NVENC input), `RENDER_TARGET`-bindable. SDR is zero-copy (encodes the WGC pool texture in
|
||||||
let desc = tex_desc(
|
/// place) and uses no ring.
|
||||||
self.width,
|
unsafe fn ensure_out_ring(
|
||||||
self.height,
|
&mut self,
|
||||||
DXGI_FORMAT_R10G10B10A2_UNORM,
|
format: windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT,
|
||||||
D3D11_BIND_RENDER_TARGET.0 as u32,
|
) -> Result<()> {
|
||||||
);
|
if !self.out_ring.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let desc = tex_desc(
|
||||||
|
self.width,
|
||||||
|
self.height,
|
||||||
|
format,
|
||||||
|
D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
|
);
|
||||||
|
for _ in 0..OUT_RING {
|
||||||
let mut t = None;
|
let mut t = None;
|
||||||
self.device
|
self.device
|
||||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||||
.context("CreateTexture2D(wgc hdr10 out)")?;
|
.context("CreateTexture2D(wgc out ring)")?;
|
||||||
self.hdr10_out = t;
|
self.out_ring.push(t.context("wgc out ring tex")?);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn ensure_bgra(&mut self) -> Result<()> {
|
/// Convert `input` (the OS-composited WGC pool texture: BGRA or scRGB FP16) → NVENC's native YUV
|
||||||
if self.bgra_copy.is_none() {
|
/// (NV12 / P010) on the video processor. Returns the YUV texture (from a ring so consecutive
|
||||||
|
/// encodes don't collide), or `None` to fall back to the legacy RGB paths.
|
||||||
|
unsafe fn convert_to_yuv(
|
||||||
|
&mut self,
|
||||||
|
input: &ID3D11Texture2D,
|
||||||
|
hdr: bool,
|
||||||
|
) -> Option<ID3D11Texture2D> {
|
||||||
|
if self.vp_disabled {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if self.video_conv.is_none() || self.yuv_out.is_empty() || self.yuv_is_hdr != hdr {
|
||||||
|
self.video_conv = None;
|
||||||
|
self.yuv_out.clear();
|
||||||
|
self.yuv_idx = 0;
|
||||||
|
let vc = match VideoConverter::new(
|
||||||
|
&self.device,
|
||||||
|
&self.context,
|
||||||
|
self.width,
|
||||||
|
self.height,
|
||||||
|
hdr,
|
||||||
|
) {
|
||||||
|
Ok(vc) => vc,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(error = %format!("{e:#}"),
|
||||||
|
"WGC: video processor unavailable — falling back to RGB path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let fmt = if hdr {
|
||||||
|
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010
|
||||||
|
} else {
|
||||||
|
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12
|
||||||
|
};
|
||||||
let desc = tex_desc(
|
let desc = tex_desc(
|
||||||
self.width,
|
self.width,
|
||||||
self.height,
|
self.height,
|
||||||
DXGI_FORMAT_B8G8R8A8_UNORM,
|
fmt,
|
||||||
D3D11_BIND_RENDER_TARGET.0 as u32,
|
D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
);
|
);
|
||||||
let mut t = None;
|
for _ in 0..OUT_RING {
|
||||||
self.device
|
let mut t = None;
|
||||||
.CreateTexture2D(&desc, None, Some(&mut t))
|
if self
|
||||||
.context("CreateTexture2D(wgc bgra)")?;
|
.device
|
||||||
self.bgra_copy = t;
|
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
tracing::warn!("WGC: CreateTexture2D(YUV) failed — falling back to RGB path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
self.yuv_out.clear();
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let Some(tex) = t else {
|
||||||
|
self.vp_disabled = true;
|
||||||
|
self.yuv_out.clear();
|
||||||
|
return None;
|
||||||
|
};
|
||||||
|
self.yuv_out.push(tex);
|
||||||
|
}
|
||||||
|
self.video_conv = Some(vc);
|
||||||
|
self.yuv_is_hdr = hdr;
|
||||||
|
tracing::info!(
|
||||||
|
hdr,
|
||||||
|
"WGC: video-processor YUV path active ({})",
|
||||||
|
if hdr { "P010" } else { "NV12" }
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Ok(())
|
let slot = self.yuv_idx;
|
||||||
|
self.yuv_idx = (self.yuv_idx + 1) % self.yuv_out.len();
|
||||||
|
let out = self.yuv_out[slot].clone();
|
||||||
|
if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) {
|
||||||
|
tracing::warn!(error = %format!("{e:#}"),
|
||||||
|
"WGC: VideoProcessorBlt failed — falling back to RGB path");
|
||||||
|
self.vp_disabled = true;
|
||||||
|
self.video_conv = None;
|
||||||
|
self.yuv_out.clear();
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(out)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result<CapturedFrame> {
|
fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result<CapturedFrame> {
|
||||||
@@ -391,13 +513,38 @@ impl WgcCapturer {
|
|||||||
.GetInterface()
|
.GetInterface()
|
||||||
.context("GetInterface ID3D11Texture2D")?;
|
.context("GetInterface ID3D11Texture2D")?;
|
||||||
|
|
||||||
|
// Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to
|
||||||
|
// NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC
|
||||||
|
// skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set
|
||||||
|
// means reading the pool texture directly does NOT serialize (unlike DDA's single-frame
|
||||||
|
// model). The frame is held until the async Blt finishes.
|
||||||
|
if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) {
|
||||||
|
let fmt = if self.hdr {
|
||||||
|
PixelFormat::P010
|
||||||
|
} else {
|
||||||
|
PixelFormat::Nv12
|
||||||
|
};
|
||||||
|
self.last_present = Some((yuv.clone(), fmt));
|
||||||
|
let out = self.d3d11_frame(yuv, fmt);
|
||||||
|
self.held.push_back(frame);
|
||||||
|
while self.held.len() > HELD_FRAMES {
|
||||||
|
self.held.pop_front();
|
||||||
|
}
|
||||||
|
return Ok(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- fallback (video processor unavailable) ---
|
||||||
if self.hdr {
|
if self.hdr {
|
||||||
|
// Next ring slot — the in-flight encode reads the slot we handed out last time, so
|
||||||
|
// this capture must land in a different one (see `out_ring`).
|
||||||
|
let slot = self.ring_idx;
|
||||||
|
self.ring_idx = (self.ring_idx + 1) % OUT_RING;
|
||||||
// FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC.
|
// FP16 (cursor already composited by the OS) → BT.2020 PQ 10-bit for NVENC.
|
||||||
self.ensure_fp16_src()?;
|
self.ensure_fp16_src()?;
|
||||||
let fp16 = self.fp16_src.clone().context("fp16 src")?;
|
let fp16 = self.fp16_src.clone().context("fp16 src")?;
|
||||||
self.context.CopyResource(&fp16, &src);
|
self.context.CopyResource(&fp16, &src);
|
||||||
self.ensure_hdr10_out()?;
|
self.ensure_out_ring(DXGI_FORMAT_R10G10B10A2_UNORM)?;
|
||||||
let out = self.hdr10_out.clone().context("hdr10 out")?;
|
let out = self.out_ring[slot].clone();
|
||||||
if self.hdr_conv.is_none() {
|
if self.hdr_conv.is_none() {
|
||||||
self.hdr_conv = Some(HdrConverter::new(&self.device)?);
|
self.hdr_conv = Some(HdrConverter::new(&self.device)?);
|
||||||
}
|
}
|
||||||
@@ -416,12 +563,19 @@ impl WgcCapturer {
|
|||||||
self.last_present = Some((out.clone(), PixelFormat::Rgb10a2));
|
self.last_present = Some((out.clone(), PixelFormat::Rgb10a2));
|
||||||
Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2))
|
Ok(self.d3d11_frame(out, PixelFormat::Rgb10a2))
|
||||||
} else {
|
} else {
|
||||||
// SDR: copy out of the recycled pool texture (cursor already composited) and hand off.
|
// SDR ZERO-COPY: hand NVENC the WGC pool texture DIRECTLY — no `CopyResource`. The
|
||||||
self.ensure_bgra()?;
|
// per-frame copy otherwise queues on the graphics engine behind a GPU-saturating game
|
||||||
let bgra = self.bgra_copy.clone().context("bgra copy")?;
|
// and stalls `lock_bitstream` ~20 ms (NVENC sits idle waiting for its input). Encoding
|
||||||
self.context.CopyResource(&bgra, &src);
|
// the pool texture in place removes that graphics-queue dependency (Apollo's model).
|
||||||
self.last_present = Some((bgra.clone(), PixelFormat::Bgra));
|
// We must keep the frame alive until its async encode finishes, so retain the last
|
||||||
Ok(self.d3d11_frame(bgra, PixelFormat::Bgra))
|
// `HELD_FRAMES`; the pool has spare buffers so the producer never starves.
|
||||||
|
self.last_present = Some((src.clone(), PixelFormat::Bgra));
|
||||||
|
let out = self.d3d11_frame(src, PixelFormat::Bgra);
|
||||||
|
self.held.push_back(frame);
|
||||||
|
while self.held.len() > HELD_FRAMES {
|
||||||
|
self.held.pop_front();
|
||||||
|
}
|
||||||
|
Ok(out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -103,9 +103,10 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) {
|
|||||||
PixelFormat::Rgba => (Pixel::RGBA, false),
|
PixelFormat::Rgba => (Pixel::RGBA, false),
|
||||||
PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0
|
PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0
|
||||||
PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0
|
PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0
|
||||||
// 10-bit HDR (R10G10B10A2) is produced only by the Windows DXGI HDR capture path; the Linux
|
// Rgb10a2 (HDR) and NV12/P010 (the Windows video-processor YUV outputs) are produced only by
|
||||||
// capturer never emits it. Map to BGRA so the match is exhaustive — unreachable here.
|
// the Windows capture/encode paths; the Linux capturer never emits them. Map to BGRA so the
|
||||||
PixelFormat::Rgb10a2 => (Pixel::BGRA, false),
|
// match is exhaustive — unreachable here.
|
||||||
|
PixelFormat::Rgb10a2 | PixelFormat::Nv12 | PixelFormat::P010 => (Pixel::BGRA, false),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,10 @@ use windows::Win32::Graphics::Direct3D11::{ID3D11Device, ID3D11Texture2D};
|
|||||||
use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv;
|
use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv;
|
||||||
use nvidia_video_codec_sdk::ENCODE_API as API;
|
use nvidia_video_codec_sdk::ENCODE_API as API;
|
||||||
|
|
||||||
const POOL: usize = 4;
|
// Output bitstream buffers = max in-flight encodes. The helper deep-pipelines (submits several frames
|
||||||
|
// before locking the oldest) so per-frame GPU-scheduling waits OVERLAP instead of serializing under a
|
||||||
|
// GPU-saturating game; this must be ≥ the helper's `PUNKTFUNK_ENCODE_DEPTH` (default 4, clamped ≤ 6).
|
||||||
|
const POOL: usize = 8;
|
||||||
|
|
||||||
fn codec_guid(codec: Codec) -> nv::GUID {
|
fn codec_guid(codec: Codec) -> nv::GUID {
|
||||||
match codec {
|
match codec {
|
||||||
@@ -363,7 +366,9 @@ impl Encoder for NvencD3d11Encoder {
|
|||||||
// frame arrives on a different device OR at a different size than our session was built on.
|
// frame arrives on a different device OR at a different size than our session was built on.
|
||||||
// HDR (BT.2020 PQ 10-bit) when the capturer hands us a 10-bit R10G10B10A2 frame. This can flip
|
// HDR (BT.2020 PQ 10-bit) when the capturer hands us a 10-bit R10G10B10A2 frame. This can flip
|
||||||
// mid-session when the user toggles HDR (which arrives as a capture device recreate anyway).
|
// mid-session when the user toggles HDR (which arrives as a capture device recreate anyway).
|
||||||
let hdr = matches!(captured.format, PixelFormat::Rgb10a2);
|
// HDR (BT.2020 PQ) when the capturer hands a 10-bit frame — either R10G10B10A2 (the legacy
|
||||||
|
// shader path) or P010 (the video-processor path). 8-bit NV12/ARGB → SDR.
|
||||||
|
let hdr = matches!(captured.format, PixelFormat::Rgb10a2 | PixelFormat::P010);
|
||||||
let dev_raw = frame.device.as_raw();
|
let dev_raw = frame.device.as_raw();
|
||||||
let size_changed =
|
let size_changed =
|
||||||
self.inited && (self.width != captured.width || self.height != captured.height);
|
self.inited && (self.width != captured.width || self.height != captured.height);
|
||||||
@@ -384,13 +389,22 @@ impl Encoder for NvencD3d11Encoder {
|
|||||||
self.width = captured.width;
|
self.width = captured.width;
|
||||||
self.height = captured.height;
|
self.height = captured.height;
|
||||||
self.hdr = hdr;
|
self.hdr = hdr;
|
||||||
if hdr {
|
// Pick the NVENC input format from the captured pixel format. YUV (NV12/P010) is the
|
||||||
// 10-bit BT.2020 PQ input; force Main10 regardless of the negotiated SDR bit depth.
|
// video-processor path — NVENC encodes it natively (no internal RGB→YUV, which is a hidden
|
||||||
self.bit_depth = 10;
|
// 3D/compute step that would fight a GPU-saturating game). RGB (ARGB/ABGR10) is the legacy
|
||||||
self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10;
|
// shader path. 10-bit (P010/ABGR10) forces HEVC Main10 + the BT.2020 PQ VUI.
|
||||||
} else {
|
self.buffer_fmt = match captured.format {
|
||||||
self.buffer_fmt = nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB;
|
PixelFormat::P010 => {
|
||||||
}
|
self.bit_depth = 10;
|
||||||
|
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_YUV420_10BIT
|
||||||
|
}
|
||||||
|
PixelFormat::Rgb10a2 => {
|
||||||
|
self.bit_depth = 10;
|
||||||
|
nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ABGR10
|
||||||
|
}
|
||||||
|
PixelFormat::Nv12 => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_NV12,
|
||||||
|
_ => nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
|
||||||
|
};
|
||||||
let device = frame.device.clone();
|
let device = frame.device.clone();
|
||||||
self.init_session(&device)?;
|
self.init_session(&device)?;
|
||||||
self.init_device = dev_raw;
|
self.init_device = dev_raw;
|
||||||
|
|||||||
@@ -151,6 +151,13 @@ impl Encoder for OpenH264Encoder {
|
|||||||
PixelFormat::Rgb10a2 => {
|
PixelFormat::Rgb10a2 => {
|
||||||
anyhow::bail!("software H.264 encoder cannot encode 10-bit HDR (Rgb10a2)")
|
anyhow::bail!("software H.264 encoder cannot encode 10-bit HDR (Rgb10a2)")
|
||||||
}
|
}
|
||||||
|
// NV12/P010 are GPU-resident video-processor outputs for the NVENC path; the software
|
||||||
|
// encoder never receives them (it only gets CPU RGB frames).
|
||||||
|
PixelFormat::Nv12 | PixelFormat::P010 => {
|
||||||
|
anyhow::bail!(
|
||||||
|
"software encoder cannot encode YUV GPU textures (NV12/P010 → NVENC only)"
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.force_kf {
|
if self.force_kf {
|
||||||
|
|||||||
@@ -101,19 +101,85 @@ pub fn run(opts: HelperOptions) -> Result<()> {
|
|||||||
let stdout = std::io::stdout();
|
let stdout = std::io::stdout();
|
||||||
let mut out = stdout.lock();
|
let mut out = stdout.lock();
|
||||||
|
|
||||||
let mut frame = first;
|
// Encode pipeline depth. The loop keeps DEPTH frames in flight so per-frame GPU-scheduling waits
|
||||||
|
// can overlap. NOTE: depth > 1 was measured to REGRESS under a GPU-saturating game — the encodes
|
||||||
|
// serialize on the contended GPU anyway, so a deeper queue just stacks latency (≈ depth × frame
|
||||||
|
// time) without raising throughput. Default 1 (the validated-best); `PUNKTFUNK_ENCODE_DEPTH` (1..=6)
|
||||||
|
// can raise it if a future workload is genuinely encode-throughput-bound rather than scheduling-bound.
|
||||||
|
let depth: usize = std::env::var("PUNKTFUNK_ENCODE_DEPTH")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.trim().parse::<usize>().ok())
|
||||||
|
.filter(|&d| (1..=6).contains(&d))
|
||||||
|
.unwrap_or(1);
|
||||||
|
tracing::info!(depth, "WGC helper: encode pipeline depth");
|
||||||
|
|
||||||
|
let perf = std::env::var_os("PUNKTFUNK_PERF").is_some();
|
||||||
|
let mut frames = 0u64;
|
||||||
|
let mut cap_wait_ns = 0u64;
|
||||||
|
let mut encode_ns = 0u64; // time blocked in lock_bitstream (the oldest in-flight encode)
|
||||||
|
let mut write_ns = 0u64; // time blocked writing the AU to the stdout pipe (relay backpressure)
|
||||||
|
let mut window = std::time::Instant::now();
|
||||||
|
|
||||||
|
// Prime: submit `depth` frames before the first poll so NVENC has that many encodes in flight.
|
||||||
|
// We don't hold the `CapturedFrame`s past `submit`: NVENC keeps its own registered texture clone
|
||||||
|
// and the capturer's ring/held-set own the canonical refs (sized for `depth`), so the in-flight
|
||||||
|
// inputs stay valid after our clones drop.
|
||||||
|
enc.submit(&first).context("first encoder submit")?;
|
||||||
|
drop(first);
|
||||||
|
for _ in 1..depth {
|
||||||
|
let f = cap.next_frame().context("WGC prime frame")?;
|
||||||
|
enc.submit(&f).context("prime encoder submit")?;
|
||||||
|
}
|
||||||
loop {
|
loop {
|
||||||
if kf.swap(false, Ordering::Relaxed) {
|
if kf.swap(false, Ordering::Relaxed) {
|
||||||
enc.request_keyframe();
|
enc.request_keyframe();
|
||||||
}
|
}
|
||||||
enc.submit(&frame).context("encoder submit")?;
|
// Pop + forward the OLDEST in-flight frame (FIFO). With `depth` outstanding it has had
|
||||||
while let Some(au) = enc.poll().context("encoder poll")? {
|
// depth-1 frames' worth of GPU slots to finish, so this rarely blocks under load.
|
||||||
if write_au(&mut out, &au).is_err() {
|
let p0 = std::time::Instant::now();
|
||||||
|
let polled = enc.poll().context("encoder poll")?;
|
||||||
|
if perf {
|
||||||
|
encode_ns += p0.elapsed().as_nanos() as u64;
|
||||||
|
}
|
||||||
|
if let Some(au) = polled {
|
||||||
|
let w0 = std::time::Instant::now();
|
||||||
|
let wrote = write_au(&mut out, &au);
|
||||||
|
if perf {
|
||||||
|
write_ns += w0.elapsed().as_nanos() as u64;
|
||||||
|
}
|
||||||
|
if wrote.is_err() {
|
||||||
tracing::info!("WGC helper: stdout closed (host gone) — exiting");
|
tracing::info!("WGC helper: stdout closed (host gone) — exiting");
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
frame = cap.next_frame().context("WGC next frame")?;
|
// Refill: capture + submit to keep `depth` frames in flight.
|
||||||
|
let t0 = std::time::Instant::now();
|
||||||
|
let next = cap.next_frame().context("WGC next frame")?;
|
||||||
|
if perf {
|
||||||
|
cap_wait_ns += t0.elapsed().as_nanos() as u64;
|
||||||
|
}
|
||||||
|
enc.submit(&next).context("encoder submit")?;
|
||||||
|
|
||||||
|
if perf {
|
||||||
|
frames += 1;
|
||||||
|
let since = window.elapsed();
|
||||||
|
if since.as_secs() >= 2 {
|
||||||
|
let secs = since.as_secs_f64();
|
||||||
|
let per = |ns: u64| format!("{:.2}", ns as f64 / frames as f64 / 1e6);
|
||||||
|
tracing::info!(
|
||||||
|
fps = format!("{:.1}", frames as f64 / secs),
|
||||||
|
cap_wait_ms = per(cap_wait_ns),
|
||||||
|
encode_ms = per(encode_ns),
|
||||||
|
write_ms = per(write_ns),
|
||||||
|
"WGC helper perf (depth-pipelined; encode_ms=lock_bitstream on the oldest)"
|
||||||
|
);
|
||||||
|
frames = 0;
|
||||||
|
cap_wait_ns = 0;
|
||||||
|
encode_ns = 0;
|
||||||
|
write_ns = 0;
|
||||||
|
window = std::time::Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option<u32> {
|
|||||||
Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
|
Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
|
||||||
Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
|
Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
|
||||||
// 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
|
// 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
|
||||||
// Rgb10a2 is the Windows HDR capture format — never produced by the Linux capturer.
|
// Rgb10a2/Nv12/P010 are the Windows HDR / video-processor formats — never produced on Linux.
|
||||||
Rgb | Bgr | Rgb10a2 => return None,
|
Rgb | Bgr | Rgb10a2 | Nv12 | P010 => return None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user