Files
punktfunk/crates/punktfunk-host/src/capture/windows/dxgi.rs
T
enricobuehler bee1f0416d chore(licensing): LGPL FFmpeg swap, third-party notices, attribution hygiene
The MIT OR Apache-2.0 SOURCE license is clean (audit found no copied copyleft); the
gaps were all binary-distribution (Layer-2). This makes the shipped artifacts honest:

- Windows host + client: bundled FFmpeg BtbN gpl-shared -> lgpl-shared (AMF/QSV/decode
  unaffected; the GPL-only x264/x265 were never used), and ship the FFmpeg LGPL notice
  + license text in the installer + MSIX (licenses/).
- THIRD-PARTY-NOTICES.txt generated + bundled into installer/MSIX/deb/rpm. Offline
  generator (scripts/gen-third-party-notices.{py,sh}) + cargo-about config (about.toml/
  .hbs) with a permissive-only accepted-license allow-list as a copyleft regression gate.
- Reword the win32u GPU-preference hook comments to reflect independent reimplementation
  (no Apollo/Sunshine GPL-3.0 source copied).
- README dual-license + inbound=outbound contributor clause + non-affiliation trademark
  disclaimer; new CONTRIBUTING.md.
- LICENSE files into the standalone driver + vk-layer workspaces; deb copyright holder
  aligned to "unom and the punktfunk contributors".

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-29 06:20:38 +00:00

3395 lines
161 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! DXGI Desktop Duplication capture (Windows) — the analogue of the PipeWire portal capturer.
//! Creates a D3D11 device on the SudoVDA adapter (by LUID), finds the matching output (by GDI
//! name), duplicates it, and on each `AcquireNextFrame` copies the desktop image into a CPU-readable
//! staging texture → tightly-packed BGRA (the GPU-less path that feeds the software encoder). A
//! future zero-copy path returns `FramePayload::D3d11` for NVENC.
//!
//! Validates only with a real GPU + an *activated* SudoVDA monitor (`DuplicateOutput` needs a live
//! WDDM output). Compiles on the GPU-less VM; the pure helpers are unit-tested there.
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
use anyhow::{anyhow, bail, Context, Result};
use std::ffi::c_void;
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
use windows::core::{s, Interface, PCSTR};
use windows::Win32::Foundation::{HMODULE, LUID};
use windows::Win32::Graphics::Direct3D::Fxc::D3DCompile;
use windows::Win32::Graphics::Direct3D::{
ID3DBlob, D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0, D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST,
D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP,
};
use windows::Win32::Graphics::Direct3D11::{
D3D11CreateDevice, ID3D11BlendState, ID3D11Buffer, ID3D11Device, ID3D11DeviceContext,
ID3D11PixelShader, ID3D11RenderTargetView, ID3D11SamplerState, ID3D11ShaderResourceView,
ID3D11Texture2D, ID3D11VertexShader, D3D11_BIND_CONSTANT_BUFFER, D3D11_BIND_FLAG,
D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_BLEND_DESC,
D3D11_BLEND_INV_DEST_COLOR, D3D11_BLEND_INV_SRC_ALPHA, D3D11_BLEND_ONE, D3D11_BLEND_OP_ADD,
D3D11_BLEND_SRC_ALPHA, D3D11_BUFFER_DESC, D3D11_COLOR_WRITE_ENABLE_ALL, D3D11_COMPARISON_NEVER,
D3D11_CPU_ACCESS_READ, D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT,
D3D11_FILTER_MIN_MAG_MIP_POINT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ,
D3D11_MAP_WRITE_DISCARD, D3D11_RENDER_TARGET_BLEND_DESC, D3D11_RENDER_TARGET_VIEW_DESC,
D3D11_RENDER_TARGET_VIEW_DESC_0, D3D11_RTV_DIMENSION_TEXTURE2D, D3D11_SAMPLER_DESC,
D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEX2D_RTV, D3D11_TEXTURE2D_DESC,
D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC, D3D11_USAGE_STAGING,
D3D11_VIEWPORT,
};
use windows::Win32::Graphics::Dxgi::Common::{
DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_P010, DXGI_FORMAT_R10G10B10A2_UNORM,
DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16_UNORM, DXGI_FORMAT_R16_UNORM,
DXGI_SAMPLE_DESC,
};
use windows::Win32::Graphics::Dxgi::{
CreateDXGIFactory1, IDXGIAdapter1, IDXGIDevice, IDXGIDevice1, IDXGIFactory1, IDXGIOutput1,
IDXGIOutput5, IDXGIOutput6, IDXGIOutputDuplication, IDXGIResource, DXGI_ERROR_ACCESS_LOST,
DXGI_ERROR_DEVICE_REMOVED, DXGI_ERROR_DEVICE_RESET, DXGI_ERROR_INVALID_CALL,
DXGI_ERROR_MODE_CHANGE_IN_PROGRESS, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC,
DXGI_OUTDUPL_FRAME_INFO, DXGI_OUTDUPL_POINTER_SHAPE_INFO,
DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR,
};
use windows::Win32::System::StationsAndDesktops::{
CloseDesktop, OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS,
};
use windows::Win32::UI::WindowsAndMessaging::SetCursorPos;
/// The Windows capture identity carried out of the SudoVDA backend in
/// [`crate::vdisplay::VirtualOutput`]: which adapter + which GDI output to duplicate.
#[derive(Clone, Debug)]
pub struct WinCaptureTarget {
/// Packed DXGI adapter LUID (`(HighPart << 32) | (LowPart & 0xffff_ffff)`).
pub adapter_luid: i64,
/// The output's GDI device name, e.g. `\\.\DISPLAY3`. Can CHANGE across a secure-desktop switch.
pub gdi_name: String,
/// Stable SudoVDA target id — re-resolved to the current GDI name on every recovery.
pub target_id: u32,
}
/// A GPU-resident captured texture (future NVENC-D3D11 zero-copy path).
pub struct D3d11Frame {
pub texture: ID3D11Texture2D,
pub device: ID3D11Device,
}
// SAFETY: `D3d11Frame` owns an `ID3D11Texture2D` + `ID3D11Device`, which are COM interface pointers.
// D3D11 devices/resources use thread-safe (interlocked) COM reference counting, and the device is
// created free-threaded (`make_device` passes no `D3D11_CREATE_DEVICE_SINGLETHREADED`), so handing
// ownership of the frame to another thread — the capture→encode handoff — and releasing it there is
// sound. The value is moved, never aliased (no `Sync`), so there is no concurrent use of the
// single-threaded immediate context.
unsafe impl Send for D3d11Frame {}
pub fn pack_luid(luid: LUID) -> i64 {
((luid.HighPart as i64) << 32) | (luid.LowPart as i64 & 0xffff_ffff)
}
/// Does a fixed-size UTF-16 GDI device name (NUL-padded, e.g. `DXGI_OUTPUT_DESC::DeviceName`)
/// equal `target`?
fn gdi_name_matches(name16: &[u16], target: &str) -> bool {
let s = String::from_utf16_lossy(name16);
s.trim_end_matches('\u{0}') == target
}
/// Copy a row-padded BGRA surface (`pitch` >= `w*4`) into a tightly-packed `w*4*h` buffer.
fn depad_bgra(src: &[u8], pitch: usize, w: usize, h: usize) -> Vec<u8> {
let row = w * 4;
let mut out = vec![0u8; row * h];
for y in 0..h {
out[y * row..y * row + row].copy_from_slice(&src[y * pitch..y * pitch + row]);
}
out
}
/// Re-find the live `IDXGIOutput1` for a GDI name across all adapters (the SudoVDA monitor is
/// enumerated under the rendering GPU). Used to recover after ACCESS_LOST, where the cached handle
/// may be stale.
pub(crate) unsafe fn find_output(gdi_name: &str) -> Result<(IDXGIAdapter1, IDXGIOutput1)> {
let factory: IDXGIFactory1 = CreateDXGIFactory1().context("CreateDXGIFactory1")?;
let mut i = 0u32;
while let Ok(a) = factory.EnumAdapters1(i) {
let mut j = 0u32;
while let Ok(o) = a.EnumOutputs(j) {
let od = o.GetDesc()?;
if gdi_name_matches(&od.DeviceName, gdi_name) {
// Diagnostic: which ADAPTER does this output sit under, and at what LUID? If this LUID
// BOUNCES across an ACCESS_LOST storm, the output is being reparented between adapters
// (the multi-GPU/IDD case Apollo's win32u hook + SET_RENDER_ADAPTER fix). If it's STABLE,
// the storm is something else (e.g. HDR independent-flip DDA can't capture).
if let Ok(ad) = a.GetDesc1() {
let name = String::from_utf16_lossy(&ad.Description);
tracing::info!(
output = gdi_name,
adapter = name.trim_end_matches('\u{0}'),
luid = format!(
"{:08x}:{:08x}",
ad.AdapterLuid.HighPart, ad.AdapterLuid.LowPart
),
"find_output: output resolved under adapter"
);
}
return Ok((a.clone(), o.cast::<IDXGIOutput1>()?));
}
j += 1;
}
i += 1;
}
bail!("no DXGI output named {gdi_name} (gone after ACCESS_LOST?)")
}
/// Read the source display's static HDR mastering metadata via `IDXGIOutput6::GetDesc1` (the
/// monitor IS the "mastering display" for a desktop capture, exactly as Sunshine/Apollo treat it).
/// GetDesc1 exposes the colour primaries, white point, and min/max mastering luminance but NOT a
/// content light level, so MaxCLL/MaxFALL are left `0` (unknown — the display tone-maps from the
/// mastering luminance). `None` if the output can't be cast to `IDXGIOutput6` or the call fails.
unsafe fn read_output_hdr_meta(output: &IDXGIOutput1) -> Option<punktfunk_core::quic::HdrMeta> {
let out6: IDXGIOutput6 = output.cast().ok()?;
let d = out6.GetDesc1().ok()?;
let m = crate::hdr::hdr_meta_from_display(
(d.RedPrimary[0], d.RedPrimary[1]),
(d.GreenPrimary[0], d.GreenPrimary[1]),
(d.BluePrimary[0], d.BluePrimary[1]),
(d.WhitePoint[0], d.WhitePoint[1]),
d.MaxLuminance,
d.MinLuminance,
0, // MaxCLL: GetDesc1 has no content light level (Apollo zeroes it)
0, // MaxFALL
);
tracing::info!(
max_nits = d.MaxLuminance,
min_nits = d.MinLuminance,
max_full_frame_nits = d.MaxFullFrameLuminance,
"read source display HDR mastering metadata (GetDesc1)"
);
Some(m)
}
/// Create a fresh D3D11 device + context on a specific adapter (driver_type UNKNOWN with an explicit
/// adapter). Used at open and on every ACCESS_LOST: a device created on one desktop cannot sustain a
/// duplication on a *different* desktop (perpetual ACCESS_LOST), so the secure-desktop switch needs a
/// device made while the thread is attached to that desktop.
pub(crate) unsafe fn make_device(
adapter: &IDXGIAdapter1,
) -> Result<(ID3D11Device, ID3D11DeviceContext)> {
let mut device: Option<ID3D11Device> = None;
let mut context: Option<ID3D11DeviceContext> = None;
D3D11CreateDevice(
adapter,
D3D_DRIVER_TYPE_UNKNOWN,
HMODULE::default(),
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
Some(&[D3D_FEATURE_LEVEL_11_0]),
D3D11_SDK_VERSION,
Some(&mut device),
None,
Some(&mut context),
)
.context("D3D11CreateDevice")?;
let device = device.context("null D3D11 device")?;
let context = context.context("null D3D11 context")?;
// GPU scheduling hardening — the same approach Sunshine/Apollo use, reimplemented here via the
// documented D3DKMT/DXGI APIs (no GPL source copied). Our capture+encode
// shares the GPU with the streamed game; when the game saturates the GPU our process is starved of
// GPU time slices, so NVENC sits near-idle yet `lock_bitstream` waits ~20 ms for our context to be
// scheduled — capping the stream (~47 fps measured at 5K@240) and stuttering. Per-frame copy/convert
// is NOT the cause (zero-copy + thread-priority alone didn't move it); the PROCESS-level GPU
// scheduling priority class is the decisive cross-process lever. Secondary: the absolute per-device
// GPU thread priority and a 1-frame latency cap.
elevate_process_gpu_priority();
if let Ok(dxgi_dev) = device.cast::<IDXGIDevice>() {
// The absolute max GPU thread priority (0x4000001E; the same value Sunshine/Apollo use); fall back to relative +7.
if dxgi_dev.SetGPUThreadPriority(0x4000_001E).is_err()
&& dxgi_dev.SetGPUThreadPriority(7).is_err()
{
tracing::warn!("SetGPUThreadPriority failed (run as admin/SYSTEM for GPU priority)");
}
}
if let Ok(dxgi1) = device.cast::<IDXGIDevice1>() {
let _ = dxgi1.SetMaximumFrameLatency(1);
}
Ok((device, context))
}
/// Resolve the configured GPU scheduling-priority class from `PUNKTFUNK_GPU_PRIORITY_CLASS`
/// (`off|normal|high|realtime`, default high). `None` = leave it at the OS default (the `off` opt-out).
/// D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, REALTIME 5.
fn configured_gpu_priority_class() -> Option<i32> {
match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS")
.ok()
.as_deref()
{
Some("off") => None,
Some("normal") => Some(2),
Some("realtime") => Some(5),
_ => Some(4), // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC)
}
}
/// Enable SE_INC_BASE_PRIORITY on the CURRENT process token (best-effort) — the kernel gates the
/// HIGH/REALTIME GPU scheduling-priority bump on it. Held by SYSTEM/Administrators; a UAC-FILTERED
/// token (what `CreateProcessAsUserW` hands the WGC helper) does NOT have it, which is why the helper
/// can't elevate itself and the SYSTEM host stamps the class onto it cross-process instead (see
/// [`set_child_gpu_priority_class`]).
unsafe fn enable_inc_base_priority() {
use windows::core::PCWSTR;
use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
use windows::Win32::Security::{
AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES,
SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, TOKEN_PRIVILEGES,
TOKEN_QUERY,
};
use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
let mut token = HANDLE::default();
if OpenProcessToken(
GetCurrentProcess(),
TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
&mut token,
)
.is_ok()
{
let mut luid = LUID::default();
if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() {
let tp = TOKEN_PRIVILEGES {
PrivilegeCount: 1,
Privileges: [LUID_AND_ATTRIBUTES {
Luid: luid,
Attributes: SE_PRIVILEGE_ENABLED,
}],
};
if AdjustTokenPrivileges(
token,
false,
Some(&tp as *const TOKEN_PRIVILEGES),
0,
None,
None,
)
.is_err()
{
tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority");
}
}
let _ = CloseHandle(token);
}
}
/// Call `gdi32!D3DKMTSetProcessSchedulingPriorityClass(process, prio)` (no stable windows-rs binding —
/// loaded by name). Returns the NTSTATUS (0 = success) or `None` if the export can't be resolved. The
/// CALLING process must hold SE_INC_BASE_PRIORITY ([`enable_inc_base_priority`]) for HIGH/REALTIME; the
/// kernel checks the caller's privilege whether the target is self or a child we created.
unsafe fn d3dkmt_set_scheduling_priority_class(
process: windows::Win32::Foundation::HANDLE,
prio: i32,
) -> Option<i32> {
use windows::core::s;
use windows::Win32::Foundation::HANDLE;
use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
let gdi32 = LoadLibraryA(s!("gdi32.dll")).ok()?;
let p = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass"))?;
type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32;
let f: SetPrio = std::mem::transmute(p);
Some(f(process, prio))
}
/// GPU scheduling-priority hardening — the same approach as Sunshine/Apollo, independently
/// implemented via the documented D3DKMT APIs (no GPL source copied). On a
/// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but
/// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling
/// priority class (the strong cross-process lever — far more effective than `SetGPUThreadPriority`
/// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT
/// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this).
/// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime`
/// (default high). NOTE: in the SYSTEM-host + user-session-helper deployment this self-set NO-OPs in
/// the helper (filtered token), so the host also sets it on the helper via [`set_child_gpu_priority_class`].
fn elevate_process_gpu_priority() {
use std::sync::Once;
static ONCE: Once = Once::new();
// SAFETY: the closure calls two of this module's `unsafe fn`s — `enable_inc_base_priority`
// (adjusts the current-process token; it has no caller precondition and builds all its FFI args
// locally) and `d3dkmt_set_scheduling_priority_class` (loads gdi32 by name and calls the export).
// The latter requires `process` to be a valid process handle; `GetCurrentProcess()` returns the
// current-process pseudo-handle, which is always valid and needs no close. Runs once via
// `Once::call_once`; no raw pointers are dereferenced here.
ONCE.call_once(|| unsafe {
use windows::Win32::System::Threading::GetCurrentProcess;
let Some(prio) = configured_gpu_priority_class() else {
tracing::info!("GPU process scheduling priority class left at default (off)");
return;
};
enable_inc_base_priority();
match d3dkmt_set_scheduling_priority_class(GetCurrentProcess(), prio) {
Some(0) => tracing::info!(
priority_class = prio,
"GPU process scheduling priority class set (2=normal 4=high 5=realtime)"
),
Some(st) => tracing::warn!(
status = format!("0x{st:08X}"),
"D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)"
),
None => tracing::warn!("D3DKMTSetProcessSchedulingPriorityClass export not found"),
}
});
}
/// Set the GPU scheduling-priority class of ANOTHER process we created — the WGC capture+encode helper
/// in the interactive user session. The helper is spawned with the user's UAC-FILTERED token, which
/// lacks SE_INC_BASE_PRIORITY, so its own [`elevate_process_gpu_priority`] silently no-ops and NVENC
/// gets starved under a GPU-saturating game (the "240→40 fps in-game collapse"). The SYSTEM host DOES
/// hold the privilege, so it stamps the class onto the child's process handle right after spawn — the
/// process-level class applies to GPU contexts the child creates afterwards. Best-effort; logged.
/// `PUNKTFUNK_GPU_PRIORITY_CLASS=off` disables it (same knob as the self path).
///
/// # Safety
/// `process` must be a valid handle to a process we own with at least PROCESS_SET_INFORMATION access
/// (the just-created helper, `PROCESS_INFORMATION::hProcess`).
pub(crate) unsafe fn set_child_gpu_priority_class(process: windows::Win32::Foundation::HANDLE) {
let Some(prio) = configured_gpu_priority_class() else {
return;
};
enable_inc_base_priority(); // the SYSTEM host holds SE_INC_BASE_PRIORITY; the helper does not
match d3dkmt_set_scheduling_priority_class(process, prio) {
Some(0) => tracing::info!(
priority_class = prio,
"WGC helper GPU scheduling priority class set cross-process from the SYSTEM host \
(2=normal 4=high 5=realtime)"
),
Some(st) => tracing::warn!(
status = format!("0x{st:08X}"),
"cross-process D3DKMTSetProcessSchedulingPriorityClass on the WGC helper failed"
),
None => tracing::warn!(
"D3DKMTSetProcessSchedulingPriorityClass export not found — WGC helper has no GPU priority"
),
}
}
/// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST
/// recovery to rebuild the whole capture on the current (possibly secure) input desktop.
unsafe fn reopen_duplication(
gdi_name: &str,
want_hdr: bool,
) -> Result<(
ID3D11Device,
ID3D11DeviceContext,
IDXGIOutput1,
IDXGIOutputDuplication,
)> {
let (adapter, out) = find_output(gdi_name)?;
let (dev, ctx) = make_device(&adapter)?;
let dupl =
duplicate_output(&out, &dev, want_hdr).context("re-DuplicateOutput after ACCESS_LOST")?;
Ok((dev, ctx, out, dupl))
}
/// Create the output duplication. Prefer `IDXGIOutput5::DuplicateOutput1` with an explicit
/// encoder-format list (FP16 first, then BGRA8) — Apollo's path. It hands us the desktop's real
/// scanout format (HDR FP16 or SDR BGRA8) and is far more robust to overlay/format changes than
/// legacy `DuplicateOutput` (which always tone-maps to 8-bit BGRA — the source of much of the
/// ACCESS_LOST churn). Requires the process be per-monitor-v2 DPI aware (set at startup in
/// [`install_gpu_pref_hook`]). Falls back to legacy `DuplicateOutput` if Output5 is unavailable or
/// `DuplicateOutput1` fails.
unsafe fn duplicate_output(
output: &IDXGIOutput1,
device: &ID3D11Device,
want_hdr: bool,
) -> Result<IDXGIOutputDuplication> {
if let Ok(output5) = output.cast::<IDXGIOutput5>() {
// For an HDR session, request FP16 FIRST so DuplicateOutput1 hands back the desktop's real
// scRGB HDR surface → the `hdr_fp16` path converts it to BT.2020 PQ 10-bit for NVENC Main10.
// For SDR request BGRA8 only: listing FP16 first would make DXGI hand back FP16 even on an SDR
// desktop, wrongly tripping the HDR path. (HDR DDA is used for the secure desktop, where the
// SudoVDA may be in HDR and legacy DuplicateOutput — the SDR-era API — can't capture FP16.)
let formats: &[DXGI_FORMAT] = if want_hdr {
&[DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_B8G8R8A8_UNORM]
} else {
&[DXGI_FORMAT_B8G8R8A8_UNORM]
};
// RETRY DuplicateOutput1. The caller releases the OLD duplication (self.dupl = None) immediately
// before calling us, and the kernel-side teardown of that duplication is ASYNC — the FIRST
// DuplicateOutput1 right after can race it and return E_ACCESSDENIED ("output still duplicated")
// even though we dropped our only reference. A few short retries let the teardown finish so the
// ROBUST DuplicateOutput1 dup succeeds, instead of falling through to legacy DuplicateOutput,
// which "succeeds" into a fragile dup that churns ACCESS_LOST/MODE_CHANGE every few ms on this
// cross-GPU IDD. (This is why DuplicateOutput1 failed but the legacy call a beat later
// succeeded — pure timing. Apollo retries DuplicateOutput1 2x/200ms for the same reason.)
// Apollo waits 200 ms between DuplicateOutput1 attempts — the kernel-side teardown of the
// just-released duplication takes that long, so short (ms) waits aren't enough. Env-tunable so
// we can dial it without a rebuild: PUNKTFUNK_DUP_RETRY_MS (per-wait, default 200) ×
// PUNKTFUNK_DUP_RETRY_N (attempts, default 6) → ~1 s worst case before the legacy fallback.
let retry_ms: u64 = std::env::var("PUNKTFUNK_DUP_RETRY_MS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(200);
// Default 1 (no retry → immediate legacy fallback). On the secure desktop DuplicateOutput1
// ALWAYS refuses (only LOGON_UI may use it), so retrying there just blocks the capture thread;
// and on the normal desktop the release-before-reduplicate + gentle recovery already keep the
// legacy dup stable. Raise PUNKTFUNK_DUP_RETRY_N only on a box where DuplicateOutput1 can win
// the old-dup-teardown race (then PUNKTFUNK_DUP_RETRY_MS sets the per-wait, default 200).
// HDR DDA genuinely NEEDS DuplicateOutput1 (legacy DuplicateOutput can't capture an FP16/HDR
// desktop — it returns E_INVALIDARG), so give it several attempts even on the secure desktop
// rather than bailing after one try to the useless legacy fallback. SDR keeps the default 1.
let attempts: u64 = std::env::var("PUNKTFUNK_DUP_RETRY_N")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(if want_hdr { 5 } else { 1 })
.max(1);
let mut last_err = None;
for attempt in 0..attempts {
match output5.DuplicateOutput1(device, 0, formats) {
Ok(d) => {
if attempt > 0 {
tracing::debug!(
attempt,
"DuplicateOutput1 succeeded on retry (rode out old-dup teardown race)"
);
}
return Ok(d);
}
Err(e) => {
last_err = Some(e);
if attempt + 1 < attempts {
std::thread::sleep(Duration::from_millis(retry_ms));
}
}
}
}
if let Some(e) = last_err {
// Expected on the secure (Winlogon) desktop (DuplicateOutput1 is LOGON_UI-only) and fires
// once per gentle recovery there — throttle so a lock dwell doesn't flood the log. The
// legacy fallback below handles it; gentle recovery keeps it from churning.
static FALLBACKS: AtomicU64 = AtomicU64::new(0);
if FALLBACKS.fetch_add(1, Ordering::Relaxed) % 64 == 0 {
tracing::debug!(
error = %format!("{e:?}"),
"DuplicateOutput1 unavailable — using legacy DuplicateOutput (expected on the secure desktop)"
);
}
}
}
output.DuplicateOutput(device).context("DuplicateOutput")
}
/// Park the cursor on a duplicated output. A blank virtual display emits NO Desktop Duplication
/// frames until something changes; a pointer move IS a DDA "change", so this kicks the very first
/// `AcquireNextFrame` loose — and lands the cursor on the display the client is viewing. Two moves
/// to distinct points guarantee an actual move even if the cursor already sat at the center.
/// Re-sync the calling (capture) thread to the CURRENT input desktop. MUST be called on EVERY recovery
/// — symmetrically for ENTERING and LEAVING the Winlogon (secure: lock/login/UAC) desktop. Gating it on
/// is_secure_desktop() (the old bug) re-attached only on the way IN, so on the way OUT the capture
/// thread stayed stuck on the gone Winlogon desktop and every rebuild failed → no frames → client
/// timeout → "display disconnected". Apollo calls its equivalent (syncThreadDesktop) before every
/// duplicate. Opening the secure desktop requires SYSTEM (the host relaunches itself as SYSTEM).
/// Matches Apollo by closing the handle right after SetThreadDesktop — the thread keeps the desktop via
/// an internal reference, so this does NOT leak even when called on every recovery.
unsafe fn attach_input_desktop() {
match OpenInputDesktop(
DESKTOP_CONTROL_FLAGS(0),
false,
DESKTOP_ACCESS_FLAGS(0x1000_0000), // GENERIC_ALL
) {
Ok(desk) => {
if let Err(e) = SetThreadDesktop(desk) {
tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: SetThreadDesktop FAILED");
}
let _ = CloseDesktop(desk);
}
Err(e) => {
tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: OpenInputDesktop FAILED")
}
}
}
pub(crate) unsafe fn nudge_cursor_onto(output: &IDXGIOutput1) {
if let Ok(od) = output.GetDesc() {
let r = od.DesktopCoordinates;
let _ = SetCursorPos(r.left + 8, r.top + 8);
let _ = SetCursorPos((r.left + r.right) / 2, (r.top + r.bottom) / 2);
}
}
/// How many times DXGI has actually called our hooked `NtGdiDdDDIGetCachedHybridQueryValue`. If this
/// stays 0 while DDA churns with ACCESS_LOST, the hook is NOT on DXGI's GPU-preference path on this
/// build (so reparenting can't be the cause — look at composition/independent-flip instead). >0 with
/// continuing churn means the hook fires but reparenting isn't the trigger here.
static HYBRID_HOOK_HITS: AtomicU64 = AtomicU64::new(0);
pub(crate) fn hybrid_hook_hits() -> u64 {
HYBRID_HOOK_HITS.load(Ordering::Relaxed)
}
// kernel32 — declared directly so we don't pull the whole Win32_System_Diagnostics_Debug feature for
// one call. FlushInstructionCache serializes the i-cache after the inline patch: the patch is written
// on the main thread but DXGI runs the hooked export from the encode/worker thread (possibly a
// different core), so the "same-thread, no flush needed" assumption was wrong.
#[link(name = "kernel32")]
extern "system" {
fn FlushInstructionCache(h: *mut c_void, base: *const c_void, size: usize) -> i32;
fn GetCurrentProcess() -> *mut c_void;
fn SetThreadExecutionState(es_flags: u32) -> u32;
}
const ES_CONTINUOUS: u32 = 0x8000_0000;
const ES_SYSTEM_REQUIRED: u32 = 0x0000_0001;
const ES_DISPLAY_REQUIRED: u32 = 0x0000_0002;
/// Replacement for `win32u.dll!NtGdiDdDDIGetCachedHybridQueryValue`: always report
/// `D3DKMT_GPU_PREFERENCE_STATE_UNSPECIFIED` (3). We fully replace the function (never call the
/// original), so no trampoline is needed. (Independent reimplementation of the same technique Apollo
/// uses: Apollo installs its hook via the MinHook library; this is an original inline byte-patch and
/// copies no Apollo/GPL source.)
unsafe extern "system" fn hybrid_query_hook(gpu_preference: *mut u32) -> i32 {
HYBRID_HOOK_HITS.fetch_add(1, Ordering::Relaxed);
if gpu_preference.is_null() {
return 0xC000_000Du32 as i32; // STATUS_INVALID_PARAMETER
}
*gpu_preference = 3; // D3DKMT_GPU_PREFERENCE_STATE_UNSPECIFIED
0 // STATUS_SUCCESS
}
/// The win32u GPU-preference hook (the same technique Apollo applies, reimplemented here from the
/// documented DDI — no GPL source copied). On a HYBRID-GPU box DXGI resolves a GPU preference
/// (registry + power settings + the hybrid-adapter DDI) and REPARENTS outputs onto the chosen render
/// GPU — which constantly invalidates Desktop Duplication (DXGI_ERROR_ACCESS_LOST 0x887A0026, the
/// freeze/churn observed on the RTX 4090 + AMD iGPU box; `SET_RENDER_ADAPTER` is ignored there). Faking
/// a cached preference of UNSPECIFIED makes DXGI skip the resolution, so the output is NOT reparented
/// and DDA stays stable on one adapter (this is what makes Apollo's DDA work on this hardware).
/// Installed once, before the first DXGI factory/enumeration; lasts the process lifetime (like Apollo).
pub(crate) fn install_gpu_pref_hook() {
use std::sync::Once;
static HOOK: Once = Once::new();
// SAFETY: this one-time hook install only touches a region it has just validated.
// `LoadLibraryA("win32u.dll")` + `GetProcAddress("NtGdiDdDDIGetCachedHybridQueryValue")` yield the
// live base of the real exported function, so `target` is a valid executable code pointer to at
// least the 12 bytes the patch overwrites (an x64 prologue). The two
// `ptr::copy_nonoverlapping`s each move exactly 12 bytes between the 12-byte stack arrays
// (`patch`/`readback`) and `target`, which `VirtualProtect(target, 12, PAGE_EXECUTE_READWRITE, …)`
// has just made writable (and is restored to `old` after) — source and dest never overlap (stack
// vs. loaded module image), so every access stays in mapped, in-bounds memory.
// `FlushInstructionCache` gets the current-process pseudo-handle + that same range. The DPI calls
// take by-value context handles / fill the live local `&mut old`/`&mut restore` for the duration of
// each synchronous call. Runs once via `Once::call_once`, before any DXGI use.
HOOK.call_once(|| unsafe {
use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
use windows::Win32::System::Memory::{
VirtualProtect, PAGE_EXECUTE_READWRITE, PAGE_PROTECTION_FLAGS,
};
use windows::Win32::UI::HiDpi::{
GetAwarenessFromDpiAwarenessContext, GetThreadDpiAwarenessContext,
SetProcessDpiAwarenessContext, DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2,
};
// Per-monitor-v2 DPI awareness — REQUIRED for IDXGIOutput5::DuplicateOutput1 (without it the
// call returns E_ACCESSDENIED forever, forcing the legacy DuplicateOutput path). Matches
// Apollo's startup. SetProcessDpiAwarenessContext fails with E_ACCESS_DENIED if awareness was
// already set (manifest / earlier call) — log the outcome AND the effective awareness so a
// 100% DuplicateOutput1 E_ACCESSDENIED is diagnosable instead of silent.
match SetProcessDpiAwarenessContext(DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2) {
Ok(()) => tracing::info!("DPI awareness set: PER_MONITOR_AWARE_V2"),
Err(e) => tracing::warn!(error = %format!("{e:?}"),
"SetProcessDpiAwarenessContext failed (already set?) — DuplicateOutput1 may E_ACCESSDENIED"),
}
// 0=UNAWARE 1=SYSTEM 2=PER_MONITOR(_V2). DuplicateOutput1 needs 2.
let awareness = GetAwarenessFromDpiAwarenessContext(GetThreadDpiAwarenessContext()).0;
tracing::info!(awareness, "effective DPI awareness (need 2=PER_MONITOR for DuplicateOutput1)");
let Ok(lib) = LoadLibraryA(s!("win32u.dll")) else {
tracing::warn!("GPU-pref hook: win32u.dll not loadable — skipping (DDA may churn on hybrid GPUs)");
return;
};
let Some(target) = GetProcAddress(lib, s!("NtGdiDdDDIGetCachedHybridQueryValue")) else {
tracing::warn!("GPU-pref hook: NtGdiDdDDIGetCachedHybridQueryValue not exported — skipping");
return;
};
let target = target as usize as *mut u8;
// x64 absolute jump to our replacement: `mov rax, imm64 ; jmp rax` (12 bytes). We never call the
// original, so no trampoline/relocation (hence no detour crate / C length-disassembler dep).
let hook = hybrid_query_hook as *const () as usize;
let mut patch = [0u8; 12];
patch[0] = 0x48;
patch[1] = 0xB8; // mov rax, imm64
patch[2..10].copy_from_slice(&hook.to_le_bytes());
patch[10] = 0xFF;
patch[11] = 0xE0; // jmp rax
let mut old = PAGE_PROTECTION_FLAGS(0);
if VirtualProtect(target as *const c_void, 12, PAGE_EXECUTE_READWRITE, &mut old).is_err() {
tracing::warn!("GPU-pref hook: VirtualProtect failed — skipping");
return;
}
std::ptr::copy_nonoverlapping(patch.as_ptr(), target, 12);
let mut restore = PAGE_PROTECTION_FLAGS(0);
let _ = VirtualProtect(target as *const c_void, 12, old, &mut restore);
// Serialize the i-cache: the patch is written here (main thread) but DXGI calls the export from
// the capture/encode worker thread — possibly a different core with a stale i-cache, in which
// case it would keep running the ORIGINAL function and DXGI would still reparent. (Apollo's
// MinHook does this flush internally; our hand-rolled patch must do it explicitly.)
let _ = FlushInstructionCache(GetCurrentProcess(), target as *const c_void, 12);
// VERIFY the patch actually landed (CFG/hotpatch/short-stub could silently reject it). Read it
// back; an error! (not a cheery "installed") makes a dead hook obvious in the logs.
let mut readback = [0u8; 12];
std::ptr::copy_nonoverlapping(target, readback.as_mut_ptr(), 12);
if readback == patch {
tracing::info!(
"GPU-pref hook installed + verified (win32u hybrid-query -> UNSPECIFIED): reparenting disabled"
);
} else {
tracing::error!(
want = %format!("{patch:02x?}"), got = %format!("{readback:02x?}"),
"GPU-pref hook patch did NOT land — hook is DEAD (DXGI will still reparent → ACCESS_LOST churn)"
);
}
});
}
// DXGI Desktop Duplication deliberately EXCLUDES the hardware cursor from the captured surface (the
// OS composites it separately). We capture the cursor shape/position from the frame info and blend it
// back in — on the GPU for the zero-copy path (a CPU readback would stall the 240 fps pipeline).
const CURSOR_VS: &str = r"
cbuffer Rect : register(b0) { float4 r; };
struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; };
VOut main(uint vid : SV_VertexID) {
float2 uv = float2((vid == 1 || vid == 3) ? 1.0 : 0.0, (vid >= 2) ? 1.0 : 0.0);
VOut o;
o.pos = float4(lerp(r.x, r.z, uv.x), lerp(r.y, r.w, uv.y), 0.0, 1.0);
o.uv = uv;
return o;
}
";
const CURSOR_PS: &str = r"
Texture2D tx : register(t0);
SamplerState sm : register(s0);
// b0 is shared with the VS: float4 rect, then the HDR cursor params. For SDR white_mul=1 / decode=0
// so this is a no-op (returns the raw sampled BGRA, blended in the display's native sRGB space). For
// HDR the cursor is composited onto a LINEAR scRGB FP16 surface where 1.0 = 80 nits, so we sRGB→
// linear decode (correct alpha blending + no dark edge fringe) and scale to HDR graphics white
// (~203 nits → white_mul = 203/80) so the cursor isn't ~2.5x too dim vs the HDR desktop.
cbuffer C : register(b0) { float4 rect; float white_mul; float decode; float2 pad; };
float3 srgb_to_linear(float3 c) {
return c <= 0.04045 ? c / 12.92 : pow((c + 0.055) / 1.055, 2.4);
}
float4 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
float4 s = tx.Sample(sm, uv);
float3 rgb = s.rgb;
if (decode > 0.5) { rgb = srgb_to_linear(rgb); }
rgb *= white_mul;
return float4(rgb, s.a);
}
";
unsafe fn compile_shader(src: &str, entry: PCSTR, target: PCSTR) -> Result<Vec<u8>> {
let mut blob: Option<ID3DBlob> = None;
let mut errs: Option<ID3DBlob> = None;
let r = D3DCompile(
src.as_ptr() as *const c_void,
src.len(),
PCSTR::null(),
None,
None,
entry,
target,
0,
0,
&mut blob,
Some(&mut errs),
);
if r.is_err() {
let msg = errs
.as_ref()
.map(|e| {
let p = e.GetBufferPointer() as *const u8;
String::from_utf8_lossy(std::slice::from_raw_parts(p, e.GetBufferSize()))
.to_string()
})
.unwrap_or_default();
bail!("D3DCompile failed: {msg}");
}
let blob = blob.context("no shader blob")?;
let p = blob.GetBufferPointer() as *const u8;
Ok(std::slice::from_raw_parts(p, blob.GetBufferSize()).to_vec())
}
/// A DXGI cursor shape decomposed into up to two BGRA layers. A single shape can require BOTH a
/// normal alpha-blended layer AND a screen-inverting (XOR) layer at once — e.g. a masked-color text
/// I-beam (opaque pixels + invert pixels) or a monochrome cursor mixing opaque and invert pixels.
/// Each layer is composited with its own blend; a single image + single blend (the old approach)
/// renders such mixed shapes wrong (wrong color, or a black box where the screen should invert).
#[derive(Clone, Default)]
struct CursorShape {
w: u32,
h: u32,
/// Layer composited with src-over alpha (transparent where a==0). `None` if it has no pixels.
alpha: Option<Vec<u8>>,
/// Layer composited with the inversion blend (white opaque → invert the screen underneath).
/// `None` if it has no pixels.
xor: Option<Vec<u8>>,
}
/// GPU cursor overlay: a tiny shader pipeline that blends the cursor texture(s) onto the captured
/// frame. Tied to one D3D11 device; rebuilt when the capturer recreates its device on a desktop switch.
struct CursorCompositor {
vs: ID3D11VertexShader,
ps: ID3D11PixelShader,
cbuf: ID3D11Buffer,
blend: ID3D11BlendState,
/// Inversion blend for masked-color (XOR) cursors like the text I-beam: result = white*(1-dest),
/// i.e. it inverts the screen under the cursor so it's visible on any background.
blend_invert: ID3D11BlendState,
sampler: ID3D11SamplerState,
/// Alpha-blended layer (normal cursor pixels). srv + width + height.
tex_alpha: Option<(ID3D11ShaderResourceView, u32, u32)>,
/// Inversion-blended layer (screen-inverting pixels: masked-color I-beam bar, monochrome invert).
tex_xor: Option<(ID3D11ShaderResourceView, u32, u32)>,
}
impl CursorCompositor {
unsafe fn new(device: &ID3D11Device) -> Result<Self> {
let vsb = compile_shader(CURSOR_VS, s!("main"), s!("vs_5_0"))?;
let psb = compile_shader(CURSOR_PS, s!("main"), s!("ps_5_0"))?;
let mut vs = None;
device.CreateVertexShader(&vsb, None, Some(&mut vs))?;
let mut ps = None;
device.CreatePixelShader(&psb, None, Some(&mut ps))?;
let cbd = D3D11_BUFFER_DESC {
ByteWidth: 32, // float4 rect + (white_mul, decode, pad, pad) for the HDR cursor PS
Usage: D3D11_USAGE_DYNAMIC,
BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32,
CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32,
..Default::default()
};
let mut cbuf = None;
device.CreateBuffer(&cbd, None, Some(&mut cbuf))?;
let mut bd = D3D11_BLEND_DESC::default();
bd.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC {
BlendEnable: true.into(),
SrcBlend: D3D11_BLEND_SRC_ALPHA,
DestBlend: D3D11_BLEND_INV_SRC_ALPHA,
BlendOp: D3D11_BLEND_OP_ADD,
SrcBlendAlpha: D3D11_BLEND_ONE,
DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA,
BlendOpAlpha: D3D11_BLEND_OP_ADD,
RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8,
};
let mut blend = None;
device.CreateBlendState(&bd, Some(&mut blend))?;
// Inversion blend: result.rgb = src*(1-dest) + dest*(1-src.a). A white opaque cursor pixel
// (src=1,a=1) -> 1-dest (inverted); a transparent pixel (src=0,a=0) -> dest (unchanged).
let mut bdi = D3D11_BLEND_DESC::default();
bdi.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC {
BlendEnable: true.into(),
SrcBlend: D3D11_BLEND_INV_DEST_COLOR,
DestBlend: D3D11_BLEND_INV_SRC_ALPHA,
BlendOp: D3D11_BLEND_OP_ADD,
SrcBlendAlpha: D3D11_BLEND_ONE,
DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA,
BlendOpAlpha: D3D11_BLEND_OP_ADD,
RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8,
};
let mut blend_invert = None;
device.CreateBlendState(&bdi, Some(&mut blend_invert))?;
let sd = D3D11_SAMPLER_DESC {
Filter: D3D11_FILTER_MIN_MAG_MIP_POINT,
AddressU: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressV: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressW: D3D11_TEXTURE_ADDRESS_CLAMP,
ComparisonFunc: D3D11_COMPARISON_NEVER,
MaxLOD: f32::MAX,
..Default::default()
};
let mut sampler = None;
device.CreateSamplerState(&sd, Some(&mut sampler))?;
Ok(Self {
vs: vs.context("vs")?,
ps: ps.context("ps")?,
cbuf: cbuf.context("cbuf")?,
blend: blend.context("blend")?,
blend_invert: blend_invert.context("blend_invert")?,
sampler: sampler.context("sampler")?,
tex_alpha: None,
tex_xor: None,
})
}
/// Upload one BGRA layer as an immutable shader-resource texture and return its SRV.
unsafe fn upload_layer(
device: &ID3D11Device,
bgra: &[u8],
w: u32,
h: u32,
) -> Result<ID3D11ShaderResourceView> {
let desc = D3D11_TEXTURE2D_DESC {
Width: w,
Height: h,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32,
..Default::default()
};
let init = D3D11_SUBRESOURCE_DATA {
pSysMem: bgra.as_ptr() as *const c_void,
SysMemPitch: w * 4,
SysMemSlicePitch: 0,
};
let mut tex: Option<ID3D11Texture2D> = None;
device.CreateTexture2D(&desc, Some(&init), Some(&mut tex))?;
let tex = tex.context("cursor tex")?;
let mut srv = None;
device.CreateShaderResourceView(&tex, None, Some(&mut srv))?;
srv.context("cursor srv")
}
/// (Re)upload the decomposed cursor layers; either layer may be absent (→ that pass is skipped).
unsafe fn set_shapes(&mut self, device: &ID3D11Device, shape: &CursorShape) -> Result<()> {
self.tex_alpha = match &shape.alpha {
Some(b) => Some((
Self::upload_layer(device, b, shape.w, shape.h)?,
shape.w,
shape.h,
)),
None => None,
};
self.tex_xor = match &shape.xor {
Some(b) => Some((
Self::upload_layer(device, b, shape.w, shape.h)?,
shape.w,
shape.h,
)),
None => None,
};
Ok(())
}
/// Blend ONE cursor layer onto `rtv` (a render-target view of the captured frame) at frame pixel
/// (cx,cy). `invert` selects the inversion blend (screen-inverting pixels); otherwise normal
/// src-over alpha. A shape with both an alpha and an XOR layer is drawn by calling this twice.
#[allow(clippy::too_many_arguments)]
unsafe fn draw_layer(
&self,
ctx: &ID3D11DeviceContext,
rtv: &ID3D11RenderTargetView,
fw: u32,
fh: u32,
cx: i32,
cy: i32,
srv: &ID3D11ShaderResourceView,
cw: u32,
ch: u32,
invert: bool,
// HDR (decode=true): sRGB→linear decode + scale the cursor to `white_mul` × 80 nits, so a
// white cursor hits HDR graphics white (~203 nits) not 80. SDR passes white_mul=1.0,
// decode=false → the PS returns the raw sample (blended in the display's native sRGB space).
// The inversion (masked-color / I-beam) blend operates on the framebuffer reference, so the
// caller passes white_mul=1.0/decode=false for the XOR layer even in HDR.
white_mul: f32,
decode: bool,
) {
let x0 = (cx as f32 / fw as f32) * 2.0 - 1.0;
let x1 = ((cx + cw as i32) as f32 / fw as f32) * 2.0 - 1.0;
let y0 = 1.0 - (cy as f32 / fh as f32) * 2.0;
let y1 = 1.0 - ((cy + ch as i32) as f32 / fh as f32) * 2.0;
let (mul, dec) = if invert {
(1.0_f32, 0.0_f32)
} else {
(white_mul, if decode { 1.0 } else { 0.0 })
};
// cbuf layout: [rect.x, rect.y, rect.z, rect.w, white_mul, decode, pad, pad] (32 bytes).
let cb = [x0, y0, x1, y1, mul, dec, 0.0, 0.0];
let mut mapped = D3D11_MAPPED_SUBRESOURCE::default();
if ctx
.Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped))
.is_ok()
{
std::ptr::copy_nonoverlapping(cb.as_ptr(), mapped.pData as *mut f32, cb.len());
ctx.Unmap(&self.cbuf, 0);
}
let vp = D3D11_VIEWPORT {
TopLeftX: 0.0,
TopLeftY: 0.0,
Width: fw as f32,
Height: fh as f32,
MinDepth: 0.0,
MaxDepth: 1.0,
};
ctx.RSSetViewports(Some(&[vp]));
ctx.OMSetRenderTargets(Some(&[Some(rtv.clone())]), None);
let blend = if invert {
&self.blend_invert
} else {
&self.blend
};
ctx.OMSetBlendState(blend, Some(&[0.0; 4]), 0xffff_ffff);
ctx.VSSetShader(&self.vs, None);
ctx.PSSetShader(&self.ps, None);
ctx.VSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())]));
ctx.PSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())])); // white_mul/decode for the PS
ctx.PSSetShaderResources(0, Some(&[Some(srv.clone())]));
ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())]));
ctx.IASetInputLayout(None);
ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP);
ctx.Draw(4, 0);
// Unbind the render target so the next frame's CopyResource into this texture is unobstructed.
ctx.OMSetRenderTargets(Some(&[None]), None);
}
}
/// Fullscreen-triangle vertex shader for the HDR conversion pass (3 verts, no input layout).
const HDR_VS: &str = r"
struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; };
VOut main(uint vid : SV_VertexID) {
float2 uv = float2((vid << 1) & 2, vid & 2);
VOut o;
o.pos = float4(uv * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
o.uv = uv;
return o;
}
";
/// HDR conversion pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) →
/// BT.2020 primaries → SMPTE ST 2084 (PQ) → written to a 10-bit R10G10B10A2 target for NVENC
/// (HEVC Main10 / HDR10). This is the standard Windows-HDR capture conversion (matches OBS/Sunshine).
const HDR_PS: &str = r"
Texture2D<float4> tx : register(t0);
SamplerState sm : register(s0);
// Rec.709 → Rec.2020 primaries (linear). Column-major rows as written, used with mul(M, v).
static const float3x3 BT709_TO_BT2020 = {
0.627403914, 0.329283038, 0.043313048,
0.069097292, 0.919540405, 0.011362303,
0.016391439, 0.088013308, 0.895595253
};
float3 pq_oetf(float3 L) {
// L normalized so 1.0 = 10000 nits. ST 2084.
const float m1 = 0.1593017578125;
const float m2 = 78.84375;
const float c1 = 0.8359375;
const float c2 = 18.8515625;
const float c3 = 18.6875;
float3 Lp = pow(saturate(L), m1);
return pow((c1 + c2 * Lp) / (1.0 + c3 * Lp), m2);
}
float4 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
float3 scrgb = max(tx.Sample(sm, uv).rgb, 0.0); // scRGB can be negative (wide gamut); clamp
float3 nits = scrgb * 80.0; // scRGB 1.0 = 80 nits → absolute luminance
float3 lin2020 = mul(BT709_TO_BT2020, nits); // primaries conversion (linear)
float3 pq = pq_oetf(lin2020 / 10000.0); // normalize to 10k nits, encode PQ
return float4(pq, 1.0);
}
";
/// scRGB FP16 → BT.2020 PQ 10-bit conversion pass. One per capture device (rebuilt on device
/// recreate, like [`CursorCompositor`]). A single fullscreen draw samples the FP16 source SRV and
/// writes PQ-encoded BT.2020 to the bound R10G10B10A2 render target.
pub(crate) struct HdrConverter {
vs: ID3D11VertexShader,
ps: ID3D11PixelShader,
sampler: ID3D11SamplerState,
}
impl HdrConverter {
pub(crate) unsafe fn new(device: &ID3D11Device) -> Result<Self> {
let vsb = compile_shader(HDR_VS, s!("main"), s!("vs_5_0"))?;
let psb = compile_shader(HDR_PS, s!("main"), s!("ps_5_0"))?;
let mut vs = None;
device.CreateVertexShader(&vsb, None, Some(&mut vs))?;
let mut ps = None;
device.CreatePixelShader(&psb, None, Some(&mut ps))?;
let sd = D3D11_SAMPLER_DESC {
Filter: D3D11_FILTER_MIN_MAG_MIP_POINT,
AddressU: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressV: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressW: D3D11_TEXTURE_ADDRESS_CLAMP,
ComparisonFunc: D3D11_COMPARISON_NEVER,
MaxLOD: f32::MAX,
..Default::default()
};
let mut sampler = None;
device.CreateSamplerState(&sd, Some(&mut sampler))?;
Ok(Self {
vs: vs.context("hdr vs")?,
ps: ps.context("hdr ps")?,
sampler: sampler.context("hdr sampler")?,
})
}
/// Convert `src_srv` (FP16 scRGB) into `dst_rtv` (R10G10B10A2 PQ BT.2020). Opaque pass, no blend.
pub(crate) unsafe fn convert(
&self,
ctx: &ID3D11DeviceContext,
src_srv: &ID3D11ShaderResourceView,
dst_rtv: &ID3D11RenderTargetView,
w: u32,
h: u32,
) {
let vp = D3D11_VIEWPORT {
TopLeftX: 0.0,
TopLeftY: 0.0,
Width: w as f32,
Height: h as f32,
MinDepth: 0.0,
MaxDepth: 1.0,
};
ctx.RSSetViewports(Some(&[vp]));
ctx.OMSetRenderTargets(Some(&[Some(dst_rtv.clone())]), None);
ctx.OMSetBlendState(None, None, 0xffff_ffff); // opaque overwrite
ctx.VSSetShader(&self.vs, None);
ctx.PSSetShader(&self.ps, None);
ctx.PSSetShaderResources(0, Some(&[Some(src_srv.clone())]));
ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())]));
ctx.IASetInputLayout(None);
ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
ctx.Draw(3, 0);
// Unbind so the next frame can CopyResource into the source and re-RTV the destination.
ctx.OMSetRenderTargets(Some(&[None]), None);
ctx.PSSetShaderResources(0, Some(&[None]));
}
}
/// Whether `PUNKTFUNK_HDR_SHADER_P010` is truthy (`1`/`true`/`yes`/`on`). When set, the WGC HDR path
/// emits P010 (BT.2020 PQ, 10-bit limited range) DIRECTLY from a shader pass ([`HdrP010Converter`])
/// instead of tone-mapping to R10G10B10A2 and letting NVENC do the RGB→YUV CSC on the contended SM.
/// Default OFF → the current HDR path (R10→NVENC + the VideoProcessor attempt) is byte-for-byte
/// unchanged.
pub(crate) fn hdr_shader_p010_enabled() -> bool {
std::env::var("PUNKTFUNK_HDR_SHADER_P010")
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
.unwrap_or(false)
}
/// P010 **luma** pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) →
/// BT.2020 PQ → BT.2020 non-constant-luminance limited-range Y, written as a 10-bit code in the high
/// 10 bits of an R16_UNORM render-target view of the P010 plane-0 (luma). The colour pipeline
/// (scRGB→nits→BT.2020-linear→PQ) is IDENTICAL to [`HDR_PS`]; only the final RGB→Y + studio-range
/// quantization differs. The shared HLSL is factored into [`HDR_P010_COMMON`].
const HDR_P010_COMMON: &str = r"
Texture2D<float4> tx : register(t0);
SamplerState sm : register(s0);
// Rec.709 → Rec.2020 primaries (linear). Same matrix as the R10 HdrConverter (mul(M, v)).
static const float3x3 BT709_TO_BT2020 = {
0.627403914, 0.329283038, 0.043313048,
0.069097292, 0.919540405, 0.011362303,
0.016391439, 0.088013308, 0.895595253
};
float3 pq_oetf(float3 L) {
// L normalized so 1.0 = 10000 nits. ST 2084. (Identical to HdrConverter.)
const float m1 = 0.1593017578125;
const float m2 = 78.84375;
const float c1 = 0.8359375;
const float c2 = 18.8515625;
const float c3 = 18.6875;
float3 Lp = pow(saturate(L), m1);
return pow((c1 + c2 * Lp) / (1.0 + c3 * Lp), m2);
}
// scRGB FP16 sample -> PQ-encoded BT.2020 RGB in [0,1] (the SAME pixels the R10 path would store,
// before quantization). Used by both the luma and chroma passes so they agree bit-for-bit with the
// existing HdrConverter colour math + the Rust reference.
float3 scrgb_to_pq2020(float2 uv) {
float3 scrgb = max(tx.Sample(sm, uv).rgb, 0.0); // scRGB can be negative (wide gamut); clamp
float3 nits = scrgb * 80.0; // scRGB 1.0 = 80 nits
float3 lin2020 = mul(BT709_TO_BT2020, nits); // primaries conversion (linear)
return pq_oetf(lin2020 / 10000.0); // normalize to 10k nits, encode PQ -> [0,1]
}
// BT.2020 non-constant-luminance, on the PQ-encoded (gamma) RGB. Kr/Kg/Kb per Rec.2020.
static const float KR = 0.2627;
static const float KG = 0.6780;
static const float KB = 0.0593;
// 10-bit studio (limited) range codes. Y' -> [64, 940]; Cb/Cr -> [64, 960] (512 ± 448).
float studio_y_code(float3 rgb_pq) {
float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b; // [0,1]
float code = 64.0 + 876.0 * y; // [64, 940]
return clamp(code, 64.0, 940.0);
}
float2 studio_cbcr_code(float3 rgb_pq) {
float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b;
float cb = (rgb_pq.b - y) / 1.8814; // ~[-0.5, 0.5]
float cr = (rgb_pq.r - y) / 1.4746;
float cbc = 512.0 + 896.0 * cb; // [64, 960]
float crc = 512.0 + 896.0 * cr;
return float2(clamp(cbc, 64.0, 960.0), clamp(crc, 64.0, 960.0));
}
// P010 stores the 10-bit code in the HIGH 10 bits of each 16-bit sample (code10 << 6). As an
// R16_UNORM / R16G16_UNORM render target the UNORM float that maps to that stored u16 is
// code10*64 / 65535.0. (Verified in hdr_p010_selftest against the readback.)
float code10_to_unorm(float code10) { return (code10 * 64.0) / 65535.0; }
";
/// P010 LUMA pass PS — full-res, writes Y to plane 0 (R16_UNORM RTV).
const HDR_P010_Y_PS: &str = r"
#include_common
float main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
float3 pq = scrgb_to_pq2020(uv);
float yc = studio_y_code(pq);
return code10_to_unorm(yc);
}
";
/// P010 CHROMA pass PS — half-res, writes interleaved (Cb,Cr) to plane 1 (R16G16_UNORM RTV). Averages
/// the 2x2 scRGB source footprint of this chroma sample (box filter) IN scRGB-linear space before the
/// PQ encode, then forms Cb/Cr from the averaged-then-PQ-encoded RGB. `inv_src` = (1/srcW, 1/srcH).
const HDR_P010_UV_PS: &str = r"
#include_common
cbuffer C : register(b0) { float2 inv_src; float2 pad; };
float2 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
// `uv` is the chroma-sample centre in [0,1]; the 4 co-sited luma texels sit at uv ± half a luma
// texel in each axis. Average their scRGB (linear) values, then run the SAME PQ/CSC as the Y pass.
float2 h = inv_src * 0.5;
float3 a = max(tx.Sample(sm, uv + float2(-h.x, -h.y)).rgb, 0.0);
float3 b = max(tx.Sample(sm, uv + float2( h.x, -h.y)).rgb, 0.0);
float3 c = max(tx.Sample(sm, uv + float2(-h.x, h.y)).rgb, 0.0);
float3 d = max(tx.Sample(sm, uv + float2( h.x, h.y)).rgb, 0.0);
float3 scrgb = (a + b + c + d) * 0.25;
float3 nits = scrgb * 80.0;
float3 lin2020 = mul(BT709_TO_BT2020, nits);
float3 pq = pq_oetf(lin2020 / 10000.0);
float2 cc = studio_cbcr_code(pq);
return float2(code10_to_unorm(cc.x), code10_to_unorm(cc.y));
}
";
/// scRGB FP16 → **P010** (BT.2020 PQ, 10-bit limited/studio range) conversion, in OUR OWN shader (two
/// passes: full-res luma + half-res chroma). NVIDIA's D3D11 VideoProcessor cannot do RGB→P010 (renders
/// green), so we quantize to studio-range 10-bit YUV directly and feed NVENC native P010 — skipping
/// NVENC's internal RGB→YUV CSC (which runs on the contended SM). One per capture device (rebuilt on
/// device recreate, like [`HdrConverter`]).
///
/// Plane writes use per-plane render-target views of the single P010 texture: an `R16_UNORM` RTV
/// selects plane 0 (luma, full WxH), an `R16G16_UNORM` RTV selects plane 1 (chroma, W/2 x H/2). This
/// planar-RTV mechanism needs a D3D11.3+ runtime + driver support; [`HdrP010Converter::convert`]
/// surfaces a clear error if `CreateRenderTargetView` rejects the plane format so the caller can fall
/// back to the existing R10 path.
pub(crate) struct HdrP010Converter {
vs: ID3D11VertexShader,
ps_y: ID3D11PixelShader,
ps_uv: ID3D11PixelShader,
sampler: ID3D11SamplerState,
/// Constant buffer for the chroma pass (inv_src texel size). 16 bytes.
cbuf: ID3D11Buffer,
}
impl HdrP010Converter {
pub(crate) unsafe fn new(device: &ID3D11Device) -> Result<Self> {
// Inline the shared HLSL (D3DCompile has no include handler wired here). The two PS sources
// carry a `#include_common` marker we substitute before compiling.
let y_src = HDR_P010_Y_PS.replace("#include_common", HDR_P010_COMMON);
let uv_src = HDR_P010_UV_PS.replace("#include_common", HDR_P010_COMMON);
let vsb = compile_shader(HDR_VS, s!("main"), s!("vs_5_0"))?;
let yb = compile_shader(&y_src, s!("main"), s!("ps_5_0"))?;
let uvb = compile_shader(&uv_src, s!("main"), s!("ps_5_0"))?;
let mut vs = None;
device.CreateVertexShader(&vsb, None, Some(&mut vs))?;
let mut ps_y = None;
device.CreatePixelShader(&yb, None, Some(&mut ps_y))?;
let mut ps_uv = None;
device.CreatePixelShader(&uvb, None, Some(&mut ps_uv))?;
let sd = D3D11_SAMPLER_DESC {
// POINT: the Y pass samples a single texel centre exactly, and the UV pass does its OWN
// 2x2 box average via 4 explicit taps at texel centres (offset half a texel). Point
// sampling keeps each tap exact; the averaging is in the shader, not the sampler.
Filter: D3D11_FILTER_MIN_MAG_MIP_POINT,
AddressU: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressV: D3D11_TEXTURE_ADDRESS_CLAMP,
AddressW: D3D11_TEXTURE_ADDRESS_CLAMP,
ComparisonFunc: D3D11_COMPARISON_NEVER,
MaxLOD: f32::MAX,
..Default::default()
};
let mut sampler = None;
device.CreateSamplerState(&sd, Some(&mut sampler))?;
let cbd = D3D11_BUFFER_DESC {
ByteWidth: 16, // float2 inv_src + float2 pad
Usage: D3D11_USAGE_DYNAMIC,
BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32,
CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32,
..Default::default()
};
let mut cbuf = None;
device.CreateBuffer(&cbd, None, Some(&mut cbuf))?;
Ok(Self {
vs: vs.context("p010 vs")?,
ps_y: ps_y.context("p010 y ps")?,
ps_uv: ps_uv.context("p010 uv ps")?,
sampler: sampler.context("p010 sampler")?,
cbuf: cbuf.context("p010 cbuf")?,
})
}
/// Create a per-plane RTV of the P010 texture `dst` with the given single-plane `format`
/// (`R16_UNORM` for plane 0 luma, `R16G16_UNORM` for plane 1 chroma). The plane is selected by the
/// view format (planar-RTV semantics); MipSlice 0.
unsafe fn plane_rtv(
device: &ID3D11Device,
dst: &ID3D11Texture2D,
format: DXGI_FORMAT,
) -> Result<ID3D11RenderTargetView> {
let desc = D3D11_RENDER_TARGET_VIEW_DESC {
Format: format,
ViewDimension: D3D11_RTV_DIMENSION_TEXTURE2D,
Anonymous: D3D11_RENDER_TARGET_VIEW_DESC_0 {
Texture2D: D3D11_TEX2D_RTV { MipSlice: 0 },
},
};
let mut rtv: Option<ID3D11RenderTargetView> = None;
device
.CreateRenderTargetView(
dst,
Some(&desc as *const D3D11_RENDER_TARGET_VIEW_DESC),
Some(&mut rtv),
)
.with_context(|| {
format!("CreateRenderTargetView(P010 plane, format={format:?}) — driver may not support planar RTVs")
})?;
rtv.context("p010 plane rtv null")
}
/// Convert `src_srv` (FP16 scRGB, WxH) into `dst` (a `DXGI_FORMAT_P010` texture with
/// `BIND_RENDER_TARGET`). Two opaque passes: full-res luma → plane 0, half-res chroma → plane 1.
/// `w`/`h` are the full luma dimensions (must be even). Returns `Err` if a plane RTV can't be
/// created (driver) so the caller can fall back to the R10 path.
pub(crate) unsafe fn convert(
&self,
device: &ID3D11Device,
ctx: &ID3D11DeviceContext,
src_srv: &ID3D11ShaderResourceView,
dst: &ID3D11Texture2D,
w: u32,
h: u32,
) -> Result<()> {
let y_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16_UNORM)?;
let uv_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16G16_UNORM)?;
// Update the chroma constant buffer (inverse source texel size).
let cb: [f32; 4] = [1.0 / w as f32, 1.0 / h as f32, 0.0, 0.0];
let mut mapped = D3D11_MAPPED_SUBRESOURCE::default();
if ctx
.Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped))
.is_ok()
{
std::ptr::copy_nonoverlapping(cb.as_ptr(), mapped.pData as *mut f32, cb.len());
ctx.Unmap(&self.cbuf, 0);
}
// Shared pipeline state.
ctx.OMSetBlendState(None, None, 0xffff_ffff); // opaque overwrite
ctx.VSSetShader(&self.vs, None);
ctx.PSSetShaderResources(0, Some(&[Some(src_srv.clone())]));
ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())]));
ctx.IASetInputLayout(None);
ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
// --- LUMA pass: full-res, plane 0 ---
let vp_y = D3D11_VIEWPORT {
TopLeftX: 0.0,
TopLeftY: 0.0,
Width: w as f32,
Height: h as f32,
MinDepth: 0.0,
MaxDepth: 1.0,
};
ctx.RSSetViewports(Some(&[vp_y]));
ctx.OMSetRenderTargets(Some(&[Some(y_rtv.clone())]), None);
ctx.PSSetShader(&self.ps_y, None);
ctx.Draw(3, 0);
ctx.OMSetRenderTargets(Some(&[None]), None);
// --- CHROMA pass: half-res, plane 1 ---
let vp_uv = D3D11_VIEWPORT {
TopLeftX: 0.0,
TopLeftY: 0.0,
Width: (w / 2) as f32,
Height: (h / 2) as f32,
MinDepth: 0.0,
MaxDepth: 1.0,
};
ctx.RSSetViewports(Some(&[vp_uv]));
ctx.OMSetRenderTargets(Some(&[Some(uv_rtv.clone())]), None);
ctx.PSSetShader(&self.ps_uv, None);
ctx.PSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())]));
ctx.Draw(3, 0);
// Unbind for the next frame's re-RTV / NVENC read.
ctx.OMSetRenderTargets(Some(&[None]), None);
ctx.PSSetShaderResources(0, Some(&[None]));
Ok(())
}
}
/// f64 reference for the P010 colour math — the EXACT analogue of the HLSL in [`HDR_P010_COMMON`].
/// Input is one scRGB pixel (linear, Rec.709 primaries, 1.0 = 80 nits, may be >1 for HDR). Output is
/// the 10-bit studio-range (Y, Cb, Cr) codes the shader should produce for a flat (constant) block.
/// Used by [`hdr_p010_selftest`].
#[cfg(target_os = "windows")]
fn p010_reference(r: f64, g: f64, b: f64) -> (f64, f64, f64) {
fn pq_oetf(l: f64) -> f64 {
let l = l.clamp(0.0, 1.0);
let m1 = 0.1593017578125;
let m2 = 78.84375;
let c1 = 0.8359375;
let c2 = 18.8515625;
let c3 = 18.6875;
let lp = l.powf(m1);
((c1 + c2 * lp) / (1.0 + c3 * lp)).powf(m2)
}
// scRGB -> nits -> BT.2020 linear (row-major matrix, mul(M, v)).
let (r, g, b) = (r.max(0.0) * 80.0, g.max(0.0) * 80.0, b.max(0.0) * 80.0);
let m = [
[0.627403914, 0.329283038, 0.043313048],
[0.069097292, 0.919540405, 0.011362303],
[0.016391439, 0.088013308, 0.895595253],
];
let lr = m[0][0] * r + m[0][1] * g + m[0][2] * b;
let lg = m[1][0] * r + m[1][1] * g + m[1][2] * b;
let lb = m[2][0] * r + m[2][1] * g + m[2][2] * b;
// PQ encode (normalize to 10k nits).
let pr = pq_oetf(lr / 10000.0);
let pg = pq_oetf(lg / 10000.0);
let pb = pq_oetf(lb / 10000.0);
// BT.2020 non-constant-luminance, limited 10-bit.
let (kr, kg, kb) = (0.2627, 0.6780, 0.0593);
let y = kr * pr + kg * pg + kb * pb;
let cb = (pb - y) / 1.8814;
let cr = (pr - y) / 1.4746;
let yc = (64.0 + 876.0 * y).clamp(64.0, 940.0);
let cbc = (512.0 + 896.0 * cb).clamp(64.0, 960.0);
let crc = (512.0 + 896.0 * cr).clamp(64.0, 960.0);
(yc, cbc, crc)
}
/// Colour self-test for [`HdrP010Converter`] (the `hdr-p010-selftest` subcommand): create a hardware
/// D3D11 device, upload a known scRGB FP16 pattern, run the P010 shader passes, read the Y (plane 0)
/// and UV (plane 1) planes back from a staging copy, and compare against the [`p010_reference`] f64
/// math. The ONLY validation we have without green-screening a live HDR stream. PASS if max abs error
/// Y ≤ 4 codes, U/V ≤ 5 codes (rounding + chroma averaging). Prints a per-colour table + PASS/FAIL.
#[cfg(target_os = "windows")]
pub fn hdr_p010_selftest() -> Result<()> {
use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE;
use windows::Win32::Graphics::Dxgi::IDXGIAdapter;
// 64x64, even dims. A 4x4 grid of 16x16 flat scRGB blocks (each 2x2 chroma footprint uniform →
// exact chroma comparison) covering pure R/G/B/white/black/gray at plausible HDR nit levels, plus
// a couple of bright (>1.0 scRGB) colours, then the rest is a gradient (compared on Y only).
const W: u32 = 64;
const H: u32 = 64;
const BLK: u32 = 16;
// (name, r, g, b) scRGB linear (1.0 = 80 nits). Mix of SDR-ish and HDR (>1.0) values.
let named: [(&str, f32, f32, f32); 8] = [
("red1.0", 1.0, 0.0, 0.0),
("green0.5", 0.0, 0.5, 0.0),
("blue4.0", 0.0, 0.0, 4.0),
("white1.0", 1.0, 1.0, 1.0),
("black", 0.0, 0.0, 0.0),
("gray0.5", 0.5, 0.5, 0.5),
("white4.0", 4.0, 4.0, 4.0),
("amber2.0", 2.0, 1.0, 0.0),
];
let grid_cols = W / BLK; // 4
let pixel_rgb = |x: u32, y: u32| -> (f32, f32, f32, bool) {
let idx = ((y / BLK) * grid_cols + (x / BLK)) as usize;
if idx < named.len() {
let (_, r, g, b) = named[idx];
(r, g, b, true)
} else {
// Gradient (distinct per pixel; Y-only compare), within HDR scRGB range.
let r = (x as f32 / W as f32) * 3.0;
let g = (y as f32 / H as f32) * 3.0;
let b = ((x + y) as f32 / (W + H) as f32) * 3.0;
(r, g, b, false)
}
};
// Build the scRGB FP16 (R16G16B16A16_FLOAT) source as f16 bits.
let mut fp16 = vec![0u16; (W * H * 4) as usize];
let mut flat = vec![false; (W * H) as usize];
for y in 0..H {
for x in 0..W {
let (r, g, b, is_flat) = pixel_rgb(x, y);
let i = ((y * W + x) * 4) as usize;
fp16[i] = f32_to_f16(r);
fp16[i + 1] = f32_to_f16(g);
fp16[i + 2] = f32_to_f16(b);
fp16[i + 3] = f32_to_f16(1.0);
flat[(y * W + x) as usize] = is_flat;
}
}
// SAFETY: this self-test creates its own D3D11 device + immediate context (`D3D11CreateDevice`,
// both checked non-null) and uses ONLY that device for the rest of the block: every
// `CreateTexture2D`/`CreateShaderResourceView`/`HdrP010Converter::{new,convert}`/`CopyResource`/
// `Map` is invoked on that device or its context, so all resources share one device and run on this
// single thread. The source texture's `D3D11_SUBRESOURCE_DATA` points at `fp16`, a live
// `Vec<u16>` of `W*H*4` samples with `SysMemPitch = W*8`, matching the W×H R16G16B16A16 texture;
// `fp16` outlives the synchronous `CreateTexture2D` that reads it. The mapped-pointer reads are
// proven individually at the `read_u16` closure below.
unsafe {
// Hardware D3D11 device (no adapter pin — the default GPU is fine for the self-test).
let mut device: Option<ID3D11Device> = None;
let mut context: Option<ID3D11DeviceContext> = None;
D3D11CreateDevice(
None::<&IDXGIAdapter>,
D3D_DRIVER_TYPE_HARDWARE,
HMODULE::default(),
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
Some(&[D3D_FEATURE_LEVEL_11_0]),
D3D11_SDK_VERSION,
Some(&mut device),
None,
Some(&mut context),
)
.context("D3D11CreateDevice(hardware) for hdr-p010-selftest")?;
let device = device.context("null device")?;
let context = context.context("null context")?;
// Source FP16 texture (initialized) + SRV.
let src_desc = D3D11_TEXTURE2D_DESC {
Width: W,
Height: H,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_R16G16B16A16_FLOAT,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32,
..Default::default()
};
let init = D3D11_SUBRESOURCE_DATA {
pSysMem: fp16.as_ptr() as *const c_void,
SysMemPitch: W * 8, // 4 channels * 2 bytes
SysMemSlicePitch: 0,
};
let mut src_tex: Option<ID3D11Texture2D> = None;
device
.CreateTexture2D(&src_desc, Some(&init), Some(&mut src_tex))
.context("CreateTexture2D(fp16 src)")?;
let src_tex = src_tex.context("null src tex")?;
let mut src_srv: Option<ID3D11ShaderResourceView> = None;
device
.CreateShaderResourceView(&src_tex, None, Some(&mut src_srv))
.context("CreateShaderResourceView(fp16 src)")?;
let src_srv = src_srv.context("null src srv")?;
// P010 destination texture (render-target bindable).
let p010_desc = D3D11_TEXTURE2D_DESC {
Width: W,
Height: H,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_P010,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
..Default::default()
};
let mut p010: Option<ID3D11Texture2D> = None;
device
.CreateTexture2D(&p010_desc, None, Some(&mut p010))
.context("CreateTexture2D(P010 dst)")?;
let p010 = p010.context("null p010 tex")?;
let conv = HdrP010Converter::new(&device)?;
conv.convert(&device, &context, &src_srv, &p010, W, H)?;
// Staging copy of the whole P010 texture (both planes), MAP_READ.
let stage_desc = D3D11_TEXTURE2D_DESC {
Width: W,
Height: H,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_P010,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_STAGING,
BindFlags: 0,
CPUAccessFlags: D3D11_CPU_ACCESS_READ.0 as u32,
..Default::default()
};
let mut staging: Option<ID3D11Texture2D> = None;
device
.CreateTexture2D(&stage_desc, None, Some(&mut staging))
.context("CreateTexture2D(P010 staging)")?;
let staging = staging.context("null staging")?;
context.CopyResource(&staging, &p010);
let mut map = D3D11_MAPPED_SUBRESOURCE::default();
context
.Map(&staging, 0, D3D11_MAP_READ, 0, Some(&mut map))
.context("Map(P010 staging)")?;
let row_pitch = map.RowPitch as usize; // bytes per luma row (in 16-bit samples: /2)
let base = map.pData as *const u8;
// DIAGNOSTIC (the uncertain layout spot — verify on the box if chroma is wrong): the mapped
// P010 plane offsets. Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2
// *interleaved* (Cb,Cr) u16 pairs. P010 packs plane 1 after plane 0 at the SAME row pitch; the
// chroma plane begins at byte offset RowPitch * (luma height). For a STAGING texture that
// height is the created H (no inter-plane alignment). DepthPitch (total mapped size) lets us
// sanity-check: it should be ~ RowPitch * H * 3/2. If chroma reads garbage on the box, print
// these and adjust `chroma_base` (e.g. an aligned luma height).
tracing::info!(
row_pitch,
depth_pitch = map.DepthPitch,
expected_chroma_base = row_pitch * H as usize,
expected_total = row_pitch * H as usize * 3 / 2,
"hdr-p010-selftest: mapped P010 layout (verify chroma plane offset here if chroma is wrong)"
);
// Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2 *interleaved* (Cb,Cr)
// u16 pairs, i.e. W u16 per chroma row. P010 packs plane 1 immediately after plane 0 at the
// SAME row pitch; per spec the chroma plane begins at an allocation offset of
// RowPitch * Height (luma rows). We read it from there. (DepthPitch is the full surface size;
// not all drivers report the chroma offset, so RowPitch*Height is the portable choice.)
let read_u16 = |byte_off: usize| -> u16 {
// SAFETY: `base` is the mapped staging pointer; all offsets are within the P010 surface
// (luma H*RowPitch + chroma (H/2)*RowPitch ≤ DepthPitch). Already in the fn's unsafe scope.
let p = base.add(byte_off) as *const u16;
p.read_unaligned()
};
// Luma codes: stored u16 in the high 10 bits -> code10 = stored >> 6.
let mut y_codes = vec![0u16; (W * H) as usize];
for y in 0..H {
for x in 0..W {
let off = (y as usize) * row_pitch + (x as usize) * 2;
y_codes[(y * W + x) as usize] = read_u16(off) >> 6;
}
}
let cw = W / 2;
let ch = H / 2;
let chroma_base = row_pitch * H as usize; // plane 1 offset
let mut cb_codes = vec![0u16; (cw * ch) as usize];
let mut cr_codes = vec![0u16; (cw * ch) as usize];
for cy in 0..ch {
for cx in 0..cw {
// Interleaved (Cb, Cr) per chroma sample → 2 u16 = 4 bytes per sample.
let off = chroma_base + (cy as usize) * row_pitch + (cx as usize) * 4;
cb_codes[(cy * cw + cx) as usize] = read_u16(off) >> 6;
cr_codes[(cy * cw + cx) as usize] = read_u16(off + 2) >> 6;
}
}
context.Unmap(&staging, 0);
// Compare Y over every pixel.
let mut max_y_err = 0.0f64;
for y in 0..H {
for x in 0..W {
let (r, g, b, _) = pixel_rgb(x, y);
let (ry, _, _) = p010_reference(r as f64, g as f64, b as f64);
let got = y_codes[(y * W + x) as usize] as f64;
max_y_err = max_y_err.max((got - ry).abs());
}
}
// Compare Cb/Cr over flat blocks only (uniform 2x2 footprint → exact reference).
let mut max_u_err = 0.0f64;
let mut max_v_err = 0.0f64;
for cy in 0..ch {
for cx in 0..cw {
let (sx, sy) = (cx * 2, cy * 2);
let all_flat =
(0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize]));
if !all_flat {
continue;
}
let (r, g, b, _) = pixel_rgb(sx, sy);
let (_, rcb, rcr) = p010_reference(r as f64, g as f64, b as f64);
let gu = cb_codes[(cy * cw + cx) as usize] as f64;
let gv = cr_codes[(cy * cw + cx) as usize] as f64;
max_u_err = max_u_err.max((gu - rcb).abs());
max_v_err = max_v_err.max((gv - rcr).abs());
}
}
// Per-colour table.
println!("HDR P010 self-test ({W}x{H}, BT.2020 PQ, 10-bit limited range)");
println!(
" {:<10} {:>14} {:>14} {:>14}",
"color", "Y exp/got", "Cb exp/got", "Cr exp/got"
);
for (idx, (name, r, g, b)) in named.iter().enumerate() {
let bx = (idx as u32 % grid_cols) * BLK + BLK / 2;
let by = (idx as u32 / grid_cols) * BLK + BLK / 2;
let (ey, ecb, ecr) = p010_reference(*r as f64, *g as f64, *b as f64);
let gy = y_codes[(by * W + bx) as usize] as f64;
let (ccx, ccy) = (bx / 2, by / 2);
let gu = cb_codes[(ccy * cw + ccx) as usize] as f64;
let gv = cr_codes[(ccy * cw + ccx) as usize] as f64;
println!(
" {:<10} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}",
name, ey, gy, ecb, gu, ecr, gv
);
}
println!(
" max abs error: Y={max_y_err:.2} (≤4) Cb={max_u_err:.2} (≤5) Cr={max_v_err:.2} (≤5)"
);
if max_y_err <= 4.0 && max_u_err <= 5.0 && max_v_err <= 5.0 {
println!("PASS");
Ok(())
} else {
println!("FAIL");
bail!(
"HDR P010 self-test FAILED (Y={max_y_err:.2} Cb={max_u_err:.2} Cr={max_v_err:.2})"
);
}
}
}
/// Minimal f32 → IEEE-754 half (f16) bit pattern, for uploading the FP16 scRGB self-test pattern. Not
/// on any hot path; handles normals, subnormals, and the 1.0/0.0 constants we feed. (round-to-nearest)
#[cfg(target_os = "windows")]
fn f32_to_f16(v: f32) -> u16 {
let bits = v.to_bits();
let sign = ((bits >> 16) & 0x8000) as u16;
let exp = ((bits >> 23) & 0xff) as i32 - 127 + 15;
let mant = bits & 0x007f_ffff;
if exp <= 0 {
// Subnormal / zero in half precision.
if exp < -10 {
return sign; // too small → ±0
}
let mant = mant | 0x0080_0000; // implicit 1
let shift = (14 - exp) as u32;
let half_mant = (mant >> shift) as u16;
// Round to nearest.
let round = ((mant >> (shift - 1)) & 1) as u16;
sign | (half_mant + round)
} else if exp >= 0x1f {
sign | 0x7c00 // Inf/NaN → Inf (our inputs never hit this)
} else {
let half_exp = (exp as u16) << 10;
let half_mant = (mant >> 13) as u16;
let round = ((mant >> 12) & 1) as u16;
sign | half_exp | (half_mant + round)
}
}
use windows::Win32::Graphics::Direct3D11::{
ID3D11VideoContext1, ID3D11VideoDevice, ID3D11VideoProcessor, ID3D11VideoProcessorEnumerator,
ID3D11VideoProcessorInputView, ID3D11VideoProcessorOutputView, D3D11_TEX2D_VPIV,
D3D11_TEX2D_VPOV, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE, D3D11_VIDEO_PROCESSOR_CONTENT_DESC,
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0,
D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0,
D3D11_VIDEO_PROCESSOR_STREAM, D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
D3D11_VPIV_DIMENSION_TEXTURE2D, D3D11_VPOV_DIMENSION_TEXTURE2D,
};
use windows::Win32::Graphics::Dxgi::Common::{
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020, DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
DXGI_RATIONAL,
};
/// D3D11 **Video Processor** colour/format converter — runs on the GPU's dedicated VIDEO engine, NOT
/// the 3D engine, so the per-frame RGB→YUV conversion does not contend with a GPU-saturating game (the
/// HDR pixel-shader path and NVENC's internal RGB→YUV both use the 3D/compute engine, which an AAA
/// title pins at ~100%). Output is NV12 (SDR, BT.709 studio-range) or P010 (HDR, BT.2020 PQ
/// studio-range) — NVENC's native YUV inputs, so it encodes them with no further conversion.
pub(crate) struct VideoConverter {
vdev: ID3D11VideoDevice,
vctx: ID3D11VideoContext1,
enumr: ID3D11VideoProcessorEnumerator,
vp: ID3D11VideoProcessor,
}
impl VideoConverter {
pub(crate) unsafe fn new(
device: &ID3D11Device,
context: &ID3D11DeviceContext,
width: u32,
height: u32,
hdr: bool,
) -> Result<Self> {
let vdev: ID3D11VideoDevice = device.cast().context("device -> ID3D11VideoDevice")?;
let vctx: ID3D11VideoContext1 = context.cast().context("context -> ID3D11VideoContext1")?;
let rate = DXGI_RATIONAL {
Numerator: 240,
Denominator: 1,
};
let desc = D3D11_VIDEO_PROCESSOR_CONTENT_DESC {
InputFrameFormat: D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE,
InputFrameRate: rate,
InputWidth: width,
InputHeight: height,
OutputFrameRate: rate,
OutputWidth: width,
OutputHeight: height,
Usage: D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
};
let enumr = vdev
.CreateVideoProcessorEnumerator(&desc)
.context("CreateVideoProcessorEnumerator")?;
let vp = vdev
.CreateVideoProcessor(&enumr, 0)
.context("CreateVideoProcessor")?;
// Full-range RGB in → studio-range YUV out. HDR: scRGB linear (G10) → BT.2020 PQ (G2084).
// SDR: sRGB (G22) → BT.709 (G22).
let (in_cs, out_cs) = if hdr {
(
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709,
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020,
)
} else {
(
DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
)
};
vctx.VideoProcessorSetStreamColorSpace1(&vp, 0, in_cs);
vctx.VideoProcessorSetOutputColorSpace1(&vp, out_cs);
// One frame in, one frame out — no interpolation/auto-processing.
vctx.VideoProcessorSetStreamFrameFormat(&vp, 0, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE);
Ok(Self {
vdev,
vctx,
enumr,
vp,
})
}
/// Convert `input` (BGRA or scRGB FP16) → `output` (NV12 or P010) on the video engine. Views are
/// created per call (cheap relative to the Blt) so the input texture can vary frame to frame.
pub(crate) unsafe fn convert(
&self,
input: &ID3D11Texture2D,
output: &ID3D11Texture2D,
) -> Result<()> {
let in_desc = D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC {
FourCC: 0,
ViewDimension: D3D11_VPIV_DIMENSION_TEXTURE2D,
Anonymous: D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0 {
Texture2D: D3D11_TEX2D_VPIV {
MipSlice: 0,
ArraySlice: 0,
},
},
};
let mut in_view: Option<ID3D11VideoProcessorInputView> = None;
self.vdev
.CreateVideoProcessorInputView(input, &self.enumr, &in_desc, Some(&mut in_view))
.context("CreateVideoProcessorInputView")?;
let out_desc = D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC {
ViewDimension: D3D11_VPOV_DIMENSION_TEXTURE2D,
Anonymous: D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0 {
Texture2D: D3D11_TEX2D_VPOV { MipSlice: 0 },
},
};
let mut out_view: Option<ID3D11VideoProcessorOutputView> = None;
self.vdev
.CreateVideoProcessorOutputView(output, &self.enumr, &out_desc, Some(&mut out_view))
.context("CreateVideoProcessorOutputView")?;
let out_view = out_view.context("null output view")?;
let stream = D3D11_VIDEO_PROCESSOR_STREAM {
Enable: true.into(),
pInputSurface: std::mem::ManuallyDrop::new(in_view),
..Default::default()
};
self.vctx
.VideoProcessorBlt(&self.vp, &out_view, 0, &[stream])
.context("VideoProcessorBlt")
}
}
/// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA.
fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option<CursorShape> {
let w = si.Width as usize;
let pitch = si.Pitch as usize;
if w == 0 || pitch == 0 {
return None;
}
// Type is a u32 (newtype constants compared via .0).
if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR.0 as u32 {
// Straight 32bpp BGRA with a real alpha channel → one alpha-blended layer, no XOR layer.
let h = si.Height as usize;
if buf.len() < pitch * h {
return None;
}
let mut alpha = vec![0u8; w * h * 4];
for y in 0..h {
for x in 0..w {
let s = y * pitch + x * 4;
let d = (y * w + x) * 4;
alpha[d] = buf[s];
alpha[d + 1] = buf[s + 1];
alpha[d + 2] = buf[s + 2];
alpha[d + 3] = buf[s + 3];
}
}
Some(CursorShape {
w: w as u32,
h: h as u32,
alpha: Some(alpha),
xor: None,
})
} else if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR.0 as u32 {
// 32bpp where the alpha byte is a MASK selector (0x00 or 0xFF), not an alpha. A single shape
// can mix opaque and screen-inverting pixels (the text I-beam: opaque hot-spot dot + an
// inverting bar), so we split it into BOTH layers:
// mask 0x00 -> opaque RGB → ALPHA layer
// mask 0xFF, RGB != 0 -> invert the screen (white) → XOR layer
// mask 0xFF, RGB == 0 -> XOR with black = no-op → transparent in both
let h = si.Height as usize;
if buf.len() < pitch * h {
return None;
}
let mut alpha = vec![0u8; w * h * 4];
let mut xor = vec![0u8; w * h * 4];
let (mut any_alpha, mut any_xor) = (false, false);
for y in 0..h {
for x in 0..w {
let s = y * pitch + x * 4;
let d = (y * w + x) * 4;
let (b, g, r, mask) = (buf[s], buf[s + 1], buf[s + 2], buf[s + 3]);
if mask == 0 {
alpha[d] = b;
alpha[d + 1] = g;
alpha[d + 2] = r;
alpha[d + 3] = 255;
any_alpha = true;
} else if b != 0 || g != 0 || r != 0 {
// inverting pixel → white opaque; the inversion blend turns this into 1-dest
xor[d] = 255;
xor[d + 1] = 255;
xor[d + 2] = 255;
xor[d + 3] = 255;
any_xor = true;
}
}
}
Some(CursorShape {
w: w as u32,
h: h as u32,
alpha: any_alpha.then_some(alpha),
xor: any_xor.then_some(xor),
})
} else {
// Monochrome: top half = AND mask, bottom half = XOR mask, 1 bpp. Per-pixel (AND,XOR):
// (0,0) opaque black → ALPHA layer
// (0,1) opaque white → ALPHA layer
// (1,0) transparent → neither layer
// (1,1) invert the screen → XOR layer (white opaque) — was previously approximated as
// solid black, which is the bug this split fixes.
let h = (si.Height / 2) as usize;
if buf.len() < pitch * h * 2 {
return None;
}
let bit = |row: usize, x: usize| (buf[row * pitch + x / 8] >> (7 - (x % 8))) & 1;
let mut alpha = vec![0u8; w * h * 4];
let mut xor = vec![0u8; w * h * 4];
let (mut any_alpha, mut any_xor) = (false, false);
for y in 0..h {
for x in 0..w {
let and_bit = bit(y, x);
let xor_bit = bit(y + h, x);
let d = (y * w + x) * 4;
match (and_bit, xor_bit) {
(0, 0) => {
// opaque black: BGR already 0, just mark opaque
alpha[d + 3] = 255;
any_alpha = true;
}
(0, 1) => {
alpha[d] = 255;
alpha[d + 1] = 255;
alpha[d + 2] = 255;
alpha[d + 3] = 255;
any_alpha = true;
}
(1, 0) => {} // transparent
_ => {
// (1,1) invert screen → white opaque into the XOR layer
xor[d] = 255;
xor[d + 1] = 255;
xor[d + 2] = 255;
xor[d + 3] = 255;
any_xor = true;
}
}
}
}
Some(CursorShape {
w: w as u32,
h: h as u32,
alpha: any_alpha.then_some(alpha),
xor: any_xor.then_some(xor),
})
}
}
/// CPU src-over alpha blend of a BGRA cursor into a BGRA frame buffer (software-encode path). When
/// `invert` is set (masked-color / XOR cursor), a covered pixel inverts the frame instead (true XOR).
#[allow(clippy::too_many_arguments)]
fn blend_cursor_cpu(
frame: &mut [u8],
fw: u32,
fh: u32,
cur: &[u8],
cw: u32,
ch: u32,
cx: i32,
cy: i32,
invert: bool,
) {
let (fw, fh, cw, ch) = (fw as i32, fh as i32, cw as i32, ch as i32);
for y in 0..ch {
let fy = cy + y;
if fy < 0 || fy >= fh {
continue;
}
for x in 0..cw {
let fx = cx + x;
if fx < 0 || fx >= fw {
continue;
}
let s = ((y * cw + x) * 4) as usize;
let a = cur[s + 3] as u32;
if a == 0 {
continue;
}
let d = ((fy * fw + fx) * 4) as usize;
if invert {
for k in 0..3 {
frame[d + k] = 255 - frame[d + k];
}
} else {
for k in 0..3 {
frame[d + k] =
((cur[s + k] as u32 * a + frame[d + k] as u32 * (255 - a)) / 255) as u8;
}
}
}
}
}
pub struct DuplCapturer {
device: ID3D11Device,
context: ID3D11DeviceContext,
output: IDXGIOutput1,
/// The output duplication. `Option` so recovery can RELEASE it (set `None`) BEFORE re-duplicating:
/// DXGI permits only ONE `IDXGIOutputDuplication` per output, and a stale one (incl. an ACCESS_LOST
/// one) keeps holding the output, so a re-`DuplicateOutput1` returns E_ACCESSDENIED and legacy
/// `DuplicateOutput` returns a BORN-LOST dup — the storm. Apollo releases before re-duplicating; so
/// do we now. `None` only transiently during recovery (acquire routes None → recovery).
dupl: Option<IDXGIOutputDuplication>,
/// The output's GDI name — re-resolved on ACCESS_LOST (a mode change can stale the cached handle).
gdi_name: String,
/// Stable SudoVDA target id, used to re-resolve `gdi_name` during recovery.
target_id: u32,
width: u32,
height: u32,
refresh_hz: u32,
staging: Option<ID3D11Texture2D>,
holding_frame: bool,
active: AtomicBool,
timeout_ms: u32,
/// The first AcquireNextFrame after a (re)DuplicateOutput gets a generous timeout — the initial
/// desktop snapshot of a large surface can take longer than the per-frame budget.
first_frame: bool,
dbg_timeouts: u32,
dbg_lost: u32,
dbg_black_seeds: u32,
last: Option<Vec<u8>>,
/// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA.
/// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input.
gpu_mode: bool,
/// Reused owned texture the duplication frame is copied into for the D3D11 path (the duplication
/// surface is transient and released each frame).
gpu_copy: Option<ID3D11Texture2D>,
/// The most recently produced presentable GPU texture + its pixel format, repeated by
/// `next_frame` when AcquireNextFrame reports no change (static desktop) or during a rebuild.
/// Format-tagged because the SDR path presents BGRA `gpu_copy` while the HDR path presents the
/// 10-bit `hdr10_out` — the encoder needs the right format on every frame.
last_present: Option<(ID3D11Texture2D, PixelFormat)>,
/// Whether this capturer should request an HDR (FP16) duplication — `DuplicateOutput1` with FP16
/// first, retried (legacy DuplicateOutput can't capture HDR). Set for the secure-desktop DDA leg
/// when the SudoVDA is in HDR; threaded into every (re)duplication incl. ACCESS_LOST recovery.
want_hdr: bool,
/// Full-chroma 4:4:4 session: deliver packed RGB (`Bgra` SDR / `Rgb10a2` HDR) and SKIP the
/// video-engine RGB→YUV (NV12/P010) conversion — NVENC reconstructs 4:4:4 only from a full-chroma
/// source, so we hand it the RGB texture and it CSCs to YUV444 at encode (chroma_format_idc=3).
chroma_444: bool,
/// HDR (scRGB FP16) capture state. Set when the duplication surface is `R16G16B16A16_FLOAT`
/// (the desktop has HDR on). The frame can't be `CopyResource`d into a BGRA target, so the HDR
/// path copies it into an FP16 SRV texture, composites the cursor, then runs [`HdrConverter`] to
/// produce a BT.2020 PQ 10-bit (`R10G10B10A2`) frame for NVENC. Toggling HDR fires ACCESS_LOST →
/// `recreate_dupl` re-detects the format, so this tracks the *current* duplication.
hdr_fp16: bool,
/// The source display's static HDR mastering metadata (ST.2086 + content light level), read from
/// `IDXGIOutput6::GetDesc1` whenever the duplication is HDR (`hdr_fp16`). The stream loop forwards
/// it to the encoder (in-band SEI) and the client (0xCE). `None` when SDR or the read failed.
hdr_meta: Option<punktfunk_core::quic::HdrMeta>,
/// FP16 copy of the duplication surface (RT|SRV): the cursor composites onto it and the converter
/// samples it. Reallocated on device/size change.
fp16_src: Option<ID3D11Texture2D>,
fp16_srv: Option<ID3D11ShaderResourceView>,
/// 10-bit `R10G10B10A2` PQ output of the HDR conversion — the texture handed to NVENC.
hdr10_out: Option<ID3D11Texture2D>,
/// scRGB→PQ conversion pass; rebuilt on device recreate.
hdr_conv: Option<HdrConverter>,
/// Video-processor RGB→YUV converter (runs on the VIDEO engine, not the 3D engine) + its NV12
/// (SDR) / P010 (HDR) output texture. This is the zero-3D path: the per-frame colour conversion and
/// NVENC's RGB→YUV both move off the 3D engine so capture+encode don't fight a GPU-saturating game.
/// Lazily built for the current size+HDR; rebuilt on change. `None`/error → falls back to the
/// legacy RGB path. Disabled with `PUNKTFUNK_NO_VIDEO_PROCESSOR=1`.
video_conv: Option<VideoConverter>,
yuv_out: Option<ID3D11Texture2D>,
/// HDR-ness the current `video_conv`/`yuv_out` were built for, so an HDR toggle rebuilds them.
yuv_is_hdr: bool,
/// Latched off after a VideoConverter failure so we don't retry it every frame (fall back to RGB).
vp_disabled: bool,
/// Last time a duplication rebuild was attempted, to throttle retries during an outage (e.g. a
/// secure-desktop dwell where the output is gone) so we don't block the encode loop or hammer
/// DuplicateOutput — between attempts the last good frame is repeated. `None` = never attempted.
last_rebuild: Option<Instant>,
/// Throttle for ALL ACCESS_LOST recovery attempts (cheap re-duplicate + full rebuild). A
/// constantly-invalidated duplication (HDR overlay/MPO churn) would otherwise spin recovery and
/// starve the encode thread; cap attempts to ~one per 5 ms and repeat the last frame between them.
last_recover: Option<Instant>,
/// True once at least one real frame has been produced. After that, a frame drought (e.g. a long
/// secure-desktop dwell with nothing rendering to the virtual output) must never fatally end the
/// session — `next_frame` keeps repeating the last/seeded frame instead of erroring on its
/// deadline. The deadline stays fatal only *before* the first frame (a genuine startup misconfig).
ever_got_frame: bool,
/// Consecutive rebuilds that produced a BORN-LOST duplication (created OK, but its first
/// AcquireNextFrame instantly returned ACCESS_LOST). On the NORMAL desktop this is the hybrid
/// reparent/flip storm — once it persists, `acquire` returns Err so the punktfunk1 loop cold-rebuilds the
/// whole pipeline (new device/output) instead of spinning on a dead dup forever (the bug where the
/// stream froze on the last frame). Reset to 0 by any real frame. NOT armed on the secure
/// (Winlogon) desktop, where a long static dwell is legitimate and must never end the session.
consecutive_born_lost: u32,
/// GPU cursor overlay (rebuilt on device recreate). `None` until the first composite.
cursor: Option<CursorCompositor>,
/// Last cursor shape, decomposed into alpha + XOR layers (kept device-independent so it survives
/// a device recreate).
cursor_shape: Option<CursorShape>,
cursor_pos: (i32, i32),
cursor_visible: bool,
/// Cursor shape changed → re-upload to the GPU texture(s) before the next composite.
cursor_dirty: bool,
dbg_cursor: u64,
_keepalive: Box<dyn Send>,
}
// SAFETY: `DuplCapturer` holds D3D11 device/context/duplication COM pointers plus plain data. The
// device is created free-threaded (`make_device` sets no `D3D11_CREATE_DEVICE_SINGLETHREADED`) and
// COM reference counting is interlocked, so moving ownership of the whole capturer to another thread
// is sound. It is used by exactly one thread (the encode thread) at a time — moved to it once, never
// shared (no `Sync`) — so the single-threaded immediate context is never touched concurrently.
unsafe impl Send for DuplCapturer {}
impl DuplCapturer {
pub fn open(
target: WinCaptureTarget,
preferred: Option<(u32, u32, u32)>,
keepalive: Box<dyn Send>,
// Whether the (already-resolved) encode backend wants GPU-resident frames — passed IN (Goal-1
// stage 5) so the capturer never re-derives the encode backend itself.
gpu: bool,
want_hdr: bool,
// 4:4:4 session → deliver RGB, skip the NV12/P010 video-engine conversion (see the field doc).
chroma_444: bool,
) -> Result<Self> {
// SAFETY: runs on the capture thread that will own this `DuplCapturer`. `install_gpu_pref_hook()`
// and the DPI-context calls take by-value handles / no args and touch only thread/process state;
// `SetThreadExecutionState` takes a flags bitmask by value. `CreateDXGIFactory1` yields a live
// `IDXGIFactory1`, and every subsequent COM method (`EnumAdapters1`/`EnumOutputs`/`GetDesc1`/
// `GetDesc`/`cast`) is called on that factory or on an adapter/output it returned — each obtained
// through a checked `while let Ok(..)`/`?` — all from this one thread. No raw pointers are
// dereferenced; the borrowed strings/locals outlive each synchronous call.
unsafe {
// Stop DXGI hybrid-GPU output reparenting BEFORE we create the factory / enumerate outputs
// (the cause of the 0x887A0026 ACCESS_LOST churn on this hybrid box: RTX 4090 + AMD iGPU).
install_gpu_pref_hook();
// Force PER-MONITOR-AWARE-V2 on THIS (capture) thread. IDXGIOutput5::DuplicateOutput1
// REQUIRES V2 — without it the call returns E_ACCESSDENIED forever (the 4370x failures
// measured live), forcing the legacy DuplicateOutput fallback which yields a BORN-LOST
// duplication on this box → the ACCESS_LOST storm. SetProcessDpiAwarenessContext failed at
// startup ("already set" — a manifest/runtime locked the process to a LOWER awareness, and
// GetAwarenessFromDpiAwarenessContext can't tell V1 from V2: it reports 2 for both). The
// per-THREAD override works regardless of the process default, so DuplicateOutput1 can
// succeed (the working dup Apollo gets). Must run on the capture thread before any DXGI use.
{
use windows::Win32::UI::HiDpi::{
AreDpiAwarenessContextsEqual, GetThreadDpiAwarenessContext,
SetThreadDpiAwarenessContext, DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2,
};
let prev = SetThreadDpiAwarenessContext(DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2);
let is_v2 = AreDpiAwarenessContextsEqual(
GetThreadDpiAwarenessContext(),
DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2,
)
.as_bool();
tracing::info!(
set_ok = !prev.0.is_null(),
thread_is_v2 = is_v2,
"capture thread DPI awareness -> PER_MONITOR_AWARE_V2 (required for DuplicateOutput1)"
);
}
// Keep the IDD (SudoVDA) virtual display awake for the capture lifetime: an idle indirect
// display can be power-gated, which invalidates the duplication (a contributor to the
// "freezes randomly while streaming" loss). Restored to ES_CONTINUOUS on Drop. (Apollo does
// this too.) Must run on the capture thread (this one owns the capturer).
SetThreadExecutionState(ES_CONTINUOUS | ES_DISPLAY_REQUIRED | ES_SYSTEM_REQUIRED);
let factory: IDXGIFactory1 = CreateDXGIFactory1().context("CreateDXGIFactory1")?;
// 1) Find the output (monitor) whose GDI DeviceName matches, across ALL adapters. On a
// real-GPU box the SudoVDA virtual monitor's DXGI output is enumerated under the GPU that
// *renders* it (the discrete/integrated GPU), NOT under the SudoVDA "adapter" LUID that
// SudoVDA reports — so we can't restrict the search to `target.adapter_luid`. The output
// also appears a beat after the display is created, so settle-retry for up to ~2 s.
// `target.adapter_luid` is kept only as a tie-break preference (matched adapter first).
let _ = target.adapter_luid;
let deadline = Instant::now() + Duration::from_millis(2000);
let (adapter, output): (IDXGIAdapter1, IDXGIOutput1) = loop {
let mut hit = None;
let mut i = 0u32;
while let Ok(a) = factory.EnumAdapters1(i) {
let ad = a.GetDesc1()?;
let aname = String::from_utf16_lossy(&ad.Description);
let aname = aname.trim_end_matches('\u{0}');
let mut j = 0u32;
while let Ok(o) = a.EnumOutputs(j) {
let od = o.GetDesc()?;
let oname = String::from_utf16_lossy(&od.DeviceName);
let oname = oname.trim_end_matches('\u{0}').to_string();
tracing::debug!(
adapter = aname,
luid = format!("{:#x}", pack_luid(ad.AdapterLuid)),
output = oname,
want = target.gdi_name,
"DXGI output seen"
);
if gdi_name_matches(&od.DeviceName, &target.gdi_name) {
tracing::info!(
adapter = aname,
luid = format!("{:#x}", pack_luid(ad.AdapterLuid)),
output = oname,
"capturing the SudoVDA output on this adapter"
);
hit = Some((a.clone(), o.cast::<IDXGIOutput1>()?));
break;
}
j += 1;
}
if hit.is_some() {
break;
}
i += 1;
}
if let Some(h) = hit {
break h;
}
if Instant::now() >= deadline {
let mut topo = Vec::new();
let mut i = 0u32;
while let Ok(a) = factory.EnumAdapters1(i) {
let ad = a.GetDesc1()?;
let an = String::from_utf16_lossy(&ad.Description);
let mut outs = Vec::new();
let mut j = 0u32;
while let Ok(o) = a.EnumOutputs(j) {
let od = o.GetDesc()?;
outs.push(
String::from_utf16_lossy(&od.DeviceName)
.trim_end_matches('\u{0}')
.to_string(),
);
j += 1;
}
topo.push(format!(
"{} [{:#x}]: {:?}",
an.trim_end_matches('\u{0}'),
pack_luid(ad.AdapterLuid),
outs
));
i += 1;
}
bail!(
"no DXGI adapter exposes output {} (topology: {})",
target.gdi_name,
topo.join(" | ")
);
}
std::thread::sleep(Duration::from_millis(100));
};
// 2) D3D11 device ON the adapter that exposes the output (driver_type MUST be UNKNOWN with
// an explicit adapter). NVENC binds to this same device for zero-copy encode.
let mut device: Option<ID3D11Device> = None;
let mut context: Option<ID3D11DeviceContext> = None;
D3D11CreateDevice(
&adapter,
D3D_DRIVER_TYPE_UNKNOWN,
HMODULE::default(),
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
Some(&[D3D_FEATURE_LEVEL_11_0]),
D3D11_SDK_VERSION,
Some(&mut device),
None,
Some(&mut context),
)
.context("D3D11CreateDevice")?;
let device = device.context("null D3D11 device")?;
let context = context.context("null D3D11 context")?;
// 3) duplicate the output. Attach to the current input desktop first (as SYSTEM this can
// be the Winlogon secure desktop) so a session that starts at the lock/login screen works.
// The virtual display is kept the sole desktop via the CCD isolation the pf-vdisplay backend
// applies at monitor creation (registry-persisted), so the secure desktop has nowhere to render
// but the output we capture — no per-open re-isolation needed.
attach_input_desktop();
let dupl = duplicate_output(&output, &device, want_hdr)
.context("DuplicateOutput (already duplicated by another app?)")?;
// Did DXGI actually call our win32u GPU-pref hook during factory/device/dupl creation? hits==0
// here means the hook is NOT on DXGI's reparenting path on this build → reparenting can't be
// the churn cause (look at independent-flip/composition instead). Diagnostic only.
tracing::debug!(
hook_hits = hybrid_hook_hits(),
"win32u GPU-pref hook call count after open"
);
// Kick the first frame loose: a blank virtual display is otherwise change-less.
nudge_cursor_onto(&output);
let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc();
let (width, height) = (dd.ModeDesc.Width, dd.ModeDesc.Height);
let refresh_hz = preferred
.map(|(_, _, hz)| hz)
.filter(|&hz| hz > 0)
.unwrap_or_else(|| {
let r = dd.ModeDesc.RefreshRate;
r.Numerator
.checked_div(r.Denominator)
.map_or(60, |hz| hz.max(1))
});
let timeout_ms = std::env::var("PUNKTFUNK_CAPTURE_TIMEOUT_MS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or((2000 / refresh_hz.max(1)).max(100));
// Produce GPU-resident D3D11 frames (zero-copy NVENC, or the NV12/P010 the AMF/QSV backends
// read back / import) whenever the encode backend is a GPU one — so the capturer's output
// format matches the encoder's input. Only the software (GPU-less) path takes CPU staging.
// The decision is resolved ONCE per session and passed in (Goal-1 stage 5), instead of this
// capturer re-calling `encode::windows_resolved_backend()` — the back-reference that let
// capture and encode disagree (plan §2.3/§5).
let gpu_mode = gpu;
// Read the source display's HDR mastering metadata while we still hold `output` (it is
// moved into the struct below). Only meaningful for an HDR (FP16) duplication.
let is_hdr_init = dd.ModeDesc.Format == DXGI_FORMAT_R16G16B16A16_FLOAT;
let hdr_meta_init = if is_hdr_init {
read_output_hdr_meta(&output)
} else {
None
};
tracing::info!(
"DXGI duplication: {}x{}@{} on {} ({}) dxgi_format={} (87=BGRA8 24=R10G10B10A2 10=R16G16B16A16_FLOAT)",
width,
height,
refresh_hz,
target.gdi_name,
if gpu_mode {
"D3D11 zero-copy"
} else {
"CPU staging"
},
dd.ModeDesc.Format.0,
);
Ok(Self {
device,
context,
output,
dupl: Some(dupl),
target_id: target.target_id,
gdi_name: target.gdi_name,
width,
height,
refresh_hz,
staging: None,
holding_frame: false,
active: AtomicBool::new(false),
timeout_ms,
first_frame: true,
dbg_timeouts: 0,
dbg_lost: 0,
dbg_black_seeds: 0,
last: None,
gpu_mode,
gpu_copy: None,
last_present: None,
want_hdr,
chroma_444,
hdr_fp16: is_hdr_init,
hdr_meta: hdr_meta_init,
fp16_src: None,
fp16_srv: None,
hdr10_out: None,
hdr_conv: None,
video_conv: None,
yuv_out: None,
yuv_is_hdr: false,
vp_disabled: std::env::var_os("PUNKTFUNK_NO_VIDEO_PROCESSOR").is_some(),
last_rebuild: None,
last_recover: None,
ever_got_frame: false,
consecutive_born_lost: 0,
cursor: None,
cursor_shape: None,
cursor_pos: (0, 0),
cursor_visible: false,
cursor_dirty: false,
dbg_cursor: 0,
_keepalive: keepalive,
})
}
}
unsafe fn ensure_staging(&mut self) -> Result<()> {
if self.staging.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_STAGING,
BindFlags: D3D11_BIND_FLAG(0).0 as u32,
CPUAccessFlags: D3D11_CPU_ACCESS_READ.0 as u32,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(staging)")?;
self.staging = t;
Ok(())
}
unsafe fn ensure_gpu_copy(&mut self) -> Result<()> {
if self.gpu_copy.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(gpu copy)")?;
self.gpu_copy = t;
Ok(())
}
/// Convert `input` (BGRA for SDR, scRGB FP16 for HDR) to NVENC's native YUV (NV12 / P010) via the
/// D3D11 **video processor** (video engine) — keeping the per-frame colour conversion AND NVENC's
/// RGB→YUV off the 3D engine so capture+encode don't fight a GPU-saturating game. Returns the YUV
/// texture, or `None` to fall back to the legacy RGB path (processor disabled/unavailable). Lazily
/// builds + caches the processor + output texture for the current size + HDR-ness.
unsafe fn convert_to_yuv(
&mut self,
input: &ID3D11Texture2D,
hdr: bool,
) -> Option<ID3D11Texture2D> {
if self.vp_disabled {
return None;
}
if self.video_conv.is_none() || self.yuv_out.is_none() || self.yuv_is_hdr != hdr {
self.video_conv = None;
self.yuv_out = None;
let vc = match VideoConverter::new(
&self.device,
&self.context,
self.width,
self.height,
hdr,
) {
Ok(vc) => vc,
Err(e) => {
tracing::warn!(error = %format!("{e:#}"),
"video processor unavailable — falling back to RGB encode path");
self.vp_disabled = true;
return None;
}
};
let fmt = if hdr {
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010
} else {
windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_NV12
};
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: fmt,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
if let Err(e) = self.device.CreateTexture2D(&desc, None, Some(&mut t)) {
tracing::warn!(error = %format!("{e:?}"),
"CreateTexture2D(YUV out) failed — falling back to RGB encode path");
self.vp_disabled = true;
return None;
}
self.video_conv = Some(vc);
self.yuv_out = t;
self.yuv_is_hdr = hdr;
tracing::info!(
hdr,
"video-processor YUV path active ({} on the video engine, 0% 3D)",
if hdr { "P010" } else { "NV12" }
);
}
let out = self.yuv_out.clone()?;
if let Err(e) = self.video_conv.as_ref()?.convert(input, &out) {
tracing::warn!(error = %format!("{e:#}"),
"VideoProcessorBlt failed — falling back to RGB encode path");
self.vp_disabled = true;
self.video_conv = None;
self.yuv_out = None;
return None;
}
Some(out)
}
/// FP16 (`R16G16B16A16_FLOAT`) copy of the HDR duplication surface (RT for the cursor composite +
/// SRV for the converter). Reallocated when absent (device/size change drops it).
unsafe fn ensure_fp16_src(&mut self) -> Result<()> {
if self.fp16_src.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_R16G16B16A16_FLOAT,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: (D3D11_BIND_RENDER_TARGET.0 | D3D11_BIND_SHADER_RESOURCE.0) as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(fp16 src)")?;
let t = t.context("fp16 src tex")?;
let mut srv = None;
self.device
.CreateShaderResourceView(&t, None, Some(&mut srv))?;
self.fp16_srv = Some(srv.context("fp16 srv")?);
self.fp16_src = Some(t);
Ok(())
}
/// 10-bit `R10G10B10A2_UNORM` PQ output of the HDR conversion — the texture NVENC encodes.
unsafe fn ensure_hdr10_out(&mut self) -> Result<()> {
if self.hdr10_out.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_R10G10B10A2_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(hdr10 out)")?;
self.hdr10_out = t;
Ok(())
}
/// Allocate a presentable GPU texture on the *current* device, clear it to black, and record it
/// as `last_present`. Called after a desktop-switch recovery so `next_frame` always has a D3D11
/// frame to repeat even while the (secure) desktop renders nothing to the virtual output — this
/// is what keeps the session alive across a lock/login/UAC transition instead of dropping it. In
/// HDR mode it seeds the 10-bit output (black = PQ 0); otherwise the BGRA copy. One-shot: the next
/// real frame overwrites the texture in place.
unsafe fn seed_black_gpu_frame(&mut self) -> Result<()> {
// Instrumentation: a BLACK seed means we have no real desktop frame to show — if the client
// streams black, this is why. On the secure (Winlogon) desktop this fires when the duplication
// came back born-lost / idle. Counted + logged (throttled) so a real-lock repro shows the mode.
self.dbg_black_seeds += 1;
if self.dbg_black_seeds % 32 == 1 {
tracing::warn!(
black_seeds = self.dbg_black_seeds,
"DDA: seeding BLACK frame — no real desktop frame available (secure desktop idle/born-lost?)"
);
}
if self.hdr_fp16 {
self.ensure_hdr10_out()?;
let out = self.hdr10_out.clone().context("hdr10 out texture")?;
let mut rtv: Option<ID3D11RenderTargetView> = None;
self.device
.CreateRenderTargetView(&out, None, Some(&mut rtv))?;
self.context
.ClearRenderTargetView(&rtv.context("null RTV (hdr seed)")?, &[0.0, 0.0, 0.0, 1.0]);
self.last_present = Some((out, PixelFormat::Rgb10a2));
} else {
self.ensure_gpu_copy()?;
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
let mut rtv: Option<ID3D11RenderTargetView> = None;
self.device
.CreateRenderTargetView(&gpu, None, Some(&mut rtv))?;
self.context
.ClearRenderTargetView(&rtv.context("null RTV (sdr seed)")?, &[0.0, 0.0, 0.0, 1.0]);
self.last_present = Some((gpu, PixelFormat::Bgra));
}
Ok(())
}
/// Pull cursor position/visibility/shape out of the frame info (the HW cursor is NOT in the frame).
unsafe fn update_cursor(&mut self, info: &DXGI_OUTDUPL_FRAME_INFO) {
if info.LastMouseUpdateTime != 0 {
self.cursor_pos = (
info.PointerPosition.Position.x,
info.PointerPosition.Position.y,
);
self.cursor_visible = info.PointerPosition.Visible.as_bool();
}
if info.PointerShapeBufferSize > 0 {
let mut buf = vec![0u8; info.PointerShapeBufferSize as usize];
let mut required = 0u32;
let mut si = DXGI_OUTDUPL_POINTER_SHAPE_INFO::default();
if self.dupl.as_ref().is_some_and(|d| {
d.GetFramePointerShape(
info.PointerShapeBufferSize,
buf.as_mut_ptr() as *mut c_void,
&mut required,
&mut si,
)
.is_ok()
}) {
if let Some(shape) = convert_pointer_shape(&buf, &si) {
tracing::info!(
shape_type = si.Type,
size = format!("{}x{}", shape.w, shape.h),
alpha = shape.alpha.is_some(),
xor = shape.xor.is_some(),
"cursor shape captured"
);
self.cursor_shape = Some(shape);
self.cursor_dirty = true;
}
}
}
}
/// Composite the cursor onto the GPU frame texture (zero-copy path). `hdr` = the target is the
/// linear scRGB FP16 surface (HDR path) — the cursor is then sRGB→linear decoded and scaled to
/// HDR graphics white (PUNKTFUNK_HDR_CURSOR_NITS, default 203, per BT.2408) so it isn't ~2.5×
/// too dim; SDR composites the raw cursor in the display's native sRGB space.
unsafe fn composite_cursor_gpu(&mut self, gpu: &ID3D11Texture2D, hdr: bool) -> Result<()> {
self.dbg_cursor += 1;
if self.dbg_cursor % 240 == 1 {
tracing::debug!(
visible = self.cursor_visible,
pos = format!("{:?}", self.cursor_pos),
shape = self
.cursor_shape
.as_ref()
.map(|s| format!("{}x{}", s.w, s.h)),
"cursor state"
);
}
if !self.cursor_visible || self.cursor_shape.is_none() {
return Ok(());
}
if self.cursor.is_none() {
self.cursor = Some(CursorCompositor::new(&self.device)?);
self.cursor_dirty = true; // fresh device → must (re)upload the shape texture
}
if self.cursor_dirty {
if let Some(shape) = &self.cursor_shape {
self.cursor
.as_mut()
.unwrap()
.set_shapes(&self.device, shape)?;
}
self.cursor_dirty = false;
}
let mut rtv: Option<ID3D11RenderTargetView> = None;
self.device
.CreateRenderTargetView(gpu, None, Some(&mut rtv))?;
let rtv = rtv.context("cursor rtv")?;
let (cx, cy) = self.cursor_pos;
// HDR graphics-white target in nits → scRGB multiplier (scRGB 1.0 = 80 nits). Default 203
// (BT.2408); PUNKTFUNK_HDR_CURSOR_NITS overrides without a rebuild. SDR → 1.0, no decode.
let white_mul = if hdr {
let nits = std::env::var("PUNKTFUNK_HDR_CURSOR_NITS")
.ok()
.and_then(|s| s.parse::<f32>().ok())
.filter(|n| n.is_finite() && *n > 0.0)
.unwrap_or(203.0);
nits / 80.0
} else {
1.0
};
let (w, h) = (self.width, self.height);
let comp = self.cursor.as_ref().unwrap();
// Alpha-blended layer (normal cursor pixels); HDR brightness scale applies here.
if let Some((srv, cw, ch)) = &comp.tex_alpha {
comp.draw_layer(
&self.context,
&rtv,
w,
h,
cx,
cy,
srv,
*cw,
*ch,
false,
white_mul,
hdr, // decode sRGB→linear only on the HDR (linear FP16) target
);
}
// Inversion layer (masked-color I-beam bar / monochrome invert): operates on the framebuffer
// reference, so it is never HDR-scaled or sRGB-decoded.
if let Some((srv, cw, ch)) = &comp.tex_xor {
comp.draw_layer(
&self.context,
&rtv,
w,
h,
cx,
cy,
srv,
*cw,
*ch,
true,
1.0,
false,
);
}
Ok(())
}
/// CHEAP recovery for the ACCESS_LOST *churn*: re-`DuplicateOutput` on the EXISTING device +
/// output. No new device/factory, so the encoder is NOT re-initialized and no black is seeded —
/// the existing `gpu_copy`/HDR textures/`last_present` are kept and frames resume immediately. This
/// is the right recovery for the HDR overlay-flip churn (the duplication is invalidated but the
/// output is still live). Returns false when the output can't be re-duplicated (desktop switch /
/// output gone) so the caller falls back to the full [`recreate_dupl`]. Probes the new duplication
/// (like recreate_dupl) so a born-lost one is rejected rather than adopted.
unsafe fn try_reduplicate(&mut self) -> bool {
if self.holding_frame {
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
}
// RELEASE the old duplication FIRST (drop it → frees the output) before re-duplicating. DXGI
// allows one duplication per output; leaving the stale one alive is exactly why DuplicateOutput1
// returned E_ACCESSDENIED and the legacy fallback produced a born-lost dup.
self.dupl = None;
let dupl = match duplicate_output(&self.output, &self.device, self.want_hdr) {
Ok(d) => d,
Err(_) => return false,
};
// Adopt first (SAME device → existing gpu_copy/HDR textures/last_present stay valid), then probe
// + CAPTURE the frame: a born-lost duplication returns ACCESS_LOST immediately; alive-but-idle
// waits the full 16ms. On a real frame we present it (so a static desktop keeps a real
// last_present instead of the discarded one); idle keeps the existing last_present.
self.dupl = Some(dupl);
let mut info = DXGI_OUTDUPL_FRAME_INFO::default();
let mut res: Option<IDXGIResource> = None;
match self
.dupl
.as_ref()
.unwrap()
.AcquireNextFrame(16, &mut info, &mut res)
{
Ok(()) => {
self.update_cursor(&info);
if let Some(r) = res {
let _ = self.present_acquired(r);
}
}
Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => {}
Err(_) => return false, // born-lost on the same output → need the full rebuild
}
true
}
/// ONE rebuild attempt — deliberately non-blocking. ACCESS_LOST fires on desktop switches
/// (normal ↔ Winlogon secure: lock/login/UAC) and on the mode change we issue at create. We
/// re-attach to the now-current input desktop and recreate the D3D11 device + duplication on it
/// (a device made on the previous desktop can't sustain a duplication on the new one). CRUCIAL:
/// no internal multi-second retry loop — during a secure-desktop dwell the SudoVDA output is
/// *gone* (`no DXGI output named …`), and a blocking retry here would starve the encode/send
/// loop of frames for seconds, so the client times out and disconnects (the bug this fixes).
/// Instead a single attempt returns immediately; the caller ([`acquire`]) repeats the last good
/// frame and retries on a throttle, so the session survives an arbitrarily long secure visit.
unsafe fn recreate_dupl(&mut self) -> Result<()> {
if self.holding_frame {
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
}
// The SudoVDA output's GDI name can CHANGE across a secure-desktop topology rebuild —
// re-resolve from the STABLE target id so we find it under its current name.
if let Some(n) = crate::win_display::resolve_gdi_name(self.target_id) {
self.gdi_name = n;
}
// Re-sync the capture thread to the CURRENT input desktop on EVERY rebuild — symmetric for
// ENTERING and LEAVING the secure (Winlogon) desktop. This is the fix for "UAC/lock appears
// fine but breaks the instant you click out of it": leaving secure used to skip this (it was
// gated on is_secure_desktop()), stranding the thread on the gone Winlogon desktop. Cheap +
// leak-free (attach_input_desktop closes its handle). Apollo (syncThreadDesktop) does the same.
// We do NOT re-isolate the display on recovery: the CCD isolation from create_monitor is
// registry-persisted, and a CCD topology mutation here would itself invalidate the freshly-rebuilt
// duplication → a self-feeding ACCESS_LOST storm (200 rebuilds/session observed before this).
attach_input_desktop();
// RELEASE the old duplication FIRST (frees the output). reopen_duplication creates a NEW device
// and re-DuplicateOutputs the output; if the stale duplication is still alive it holds the output
// and the new one is born-lost / E_ACCESSDENIED. (On reopen failure self.dupl stays None and
// acquire's None-guard re-drives recovery.)
self.dupl = None;
let (dev, ctx, out, dupl) = reopen_duplication(&self.gdi_name, self.want_hdr)?; // Err → caller repeats + retries
// (The born-lost guard is now the capture-acquire at the end: we adopt, then grab the current
// frame; ACCESS_LOST there means born-lost, and we seed black + let the throttled caller retry.)
// A desktop switch can come back at a different size (e.g. the user session applies its own
// resolution on login). Adopt it: update dimensions and drop the staging/gpu copies so they
// reallocate. NVENC re-inits at the new size when it sees the frame.
let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc();
let (nw, nh) = (dd.ModeDesc.Width, dd.ModeDesc.Height);
tracing::info!(
dxgi_format = dd.ModeDesc.Format.0,
"DXGI duplication rebuilt (format: 87=BGRA8 24=R10G10B10A2 10=R16G16B16A16_FLOAT)"
);
if nw != self.width || nh != self.height {
tracing::info!(
old = format!("{}x{}", self.width, self.height),
new = format!("{nw}x{nh}"),
"DXGI duplication size changed across switch"
);
self.width = nw;
self.height = nh;
self.staging = None;
}
self.device = dev;
self.context = ctx;
self.output = out;
self.dupl = Some(dupl);
self.gpu_copy = None; // stale: belonged to the old device
self.cursor = None; // shaders/textures belonged to the old device; rebuilt on demand
self.last_present = None; // belonged to the old device; reseeded below
// Re-detect HDR and drop the HDR textures/converter (old device). Toggling HDR on or
// off is exactly this path: the duplication comes back as FP16 (HDR) or BGRA8.
self.hdr_fp16 = dd.ModeDesc.Format == DXGI_FORMAT_R16G16B16A16_FLOAT;
// Re-read the source mastering metadata for the (possibly new) HDR output, or clear it on SDR.
self.hdr_meta = if self.hdr_fp16 {
read_output_hdr_meta(&self.output)
} else {
None
};
self.fp16_src = None;
self.fp16_srv = None;
self.hdr10_out = None;
self.hdr_conv = None;
// Video processor + its YUV output belonged to the old device / size / HDR-ness — rebuild lazily.
self.video_conv = None;
self.yuv_out = None;
self.first_frame = true;
// Capture the CURRENT desktop frame as `last_present` (instead of seeding black). The secure
// (lock/login/UAC) desktop is STATIC, so DDA only emits a frame on change — if we seeded black
// we'd stream black until the user pressed a key (the reported bug). A freshly-created
// duplication's first AcquireNextFrame returns the full current desktop; grab it and present it,
// so the client shows the real (frozen-until-it-changes) secure desktop. Born-lost (ACCESS_LOST
// here) or no-initial-frame (timeout) → seed black as a fallback and let the throttled caller
// retry — a brief black flash during the unsettled switch, then real content.
nudge_cursor_onto(&self.output); // kick a change so a static desktop yields its first frame
let mut info = DXGI_OUTDUPL_FRAME_INFO::default();
let mut res: Option<IDXGIResource> = None;
let captured = match self
.dupl
.as_ref()
.unwrap()
.AcquireNextFrame(120, &mut info, &mut res)
{
Ok(()) => {
self.update_cursor(&info);
match res {
Some(r) => match self.present_acquired(r) {
Ok(_) => {
self.first_frame = false;
tracing::info!("DXGI recovery: captured real secure-desktop frame");
true
}
Err(e) => {
tracing::warn!(error = %format!("{e:#}"), "recovery: present_acquired failed");
false
}
},
None => false,
}
}
Err(e) => {
tracing::warn!(
code = format!("{:#x}", e.code().0),
"DXGI recovery: no initial frame (born-lost/idle) — seeding black, will retry"
);
false
}
};
if !captured && self.gpu_mode {
if let Err(e) = self.seed_black_gpu_frame() {
tracing::warn!(error = %format!("{e:#}"), "seed black frame after recovery failed");
}
}
// Track the born-lost storm: a rebuild that grabbed a real frame clears it; one that came back
// born-lost (created OK, first AcquireNextFrame == ACCESS_LOST) advances it. `acquire` uses this
// to escape to a full pipeline cold-rebuild on the normal desktop instead of spinning forever.
if captured {
self.consecutive_born_lost = 0;
} else {
self.consecutive_born_lost = self.consecutive_born_lost.saturating_add(1);
}
Ok(())
}
/// Acquire one frame: `Some` on a fresh image, `None` on timeout (no change → caller reuses last).
unsafe fn acquire(&mut self) -> Result<Option<CapturedFrame>> {
if self.holding_frame {
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
}
let mut info = DXGI_OUTDUPL_FRAME_INFO::default();
let mut res: Option<IDXGIResource> = None;
let timeout = if self.first_frame {
2000
} else {
self.timeout_ms
};
// If a prior recovery released the old duplication but couldn't create a new one yet (output
// gone during a secure dwell, etc.), self.dupl is None — synthesize ACCESS_LOST so we flow into
// the recovery path below instead of panicking.
let acq = match self.dupl.as_ref() {
Some(d) => d.AcquireNextFrame(timeout, &mut info, &mut res),
None => Err(windows::core::Error::from_hresult(DXGI_ERROR_ACCESS_LOST)),
};
match acq {
Ok(()) => {
if self.first_frame {
tracing::info!(w = self.width, h = self.height, "DXGI first frame acquired");
self.first_frame = false;
}
self.consecutive_born_lost = 0; // a real frame breaks the born-lost storm
self.update_cursor(&info);
}
Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => {
self.dbg_timeouts += 1;
if self.dbg_timeouts % 40 == 1 {
// A static desktop produces no DDA frames, so timeouts are NORMAL idle, not an error.
tracing::debug!(
timeouts = self.dbg_timeouts,
first_frame = self.first_frame,
"DXGI AcquireNextFrame timeout (no desktop change yet)"
);
}
return Ok(None);
}
// MODE_CHANGE_IN_PROGRESS (0x887A0025) is TRANSIENT by design ("the call may succeed at a
// later attempt") — the display topology is mid-settle (e.g. just after the IDD's mode is
// applied). Do NOT recover/rebuild: a rebuild re-issues create()→set_active_mode, re-touching
// the topology and PERPETUATING the change (the storm we measured). Just repeat the last frame
// and wait it out, like a timeout. Throttled log so a genuinely stuck change stays visible.
Err(e) if e.code() == DXGI_ERROR_MODE_CHANGE_IN_PROGRESS => {
self.dbg_timeouts += 1;
if self.dbg_timeouts % 120 == 1 {
tracing::warn!(
"DXGI mode change in progress (0x887A0025) — waiting for topology to settle"
);
}
return Ok(None);
}
// Recoverable losses, ALL handled by rebuilding the duplication (device + re-DuplicateOutput):
// ACCESS_LOST — desktop switch (normal <-> Winlogon secure: lock/login/UAC) or mode change
// INVALID_CALL — the secure->user-desktop switch (post-login) leaves the duplication in a
// state where AcquireNextFrame returns 0x887A0001; recreating recovers it.
// Previously fatal -> the stream dropped the instant the user logged in.
// DEVICE_REMOVED/RESET — GPU TDR / driver reset.
Err(e)
if e.code() == DXGI_ERROR_ACCESS_LOST
|| e.code() == DXGI_ERROR_INVALID_CALL
|| e.code() == DXGI_ERROR_DEVICE_REMOVED
|| e.code() == DXGI_ERROR_DEVICE_RESET =>
{
self.dbg_lost += 1;
// TIERED recovery. The HDR path produces a constant ACCESS_LOST *churn*: the
// duplication keeps getting invalidated (overlay/MPO flips that HDR makes aggressive)
// but the OUTPUT stays valid — a probe passes, the dup lives briefly, dies, repeats.
// For that, the cheap fix is a fresh DuplicateOutput on the SAME device+output: no new
// device/factory → NO encoder re-init, NO black seed → frames stay near-continuous
// (this is what makes HDR animations smooth). Only a genuine output loss (secure-desktop
// switch, where DISPLAY10 is gone) or a dead device needs the full rebuild — and THAT
// is throttled so a long secure dwell doesn't hammer DuplicateOutput / starve the
// client (between attempts we repeat the last frame).
let device_dead =
e.code() == DXGI_ERROR_DEVICE_REMOVED || e.code() == DXGI_ERROR_DEVICE_RESET;
if self.dbg_lost % 64 == 1 {
tracing::warn!(
lost = self.dbg_lost,
code = format!("{:#x}", e.code().0),
"DXGI capture lost — recovering (cheap re-duplicate, full rebuild if output gone)"
);
}
// GENTLE recovery. On the secure (Winlogon) desktop the duplication dies on EVERY
// independent-flip; a tight re-duplicate loop tears the duplication down + brings it up
// hundreds of times/sec — that release/recreate cycle is the real kernel stress (and it
// stalls the send thread long enough that the client times out → "display disconnected").
// So instead of fighting it: cap recovery HARD and just repeat the last frame in between
// (no busy-spin, no per-flip teardown). The session stays alive across a secure dwell; the
// lock/UAC screen is frozen/laggy, then capture resumes cleanly when the desktop returns.
// Tunable: PUNKTFUNK_RECOVER_MS (cheap re-duplicate cadence, default 250) and
// PUNKTFUNK_REBUILD_MS (heavy new-device rebuild cadence, default 1500).
let recover_ms = std::env::var("PUNKTFUNK_RECOVER_MS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(250u64);
let now = Instant::now();
if self
.last_recover
.is_some_and(|t| now.duration_since(t) < Duration::from_millis(recover_ms))
{
return Ok(None); // repeat the last frame; do NOT tear down/recreate yet
}
self.last_recover = Some(now);
if !device_dead && self.try_reduplicate() {
// Cheap recovery succeeded (same device, no teardown of the device/monitor).
self.first_frame = true;
return Ok(None);
}
// Heavy full rebuild (new device) — the costliest teardown/recreate, so throttle it the
// hardest. Only when the cheap re-duplicate keeps failing (genuine output/device loss).
let rebuild_ms = std::env::var("PUNKTFUNK_REBUILD_MS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(1500u64);
let now = Instant::now();
let due = self
.last_rebuild
.is_none_or(|t| now.duration_since(t) >= Duration::from_millis(rebuild_ms));
if due {
self.last_rebuild = Some(now);
if self.recreate_dupl().is_ok() {
self.first_frame = true;
}
}
// Born-lost rebuilds (created OK, instant ACCESS_LOST) used to escalate to a full pipeline
// cold-rebuild here — but that re-issued vd.create()→set_active_mode (an audible PnP
// add/remove chime + a fresh topology mode change), which never converged and amplified
// the storm. With the topology fix (set_active_mode no longer promotes the IDD to PRIMARY
// by default) the born-lost storm is gone at its source; if one ever recurs, just keep
// repeating the last frame in-process — never tear the IDD down mid-session (Apollo never
// does). Throttled visibility only.
if self.consecutive_born_lost > 0 && self.consecutive_born_lost % 40 == 1 {
tracing::warn!(
consecutive = self.consecutive_born_lost,
"DDA born-lost rebuilds — repeating last frame in-process (no teardown)"
);
}
return Ok(None);
}
Err(e) => return Err(e).context("AcquireNextFrame"),
}
let res = res.context("AcquireNextFrame: null resource")?;
// Detect a mode/format change on the hot path. The desktop can flip HDR<->SDR (FP16<->BGRA —
// e.g. the SudoVDA output dropping out of HDR for the secure desktop) or change resolution
// WITHOUT raising ACCESS_LOST; `hdr_fp16`/`width`/`height` would then be stale and
// `present_acquired` would CopyResource into a mismatched-format/size target — corruption, or
// the secure-desktop "works once, then HDR breaks" bug. Re-read the acquired texture's desc
// every frame (Apollo does this) and rebuild on a real change instead of presenting a
// mismatched frame. Throttled like the ACCESS_LOST path so a flapping toggle can't hammer
// DuplicateOutput.
if let Ok(tex) = res.cast::<ID3D11Texture2D>() {
let mut d = D3D11_TEXTURE2D_DESC::default();
tex.GetDesc(&mut d);
// Only a real SIZE change is reliably detectable here. Format/HDR is NOT: legacy
// DuplicateOutput always hands back an 8-bit BGRA surface regardless of the output's FP16
// scanout mode, so comparing the acquired-texture format against `hdr_fp16` (derived from
// the OUTDUPL ModeDesc) self-fires every frame → a rebuild storm. A genuine resolution
// change is caught here; a real HDR↔SDR toggle arrives as ACCESS_LOST → recreate_dupl
// re-detects it. (Genuine FP16 capture is a separate change: DuplicateOutput1.)
if d.Width != self.width || d.Height != self.height {
tracing::info!(
old = format!("{}x{}", self.width, self.height),
new = format!("{}x{}", d.Width, d.Height),
"DXGI capture size changed mid-stream — rebuilding"
);
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
let now = Instant::now();
let due = self
.last_rebuild
.is_none_or(|t| now.duration_since(t) >= Duration::from_millis(250));
if due {
self.last_rebuild = Some(now);
if self.recreate_dupl().is_ok() {
self.first_frame = true;
}
}
return Ok(None);
}
}
Ok(Some(self.present_acquired(res)?))
}
/// Turn a freshly-acquired duplication resource into a `CapturedFrame` and record it as
/// `last_present`. Factored out of [`acquire`] so the recovery path ([`recreate_dupl`]) can grab
/// the CURRENT desktop frame instead of seeding black: the secure (lock/login/UAC) desktop is
/// static, so DDA emits no change-frame to replace a black seed — the cause of the black-screen-
/// until-you-press-a-key bug. The caller has already `AcquireNextFrame`d; this releases it.
unsafe fn present_acquired(&mut self, res: IDXGIResource) -> Result<CapturedFrame> {
self.holding_frame = true;
let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?;
if self.gpu_mode && self.hdr_fp16 {
// HDR zero-copy path: the duplication surface is scRGB FP16 (R16G16B16A16_FLOAT) — it can't
// be CopyResource'd into a BGRA target (that was the freeze + cursor-trail bug). Copy it into
// an FP16 SRV texture (same format → valid), composite the cursor onto it (the cursor lands
// at ~SDR-white brightness, then goes through the PQ curve correctly), then convert scRGB →
// BT.2020 PQ 10-bit into hdr10_out and hand THAT to NVENC (HEVC Main10 / HDR10).
self.ensure_fp16_src()?;
let src = self.fp16_src.clone().context("fp16 src texture")?;
self.context.CopyResource(&src, &tex);
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
self.composite_cursor_gpu(&src, true)?; // onto the FP16 surface (HDR: decode + nits scale)
// Video-engine path: scRGB FP16 → BT.2020 PQ P010 on the VIDEO engine (no 3D shader, and
// NVENC encodes P010 natively). Fall back to the HdrConverter pixel shader (3D) only if the
// video processor is unavailable.
if let Some(p010) = (!self.chroma_444)
.then(|| self.convert_to_yuv(&src, true))
.flatten()
{
self.last_present = Some((p010.clone(), PixelFormat::P010));
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::P010,
payload: FramePayload::D3d11(D3d11Frame {
texture: p010,
device: self.device.clone(),
}),
});
}
self.ensure_hdr10_out()?;
let out = self.hdr10_out.clone().context("hdr10 out texture")?;
if self.hdr_conv.is_none() {
self.hdr_conv = Some(HdrConverter::new(&self.device)?);
}
let srv = self.fp16_srv.clone().context("fp16 srv")?;
let mut rtv: Option<ID3D11RenderTargetView> = None;
self.device
.CreateRenderTargetView(&out, None, Some(&mut rtv))?;
let rtv = rtv.context("hdr10 rtv")?;
self.hdr_conv.as_ref().unwrap().convert(
&self.context,
&srv,
&rtv,
self.width,
self.height,
);
self.last_present = Some((out.clone(), PixelFormat::Rgb10a2));
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Rgb10a2,
payload: FramePayload::D3d11(D3d11Frame {
texture: out,
device: self.device.clone(),
}),
});
}
if self.gpu_mode {
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
// surface into a reused owned texture, release the duplication frame, hand off the texture.
// NOTE: do NOT convert the duplication surface directly on the video processor to skip this
// copy — the VP colour-convert (3D/compute on NVIDIA) holds the DDA surface until it
// completes, blocking ReleaseFrame/AcquireNextFrame and SERIALIZING capture+convert (~60 fps,
// encode_us 15-20 ms measured). The fast same-format CopyResource decouples them: it releases
// the DDA frame immediately so the convert runs independently (40-200 fps). Worth ~5% 3D.
self.ensure_gpu_copy()?;
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
self.context.CopyResource(&gpu, &tex);
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
self.composite_cursor_gpu(&gpu, false)?;
// Prefer the video-engine YUV path (BGRA → NV12 on the video engine) so the colour
// conversion AND NVENC's encode stay OFF the 3D engine — the only way to keep up when a
// game pins the 3D engine at ~100%. Fall back to handing NVENC the BGRA texture (it then
// does RGB→YUV internally on the 3D/compute engine).
if let Some(nv12) = (!self.chroma_444)
.then(|| self.convert_to_yuv(&gpu, false))
.flatten()
{
self.last_present = Some((nv12.clone(), PixelFormat::Nv12));
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Nv12,
payload: FramePayload::D3d11(D3d11Frame {
texture: nv12,
device: self.device.clone(),
}),
});
}
self.last_present = Some((gpu.clone(), PixelFormat::Bgra));
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::D3d11(D3d11Frame {
texture: gpu,
device: self.device.clone(),
}),
});
}
self.ensure_staging()?;
let staging = self.staging.clone().context("staging texture")?;
self.context.CopyResource(&staging, &tex);
let mut map = D3D11_MAPPED_SUBRESOURCE::default();
self.context
.Map(&staging, 0, D3D11_MAP_READ, 0, Some(&mut map))
.context("Map staging")?;
let (w, h) = (self.width as usize, self.height as usize);
let pitch = map.RowPitch as usize;
let src = std::slice::from_raw_parts(map.pData as *const u8, pitch * h);
let mut tight = depad_bgra(src, pitch, w, h);
self.context.Unmap(&staging, 0);
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
self.holding_frame = false;
if self.cursor_visible {
if let Some(shape) = &self.cursor_shape {
let (cx, cy) = self.cursor_pos;
if let Some(bgra) = &shape.alpha {
blend_cursor_cpu(
&mut tight,
self.width,
self.height,
bgra,
shape.w,
shape.h,
cx,
cy,
false,
);
}
if let Some(bgra) = &shape.xor {
blend_cursor_cpu(
&mut tight,
self.width,
self.height,
bgra,
shape.w,
shape.h,
cx,
cy,
true,
);
}
}
}
self.last = Some(tight.clone());
Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::Cpu(tight),
})
}
}
fn now_ns() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_nanos() as u64)
.unwrap_or(0)
}
impl Capturer for DuplCapturer {
fn hdr_meta(&self) -> Option<punktfunk_core::quic::HdrMeta> {
// Only when the duplication is actually HDR (FP16); cleared to None on an SDR rebuild.
if self.hdr_fp16 {
self.hdr_meta
} else {
None
}
}
fn next_frame(&mut self) -> Result<CapturedFrame> {
// Generous: a secure-desktop switch can take several seconds to settle (re-resolve + recreate
// the duplication up to 12 s). Better a few seconds of frozen-last-frame than dropping the stream.
let mut deadline = Instant::now() + Duration::from_secs(20);
loop {
// SAFETY: `acquire` is an `unsafe fn` because it drives the D3D11 immediate context + the
// output duplication, which must be touched only from the capturer's owning thread.
// `next_frame` runs on that one thread — `DuplCapturer` is `Send` but not `Sync`, so it is
// owned by a single (encode) thread for its whole life — and `&mut self` gives exclusive
// access for the call, satisfying that contract.
if let Some(f) = unsafe { self.acquire() }? {
self.ever_got_frame = true;
return Ok(f);
}
if self.gpu_mode {
if let Some((tex, fmt)) = &self.last_present {
// Repeat the last presented GPU frame (SDR BGRA or HDR 10-bit), keeping the encoder
// on a matching format through a static desktop or a mid-rebuild gap.
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: *fmt,
payload: FramePayload::D3d11(D3d11Frame {
texture: tex.clone(),
device: self.device.clone(),
}),
});
}
}
if let Some(b) = &self.last {
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::Cpu(b.clone()),
});
}
if Instant::now() > deadline {
// After we've streamed at least once, never fatally drop on a frame drought: a long
// secure-desktop dwell (or a slow rebuild) just means no NEW frame yet. Reset the
// deadline and keep repeating the last/seeded frame so the session stays alive. The
// deadline stays fatal only before the first frame — a genuine "monitor never lit up".
if self.ever_got_frame {
deadline = Instant::now() + Duration::from_secs(20);
continue;
}
return Err(anyhow!(
"no DXGI frame within 20s (SudoVDA monitor not activated by a WDDM GPU?)"
));
}
}
}
fn try_latest(&mut self) -> Result<Option<CapturedFrame>> {
// SAFETY: as in `next_frame` — `acquire` must run on the capturer's single owning thread, and
// `try_latest` is called on it (`DuplCapturer` is `Send`, not `Sync`); `&mut self` is exclusive.
unsafe { self.acquire() }
}
fn set_active(&self, active: bool) {
self.active.store(active, Ordering::Relaxed);
}
}
impl Drop for DuplCapturer {
fn drop(&mut self) {
if self.holding_frame {
// SAFETY: `self.dupl` is the live `IDXGIOutputDuplication` this capturer created and owns;
// `ReleaseFrame` is a valid COM method on it, called only when `holding_frame` records that a
// frame was acquired and not yet released (so it is not an unbalanced release). Drop runs on
// whichever thread owns the capturer — its sole owner, since it is `!Sync` — and the `&`
// borrow of the duplication outlives this synchronous call.
unsafe {
let _ = self.dupl.as_ref().map(|d| d.ReleaseFrame());
}
}
// Release the display/system-required execution state we took at open().
// SAFETY: `SetThreadExecutionState` is a Win32 FFI call taking an execution-state flag bitmask
// by value (`ES_CONTINUOUS` clears the display/system-required state taken at open); it borrows
// no Rust memory and is safe to call from any thread.
unsafe {
SetThreadExecutionState(ES_CONTINUOUS);
}
// _keepalive drops after, REMOVEing the SudoVDA monitor.
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn pack_luid_roundtrip() {
let l = LUID {
LowPart: 0x1234_5678,
HighPart: 0x0000_0009,
};
assert_eq!(pack_luid(l), (0x9i64 << 32) | 0x1234_5678);
}
#[test]
fn gdi_name_match() {
let mut buf = [0u16; 32];
for (i, c) in r"\\.\DISPLAY3".encode_utf16().enumerate() {
buf[i] = c;
}
assert!(gdi_name_matches(&buf, r"\\.\DISPLAY3"));
assert!(!gdi_name_matches(&buf, r"\\.\DISPLAY1"));
}
#[test]
fn depad_removes_row_padding() {
// 2x2 BGRA, pitch = 12 (row=8 + 4 pad bytes).
let pitch = 12;
let mut src = vec![0u8; pitch * 2];
for y in 0..2 {
for x in 0..8 {
src[y * pitch + x] = (y * 8 + x) as u8;
}
}
let out = depad_bgra(&src, pitch, 2, 2);
assert_eq!(out.len(), 16);
assert_eq!(&out[0..8], &[0, 1, 2, 3, 4, 5, 6, 7]);
assert_eq!(&out[8..16], &[8, 9, 10, 11, 12, 13, 14, 15]);
}
}