6c2942ee45
android / android (push) Has been cancelled
apple / screenshots (push) Has been cancelled
apple / swift (push) Has been cancelled
ci / web (push) Has been cancelled
ci / docs-site (push) Has been cancelled
ci / bench (push) Has been cancelled
deb / build-publish (push) Has been cancelled
ci / rust (push) Has been cancelled
decky / build-publish (push) Has been cancelled
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Has been cancelled
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Has been cancelled
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Has been cancelled
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Has been cancelled
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Has been cancelled
docker / deploy-docs (push) Has been cancelled
windows-host / package (push) Failing after 11s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Has been cancelled
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Has been cancelled
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1133 lines
53 KiB
Rust
1133 lines
53 KiB
Rust
//! Shared Windows GPU primitives — D3D11 device creation, GPU scheduling priority hooks,
|
||
//! HLSL shader compilation, HDR FP16→P010 conversion ([`HdrP010Converter`]), video-engine
|
||
//! colour conversion ([`VideoConverter`]), and the IDD-push capture identity
|
||
//! ([`WinCaptureTarget`], [`pack_luid`]). Consumed by [`super::idd_push`].
|
||
//! DXGI Desktop Duplication has been removed; this module contains no capturer.
|
||
|
||
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
|
||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||
|
||
use anyhow::{bail, Context, Result};
|
||
use std::ffi::c_void;
|
||
use std::sync::atomic::{AtomicU64, Ordering};
|
||
use windows::core::{s, Interface, PCSTR};
|
||
use windows::Win32::Foundation::{HMODULE, LUID};
|
||
use windows::Win32::Graphics::Direct3D::Fxc::D3DCompile;
|
||
use windows::Win32::Graphics::Direct3D::{
|
||
ID3DBlob, D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0, D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST,
|
||
};
|
||
use windows::Win32::Graphics::Direct3D11::{
|
||
D3D11CreateDevice, ID3D11Buffer, ID3D11Device, ID3D11DeviceContext, ID3D11PixelShader,
|
||
ID3D11RenderTargetView, ID3D11SamplerState, ID3D11ShaderResourceView, ID3D11Texture2D,
|
||
ID3D11VertexShader, D3D11_BIND_CONSTANT_BUFFER, D3D11_BIND_RENDER_TARGET,
|
||
D3D11_BIND_SHADER_RESOURCE, D3D11_BUFFER_DESC, D3D11_COMPARISON_NEVER, D3D11_CPU_ACCESS_READ,
|
||
D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_FILTER_MIN_MAG_MIP_POINT,
|
||
D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_MAP_WRITE_DISCARD,
|
||
D3D11_RENDER_TARGET_VIEW_DESC, D3D11_RENDER_TARGET_VIEW_DESC_0, D3D11_RTV_DIMENSION_TEXTURE2D,
|
||
D3D11_SAMPLER_DESC, D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEX2D_RTV,
|
||
D3D11_TEXTURE2D_DESC, D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC,
|
||
D3D11_USAGE_STAGING, D3D11_VIEWPORT,
|
||
};
|
||
use windows::Win32::Graphics::Dxgi::Common::{
|
||
DXGI_FORMAT, DXGI_FORMAT_P010, DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16_UNORM,
|
||
DXGI_FORMAT_R16_UNORM, DXGI_SAMPLE_DESC,
|
||
};
|
||
use windows::Win32::Graphics::Dxgi::{IDXGIAdapter1, IDXGIDevice, IDXGIDevice1};
|
||
|
||
#[derive(Clone)]
|
||
pub struct WinCaptureTarget {
|
||
/// Packed DXGI adapter LUID (`(HighPart << 32) | (LowPart & 0xffff_ffff)`).
|
||
pub adapter_luid: i64,
|
||
/// The output's GDI device name, e.g. `\\.\DISPLAY3`. Can CHANGE across a secure-desktop switch.
|
||
pub gdi_name: String,
|
||
/// Stable SudoVDA target id — re-resolved to the current GDI name on every recovery.
|
||
pub target_id: u32,
|
||
}
|
||
|
||
/// A GPU-resident captured texture (future NVENC-D3D11 zero-copy path).
|
||
pub struct D3d11Frame {
|
||
pub texture: ID3D11Texture2D,
|
||
pub device: ID3D11Device,
|
||
}
|
||
// SAFETY: `D3d11Frame` owns an `ID3D11Texture2D` + `ID3D11Device`, which are COM interface pointers.
|
||
// D3D11 devices/resources use thread-safe (interlocked) COM reference counting, and the device is
|
||
// created free-threaded (`make_device` passes no `D3D11_CREATE_DEVICE_SINGLETHREADED`), so handing
|
||
// ownership of the frame to another thread — the capture→encode handoff — and releasing it there is
|
||
// sound. The value is moved, never aliased (no `Sync`), so there is no concurrent use of the
|
||
// single-threaded immediate context.
|
||
unsafe impl Send for D3d11Frame {}
|
||
|
||
pub fn pack_luid(luid: LUID) -> i64 {
|
||
((luid.HighPart as i64) << 32) | (luid.LowPart as i64 & 0xffff_ffff)
|
||
}
|
||
|
||
/// Create a fresh D3D11 device + context on a specific adapter (driver_type UNKNOWN with an explicit
|
||
/// adapter). Used at open and on every ACCESS_LOST: a device created on one desktop cannot sustain a
|
||
/// duplication on a *different* desktop (perpetual ACCESS_LOST), so the secure-desktop switch needs a
|
||
/// device made while the thread is attached to that desktop.
|
||
pub(crate) unsafe fn make_device(
|
||
adapter: &IDXGIAdapter1,
|
||
) -> Result<(ID3D11Device, ID3D11DeviceContext)> {
|
||
let mut device: Option<ID3D11Device> = None;
|
||
let mut context: Option<ID3D11DeviceContext> = None;
|
||
D3D11CreateDevice(
|
||
adapter,
|
||
D3D_DRIVER_TYPE_UNKNOWN,
|
||
HMODULE::default(),
|
||
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
|
||
Some(&[D3D_FEATURE_LEVEL_11_0]),
|
||
D3D11_SDK_VERSION,
|
||
Some(&mut device),
|
||
None,
|
||
Some(&mut context),
|
||
)
|
||
.context("D3D11CreateDevice")?;
|
||
let device = device.context("null D3D11 device")?;
|
||
let context = context.context("null D3D11 context")?;
|
||
|
||
// GPU scheduling hardening — the same approach Sunshine/Apollo use, reimplemented here via the
|
||
// documented D3DKMT/DXGI APIs (no GPL source copied). Our capture+encode
|
||
// shares the GPU with the streamed game; when the game saturates the GPU our process is starved of
|
||
// GPU time slices, so NVENC sits near-idle yet `lock_bitstream` waits ~20 ms for our context to be
|
||
// scheduled — capping the stream (~47 fps measured at 5K@240) and stuttering. Per-frame copy/convert
|
||
// is NOT the cause (zero-copy + thread-priority alone didn't move it); the PROCESS-level GPU
|
||
// scheduling priority class is the decisive cross-process lever. Secondary: the absolute per-device
|
||
// GPU thread priority and a 1-frame latency cap.
|
||
elevate_process_gpu_priority();
|
||
if let Ok(dxgi_dev) = device.cast::<IDXGIDevice>() {
|
||
// The absolute max GPU thread priority (0x4000001E; the same value Sunshine/Apollo use); fall back to relative +7.
|
||
if dxgi_dev.SetGPUThreadPriority(0x4000_001E).is_err()
|
||
&& dxgi_dev.SetGPUThreadPriority(7).is_err()
|
||
{
|
||
tracing::warn!("SetGPUThreadPriority failed (run as admin/SYSTEM for GPU priority)");
|
||
}
|
||
}
|
||
if let Ok(dxgi1) = device.cast::<IDXGIDevice1>() {
|
||
let _ = dxgi1.SetMaximumFrameLatency(1);
|
||
}
|
||
Ok((device, context))
|
||
}
|
||
|
||
/// Resolve the configured GPU scheduling-priority class from `PUNKTFUNK_GPU_PRIORITY_CLASS`
|
||
/// (`off|normal|high|realtime`, default high). `None` = leave it at the OS default (the `off` opt-out).
|
||
/// D3DKMT_SCHEDULINGPRIORITYCLASS: IDLE 0, BELOW_NORMAL 1, NORMAL 2, ABOVE_NORMAL 3, HIGH 4, REALTIME 5.
|
||
fn configured_gpu_priority_class() -> Option<i32> {
|
||
match std::env::var("PUNKTFUNK_GPU_PRIORITY_CLASS")
|
||
.ok()
|
||
.as_deref()
|
||
{
|
||
Some("off") => None,
|
||
Some("normal") => Some(2),
|
||
Some("realtime") => Some(5),
|
||
_ => Some(4), // HIGH — safe on NVIDIA+HAGS (realtime can freeze NVENC)
|
||
}
|
||
}
|
||
|
||
/// Enable SE_INC_BASE_PRIORITY on the CURRENT process token (best-effort) — the kernel gates the
|
||
/// HIGH/REALTIME GPU scheduling-priority bump on it. Held by SYSTEM/Administrators; a UAC-FILTERED
|
||
/// token does NOT have it, which is why `elevate_process_gpu_priority` may silently no-op in a
|
||
/// restricted service context.
|
||
unsafe fn enable_inc_base_priority() {
|
||
use windows::core::PCWSTR;
|
||
use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID};
|
||
use windows::Win32::Security::{
|
||
AdjustTokenPrivileges, LookupPrivilegeValueW, LUID_AND_ATTRIBUTES,
|
||
SE_INC_BASE_PRIORITY_NAME, SE_PRIVILEGE_ENABLED, TOKEN_ADJUST_PRIVILEGES, TOKEN_PRIVILEGES,
|
||
TOKEN_QUERY,
|
||
};
|
||
use windows::Win32::System::Threading::{GetCurrentProcess, OpenProcessToken};
|
||
let mut token = HANDLE::default();
|
||
if OpenProcessToken(
|
||
GetCurrentProcess(),
|
||
TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY,
|
||
&mut token,
|
||
)
|
||
.is_ok()
|
||
{
|
||
let mut luid = LUID::default();
|
||
if LookupPrivilegeValueW(PCWSTR::null(), SE_INC_BASE_PRIORITY_NAME, &mut luid).is_ok() {
|
||
let tp = TOKEN_PRIVILEGES {
|
||
PrivilegeCount: 1,
|
||
Privileges: [LUID_AND_ATTRIBUTES {
|
||
Luid: luid,
|
||
Attributes: SE_PRIVILEGE_ENABLED,
|
||
}],
|
||
};
|
||
if AdjustTokenPrivileges(
|
||
token,
|
||
false,
|
||
Some(&tp as *const TOKEN_PRIVILEGES),
|
||
0,
|
||
None,
|
||
None,
|
||
)
|
||
.is_err()
|
||
{
|
||
tracing::warn!("could not enable SE_INC_BASE_PRIORITY for GPU priority");
|
||
}
|
||
}
|
||
let _ = CloseHandle(token);
|
||
}
|
||
}
|
||
|
||
/// Call `gdi32!D3DKMTSetProcessSchedulingPriorityClass(process, prio)` (no stable windows-rs binding —
|
||
/// loaded by name). Returns the NTSTATUS (0 = success) or `None` if the export can't be resolved. The
|
||
/// CALLING process must hold SE_INC_BASE_PRIORITY ([`enable_inc_base_priority`]) for HIGH/REALTIME; the
|
||
/// kernel checks the caller's privilege whether the target is self or a child we created.
|
||
unsafe fn d3dkmt_set_scheduling_priority_class(
|
||
process: windows::Win32::Foundation::HANDLE,
|
||
prio: i32,
|
||
) -> Option<i32> {
|
||
use windows::core::s;
|
||
use windows::Win32::Foundation::HANDLE;
|
||
use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
|
||
let gdi32 = LoadLibraryA(s!("gdi32.dll")).ok()?;
|
||
let p = GetProcAddress(gdi32, s!("D3DKMTSetProcessSchedulingPriorityClass"))?;
|
||
type SetPrio = unsafe extern "system" fn(HANDLE, i32) -> i32;
|
||
let f: SetPrio = std::mem::transmute(p);
|
||
Some(f(process, prio))
|
||
}
|
||
|
||
/// GPU scheduling-priority hardening — the same approach as Sunshine/Apollo, independently
|
||
/// implemented via the documented D3DKMT APIs (no GPL source copied). On a
|
||
/// GPU-saturated game our capture+encode process is starved of GPU time slices — NVENC sits ~idle but
|
||
/// `lock_bitstream` waits ~20 ms for our context to be scheduled. Elevating the PROCESS GPU scheduling
|
||
/// priority class (the strong cross-process lever — far more effective than `SetGPUThreadPriority`
|
||
/// alone, which we measured as no help) lets our brief encode preempt the game. Uses HIGH, NOT
|
||
/// realtime: realtime on NVIDIA + HAGS can freeze/crash NVENC (Apollo downgrades it for exactly this).
|
||
/// Runs once per process; best-effort. `PUNKTFUNK_GPU_PRIORITY_CLASS = off|normal|high|realtime`
|
||
/// (default high). Best-effort: silently no-ops under a UAC-filtered token (the process will not
|
||
/// hold SE_INC_BASE_PRIORITY, so the D3DKMT call is a no-op).
|
||
fn elevate_process_gpu_priority() {
|
||
use std::sync::Once;
|
||
static ONCE: Once = Once::new();
|
||
// SAFETY: the closure calls two of this module's `unsafe fn`s — `enable_inc_base_priority`
|
||
// (adjusts the current-process token; it has no caller precondition and builds all its FFI args
|
||
// locally) and `d3dkmt_set_scheduling_priority_class` (loads gdi32 by name and calls the export).
|
||
// The latter requires `process` to be a valid process handle; `GetCurrentProcess()` returns the
|
||
// current-process pseudo-handle, which is always valid and needs no close. Runs once via
|
||
// `Once::call_once`; no raw pointers are dereferenced here.
|
||
ONCE.call_once(|| unsafe {
|
||
use windows::Win32::System::Threading::GetCurrentProcess;
|
||
let Some(prio) = configured_gpu_priority_class() else {
|
||
tracing::info!("GPU process scheduling priority class left at default (off)");
|
||
return;
|
||
};
|
||
enable_inc_base_priority();
|
||
match d3dkmt_set_scheduling_priority_class(GetCurrentProcess(), prio) {
|
||
Some(0) => tracing::info!(
|
||
priority_class = prio,
|
||
"GPU process scheduling priority class set (2=normal 4=high 5=realtime)"
|
||
),
|
||
Some(st) => tracing::warn!(
|
||
status = format!("0x{st:08X}"),
|
||
"D3DKMTSetProcessSchedulingPriorityClass failed (run as admin/SYSTEM for GPU priority)"
|
||
),
|
||
None => tracing::warn!("D3DKMTSetProcessSchedulingPriorityClass export not found"),
|
||
}
|
||
});
|
||
}
|
||
|
||
/// How many times DXGI has actually called our hooked `NtGdiDdDDIGetCachedHybridQueryValue`. If this
|
||
/// stays 0 while DDA churns with ACCESS_LOST, the hook is NOT on DXGI's GPU-preference path on this
|
||
/// build (so reparenting can't be the cause — look at composition/independent-flip instead). >0 with
|
||
/// continuing churn means the hook fires but reparenting isn't the trigger here.
|
||
static HYBRID_HOOK_HITS: AtomicU64 = AtomicU64::new(0);
|
||
|
||
// kernel32 — declared directly so we don't pull the whole Win32_System_Diagnostics_Debug feature for
|
||
// one call. FlushInstructionCache serializes the i-cache after the inline patch: the patch is written
|
||
// on the main thread but DXGI runs the hooked export from the encode/worker thread (possibly a
|
||
// different core), so the "same-thread, no flush needed" assumption was wrong.
|
||
#[link(name = "kernel32")]
|
||
extern "system" {
|
||
fn FlushInstructionCache(h: *mut c_void, base: *const c_void, size: usize) -> i32;
|
||
fn GetCurrentProcess() -> *mut c_void;
|
||
}
|
||
/// Replacement for `win32u.dll!NtGdiDdDDIGetCachedHybridQueryValue`: always report
|
||
/// `D3DKMT_GPU_PREFERENCE_STATE_UNSPECIFIED` (3). We fully replace the function (never call the
|
||
/// original), so no trampoline is needed. (Independent reimplementation of the same technique Apollo
|
||
/// uses: Apollo installs its hook via the MinHook library; this is an original inline byte-patch and
|
||
/// copies no Apollo/GPL source.)
|
||
unsafe extern "system" fn hybrid_query_hook(gpu_preference: *mut u32) -> i32 {
|
||
HYBRID_HOOK_HITS.fetch_add(1, Ordering::Relaxed);
|
||
if gpu_preference.is_null() {
|
||
return 0xC000_000Du32 as i32; // STATUS_INVALID_PARAMETER
|
||
}
|
||
*gpu_preference = 3; // D3DKMT_GPU_PREFERENCE_STATE_UNSPECIFIED
|
||
0 // STATUS_SUCCESS
|
||
}
|
||
|
||
/// The win32u GPU-preference hook (the same technique Apollo applies, reimplemented here from the
|
||
/// documented DDI — no GPL source copied). On a HYBRID-GPU box DXGI resolves a GPU preference
|
||
/// (registry + power settings + the hybrid-adapter DDI) and REPARENTS outputs onto the chosen render
|
||
/// GPU — which constantly invalidates Desktop Duplication (DXGI_ERROR_ACCESS_LOST 0x887A0026, the
|
||
/// freeze/churn observed on the RTX 4090 + AMD iGPU box; `SET_RENDER_ADAPTER` is ignored there). Faking
|
||
/// a cached preference of UNSPECIFIED makes DXGI skip the resolution, so the output is NOT reparented
|
||
/// and DDA stays stable on one adapter (this is what makes Apollo's DDA work on this hardware).
|
||
/// Installed once, before the first DXGI factory/enumeration; lasts the process lifetime (like Apollo).
|
||
pub(crate) fn install_gpu_pref_hook() {
|
||
use std::sync::Once;
|
||
static HOOK: Once = Once::new();
|
||
// SAFETY: this one-time hook install only touches a region it has just validated.
|
||
// `LoadLibraryA("win32u.dll")` + `GetProcAddress("NtGdiDdDDIGetCachedHybridQueryValue")` yield the
|
||
// live base of the real exported function, so `target` is a valid executable code pointer to at
|
||
// least the 12 bytes the patch overwrites (an x64 prologue). The two
|
||
// `ptr::copy_nonoverlapping`s each move exactly 12 bytes between the 12-byte stack arrays
|
||
// (`patch`/`readback`) and `target`, which `VirtualProtect(target, 12, PAGE_EXECUTE_READWRITE, …)`
|
||
// has just made writable (and is restored to `old` after) — source and dest never overlap (stack
|
||
// vs. loaded module image), so every access stays in mapped, in-bounds memory.
|
||
// `FlushInstructionCache` gets the current-process pseudo-handle + that same range. The DPI calls
|
||
// take by-value context handles / fill the live local `&mut old`/`&mut restore` for the duration of
|
||
// each synchronous call. Runs once via `Once::call_once`, before any DXGI use.
|
||
HOOK.call_once(|| unsafe {
|
||
use windows::Win32::System::LibraryLoader::{GetProcAddress, LoadLibraryA};
|
||
use windows::Win32::System::Memory::{
|
||
VirtualProtect, PAGE_EXECUTE_READWRITE, PAGE_PROTECTION_FLAGS,
|
||
};
|
||
use windows::Win32::UI::HiDpi::{
|
||
GetAwarenessFromDpiAwarenessContext, GetThreadDpiAwarenessContext,
|
||
SetProcessDpiAwarenessContext, DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2,
|
||
};
|
||
// Per-monitor-v2 DPI awareness — REQUIRED for IDXGIOutput5::DuplicateOutput1 (without it the
|
||
// call returns E_ACCESSDENIED forever, forcing the legacy DuplicateOutput path). Matches
|
||
// Apollo's startup. SetProcessDpiAwarenessContext fails with E_ACCESS_DENIED if awareness was
|
||
// already set (manifest / earlier call) — log the outcome AND the effective awareness so a
|
||
// 100% DuplicateOutput1 E_ACCESSDENIED is diagnosable instead of silent.
|
||
match SetProcessDpiAwarenessContext(DPI_AWARENESS_CONTEXT_PER_MONITOR_AWARE_V2) {
|
||
Ok(()) => tracing::info!("DPI awareness set: PER_MONITOR_AWARE_V2"),
|
||
Err(e) => tracing::warn!(error = %format!("{e:?}"),
|
||
"SetProcessDpiAwarenessContext failed (already set?) — DuplicateOutput1 may E_ACCESSDENIED"),
|
||
}
|
||
// 0=UNAWARE 1=SYSTEM 2=PER_MONITOR(_V2). DuplicateOutput1 needs 2.
|
||
let awareness = GetAwarenessFromDpiAwarenessContext(GetThreadDpiAwarenessContext()).0;
|
||
tracing::info!(awareness, "effective DPI awareness (need 2=PER_MONITOR for DuplicateOutput1)");
|
||
let Ok(lib) = LoadLibraryA(s!("win32u.dll")) else {
|
||
tracing::warn!("GPU-pref hook: win32u.dll not loadable — skipping (DDA may churn on hybrid GPUs)");
|
||
return;
|
||
};
|
||
let Some(target) = GetProcAddress(lib, s!("NtGdiDdDDIGetCachedHybridQueryValue")) else {
|
||
tracing::warn!("GPU-pref hook: NtGdiDdDDIGetCachedHybridQueryValue not exported — skipping");
|
||
return;
|
||
};
|
||
let target = target as usize as *mut u8;
|
||
// x64 absolute jump to our replacement: `mov rax, imm64 ; jmp rax` (12 bytes). We never call the
|
||
// original, so no trampoline/relocation (hence no detour crate / C length-disassembler dep).
|
||
let hook = hybrid_query_hook as *const () as usize;
|
||
let mut patch = [0u8; 12];
|
||
patch[0] = 0x48;
|
||
patch[1] = 0xB8; // mov rax, imm64
|
||
patch[2..10].copy_from_slice(&hook.to_le_bytes());
|
||
patch[10] = 0xFF;
|
||
patch[11] = 0xE0; // jmp rax
|
||
let mut old = PAGE_PROTECTION_FLAGS(0);
|
||
if VirtualProtect(target as *const c_void, 12, PAGE_EXECUTE_READWRITE, &mut old).is_err() {
|
||
tracing::warn!("GPU-pref hook: VirtualProtect failed — skipping");
|
||
return;
|
||
}
|
||
std::ptr::copy_nonoverlapping(patch.as_ptr(), target, 12);
|
||
let mut restore = PAGE_PROTECTION_FLAGS(0);
|
||
let _ = VirtualProtect(target as *const c_void, 12, old, &mut restore);
|
||
// Serialize the i-cache: the patch is written here (main thread) but DXGI calls the export from
|
||
// the capture/encode worker thread — possibly a different core with a stale i-cache, in which
|
||
// case it would keep running the ORIGINAL function and DXGI would still reparent. (Apollo's
|
||
// MinHook does this flush internally; our hand-rolled patch must do it explicitly.)
|
||
let _ = FlushInstructionCache(GetCurrentProcess(), target as *const c_void, 12);
|
||
// VERIFY the patch actually landed (CFG/hotpatch/short-stub could silently reject it). Read it
|
||
// back; an error! (not a cheery "installed") makes a dead hook obvious in the logs.
|
||
let mut readback = [0u8; 12];
|
||
std::ptr::copy_nonoverlapping(target, readback.as_mut_ptr(), 12);
|
||
if readback == patch {
|
||
tracing::info!(
|
||
"GPU-pref hook installed + verified (win32u hybrid-query -> UNSPECIFIED): reparenting disabled"
|
||
);
|
||
} else {
|
||
tracing::error!(
|
||
want = %format!("{patch:02x?}"), got = %format!("{readback:02x?}"),
|
||
"GPU-pref hook patch did NOT land — hook is DEAD (DXGI will still reparent → ACCESS_LOST churn)"
|
||
);
|
||
}
|
||
});
|
||
}
|
||
|
||
unsafe fn compile_shader(src: &str, entry: PCSTR, target: PCSTR) -> Result<Vec<u8>> {
|
||
let mut blob: Option<ID3DBlob> = None;
|
||
let mut errs: Option<ID3DBlob> = None;
|
||
let r = D3DCompile(
|
||
src.as_ptr() as *const c_void,
|
||
src.len(),
|
||
PCSTR::null(),
|
||
None,
|
||
None,
|
||
entry,
|
||
target,
|
||
0,
|
||
0,
|
||
&mut blob,
|
||
Some(&mut errs),
|
||
);
|
||
if r.is_err() {
|
||
let msg = errs
|
||
.as_ref()
|
||
.map(|e| {
|
||
let p = e.GetBufferPointer() as *const u8;
|
||
String::from_utf8_lossy(std::slice::from_raw_parts(p, e.GetBufferSize()))
|
||
.to_string()
|
||
})
|
||
.unwrap_or_default();
|
||
bail!("D3DCompile failed: {msg}");
|
||
}
|
||
let blob = blob.context("no shader blob")?;
|
||
let p = blob.GetBufferPointer() as *const u8;
|
||
Ok(std::slice::from_raw_parts(p, blob.GetBufferSize()).to_vec())
|
||
}
|
||
|
||
/// Fullscreen-triangle vertex shader for the HDR conversion pass (3 verts, no input layout).
|
||
const HDR_VS: &str = r"
|
||
struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; };
|
||
VOut main(uint vid : SV_VertexID) {
|
||
float2 uv = float2((vid << 1) & 2, vid & 2);
|
||
VOut o;
|
||
o.pos = float4(uv * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
|
||
o.uv = uv;
|
||
return o;
|
||
}
|
||
";
|
||
|
||
/// P010 **luma** pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) →
|
||
/// BT.2020 PQ → BT.2020 non-constant-luminance limited-range Y′, written as a 10-bit code in the high
|
||
/// 10 bits of an R16_UNORM render-target view of the P010 plane-0 (luma). The colour pipeline
|
||
/// (scRGB→nits→BT.2020-linear→PQ) is IDENTICAL to the R10 HDR path; only the final RGB→Y + studio-range
|
||
/// quantization differs. The shared HLSL is factored into [`HDR_P010_COMMON`].
|
||
const HDR_P010_COMMON: &str = r"
|
||
Texture2D<float4> tx : register(t0);
|
||
SamplerState sm : register(s0);
|
||
// Rec.709 → Rec.2020 primaries (linear). Same matrix as the R10 HdrConverter (mul(M, v)).
|
||
static const float3x3 BT709_TO_BT2020 = {
|
||
0.627403914, 0.329283038, 0.043313048,
|
||
0.069097292, 0.919540405, 0.011362303,
|
||
0.016391439, 0.088013308, 0.895595253
|
||
};
|
||
float3 pq_oetf(float3 L) {
|
||
// L normalized so 1.0 = 10000 nits. ST 2084. (Identical to HdrConverter.)
|
||
const float m1 = 0.1593017578125;
|
||
const float m2 = 78.84375;
|
||
const float c1 = 0.8359375;
|
||
const float c2 = 18.8515625;
|
||
const float c3 = 18.6875;
|
||
float3 Lp = pow(saturate(L), m1);
|
||
return pow((c1 + c2 * Lp) / (1.0 + c3 * Lp), m2);
|
||
}
|
||
// scRGB FP16 sample -> PQ-encoded BT.2020 RGB in [0,1] (the SAME pixels the R10 path would store,
|
||
// before quantization). Used by both the luma and chroma passes so they agree bit-for-bit with the
|
||
// existing HdrConverter colour math + the Rust reference.
|
||
float3 scrgb_to_pq2020(float2 uv) {
|
||
float3 scrgb = max(tx.Sample(sm, uv).rgb, 0.0); // scRGB can be negative (wide gamut); clamp
|
||
float3 nits = scrgb * 80.0; // scRGB 1.0 = 80 nits
|
||
float3 lin2020 = mul(BT709_TO_BT2020, nits); // primaries conversion (linear)
|
||
return pq_oetf(lin2020 / 10000.0); // normalize to 10k nits, encode PQ -> [0,1]
|
||
}
|
||
// BT.2020 non-constant-luminance, on the PQ-encoded (gamma) RGB. Kr/Kg/Kb per Rec.2020.
|
||
static const float KR = 0.2627;
|
||
static const float KG = 0.6780;
|
||
static const float KB = 0.0593;
|
||
// 10-bit studio (limited) range codes. Y' -> [64, 940]; Cb/Cr -> [64, 960] (512 ± 448).
|
||
float studio_y_code(float3 rgb_pq) {
|
||
float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b; // [0,1]
|
||
float code = 64.0 + 876.0 * y; // [64, 940]
|
||
return clamp(code, 64.0, 940.0);
|
||
}
|
||
float2 studio_cbcr_code(float3 rgb_pq) {
|
||
float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b;
|
||
float cb = (rgb_pq.b - y) / 1.8814; // ~[-0.5, 0.5]
|
||
float cr = (rgb_pq.r - y) / 1.4746;
|
||
float cbc = 512.0 + 896.0 * cb; // [64, 960]
|
||
float crc = 512.0 + 896.0 * cr;
|
||
return float2(clamp(cbc, 64.0, 960.0), clamp(crc, 64.0, 960.0));
|
||
}
|
||
// P010 stores the 10-bit code in the HIGH 10 bits of each 16-bit sample (code10 << 6). As an
|
||
// R16_UNORM / R16G16_UNORM render target the UNORM float that maps to that stored u16 is
|
||
// code10*64 / 65535.0. (Verified in hdr_p010_selftest against the readback.)
|
||
float code10_to_unorm(float code10) { return (code10 * 64.0) / 65535.0; }
|
||
";
|
||
|
||
/// P010 LUMA pass PS — full-res, writes Y′ to plane 0 (R16_UNORM RTV).
|
||
const HDR_P010_Y_PS: &str = r"
|
||
#include_common
|
||
float main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
|
||
float3 pq = scrgb_to_pq2020(uv);
|
||
float yc = studio_y_code(pq);
|
||
return code10_to_unorm(yc);
|
||
}
|
||
";
|
||
|
||
/// P010 CHROMA pass PS — half-res, writes interleaved (Cb,Cr) to plane 1 (R16G16_UNORM RTV). Averages
|
||
/// the 2x2 scRGB source footprint of this chroma sample (box filter) IN scRGB-linear space before the
|
||
/// PQ encode, then forms Cb/Cr from the averaged-then-PQ-encoded RGB. `inv_src` = (1/srcW, 1/srcH).
|
||
const HDR_P010_UV_PS: &str = r"
|
||
#include_common
|
||
cbuffer C : register(b0) { float2 inv_src; float2 pad; };
|
||
float2 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET {
|
||
// `uv` is the chroma-sample centre in [0,1]; the 4 co-sited luma texels sit at uv ± half a luma
|
||
// texel in each axis. Average their scRGB (linear) values, then run the SAME PQ/CSC as the Y pass.
|
||
float2 h = inv_src * 0.5;
|
||
float3 a = max(tx.Sample(sm, uv + float2(-h.x, -h.y)).rgb, 0.0);
|
||
float3 b = max(tx.Sample(sm, uv + float2( h.x, -h.y)).rgb, 0.0);
|
||
float3 c = max(tx.Sample(sm, uv + float2(-h.x, h.y)).rgb, 0.0);
|
||
float3 d = max(tx.Sample(sm, uv + float2( h.x, h.y)).rgb, 0.0);
|
||
float3 scrgb = (a + b + c + d) * 0.25;
|
||
float3 nits = scrgb * 80.0;
|
||
float3 lin2020 = mul(BT709_TO_BT2020, nits);
|
||
float3 pq = pq_oetf(lin2020 / 10000.0);
|
||
float2 cc = studio_cbcr_code(pq);
|
||
return float2(code10_to_unorm(cc.x), code10_to_unorm(cc.y));
|
||
}
|
||
";
|
||
|
||
/// scRGB FP16 → **P010** (BT.2020 PQ, 10-bit limited/studio range) conversion, in OUR OWN shader (two
|
||
/// passes: full-res luma + half-res chroma). NVIDIA's D3D11 VideoProcessor cannot do RGB→P010 (renders
|
||
/// green), so we quantize to studio-range 10-bit YUV directly and feed NVENC native P010 — skipping
|
||
/// NVENC's internal RGB→YUV CSC (which runs on the contended SM). One per capture device (rebuilt on
|
||
/// device recreate).
|
||
///
|
||
/// Plane writes use per-plane render-target views of the single P010 texture: an `R16_UNORM` RTV
|
||
/// selects plane 0 (luma, full WxH), an `R16G16_UNORM` RTV selects plane 1 (chroma, W/2 x H/2). This
|
||
/// planar-RTV mechanism needs a D3D11.3+ runtime + driver support; [`HdrP010Converter::convert`]
|
||
/// surfaces a clear error if `CreateRenderTargetView` rejects the plane format so the caller can fall
|
||
/// back to the existing R10 path.
|
||
pub(crate) struct HdrP010Converter {
|
||
vs: ID3D11VertexShader,
|
||
ps_y: ID3D11PixelShader,
|
||
ps_uv: ID3D11PixelShader,
|
||
sampler: ID3D11SamplerState,
|
||
/// Constant buffer for the chroma pass (inv_src texel size). 16 bytes.
|
||
cbuf: ID3D11Buffer,
|
||
}
|
||
|
||
impl HdrP010Converter {
|
||
pub(crate) unsafe fn new(device: &ID3D11Device) -> Result<Self> {
|
||
// Inline the shared HLSL (D3DCompile has no include handler wired here). The two PS sources
|
||
// carry a `#include_common` marker we substitute before compiling.
|
||
let y_src = HDR_P010_Y_PS.replace("#include_common", HDR_P010_COMMON);
|
||
let uv_src = HDR_P010_UV_PS.replace("#include_common", HDR_P010_COMMON);
|
||
let vsb = compile_shader(HDR_VS, s!("main"), s!("vs_5_0"))?;
|
||
let yb = compile_shader(&y_src, s!("main"), s!("ps_5_0"))?;
|
||
let uvb = compile_shader(&uv_src, s!("main"), s!("ps_5_0"))?;
|
||
let mut vs = None;
|
||
device.CreateVertexShader(&vsb, None, Some(&mut vs))?;
|
||
let mut ps_y = None;
|
||
device.CreatePixelShader(&yb, None, Some(&mut ps_y))?;
|
||
let mut ps_uv = None;
|
||
device.CreatePixelShader(&uvb, None, Some(&mut ps_uv))?;
|
||
let sd = D3D11_SAMPLER_DESC {
|
||
// POINT: the Y pass samples a single texel centre exactly, and the UV pass does its OWN
|
||
// 2x2 box average via 4 explicit taps at texel centres (offset half a texel). Point
|
||
// sampling keeps each tap exact; the averaging is in the shader, not the sampler.
|
||
Filter: D3D11_FILTER_MIN_MAG_MIP_POINT,
|
||
AddressU: D3D11_TEXTURE_ADDRESS_CLAMP,
|
||
AddressV: D3D11_TEXTURE_ADDRESS_CLAMP,
|
||
AddressW: D3D11_TEXTURE_ADDRESS_CLAMP,
|
||
ComparisonFunc: D3D11_COMPARISON_NEVER,
|
||
MaxLOD: f32::MAX,
|
||
..Default::default()
|
||
};
|
||
let mut sampler = None;
|
||
device.CreateSamplerState(&sd, Some(&mut sampler))?;
|
||
let cbd = D3D11_BUFFER_DESC {
|
||
ByteWidth: 16, // float2 inv_src + float2 pad
|
||
Usage: D3D11_USAGE_DYNAMIC,
|
||
BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32,
|
||
CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32,
|
||
..Default::default()
|
||
};
|
||
let mut cbuf = None;
|
||
device.CreateBuffer(&cbd, None, Some(&mut cbuf))?;
|
||
Ok(Self {
|
||
vs: vs.context("p010 vs")?,
|
||
ps_y: ps_y.context("p010 y ps")?,
|
||
ps_uv: ps_uv.context("p010 uv ps")?,
|
||
sampler: sampler.context("p010 sampler")?,
|
||
cbuf: cbuf.context("p010 cbuf")?,
|
||
})
|
||
}
|
||
|
||
/// Create a per-plane RTV of the P010 texture `dst` with the given single-plane `format`
|
||
/// (`R16_UNORM` for plane 0 luma, `R16G16_UNORM` for plane 1 chroma). The plane is selected by the
|
||
/// view format (planar-RTV semantics); MipSlice 0.
|
||
unsafe fn plane_rtv(
|
||
device: &ID3D11Device,
|
||
dst: &ID3D11Texture2D,
|
||
format: DXGI_FORMAT,
|
||
) -> Result<ID3D11RenderTargetView> {
|
||
let desc = D3D11_RENDER_TARGET_VIEW_DESC {
|
||
Format: format,
|
||
ViewDimension: D3D11_RTV_DIMENSION_TEXTURE2D,
|
||
Anonymous: D3D11_RENDER_TARGET_VIEW_DESC_0 {
|
||
Texture2D: D3D11_TEX2D_RTV { MipSlice: 0 },
|
||
},
|
||
};
|
||
let mut rtv: Option<ID3D11RenderTargetView> = None;
|
||
device
|
||
.CreateRenderTargetView(
|
||
dst,
|
||
Some(&desc as *const D3D11_RENDER_TARGET_VIEW_DESC),
|
||
Some(&mut rtv),
|
||
)
|
||
.with_context(|| {
|
||
format!("CreateRenderTargetView(P010 plane, format={format:?}) — driver may not support planar RTVs")
|
||
})?;
|
||
rtv.context("p010 plane rtv null")
|
||
}
|
||
|
||
/// Convert `src_srv` (FP16 scRGB, WxH) into `dst` (a `DXGI_FORMAT_P010` texture with
|
||
/// `BIND_RENDER_TARGET`). Two opaque passes: full-res luma → plane 0, half-res chroma → plane 1.
|
||
/// `w`/`h` are the full luma dimensions (must be even). Returns `Err` if a plane RTV can't be
|
||
/// created (driver) so the caller can fall back to the R10 path.
|
||
pub(crate) unsafe fn convert(
|
||
&self,
|
||
device: &ID3D11Device,
|
||
ctx: &ID3D11DeviceContext,
|
||
src_srv: &ID3D11ShaderResourceView,
|
||
dst: &ID3D11Texture2D,
|
||
w: u32,
|
||
h: u32,
|
||
) -> Result<()> {
|
||
let y_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16_UNORM)?;
|
||
let uv_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16G16_UNORM)?;
|
||
|
||
// Update the chroma constant buffer (inverse source texel size).
|
||
let cb: [f32; 4] = [1.0 / w as f32, 1.0 / h as f32, 0.0, 0.0];
|
||
let mut mapped = D3D11_MAPPED_SUBRESOURCE::default();
|
||
if ctx
|
||
.Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped))
|
||
.is_ok()
|
||
{
|
||
std::ptr::copy_nonoverlapping(cb.as_ptr(), mapped.pData as *mut f32, cb.len());
|
||
ctx.Unmap(&self.cbuf, 0);
|
||
}
|
||
|
||
// Shared pipeline state.
|
||
ctx.OMSetBlendState(None, None, 0xffff_ffff); // opaque overwrite
|
||
ctx.VSSetShader(&self.vs, None);
|
||
ctx.PSSetShaderResources(0, Some(&[Some(src_srv.clone())]));
|
||
ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())]));
|
||
ctx.IASetInputLayout(None);
|
||
ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
|
||
|
||
// --- LUMA pass: full-res, plane 0 ---
|
||
let vp_y = D3D11_VIEWPORT {
|
||
TopLeftX: 0.0,
|
||
TopLeftY: 0.0,
|
||
Width: w as f32,
|
||
Height: h as f32,
|
||
MinDepth: 0.0,
|
||
MaxDepth: 1.0,
|
||
};
|
||
ctx.RSSetViewports(Some(&[vp_y]));
|
||
ctx.OMSetRenderTargets(Some(&[Some(y_rtv.clone())]), None);
|
||
ctx.PSSetShader(&self.ps_y, None);
|
||
ctx.Draw(3, 0);
|
||
ctx.OMSetRenderTargets(Some(&[None]), None);
|
||
|
||
// --- CHROMA pass: half-res, plane 1 ---
|
||
let vp_uv = D3D11_VIEWPORT {
|
||
TopLeftX: 0.0,
|
||
TopLeftY: 0.0,
|
||
Width: (w / 2) as f32,
|
||
Height: (h / 2) as f32,
|
||
MinDepth: 0.0,
|
||
MaxDepth: 1.0,
|
||
};
|
||
ctx.RSSetViewports(Some(&[vp_uv]));
|
||
ctx.OMSetRenderTargets(Some(&[Some(uv_rtv.clone())]), None);
|
||
ctx.PSSetShader(&self.ps_uv, None);
|
||
ctx.PSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())]));
|
||
ctx.Draw(3, 0);
|
||
|
||
// Unbind for the next frame's re-RTV / NVENC read.
|
||
ctx.OMSetRenderTargets(Some(&[None]), None);
|
||
ctx.PSSetShaderResources(0, Some(&[None]));
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// f64 reference for the P010 colour math — the EXACT analogue of the HLSL in [`HDR_P010_COMMON`].
|
||
/// Input is one scRGB pixel (linear, Rec.709 primaries, 1.0 = 80 nits, may be >1 for HDR). Output is
|
||
/// the 10-bit studio-range (Y, Cb, Cr) codes the shader should produce for a flat (constant) block.
|
||
/// Used by [`hdr_p010_selftest`].
|
||
#[cfg(target_os = "windows")]
|
||
fn p010_reference(r: f64, g: f64, b: f64) -> (f64, f64, f64) {
|
||
fn pq_oetf(l: f64) -> f64 {
|
||
let l = l.clamp(0.0, 1.0);
|
||
let m1 = 0.1593017578125;
|
||
let m2 = 78.84375;
|
||
let c1 = 0.8359375;
|
||
let c2 = 18.8515625;
|
||
let c3 = 18.6875;
|
||
let lp = l.powf(m1);
|
||
((c1 + c2 * lp) / (1.0 + c3 * lp)).powf(m2)
|
||
}
|
||
// scRGB -> nits -> BT.2020 linear (row-major matrix, mul(M, v)).
|
||
let (r, g, b) = (r.max(0.0) * 80.0, g.max(0.0) * 80.0, b.max(0.0) * 80.0);
|
||
let m = [
|
||
[0.627403914, 0.329283038, 0.043313048],
|
||
[0.069097292, 0.919540405, 0.011362303],
|
||
[0.016391439, 0.088013308, 0.895595253],
|
||
];
|
||
let lr = m[0][0] * r + m[0][1] * g + m[0][2] * b;
|
||
let lg = m[1][0] * r + m[1][1] * g + m[1][2] * b;
|
||
let lb = m[2][0] * r + m[2][1] * g + m[2][2] * b;
|
||
// PQ encode (normalize to 10k nits).
|
||
let pr = pq_oetf(lr / 10000.0);
|
||
let pg = pq_oetf(lg / 10000.0);
|
||
let pb = pq_oetf(lb / 10000.0);
|
||
// BT.2020 non-constant-luminance, limited 10-bit.
|
||
let (kr, kg, kb) = (0.2627, 0.6780, 0.0593);
|
||
let y = kr * pr + kg * pg + kb * pb;
|
||
let cb = (pb - y) / 1.8814;
|
||
let cr = (pr - y) / 1.4746;
|
||
let yc = (64.0 + 876.0 * y).clamp(64.0, 940.0);
|
||
let cbc = (512.0 + 896.0 * cb).clamp(64.0, 960.0);
|
||
let crc = (512.0 + 896.0 * cr).clamp(64.0, 960.0);
|
||
(yc, cbc, crc)
|
||
}
|
||
|
||
/// Colour self-test for [`HdrP010Converter`] (the `hdr-p010-selftest` subcommand): create a hardware
|
||
/// D3D11 device, upload a known scRGB FP16 pattern, run the P010 shader passes, read the Y (plane 0)
|
||
/// and UV (plane 1) planes back from a staging copy, and compare against the [`p010_reference`] f64
|
||
/// math. The ONLY validation we have without green-screening a live HDR stream. PASS if max abs error
|
||
/// Y ≤ 4 codes, U/V ≤ 5 codes (rounding + chroma averaging). Prints a per-colour table + PASS/FAIL.
|
||
#[cfg(target_os = "windows")]
|
||
pub fn hdr_p010_selftest() -> Result<()> {
|
||
use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE;
|
||
use windows::Win32::Graphics::Dxgi::IDXGIAdapter;
|
||
|
||
// 64x64, even dims. A 4x4 grid of 16x16 flat scRGB blocks (each 2x2 chroma footprint uniform →
|
||
// exact chroma comparison) covering pure R/G/B/white/black/gray at plausible HDR nit levels, plus
|
||
// a couple of bright (>1.0 scRGB) colours, then the rest is a gradient (compared on Y only).
|
||
const W: u32 = 64;
|
||
const H: u32 = 64;
|
||
const BLK: u32 = 16;
|
||
// (name, r, g, b) scRGB linear (1.0 = 80 nits). Mix of SDR-ish and HDR (>1.0) values.
|
||
let named: [(&str, f32, f32, f32); 8] = [
|
||
("red1.0", 1.0, 0.0, 0.0),
|
||
("green0.5", 0.0, 0.5, 0.0),
|
||
("blue4.0", 0.0, 0.0, 4.0),
|
||
("white1.0", 1.0, 1.0, 1.0),
|
||
("black", 0.0, 0.0, 0.0),
|
||
("gray0.5", 0.5, 0.5, 0.5),
|
||
("white4.0", 4.0, 4.0, 4.0),
|
||
("amber2.0", 2.0, 1.0, 0.0),
|
||
];
|
||
|
||
let grid_cols = W / BLK; // 4
|
||
let pixel_rgb = |x: u32, y: u32| -> (f32, f32, f32, bool) {
|
||
let idx = ((y / BLK) * grid_cols + (x / BLK)) as usize;
|
||
if idx < named.len() {
|
||
let (_, r, g, b) = named[idx];
|
||
(r, g, b, true)
|
||
} else {
|
||
// Gradient (distinct per pixel; Y-only compare), within HDR scRGB range.
|
||
let r = (x as f32 / W as f32) * 3.0;
|
||
let g = (y as f32 / H as f32) * 3.0;
|
||
let b = ((x + y) as f32 / (W + H) as f32) * 3.0;
|
||
(r, g, b, false)
|
||
}
|
||
};
|
||
|
||
// Build the scRGB FP16 (R16G16B16A16_FLOAT) source as f16 bits.
|
||
let mut fp16 = vec![0u16; (W * H * 4) as usize];
|
||
let mut flat = vec![false; (W * H) as usize];
|
||
for y in 0..H {
|
||
for x in 0..W {
|
||
let (r, g, b, is_flat) = pixel_rgb(x, y);
|
||
let i = ((y * W + x) * 4) as usize;
|
||
fp16[i] = f32_to_f16(r);
|
||
fp16[i + 1] = f32_to_f16(g);
|
||
fp16[i + 2] = f32_to_f16(b);
|
||
fp16[i + 3] = f32_to_f16(1.0);
|
||
flat[(y * W + x) as usize] = is_flat;
|
||
}
|
||
}
|
||
|
||
// SAFETY: this self-test creates its own D3D11 device + immediate context (`D3D11CreateDevice`,
|
||
// both checked non-null) and uses ONLY that device for the rest of the block: every
|
||
// `CreateTexture2D`/`CreateShaderResourceView`/`HdrP010Converter::{new,convert}`/`CopyResource`/
|
||
// `Map` is invoked on that device or its context, so all resources share one device and run on this
|
||
// single thread. The source texture's `D3D11_SUBRESOURCE_DATA` points at `fp16`, a live
|
||
// `Vec<u16>` of `W*H*4` samples with `SysMemPitch = W*8`, matching the W×H R16G16B16A16 texture;
|
||
// `fp16` outlives the synchronous `CreateTexture2D` that reads it. The mapped-pointer reads are
|
||
// proven individually at the `read_u16` closure below.
|
||
unsafe {
|
||
// Hardware D3D11 device (no adapter pin — the default GPU is fine for the self-test).
|
||
let mut device: Option<ID3D11Device> = None;
|
||
let mut context: Option<ID3D11DeviceContext> = None;
|
||
D3D11CreateDevice(
|
||
None::<&IDXGIAdapter>,
|
||
D3D_DRIVER_TYPE_HARDWARE,
|
||
HMODULE::default(),
|
||
D3D11_CREATE_DEVICE_BGRA_SUPPORT,
|
||
Some(&[D3D_FEATURE_LEVEL_11_0]),
|
||
D3D11_SDK_VERSION,
|
||
Some(&mut device),
|
||
None,
|
||
Some(&mut context),
|
||
)
|
||
.context("D3D11CreateDevice(hardware) for hdr-p010-selftest")?;
|
||
let device = device.context("null device")?;
|
||
let context = context.context("null context")?;
|
||
|
||
// Source FP16 texture (initialized) + SRV.
|
||
let src_desc = D3D11_TEXTURE2D_DESC {
|
||
Width: W,
|
||
Height: H,
|
||
MipLevels: 1,
|
||
ArraySize: 1,
|
||
Format: DXGI_FORMAT_R16G16B16A16_FLOAT,
|
||
SampleDesc: DXGI_SAMPLE_DESC {
|
||
Count: 1,
|
||
Quality: 0,
|
||
},
|
||
Usage: D3D11_USAGE_DEFAULT,
|
||
BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32,
|
||
..Default::default()
|
||
};
|
||
let init = D3D11_SUBRESOURCE_DATA {
|
||
pSysMem: fp16.as_ptr() as *const c_void,
|
||
SysMemPitch: W * 8, // 4 channels * 2 bytes
|
||
SysMemSlicePitch: 0,
|
||
};
|
||
let mut src_tex: Option<ID3D11Texture2D> = None;
|
||
device
|
||
.CreateTexture2D(&src_desc, Some(&init), Some(&mut src_tex))
|
||
.context("CreateTexture2D(fp16 src)")?;
|
||
let src_tex = src_tex.context("null src tex")?;
|
||
let mut src_srv: Option<ID3D11ShaderResourceView> = None;
|
||
device
|
||
.CreateShaderResourceView(&src_tex, None, Some(&mut src_srv))
|
||
.context("CreateShaderResourceView(fp16 src)")?;
|
||
let src_srv = src_srv.context("null src srv")?;
|
||
|
||
// P010 destination texture (render-target bindable).
|
||
let p010_desc = D3D11_TEXTURE2D_DESC {
|
||
Width: W,
|
||
Height: H,
|
||
MipLevels: 1,
|
||
ArraySize: 1,
|
||
Format: DXGI_FORMAT_P010,
|
||
SampleDesc: DXGI_SAMPLE_DESC {
|
||
Count: 1,
|
||
Quality: 0,
|
||
},
|
||
Usage: D3D11_USAGE_DEFAULT,
|
||
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
|
||
..Default::default()
|
||
};
|
||
let mut p010: Option<ID3D11Texture2D> = None;
|
||
device
|
||
.CreateTexture2D(&p010_desc, None, Some(&mut p010))
|
||
.context("CreateTexture2D(P010 dst)")?;
|
||
let p010 = p010.context("null p010 tex")?;
|
||
|
||
let conv = HdrP010Converter::new(&device)?;
|
||
conv.convert(&device, &context, &src_srv, &p010, W, H)?;
|
||
|
||
// Staging copy of the whole P010 texture (both planes), MAP_READ.
|
||
let stage_desc = D3D11_TEXTURE2D_DESC {
|
||
Width: W,
|
||
Height: H,
|
||
MipLevels: 1,
|
||
ArraySize: 1,
|
||
Format: DXGI_FORMAT_P010,
|
||
SampleDesc: DXGI_SAMPLE_DESC {
|
||
Count: 1,
|
||
Quality: 0,
|
||
},
|
||
Usage: D3D11_USAGE_STAGING,
|
||
BindFlags: 0,
|
||
CPUAccessFlags: D3D11_CPU_ACCESS_READ.0 as u32,
|
||
..Default::default()
|
||
};
|
||
let mut staging: Option<ID3D11Texture2D> = None;
|
||
device
|
||
.CreateTexture2D(&stage_desc, None, Some(&mut staging))
|
||
.context("CreateTexture2D(P010 staging)")?;
|
||
let staging = staging.context("null staging")?;
|
||
context.CopyResource(&staging, &p010);
|
||
|
||
let mut map = D3D11_MAPPED_SUBRESOURCE::default();
|
||
context
|
||
.Map(&staging, 0, D3D11_MAP_READ, 0, Some(&mut map))
|
||
.context("Map(P010 staging)")?;
|
||
let row_pitch = map.RowPitch as usize; // bytes per luma row (in 16-bit samples: /2)
|
||
let base = map.pData as *const u8;
|
||
// DIAGNOSTIC (the uncertain layout spot — verify on the box if chroma is wrong): the mapped
|
||
// P010 plane offsets. Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2
|
||
// *interleaved* (Cb,Cr) u16 pairs. P010 packs plane 1 after plane 0 at the SAME row pitch; the
|
||
// chroma plane begins at byte offset RowPitch * (luma height). For a STAGING texture that
|
||
// height is the created H (no inter-plane alignment). DepthPitch (total mapped size) lets us
|
||
// sanity-check: it should be ~ RowPitch * H * 3/2. If chroma reads garbage on the box, print
|
||
// these and adjust `chroma_base` (e.g. an aligned luma height).
|
||
tracing::info!(
|
||
row_pitch,
|
||
depth_pitch = map.DepthPitch,
|
||
expected_chroma_base = row_pitch * H as usize,
|
||
expected_total = row_pitch * H as usize * 3 / 2,
|
||
"hdr-p010-selftest: mapped P010 layout (verify chroma plane offset here if chroma is wrong)"
|
||
);
|
||
// Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2 *interleaved* (Cb,Cr)
|
||
// u16 pairs, i.e. W u16 per chroma row. P010 packs plane 1 immediately after plane 0 at the
|
||
// SAME row pitch; per spec the chroma plane begins at an allocation offset of
|
||
// RowPitch * Height (luma rows). We read it from there. (DepthPitch is the full surface size;
|
||
// not all drivers report the chroma offset, so RowPitch*Height is the portable choice.)
|
||
let read_u16 = |byte_off: usize| -> u16 {
|
||
// SAFETY: `base` is the mapped staging pointer; all offsets are within the P010 surface
|
||
// (luma H*RowPitch + chroma (H/2)*RowPitch ≤ DepthPitch). Already in the fn's unsafe scope.
|
||
let p = base.add(byte_off) as *const u16;
|
||
p.read_unaligned()
|
||
};
|
||
// Luma codes: stored u16 in the high 10 bits -> code10 = stored >> 6.
|
||
let mut y_codes = vec![0u16; (W * H) as usize];
|
||
for y in 0..H {
|
||
for x in 0..W {
|
||
let off = (y as usize) * row_pitch + (x as usize) * 2;
|
||
y_codes[(y * W + x) as usize] = read_u16(off) >> 6;
|
||
}
|
||
}
|
||
let cw = W / 2;
|
||
let ch = H / 2;
|
||
let chroma_base = row_pitch * H as usize; // plane 1 offset
|
||
let mut cb_codes = vec![0u16; (cw * ch) as usize];
|
||
let mut cr_codes = vec![0u16; (cw * ch) as usize];
|
||
for cy in 0..ch {
|
||
for cx in 0..cw {
|
||
// Interleaved (Cb, Cr) per chroma sample → 2 u16 = 4 bytes per sample.
|
||
let off = chroma_base + (cy as usize) * row_pitch + (cx as usize) * 4;
|
||
cb_codes[(cy * cw + cx) as usize] = read_u16(off) >> 6;
|
||
cr_codes[(cy * cw + cx) as usize] = read_u16(off + 2) >> 6;
|
||
}
|
||
}
|
||
context.Unmap(&staging, 0);
|
||
|
||
// Compare Y over every pixel.
|
||
let mut max_y_err = 0.0f64;
|
||
for y in 0..H {
|
||
for x in 0..W {
|
||
let (r, g, b, _) = pixel_rgb(x, y);
|
||
let (ry, _, _) = p010_reference(r as f64, g as f64, b as f64);
|
||
let got = y_codes[(y * W + x) as usize] as f64;
|
||
max_y_err = max_y_err.max((got - ry).abs());
|
||
}
|
||
}
|
||
// Compare Cb/Cr over flat blocks only (uniform 2x2 footprint → exact reference).
|
||
let mut max_u_err = 0.0f64;
|
||
let mut max_v_err = 0.0f64;
|
||
for cy in 0..ch {
|
||
for cx in 0..cw {
|
||
let (sx, sy) = (cx * 2, cy * 2);
|
||
let all_flat =
|
||
(0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize]));
|
||
if !all_flat {
|
||
continue;
|
||
}
|
||
let (r, g, b, _) = pixel_rgb(sx, sy);
|
||
let (_, rcb, rcr) = p010_reference(r as f64, g as f64, b as f64);
|
||
let gu = cb_codes[(cy * cw + cx) as usize] as f64;
|
||
let gv = cr_codes[(cy * cw + cx) as usize] as f64;
|
||
max_u_err = max_u_err.max((gu - rcb).abs());
|
||
max_v_err = max_v_err.max((gv - rcr).abs());
|
||
}
|
||
}
|
||
|
||
// Per-colour table.
|
||
println!("HDR P010 self-test ({W}x{H}, BT.2020 PQ, 10-bit limited range)");
|
||
println!(
|
||
" {:<10} {:>14} {:>14} {:>14}",
|
||
"color", "Y exp/got", "Cb exp/got", "Cr exp/got"
|
||
);
|
||
for (idx, (name, r, g, b)) in named.iter().enumerate() {
|
||
let bx = (idx as u32 % grid_cols) * BLK + BLK / 2;
|
||
let by = (idx as u32 / grid_cols) * BLK + BLK / 2;
|
||
let (ey, ecb, ecr) = p010_reference(*r as f64, *g as f64, *b as f64);
|
||
let gy = y_codes[(by * W + bx) as usize] as f64;
|
||
let (ccx, ccy) = (bx / 2, by / 2);
|
||
let gu = cb_codes[(ccy * cw + ccx) as usize] as f64;
|
||
let gv = cr_codes[(ccy * cw + ccx) as usize] as f64;
|
||
println!(
|
||
" {:<10} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}",
|
||
name, ey, gy, ecb, gu, ecr, gv
|
||
);
|
||
}
|
||
println!(
|
||
" max abs error: Y={max_y_err:.2} (≤4) Cb={max_u_err:.2} (≤5) Cr={max_v_err:.2} (≤5)"
|
||
);
|
||
|
||
if max_y_err <= 4.0 && max_u_err <= 5.0 && max_v_err <= 5.0 {
|
||
println!("PASS");
|
||
Ok(())
|
||
} else {
|
||
println!("FAIL");
|
||
bail!(
|
||
"HDR P010 self-test FAILED (Y={max_y_err:.2} Cb={max_u_err:.2} Cr={max_v_err:.2})"
|
||
);
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Minimal f32 → IEEE-754 half (f16) bit pattern, for uploading the FP16 scRGB self-test pattern. Not
|
||
/// on any hot path; handles normals, subnormals, and the 1.0/0.0 constants we feed. (round-to-nearest)
|
||
#[cfg(target_os = "windows")]
|
||
fn f32_to_f16(v: f32) -> u16 {
|
||
let bits = v.to_bits();
|
||
let sign = ((bits >> 16) & 0x8000) as u16;
|
||
let exp = ((bits >> 23) & 0xff) as i32 - 127 + 15;
|
||
let mant = bits & 0x007f_ffff;
|
||
if exp <= 0 {
|
||
// Subnormal / zero in half precision.
|
||
if exp < -10 {
|
||
return sign; // too small → ±0
|
||
}
|
||
let mant = mant | 0x0080_0000; // implicit 1
|
||
let shift = (14 - exp) as u32;
|
||
let half_mant = (mant >> shift) as u16;
|
||
// Round to nearest.
|
||
let round = ((mant >> (shift - 1)) & 1) as u16;
|
||
sign | (half_mant + round)
|
||
} else if exp >= 0x1f {
|
||
sign | 0x7c00 // Inf/NaN → Inf (our inputs never hit this)
|
||
} else {
|
||
let half_exp = (exp as u16) << 10;
|
||
let half_mant = (mant >> 13) as u16;
|
||
let round = ((mant >> 12) & 1) as u16;
|
||
sign | half_exp | (half_mant + round)
|
||
}
|
||
}
|
||
|
||
use windows::Win32::Graphics::Direct3D11::{
|
||
ID3D11VideoContext1, ID3D11VideoDevice, ID3D11VideoProcessor, ID3D11VideoProcessorEnumerator,
|
||
ID3D11VideoProcessorInputView, ID3D11VideoProcessorOutputView, D3D11_TEX2D_VPIV,
|
||
D3D11_TEX2D_VPOV, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE, D3D11_VIDEO_PROCESSOR_CONTENT_DESC,
|
||
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0,
|
||
D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC, D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0,
|
||
D3D11_VIDEO_PROCESSOR_STREAM, D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
|
||
D3D11_VPIV_DIMENSION_TEXTURE2D, D3D11_VPOV_DIMENSION_TEXTURE2D,
|
||
};
|
||
use windows::Win32::Graphics::Dxgi::Common::{
|
||
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709, DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
|
||
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020, DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
|
||
DXGI_RATIONAL,
|
||
};
|
||
|
||
/// D3D11 **Video Processor** colour/format converter — runs on the GPU's dedicated VIDEO engine, NOT
|
||
/// the 3D engine, so the per-frame RGB→YUV conversion does not contend with a GPU-saturating game (the
|
||
/// HDR pixel-shader path and NVENC's internal RGB→YUV both use the 3D/compute engine, which an AAA
|
||
/// title pins at ~100%). Output is NV12 (SDR, BT.709 studio-range) or P010 (HDR, BT.2020 PQ
|
||
/// studio-range) — NVENC's native YUV inputs, so it encodes them with no further conversion.
|
||
pub(crate) struct VideoConverter {
|
||
vdev: ID3D11VideoDevice,
|
||
vctx: ID3D11VideoContext1,
|
||
enumr: ID3D11VideoProcessorEnumerator,
|
||
vp: ID3D11VideoProcessor,
|
||
}
|
||
|
||
impl VideoConverter {
|
||
pub(crate) unsafe fn new(
|
||
device: &ID3D11Device,
|
||
context: &ID3D11DeviceContext,
|
||
width: u32,
|
||
height: u32,
|
||
hdr: bool,
|
||
) -> Result<Self> {
|
||
let vdev: ID3D11VideoDevice = device.cast().context("device -> ID3D11VideoDevice")?;
|
||
let vctx: ID3D11VideoContext1 = context.cast().context("context -> ID3D11VideoContext1")?;
|
||
let rate = DXGI_RATIONAL {
|
||
Numerator: 240,
|
||
Denominator: 1,
|
||
};
|
||
let desc = D3D11_VIDEO_PROCESSOR_CONTENT_DESC {
|
||
InputFrameFormat: D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE,
|
||
InputFrameRate: rate,
|
||
InputWidth: width,
|
||
InputHeight: height,
|
||
OutputFrameRate: rate,
|
||
OutputWidth: width,
|
||
OutputHeight: height,
|
||
Usage: D3D11_VIDEO_USAGE_PLAYBACK_NORMAL,
|
||
};
|
||
let enumr = vdev
|
||
.CreateVideoProcessorEnumerator(&desc)
|
||
.context("CreateVideoProcessorEnumerator")?;
|
||
let vp = vdev
|
||
.CreateVideoProcessor(&enumr, 0)
|
||
.context("CreateVideoProcessor")?;
|
||
|
||
// Full-range RGB in → studio-range YUV out. HDR: scRGB linear (G10) → BT.2020 PQ (G2084).
|
||
// SDR: sRGB (G22) → BT.709 (G22).
|
||
let (in_cs, out_cs) = if hdr {
|
||
(
|
||
DXGI_COLOR_SPACE_RGB_FULL_G10_NONE_P709,
|
||
DXGI_COLOR_SPACE_YCBCR_STUDIO_G2084_LEFT_P2020,
|
||
)
|
||
} else {
|
||
(
|
||
DXGI_COLOR_SPACE_RGB_FULL_G22_NONE_P709,
|
||
DXGI_COLOR_SPACE_YCBCR_STUDIO_G22_LEFT_P709,
|
||
)
|
||
};
|
||
vctx.VideoProcessorSetStreamColorSpace1(&vp, 0, in_cs);
|
||
vctx.VideoProcessorSetOutputColorSpace1(&vp, out_cs);
|
||
// One frame in, one frame out — no interpolation/auto-processing.
|
||
vctx.VideoProcessorSetStreamFrameFormat(&vp, 0, D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE);
|
||
|
||
Ok(Self {
|
||
vdev,
|
||
vctx,
|
||
enumr,
|
||
vp,
|
||
})
|
||
}
|
||
|
||
/// Convert `input` (BGRA or scRGB FP16) → `output` (NV12 or P010) on the video engine. Views are
|
||
/// created per call (cheap relative to the Blt) so the input texture can vary frame to frame.
|
||
pub(crate) unsafe fn convert(
|
||
&self,
|
||
input: &ID3D11Texture2D,
|
||
output: &ID3D11Texture2D,
|
||
) -> Result<()> {
|
||
let in_desc = D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC {
|
||
FourCC: 0,
|
||
ViewDimension: D3D11_VPIV_DIMENSION_TEXTURE2D,
|
||
Anonymous: D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC_0 {
|
||
Texture2D: D3D11_TEX2D_VPIV {
|
||
MipSlice: 0,
|
||
ArraySlice: 0,
|
||
},
|
||
},
|
||
};
|
||
let mut in_view: Option<ID3D11VideoProcessorInputView> = None;
|
||
self.vdev
|
||
.CreateVideoProcessorInputView(input, &self.enumr, &in_desc, Some(&mut in_view))
|
||
.context("CreateVideoProcessorInputView")?;
|
||
|
||
let out_desc = D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC {
|
||
ViewDimension: D3D11_VPOV_DIMENSION_TEXTURE2D,
|
||
Anonymous: D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC_0 {
|
||
Texture2D: D3D11_TEX2D_VPOV { MipSlice: 0 },
|
||
},
|
||
};
|
||
let mut out_view: Option<ID3D11VideoProcessorOutputView> = None;
|
||
self.vdev
|
||
.CreateVideoProcessorOutputView(output, &self.enumr, &out_desc, Some(&mut out_view))
|
||
.context("CreateVideoProcessorOutputView")?;
|
||
let out_view = out_view.context("null output view")?;
|
||
|
||
let stream = D3D11_VIDEO_PROCESSOR_STREAM {
|
||
Enable: true.into(),
|
||
pInputSurface: std::mem::ManuallyDrop::new(in_view),
|
||
..Default::default()
|
||
};
|
||
self.vctx
|
||
.VideoProcessorBlt(&self.vp, &out_view, 0, &[stream])
|
||
.context("VideoProcessorBlt")
|
||
}
|
||
}
|