diff --git a/crates/punktfunk-host/src/capture/dxgi.rs b/crates/punktfunk-host/src/capture/dxgi.rs index 88a06a9..5383841 100644 --- a/crates/punktfunk-host/src/capture/dxgi.rs +++ b/crates/punktfunk-host/src/capture/dxgi.rs @@ -28,13 +28,16 @@ use windows::Win32::Graphics::Direct3D11::{ D3D11_BLEND_SRC_ALPHA, D3D11_BUFFER_DESC, D3D11_COLOR_WRITE_ENABLE_ALL, D3D11_COMPARISON_NEVER, D3D11_CPU_ACCESS_READ, D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_FILTER_MIN_MAG_MIP_POINT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, - D3D11_MAP_WRITE_DISCARD, D3D11_RENDER_TARGET_BLEND_DESC, D3D11_SAMPLER_DESC, D3D11_SDK_VERSION, - D3D11_SUBRESOURCE_DATA, D3D11_TEXTURE2D_DESC, D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, - D3D11_USAGE_DYNAMIC, D3D11_USAGE_STAGING, D3D11_VIEWPORT, + D3D11_MAP_WRITE_DISCARD, D3D11_RENDER_TARGET_BLEND_DESC, D3D11_RENDER_TARGET_VIEW_DESC, + D3D11_RENDER_TARGET_VIEW_DESC_0, D3D11_RTV_DIMENSION_TEXTURE2D, D3D11_SAMPLER_DESC, + D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEX2D_RTV, D3D11_TEXTURE2D_DESC, + D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC, D3D11_USAGE_STAGING, + D3D11_VIEWPORT, }; use windows::Win32::Graphics::Dxgi::Common::{ - DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM, - DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_SAMPLE_DESC, + DXGI_FORMAT, DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_FORMAT_P010, DXGI_FORMAT_R10G10B10A2_UNORM, + DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16_UNORM, DXGI_FORMAT_R16_UNORM, + DXGI_SAMPLE_DESC, }; use windows::Win32::Graphics::Dxgi::{ CreateDXGIFactory1, IDXGIAdapter1, IDXGIDevice, IDXGIDevice1, IDXGIFactory1, IDXGIOutput1, @@ -936,6 +939,618 @@ impl HdrConverter { } } +/// Whether `PUNKTFUNK_HDR_SHADER_P010` is truthy (`1`/`true`/`yes`/`on`). When set, the WGC HDR path +/// emits P010 (BT.2020 PQ, 10-bit limited range) DIRECTLY from a shader pass ([`HdrP010Converter`]) +/// instead of tone-mapping to R10G10B10A2 and letting NVENC do the RGB→YUV CSC on the contended SM. +/// Default OFF → the current HDR path (R10→NVENC + the VideoProcessor attempt) is byte-for-byte +/// unchanged. +pub(crate) fn hdr_shader_p010_enabled() -> bool { + std::env::var("PUNKTFUNK_HDR_SHADER_P010") + .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on")) + .unwrap_or(false) +} + +/// P010 **luma** pixel shader: scRGB FP16 desktop (linear, Rec.709 primaries, 1.0 = 80 nits) → +/// BT.2020 PQ → BT.2020 non-constant-luminance limited-range Y′, written as a 10-bit code in the high +/// 10 bits of an R16_UNORM render-target view of the P010 plane-0 (luma). The colour pipeline +/// (scRGB→nits→BT.2020-linear→PQ) is IDENTICAL to [`HDR_PS`]; only the final RGB→Y + studio-range +/// quantization differs. The shared HLSL is factored into [`HDR_P010_COMMON`]. +const HDR_P010_COMMON: &str = r" +Texture2D tx : register(t0); +SamplerState sm : register(s0); +// Rec.709 → Rec.2020 primaries (linear). Same matrix as the R10 HdrConverter (mul(M, v)). +static const float3x3 BT709_TO_BT2020 = { + 0.627403914, 0.329283038, 0.043313048, + 0.069097292, 0.919540405, 0.011362303, + 0.016391439, 0.088013308, 0.895595253 +}; +float3 pq_oetf(float3 L) { + // L normalized so 1.0 = 10000 nits. ST 2084. (Identical to HdrConverter.) + const float m1 = 0.1593017578125; + const float m2 = 78.84375; + const float c1 = 0.8359375; + const float c2 = 18.8515625; + const float c3 = 18.6875; + float3 Lp = pow(saturate(L), m1); + return pow((c1 + c2 * Lp) / (1.0 + c3 * Lp), m2); +} +// scRGB FP16 sample -> PQ-encoded BT.2020 RGB in [0,1] (the SAME pixels the R10 path would store, +// before quantization). Used by both the luma and chroma passes so they agree bit-for-bit with the +// existing HdrConverter colour math + the Rust reference. +float3 scrgb_to_pq2020(float2 uv) { + float3 scrgb = max(tx.Sample(sm, uv).rgb, 0.0); // scRGB can be negative (wide gamut); clamp + float3 nits = scrgb * 80.0; // scRGB 1.0 = 80 nits + float3 lin2020 = mul(BT709_TO_BT2020, nits); // primaries conversion (linear) + return pq_oetf(lin2020 / 10000.0); // normalize to 10k nits, encode PQ -> [0,1] +} +// BT.2020 non-constant-luminance, on the PQ-encoded (gamma) RGB. Kr/Kg/Kb per Rec.2020. +static const float KR = 0.2627; +static const float KG = 0.6780; +static const float KB = 0.0593; +// 10-bit studio (limited) range codes. Y' -> [64, 940]; Cb/Cr -> [64, 960] (512 ± 448). +float studio_y_code(float3 rgb_pq) { + float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b; // [0,1] + float code = 64.0 + 876.0 * y; // [64, 940] + return clamp(code, 64.0, 940.0); +} +float2 studio_cbcr_code(float3 rgb_pq) { + float y = KR * rgb_pq.r + KG * rgb_pq.g + KB * rgb_pq.b; + float cb = (rgb_pq.b - y) / 1.8814; // ~[-0.5, 0.5] + float cr = (rgb_pq.r - y) / 1.4746; + float cbc = 512.0 + 896.0 * cb; // [64, 960] + float crc = 512.0 + 896.0 * cr; + return float2(clamp(cbc, 64.0, 960.0), clamp(crc, 64.0, 960.0)); +} +// P010 stores the 10-bit code in the HIGH 10 bits of each 16-bit sample (code10 << 6). As an +// R16_UNORM / R16G16_UNORM render target the UNORM float that maps to that stored u16 is +// code10*64 / 65535.0. (Verified in hdr_p010_selftest against the readback.) +float code10_to_unorm(float code10) { return (code10 * 64.0) / 65535.0; } +"; + +/// P010 LUMA pass PS — full-res, writes Y′ to plane 0 (R16_UNORM RTV). +const HDR_P010_Y_PS: &str = r" +#include_common +float main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { + float3 pq = scrgb_to_pq2020(uv); + float yc = studio_y_code(pq); + return code10_to_unorm(yc); +} +"; + +/// P010 CHROMA pass PS — half-res, writes interleaved (Cb,Cr) to plane 1 (R16G16_UNORM RTV). Averages +/// the 2x2 scRGB source footprint of this chroma sample (box filter) IN scRGB-linear space before the +/// PQ encode, then forms Cb/Cr from the averaged-then-PQ-encoded RGB. `inv_src` = (1/srcW, 1/srcH). +const HDR_P010_UV_PS: &str = r" +#include_common +cbuffer C : register(b0) { float2 inv_src; float2 pad; }; +float2 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { + // `uv` is the chroma-sample centre in [0,1]; the 4 co-sited luma texels sit at uv ± half a luma + // texel in each axis. Average their scRGB (linear) values, then run the SAME PQ/CSC as the Y pass. + float2 h = inv_src * 0.5; + float3 a = max(tx.Sample(sm, uv + float2(-h.x, -h.y)).rgb, 0.0); + float3 b = max(tx.Sample(sm, uv + float2( h.x, -h.y)).rgb, 0.0); + float3 c = max(tx.Sample(sm, uv + float2(-h.x, h.y)).rgb, 0.0); + float3 d = max(tx.Sample(sm, uv + float2( h.x, h.y)).rgb, 0.0); + float3 scrgb = (a + b + c + d) * 0.25; + float3 nits = scrgb * 80.0; + float3 lin2020 = mul(BT709_TO_BT2020, nits); + float3 pq = pq_oetf(lin2020 / 10000.0); + float2 cc = studio_cbcr_code(pq); + return float2(code10_to_unorm(cc.x), code10_to_unorm(cc.y)); +} +"; + +/// scRGB FP16 → **P010** (BT.2020 PQ, 10-bit limited/studio range) conversion, in OUR OWN shader (two +/// passes: full-res luma + half-res chroma). NVIDIA's D3D11 VideoProcessor cannot do RGB→P010 (renders +/// green), so we quantize to studio-range 10-bit YUV directly and feed NVENC native P010 — skipping +/// NVENC's internal RGB→YUV CSC (which runs on the contended SM). One per capture device (rebuilt on +/// device recreate, like [`HdrConverter`]). +/// +/// Plane writes use per-plane render-target views of the single P010 texture: an `R16_UNORM` RTV +/// selects plane 0 (luma, full WxH), an `R16G16_UNORM` RTV selects plane 1 (chroma, W/2 x H/2). This +/// planar-RTV mechanism needs a D3D11.3+ runtime + driver support; [`HdrP010Converter::convert`] +/// surfaces a clear error if `CreateRenderTargetView` rejects the plane format so the caller can fall +/// back to the existing R10 path. +pub(crate) struct HdrP010Converter { + vs: ID3D11VertexShader, + ps_y: ID3D11PixelShader, + ps_uv: ID3D11PixelShader, + sampler: ID3D11SamplerState, + /// Constant buffer for the chroma pass (inv_src texel size). 16 bytes. + cbuf: ID3D11Buffer, +} + +impl HdrP010Converter { + pub(crate) unsafe fn new(device: &ID3D11Device) -> Result { + // Inline the shared HLSL (D3DCompile has no include handler wired here). The two PS sources + // carry a `#include_common` marker we substitute before compiling. + let y_src = HDR_P010_Y_PS.replace("#include_common", HDR_P010_COMMON); + let uv_src = HDR_P010_UV_PS.replace("#include_common", HDR_P010_COMMON); + let vsb = compile_shader(HDR_VS, s!("main"), s!("vs_5_0"))?; + let yb = compile_shader(&y_src, s!("main"), s!("ps_5_0"))?; + let uvb = compile_shader(&uv_src, s!("main"), s!("ps_5_0"))?; + let mut vs = None; + device.CreateVertexShader(&vsb, None, Some(&mut vs))?; + let mut ps_y = None; + device.CreatePixelShader(&yb, None, Some(&mut ps_y))?; + let mut ps_uv = None; + device.CreatePixelShader(&uvb, None, Some(&mut ps_uv))?; + let sd = D3D11_SAMPLER_DESC { + // POINT: the Y pass samples a single texel centre exactly, and the UV pass does its OWN + // 2x2 box average via 4 explicit taps at texel centres (offset half a texel). Point + // sampling keeps each tap exact; the averaging is in the shader, not the sampler. + Filter: D3D11_FILTER_MIN_MAG_MIP_POINT, + AddressU: D3D11_TEXTURE_ADDRESS_CLAMP, + AddressV: D3D11_TEXTURE_ADDRESS_CLAMP, + AddressW: D3D11_TEXTURE_ADDRESS_CLAMP, + ComparisonFunc: D3D11_COMPARISON_NEVER, + MaxLOD: f32::MAX, + ..Default::default() + }; + let mut sampler = None; + device.CreateSamplerState(&sd, Some(&mut sampler))?; + let cbd = D3D11_BUFFER_DESC { + ByteWidth: 16, // float2 inv_src + float2 pad + Usage: D3D11_USAGE_DYNAMIC, + BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32, + CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32, + ..Default::default() + }; + let mut cbuf = None; + device.CreateBuffer(&cbd, None, Some(&mut cbuf))?; + Ok(Self { + vs: vs.context("p010 vs")?, + ps_y: ps_y.context("p010 y ps")?, + ps_uv: ps_uv.context("p010 uv ps")?, + sampler: sampler.context("p010 sampler")?, + cbuf: cbuf.context("p010 cbuf")?, + }) + } + + /// Create a per-plane RTV of the P010 texture `dst` with the given single-plane `format` + /// (`R16_UNORM` for plane 0 luma, `R16G16_UNORM` for plane 1 chroma). The plane is selected by the + /// view format (planar-RTV semantics); MipSlice 0. + unsafe fn plane_rtv( + device: &ID3D11Device, + dst: &ID3D11Texture2D, + format: DXGI_FORMAT, + ) -> Result { + let desc = D3D11_RENDER_TARGET_VIEW_DESC { + Format: format, + ViewDimension: D3D11_RTV_DIMENSION_TEXTURE2D, + Anonymous: D3D11_RENDER_TARGET_VIEW_DESC_0 { + Texture2D: D3D11_TEX2D_RTV { MipSlice: 0 }, + }, + }; + let mut rtv: Option = None; + device + .CreateRenderTargetView( + dst, + Some(&desc as *const D3D11_RENDER_TARGET_VIEW_DESC), + Some(&mut rtv), + ) + .with_context(|| { + format!("CreateRenderTargetView(P010 plane, format={format:?}) — driver may not support planar RTVs") + })?; + rtv.context("p010 plane rtv null") + } + + /// Convert `src_srv` (FP16 scRGB, WxH) into `dst` (a `DXGI_FORMAT_P010` texture with + /// `BIND_RENDER_TARGET`). Two opaque passes: full-res luma → plane 0, half-res chroma → plane 1. + /// `w`/`h` are the full luma dimensions (must be even). Returns `Err` if a plane RTV can't be + /// created (driver) so the caller can fall back to the R10 path. + pub(crate) unsafe fn convert( + &self, + device: &ID3D11Device, + ctx: &ID3D11DeviceContext, + src_srv: &ID3D11ShaderResourceView, + dst: &ID3D11Texture2D, + w: u32, + h: u32, + ) -> Result<()> { + let y_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16_UNORM)?; + let uv_rtv = Self::plane_rtv(device, dst, DXGI_FORMAT_R16G16_UNORM)?; + + // Update the chroma constant buffer (inverse source texel size). + let cb: [f32; 4] = [1.0 / w as f32, 1.0 / h as f32, 0.0, 0.0]; + let mut mapped = D3D11_MAPPED_SUBRESOURCE::default(); + if ctx + .Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped)) + .is_ok() + { + std::ptr::copy_nonoverlapping(cb.as_ptr(), mapped.pData as *mut f32, cb.len()); + ctx.Unmap(&self.cbuf, 0); + } + + // Shared pipeline state. + ctx.OMSetBlendState(None, None, 0xffff_ffff); // opaque overwrite + ctx.VSSetShader(&self.vs, None); + ctx.PSSetShaderResources(0, Some(&[Some(src_srv.clone())])); + ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())])); + ctx.IASetInputLayout(None); + ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + + // --- LUMA pass: full-res, plane 0 --- + let vp_y = D3D11_VIEWPORT { + TopLeftX: 0.0, + TopLeftY: 0.0, + Width: w as f32, + Height: h as f32, + MinDepth: 0.0, + MaxDepth: 1.0, + }; + ctx.RSSetViewports(Some(&[vp_y])); + ctx.OMSetRenderTargets(Some(&[Some(y_rtv.clone())]), None); + ctx.PSSetShader(&self.ps_y, None); + ctx.Draw(3, 0); + ctx.OMSetRenderTargets(Some(&[None]), None); + + // --- CHROMA pass: half-res, plane 1 --- + let vp_uv = D3D11_VIEWPORT { + TopLeftX: 0.0, + TopLeftY: 0.0, + Width: (w / 2) as f32, + Height: (h / 2) as f32, + MinDepth: 0.0, + MaxDepth: 1.0, + }; + ctx.RSSetViewports(Some(&[vp_uv])); + ctx.OMSetRenderTargets(Some(&[Some(uv_rtv.clone())]), None); + ctx.PSSetShader(&self.ps_uv, None); + ctx.PSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())])); + ctx.Draw(3, 0); + + // Unbind for the next frame's re-RTV / NVENC read. + ctx.OMSetRenderTargets(Some(&[None]), None); + ctx.PSSetShaderResources(0, Some(&[None])); + Ok(()) + } +} + +/// f64 reference for the P010 colour math — the EXACT analogue of the HLSL in [`HDR_P010_COMMON`]. +/// Input is one scRGB pixel (linear, Rec.709 primaries, 1.0 = 80 nits, may be >1 for HDR). Output is +/// the 10-bit studio-range (Y, Cb, Cr) codes the shader should produce for a flat (constant) block. +/// Used by [`hdr_p010_selftest`]. +#[cfg(target_os = "windows")] +fn p010_reference(r: f64, g: f64, b: f64) -> (f64, f64, f64) { + fn pq_oetf(l: f64) -> f64 { + let l = l.clamp(0.0, 1.0); + let m1 = 0.1593017578125; + let m2 = 78.84375; + let c1 = 0.8359375; + let c2 = 18.8515625; + let c3 = 18.6875; + let lp = l.powf(m1); + ((c1 + c2 * lp) / (1.0 + c3 * lp)).powf(m2) + } + // scRGB -> nits -> BT.2020 linear (row-major matrix, mul(M, v)). + let (r, g, b) = (r.max(0.0) * 80.0, g.max(0.0) * 80.0, b.max(0.0) * 80.0); + let m = [ + [0.627403914, 0.329283038, 0.043313048], + [0.069097292, 0.919540405, 0.011362303], + [0.016391439, 0.088013308, 0.895595253], + ]; + let lr = m[0][0] * r + m[0][1] * g + m[0][2] * b; + let lg = m[1][0] * r + m[1][1] * g + m[1][2] * b; + let lb = m[2][0] * r + m[2][1] * g + m[2][2] * b; + // PQ encode (normalize to 10k nits). + let pr = pq_oetf(lr / 10000.0); + let pg = pq_oetf(lg / 10000.0); + let pb = pq_oetf(lb / 10000.0); + // BT.2020 non-constant-luminance, limited 10-bit. + let (kr, kg, kb) = (0.2627, 0.6780, 0.0593); + let y = kr * pr + kg * pg + kb * pb; + let cb = (pb - y) / 1.8814; + let cr = (pr - y) / 1.4746; + let yc = (64.0 + 876.0 * y).clamp(64.0, 940.0); + let cbc = (512.0 + 896.0 * cb).clamp(64.0, 960.0); + let crc = (512.0 + 896.0 * cr).clamp(64.0, 960.0); + (yc, cbc, crc) +} + +/// Colour self-test for [`HdrP010Converter`] (the `hdr-p010-selftest` subcommand): create a hardware +/// D3D11 device, upload a known scRGB FP16 pattern, run the P010 shader passes, read the Y (plane 0) +/// and UV (plane 1) planes back from a staging copy, and compare against the [`p010_reference`] f64 +/// math. The ONLY validation we have without green-screening a live HDR stream. PASS if max abs error +/// Y ≤ 4 codes, U/V ≤ 5 codes (rounding + chroma averaging). Prints a per-colour table + PASS/FAIL. +#[cfg(target_os = "windows")] +pub fn hdr_p010_selftest() -> Result<()> { + use windows::Win32::Graphics::Direct3D::D3D_DRIVER_TYPE_HARDWARE; + use windows::Win32::Graphics::Dxgi::IDXGIAdapter; + + // 64x64, even dims. A 4x4 grid of 16x16 flat scRGB blocks (each 2x2 chroma footprint uniform → + // exact chroma comparison) covering pure R/G/B/white/black/gray at plausible HDR nit levels, plus + // a couple of bright (>1.0 scRGB) colours, then the rest is a gradient (compared on Y only). + const W: u32 = 64; + const H: u32 = 64; + const BLK: u32 = 16; + // (name, r, g, b) scRGB linear (1.0 = 80 nits). Mix of SDR-ish and HDR (>1.0) values. + let named: [(&str, f32, f32, f32); 8] = [ + ("red1.0", 1.0, 0.0, 0.0), + ("green0.5", 0.0, 0.5, 0.0), + ("blue4.0", 0.0, 0.0, 4.0), + ("white1.0", 1.0, 1.0, 1.0), + ("black", 0.0, 0.0, 0.0), + ("gray0.5", 0.5, 0.5, 0.5), + ("white4.0", 4.0, 4.0, 4.0), + ("amber2.0", 2.0, 1.0, 0.0), + ]; + + let grid_cols = W / BLK; // 4 + let pixel_rgb = |x: u32, y: u32| -> (f32, f32, f32, bool) { + let idx = ((y / BLK) * grid_cols + (x / BLK)) as usize; + if idx < named.len() { + let (_, r, g, b) = named[idx]; + (r, g, b, true) + } else { + // Gradient (distinct per pixel; Y-only compare), within HDR scRGB range. + let r = (x as f32 / W as f32) * 3.0; + let g = (y as f32 / H as f32) * 3.0; + let b = ((x + y) as f32 / (W + H) as f32) * 3.0; + (r, g, b, false) + } + }; + + // Build the scRGB FP16 (R16G16B16A16_FLOAT) source as f16 bits. + let mut fp16 = vec![0u16; (W * H * 4) as usize]; + let mut flat = vec![false; (W * H) as usize]; + for y in 0..H { + for x in 0..W { + let (r, g, b, is_flat) = pixel_rgb(x, y); + let i = ((y * W + x) * 4) as usize; + fp16[i] = f32_to_f16(r); + fp16[i + 1] = f32_to_f16(g); + fp16[i + 2] = f32_to_f16(b); + fp16[i + 3] = f32_to_f16(1.0); + flat[(y * W + x) as usize] = is_flat; + } + } + + unsafe { + // Hardware D3D11 device (no adapter pin — the default GPU is fine for the self-test). + let mut device: Option = None; + let mut context: Option = None; + D3D11CreateDevice( + None::<&IDXGIAdapter>, + D3D_DRIVER_TYPE_HARDWARE, + HMODULE::default(), + D3D11_CREATE_DEVICE_BGRA_SUPPORT, + Some(&[D3D_FEATURE_LEVEL_11_0]), + D3D11_SDK_VERSION, + Some(&mut device), + None, + Some(&mut context), + ) + .context("D3D11CreateDevice(hardware) for hdr-p010-selftest")?; + let device = device.context("null device")?; + let context = context.context("null context")?; + + // Source FP16 texture (initialized) + SRV. + let src_desc = D3D11_TEXTURE2D_DESC { + Width: W, + Height: H, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_R16G16B16A16_FLOAT, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, + ..Default::default() + }; + let init = D3D11_SUBRESOURCE_DATA { + pSysMem: fp16.as_ptr() as *const c_void, + SysMemPitch: W * 8, // 4 channels * 2 bytes + SysMemSlicePitch: 0, + }; + let mut src_tex: Option = None; + device + .CreateTexture2D(&src_desc, Some(&init), Some(&mut src_tex)) + .context("CreateTexture2D(fp16 src)")?; + let src_tex = src_tex.context("null src tex")?; + let mut src_srv: Option = None; + device + .CreateShaderResourceView(&src_tex, None, Some(&mut src_srv)) + .context("CreateShaderResourceView(fp16 src)")?; + let src_srv = src_srv.context("null src srv")?; + + // P010 destination texture (render-target bindable). + let p010_desc = D3D11_TEXTURE2D_DESC { + Width: W, + Height: H, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_P010, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32, + ..Default::default() + }; + let mut p010: Option = None; + device + .CreateTexture2D(&p010_desc, None, Some(&mut p010)) + .context("CreateTexture2D(P010 dst)")?; + let p010 = p010.context("null p010 tex")?; + + let conv = HdrP010Converter::new(&device)?; + conv.convert(&device, &context, &src_srv, &p010, W, H)?; + + // Staging copy of the whole P010 texture (both planes), MAP_READ. + let stage_desc = D3D11_TEXTURE2D_DESC { + Width: W, + Height: H, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_P010, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_STAGING, + BindFlags: 0, + CPUAccessFlags: D3D11_CPU_ACCESS_READ.0 as u32, + ..Default::default() + }; + let mut staging: Option = None; + device + .CreateTexture2D(&stage_desc, None, Some(&mut staging)) + .context("CreateTexture2D(P010 staging)")?; + let staging = staging.context("null staging")?; + context.CopyResource(&staging, &p010); + + let mut map = D3D11_MAPPED_SUBRESOURCE::default(); + context + .Map(&staging, 0, D3D11_MAP_READ, 0, Some(&mut map)) + .context("Map(P010 staging)")?; + let row_pitch = map.RowPitch as usize; // bytes per luma row (in 16-bit samples: /2) + let base = map.pData as *const u8; + // DIAGNOSTIC (the uncertain layout spot — verify on the box if chroma is wrong): the mapped + // P010 plane offsets. Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2 + // *interleaved* (Cb,Cr) u16 pairs. P010 packs plane 1 after plane 0 at the SAME row pitch; the + // chroma plane begins at byte offset RowPitch * (luma height). For a STAGING texture that + // height is the created H (no inter-plane alignment). DepthPitch (total mapped size) lets us + // sanity-check: it should be ~ RowPitch * H * 3/2. If chroma reads garbage on the box, print + // these and adjust `chroma_base` (e.g. an aligned luma height). + tracing::info!( + row_pitch, + depth_pitch = map.DepthPitch, + expected_chroma_base = row_pitch * H as usize, + expected_total = row_pitch * H as usize * 3 / 2, + "hdr-p010-selftest: mapped P010 layout (verify chroma plane offset here if chroma is wrong)" + ); + // Plane 0 (luma): H rows of W u16. Plane 1 (chroma): H/2 rows of W/2 *interleaved* (Cb,Cr) + // u16 pairs, i.e. W u16 per chroma row. P010 packs plane 1 immediately after plane 0 at the + // SAME row pitch; per spec the chroma plane begins at an allocation offset of + // RowPitch * Height (luma rows). We read it from there. (DepthPitch is the full surface size; + // not all drivers report the chroma offset, so RowPitch*Height is the portable choice.) + let read_u16 = |byte_off: usize| -> u16 { + // SAFETY: `base` is the mapped staging pointer; all offsets are within the P010 surface + // (luma H*RowPitch + chroma (H/2)*RowPitch ≤ DepthPitch). Already in the fn's unsafe scope. + let p = base.add(byte_off) as *const u16; + p.read_unaligned() + }; + // Luma codes: stored u16 in the high 10 bits -> code10 = stored >> 6. + let mut y_codes = vec![0u16; (W * H) as usize]; + for y in 0..H { + for x in 0..W { + let off = (y as usize) * row_pitch + (x as usize) * 2; + y_codes[(y * W + x) as usize] = read_u16(off) >> 6; + } + } + let cw = W / 2; + let ch = H / 2; + let chroma_base = row_pitch * H as usize; // plane 1 offset + let mut cb_codes = vec![0u16; (cw * ch) as usize]; + let mut cr_codes = vec![0u16; (cw * ch) as usize]; + for cy in 0..ch { + for cx in 0..cw { + // Interleaved (Cb, Cr) per chroma sample → 2 u16 = 4 bytes per sample. + let off = chroma_base + (cy as usize) * row_pitch + (cx as usize) * 4; + cb_codes[(cy * cw + cx) as usize] = read_u16(off) >> 6; + cr_codes[(cy * cw + cx) as usize] = read_u16(off + 2) >> 6; + } + } + context.Unmap(&staging, 0); + + // Compare Y over every pixel. + let mut max_y_err = 0.0f64; + for y in 0..H { + for x in 0..W { + let (r, g, b, _) = pixel_rgb(x, y); + let (ry, _, _) = p010_reference(r as f64, g as f64, b as f64); + let got = y_codes[(y * W + x) as usize] as f64; + max_y_err = max_y_err.max((got - ry).abs()); + } + } + // Compare Cb/Cr over flat blocks only (uniform 2x2 footprint → exact reference). + let mut max_u_err = 0.0f64; + let mut max_v_err = 0.0f64; + for cy in 0..ch { + for cx in 0..cw { + let (sx, sy) = (cx * 2, cy * 2); + let all_flat = + (0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize])); + if !all_flat { + continue; + } + let (r, g, b, _) = pixel_rgb(sx, sy); + let (_, rcb, rcr) = p010_reference(r as f64, g as f64, b as f64); + let gu = cb_codes[(cy * cw + cx) as usize] as f64; + let gv = cr_codes[(cy * cw + cx) as usize] as f64; + max_u_err = max_u_err.max((gu - rcb).abs()); + max_v_err = max_v_err.max((gv - rcr).abs()); + } + } + + // Per-colour table. + println!("HDR P010 self-test ({W}x{H}, BT.2020 PQ, 10-bit limited range)"); + println!( + " {:<10} {:>14} {:>14} {:>14}", + "color", "Y exp/got", "Cb exp/got", "Cr exp/got" + ); + for (idx, (name, r, g, b)) in named.iter().enumerate() { + let bx = (idx as u32 % grid_cols) * BLK + BLK / 2; + let by = (idx as u32 / grid_cols) * BLK + BLK / 2; + let (ey, ecb, ecr) = p010_reference(*r as f64, *g as f64, *b as f64); + let gy = y_codes[(by * W + bx) as usize] as f64; + let (ccx, ccy) = (bx / 2, by / 2); + let gu = cb_codes[(ccy * cw + ccx) as usize] as f64; + let gv = cr_codes[(ccy * cw + ccx) as usize] as f64; + println!( + " {:<10} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}", + name, ey, gy, ecb, gu, ecr, gv + ); + } + println!( + " max abs error: Y={max_y_err:.2} (≤4) Cb={max_u_err:.2} (≤5) Cr={max_v_err:.2} (≤5)" + ); + + if max_y_err <= 4.0 && max_u_err <= 5.0 && max_v_err <= 5.0 { + println!("PASS"); + Ok(()) + } else { + println!("FAIL"); + bail!( + "HDR P010 self-test FAILED (Y={max_y_err:.2} Cb={max_u_err:.2} Cr={max_v_err:.2})" + ); + } + } +} + +/// Minimal f32 → IEEE-754 half (f16) bit pattern, for uploading the FP16 scRGB self-test pattern. Not +/// on any hot path; handles normals, subnormals, and the 1.0/0.0 constants we feed. (round-to-nearest) +#[cfg(target_os = "windows")] +fn f32_to_f16(v: f32) -> u16 { + let bits = v.to_bits(); + let sign = ((bits >> 16) & 0x8000) as u16; + let exp = ((bits >> 23) & 0xff) as i32 - 127 + 15; + let mant = bits & 0x007f_ffff; + if exp <= 0 { + // Subnormal / zero in half precision. + if exp < -10 { + return sign; // too small → ±0 + } + let mant = mant | 0x0080_0000; // implicit 1 + let shift = (14 - exp) as u32; + let half_mant = (mant >> shift) as u16; + // Round to nearest. + let round = ((mant >> (shift - 1)) & 1) as u16; + sign | (half_mant + round) + } else if exp >= 0x1f { + sign | 0x7c00 // Inf/NaN → Inf (our inputs never hit this) + } else { + let half_exp = (exp as u16) << 10; + let half_mant = (mant >> 13) as u16; + let round = ((mant >> 12) & 1) as u16; + sign | half_exp | (half_mant + round) + } +} + use windows::Win32::Graphics::Direct3D11::{ ID3D11VideoContext1, ID3D11VideoDevice, ID3D11VideoProcessor, ID3D11VideoProcessorEnumerator, ID3D11VideoProcessorInputView, ID3D11VideoProcessorOutputView, D3D11_TEX2D_VPIV, diff --git a/crates/punktfunk-host/src/capture/wgc.rs b/crates/punktfunk-host/src/capture/wgc.rs index 7a144e0..edfb3cf 100644 --- a/crates/punktfunk-host/src/capture/wgc.rs +++ b/crates/punktfunk-host/src/capture/wgc.rs @@ -17,8 +17,8 @@ //! the DDA backend ([`super::dxgi::DuplCapturer`]) for those (see capture.rs). use super::dxgi::{ - find_output, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, VideoConverter, - WinCaptureTarget, + find_output, hdr_shader_p010_enabled, make_device, nudge_cursor_onto, D3d11Frame, HdrConverter, + HdrP010Converter, VideoConverter, WinCaptureTarget, }; use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use anyhow::{bail, Context, Result}; @@ -130,6 +130,15 @@ pub struct WgcCapturer { hdr_conv: Option, fp16_src: Option, fp16_srv: Option, + /// `PUNKTFUNK_HDR_SHADER_P010` path: emit P010 (BT.2020 PQ 10-bit limited range) DIRECTLY from our + /// own shader (`HdrP010Converter`) so NVENC takes native P010 and skips its SM-side RGB→YUV CSC. + /// Gated by [`hdr_shader_p010_enabled`] AND `self.hdr`; `None`/empty when off → the existing R10 + + /// VideoProcessor paths run unchanged. `p010_disabled` latches a runtime failure (e.g. a driver + /// that rejects the planar plane RTV) so we fall back to the R10 path and stop retrying. + hdr_p010_conv: Option, + p010_out: Vec, + p010_idx: usize, + p010_disabled: bool, /// Ring of host-owned output textures (BGRA for SDR, R10G10B10A2 for HDR), rotated per processed /// frame. A ring — not one texture — is required because the encode loop is PIPELINED: NVENC /// encodes frame N (in place, registered by pointer) while this capturer produces frame N+1, so @@ -320,6 +329,10 @@ impl WgcCapturer { hdr_conv: None, fp16_src: None, fp16_srv: None, + hdr_p010_conv: None, + p010_out: Vec::new(), + p010_idx: 0, + p010_disabled: false, out_ring: Vec::new(), ring_idx: 0, video_conv: None, @@ -503,6 +516,49 @@ impl WgcCapturer { Some(out) } + /// `PUNKTFUNK_HDR_SHADER_P010` path: convert the OS-composited FP16 scRGB capture DIRECTLY to a + /// host-owned P010 texture (BT.2020 PQ, 10-bit limited range) via [`HdrP010Converter`] — two + /// shader passes writing the P010 planes. NVENC then takes native P010 and skips its internal + /// RGB→YUV CSC. Returns the next ring slot's P010 texture, or `Err` if the converter / a planar + /// plane RTV fails (the caller latches `p010_disabled` and falls back to the R10 path). + unsafe fn hdr_to_p010(&mut self, src: &ID3D11Texture2D) -> Result { + let slot = self.p010_idx; + // Lazily allocate the FP16 source (shared with the R10 path) + the P010 output ring. + self.ensure_fp16_src()?; + let fp16 = self.fp16_src.clone().context("fp16 src")?; + self.context.CopyResource(&fp16, src); + if self.p010_out.is_empty() { + let desc = tex_desc( + self.width, + self.height, + windows::Win32::Graphics::Dxgi::Common::DXGI_FORMAT_P010, + D3D11_BIND_RENDER_TARGET.0 as u32, + ); + for _ in 0..OUT_RING { + let mut t = None; + self.device + .CreateTexture2D(&desc, None, Some(&mut t)) + .context("CreateTexture2D(wgc p010 ring)")?; + self.p010_out.push(t.context("wgc p010 ring tex")?); + } + } + self.p010_idx = (self.p010_idx + 1) % self.p010_out.len(); + let out = self.p010_out[slot].clone(); + if self.hdr_p010_conv.is_none() { + self.hdr_p010_conv = Some(HdrP010Converter::new(&self.device)?); + } + let srv = self.fp16_srv.clone().context("fp16 srv")?; + self.hdr_p010_conv.as_ref().unwrap().convert( + &self.device, + &self.context, + &srv, + &out, + self.width, + self.height, + )?; + Ok(out) + } + fn process_frame(&mut self, frame: Direct3D11CaptureFrame) -> Result { unsafe { let surface = frame.Surface().context("frame Surface")?; @@ -513,11 +569,40 @@ impl WgcCapturer { .GetInterface() .context("GetInterface ID3D11Texture2D")?; + // GATED P010-shader path (`PUNKTFUNK_HDR_SHADER_P010`): for HDR, emit P010 (BT.2020 PQ + // 10-bit limited range) DIRECTLY from our shader so NVENC takes native P010 and skips its + // SM-side RGB→YUV CSC. Runs BEFORE the R10 + VideoProcessor path. A converter/plane-RTV + // failure latches `p010_disabled` → we fall through to the unchanged R10 path for the rest + // of the session. Default OFF → none of this executes and behaviour is byte-for-byte as + // today. + if self.hdr && !self.p010_disabled && hdr_shader_p010_enabled() { + match self.hdr_to_p010(&src) { + Ok(p010) => { + // The P010 output is host-owned (the ring), and the FP16 CopyResource read + // `src` synchronously on the immediate context before the shader passes — so we + // do NOT need to hold `frame` past here (unlike the SDR/R10 in-place paths). + // Dropping it returns the pool buffer to WGC immediately. + drop(frame); + self.last_present = Some((p010.clone(), PixelFormat::P010)); + return Ok(self.d3d11_frame(p010, PixelFormat::P010)); + } + Err(e) => { + tracing::warn!(error = %format!("{e:#}"), + "WGC: HDR P010 shader path failed — disabling it, falling back to R10"); + self.p010_disabled = true; + self.hdr_p010_conv = None; + self.p010_out.clear(); + } + } + } + // Preferred path: convert the OS-composited capture (cursor already in it) DIRECTLY to // NVENC's native YUV on the video processor — no CopyResource, no cursor draw, and NVENC // skips its internal RGB→YUV (the contended 3D step). WGC's multi-buffer pool + held set // means reading the pool texture directly does NOT serialize (unlike DDA's single-frame - // model). The frame is held until the async Blt finishes. + // model). The frame is held until the async Blt finishes. (HDR: the video processor can't + // ingest FP16 scRGB, so the Blt fails and we fall back to the R10 path below; the + // `PUNKTFUNK_HDR_SHADER_P010` branch above is the off-the-SM HDR path.) if let Some(yuv) = self.convert_to_yuv(&src, self.hdr) { let fmt = if self.hdr { PixelFormat::P010 diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs index 4b48bad..717d463 100644 --- a/crates/punktfunk-host/src/main.rs +++ b/crates/punktfunk-host/src/main.rs @@ -130,6 +130,13 @@ fn real_main() -> Result<()> { // `PUNKTFUNK_NV12` convert is colour-correct. Prints PASS/FAIL + max Y/U/V error. #[cfg(target_os = "linux")] Some("nv12-selftest") => zerocopy::nv12_selftest(), + // HDR P010 colour self-test (Windows; no display/capture needed): upload a known scRGB FP16 + // pattern, run the `HdrP010Converter` shader → P010 on the GPU, read the Y/UV planes back, and + // compare against an f64 BT.2020-PQ limited-range reference. Validates the + // `PUNKTFUNK_HDR_SHADER_P010` colour math without green-screening a live HDR stream. Prints + // PASS/FAIL + max Y/Cb/Cr error. + #[cfg(target_os = "windows")] + Some("hdr-p010-selftest") => crate::capture::dxgi::hdr_p010_selftest(), // Compositor readiness probe: exit 0 iff the (detected or PUNKTFUNK_COMPOSITOR-forced) // compositor is up and able to create a virtual output *now*. A session-bringup // script polls this to gate on real readiness instead of a blind `sleep`. @@ -551,6 +558,9 @@ NOTES: \x20 punktfunk-host service install register an auto-start SYSTEM service + firewall rules\n\ \x20 punktfunk-host service uninstall remove the service + firewall rules\n\ \x20 punktfunk-host service start|stop|status\n\ - \x20 config: %ProgramData%\\punktfunk\\host.env" + \x20 config: %ProgramData%\\punktfunk\\host.env\n\ + \nWINDOWS DIAGNOSTICS:\n\ + \x20 punktfunk-host hdr-p010-selftest GPU colour check for the PUNKTFUNK_HDR_SHADER_P010 path\n\ + \x20 (scRGB FP16 -> P010 BT.2020 PQ shader vs an f64 reference)" ); }