diff --git a/crates/punktfunk-host/src/capture/linux.rs b/crates/punktfunk-host/src/capture/linux.rs index 9b6b103..7820025 100644 --- a/crates/punktfunk-host/src/capture/linux.rs +++ b/crates/punktfunk-host/src/capture/linux.rs @@ -466,6 +466,9 @@ mod pipewire { negotiated: Arc, /// Present when zero-copy is enabled: imports a dmabuf → CUDA device buffer. importer: Option, + /// `PUNKTFUNK_NV12`: on the tiled EGL/GL zero-copy path, convert to NV12 on the GPU and feed + /// NVENC native YUV (Tier 2A). Off ⇒ the BGRx path is unchanged. + nv12: bool, /// Rate-limit counter for the latest-frame-only diagnostic log (see `.process`). dbg_log_n: u64, } @@ -780,8 +783,17 @@ mod pipewire { // sample LINEAR). let modifier = (ud.modifier != 0).then_some(ud.modifier); if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) { - let imported = if modifier.is_some() { - importer.import(&plane, w as u32, h as u32, fourcc, modifier) + // NV12 convert (Tier 2A) only on the tiled EGL/GL path (`modifier.is_some()`): + // produce native YUV so NVENC skips its internal RGB→YUV CSC. The LINEAR/Vulkan + // (gamescope) path stays RGB — its convert isn't wired here. When NV12 is + // produced the frame's format is reported as `Nv12` so the encoder opens native. + let nv12 = ud.nv12 && modifier.is_some(); + let imported = if let Some(m) = modifier { + if nv12 { + importer.import_nv12(&plane, w as u32, h as u32, fourcc, Some(m)) + } else { + importer.import(&plane, w as u32, h as u32, fourcc, Some(m)) + } } else { importer.import_linear(&plane, w as u32, h as u32) }; @@ -794,6 +806,7 @@ mod pipewire { w, h, modifier = ud.modifier, + nv12, "zero-copy: dmabuf imported to CUDA (no CPU copy)" ); } @@ -805,7 +818,7 @@ mod pipewire { width: w as u32, height: h as u32, pts_ns, - format: fmt, + format: if nv12 { PixelFormat::Nv12 } else { fmt }, payload: FramePayload::Cuda(devbuf), }); return; @@ -978,6 +991,12 @@ mod pipewire { "zero-copy: advertising EGL-importable dmabuf modifiers" ); } + if want_dmabuf && crate::zerocopy::nv12_enabled() { + tracing::info!( + "PUNKTFUNK_NV12: tiled dmabufs convert to NV12 (BT.709 limited) on the GPU — NVENC \ + fed native YUV (no internal RGB→YUV CSC)" + ); + } let data = UserData { info: VideoInfoRaw::default(), @@ -987,6 +1006,7 @@ mod pipewire { active, negotiated, importer, + nv12: crate::zerocopy::nv12_enabled(), dbg_log_n: 0, }; diff --git a/crates/punktfunk-host/src/encode/linux.rs b/crates/punktfunk-host/src/encode/linux.rs index a0bc40d..fd8a7e3 100644 --- a/crates/punktfunk-host/src/encode/linux.rs +++ b/crates/punktfunk-host/src/encode/linux.rs @@ -103,10 +103,14 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) { PixelFormat::Rgba => (Pixel::RGBA, false), PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0 PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0 - // Rgb10a2 (HDR) and NV12/P010 (the Windows video-processor YUV outputs) are produced only by - // the Windows capture/encode paths; the Linux capturer never emits them. Map to BGRA so the - // match is exhaustive — unreachable here. - PixelFormat::Rgb10a2 | PixelFormat::Nv12 | PixelFormat::P010 => (Pixel::BGRA, false), + // NV12 is native YUV: NVENC encodes it with NO internal RGB→YUV CSC (the Tier 2A win). On + // Linux it's produced by the GPU convert on the zero-copy tiled path (`PUNKTFUNK_NV12`); on + // Windows by the D3D11 video processor. + PixelFormat::Nv12 => (Pixel::NV12, false), + // Rgb10a2 (HDR) and P010 (the Windows 10-bit video-processor output) are produced only by + // the Windows paths; the Linux capturer never emits them. Map to BGRA so the match is + // exhaustive — unreachable here. + PixelFormat::Rgb10a2 | PixelFormat::P010 => (Pixel::BGRA, false), } } @@ -204,6 +208,21 @@ impl NvencEncoder { (*video.as_mut_ptr()).gop_size = -1; } + // NV12 path: we did the RGB→YUV conversion ourselves as BT.709 *limited* range, so signal + // that in the bitstream VUI (colorspace/range/primaries/transfer) — otherwise the client + // decoder assumes a default and the picture comes out washed-out / wrong-contrast. The + // RGB-input paths leave these unset (NVENC's internal CSC writes its own VUI). Matches the + // Windows NV12 path's BT.709 limited-range signalling. + if matches!(format, PixelFormat::Nv12) { + unsafe { + let raw = video.as_mut_ptr(); + (*raw).colorspace = ffi::AVColorSpace::AVCOL_SPC_BT709; + (*raw).color_range = ffi::AVColorRange::AVCOL_RANGE_MPEG; // limited/studio + (*raw).color_primaries = ffi::AVColorPrimaries::AVCOL_PRI_BT709; + (*raw).color_trc = ffi::AVColorTransferCharacteristic::AVCOL_TRC_BT709; + } + } + // For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA // hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context // *before* open (NVENC derives the device from `hw_frames_ctx`). @@ -419,9 +438,20 @@ impl NvencEncoder { ffi::av_frame_free(&mut f); bail!("av_hwframe_get_buffer(CUDA) failed ({r})"); } - let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; - let dst_pitch = (*f).linesize[0] as usize; - if let Err(e) = crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) { + // NV12 surfaces are two-plane (Y in data[0], interleaved UV in data[1]); the RGB + // surfaces are single-plane. Copy the matching layout into NVENC's pooled surface. + let copy_res = if buf.is_nv12() { + let y_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; + let y_pitch = (*f).linesize[0] as usize; + let uv_ptr = (*f).data[1] as crate::zerocopy::cuda::CUdeviceptr; + let uv_pitch = (*f).linesize[1] as usize; + crate::zerocopy::cuda::copy_nv12_to_device(buf, y_ptr, y_pitch, uv_ptr, uv_pitch) + } else { + let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; + let dst_pitch = (*f).linesize[0] as usize; + crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) + }; + if let Err(e) = copy_res { ffi::av_frame_free(&mut f); return Err(e).context("copy imported buffer into NVENC surface"); } diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs index 800d66c..4b48bad 100644 --- a/crates/punktfunk-host/src/main.rs +++ b/crates/punktfunk-host/src/main.rs @@ -125,6 +125,11 @@ fn real_main() -> Result<()> { // Zero-copy FFI/GPU probe: init the EGL importer + CUDA context (no capture needed). #[cfg(target_os = "linux")] Some("zerocopy-probe") => zerocopy::probe(), + // NV12 colour self-test (no display/capture needed): convert a known RGBA pattern to NV12 + // on the GPU and compare against a BT.709 limited-range reference. Validates the Tier 2A + // `PUNKTFUNK_NV12` convert is colour-correct. Prints PASS/FAIL + max Y/U/V error. + #[cfg(target_os = "linux")] + Some("nv12-selftest") => zerocopy::nv12_selftest(), // Compositor readiness probe: exit 0 iff the (detected or PUNKTFUNK_COMPOSITOR-forced) // compositor is up and able to create a virtual output *now*. A session-bringup // script polls this to gate on real readiness instead of a blind `sleep`. diff --git a/crates/punktfunk-host/src/zerocopy/cuda.rs b/crates/punktfunk-host/src/zerocopy/cuda.rs index 52465c1..4aed847 100644 --- a/crates/punktfunk-host/src/zerocopy/cuda.rs +++ b/crates/punktfunk-host/src/zerocopy/cuda.rs @@ -159,6 +159,31 @@ fn ck(r: CUresult, what: &str) -> Result<()> { } } +/// Copy a pitched device plane `(src_ptr, src_pitch)` down to a tightly-packed host buffer of +/// `width_bytes`×`height` (no row padding). Synchronous on the priority stream. Used by the NV12 +/// self-test to read planes back for the colour comparison; not on the hot path. +pub fn read_plane_to_host( + src_ptr: CUdeviceptr, + src_pitch: usize, + width_bytes: usize, + height: usize, +) -> Result> { + let mut host = vec![0u8; width_bytes * height]; + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_DEVICE, + srcDevice: src_ptr, + srcPitch: src_pitch, + dstMemoryType: 1, // CU_MEMORYTYPE_HOST + dstHost: host.as_mut_ptr() as *mut c_void, + dstPitch: width_bytes, + WidthInBytes: width_bytes, + Height: height, + ..Default::default() + }; + unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->host)")? }; + Ok(host) +} + /// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live /// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread. #[derive(Clone, Copy)] @@ -265,11 +290,52 @@ fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> { Ok((ptr, pitch)) } +/// Allocate the two pitched planes of an NV12 surface (8-bit BT.709 4:2:0): a `width`-byte Y plane +/// (W×H, 1 byte/px) and an interleaved chroma plane (W/2 × H/2 samples, 2 bytes/sample → W bytes +/// wide). Both planes share the driver's Y pitch (the wider request), so the encoder's two-plane +/// surface and ours line up. Returns `((y_ptr, y_pitch), (uv_ptr, uv_pitch))`. +fn alloc_pitched_nv12( + width: u32, + height: u32, +) -> Result<((CUdeviceptr, usize), (CUdeviceptr, usize))> { + let mut y_ptr: CUdeviceptr = 0; + let mut y_pitch: usize = 0; + let mut uv_ptr: CUdeviceptr = 0; + let mut uv_pitch: usize = 0; + unsafe { + ck( + cuMemAllocPitch_v2( + &mut y_ptr, + &mut y_pitch, + width as usize, + height as usize, + 16, + ), + "cuMemAllocPitch_v2(Y)", + )?; + // Chroma is W/2 samples wide at 2 bytes each = W bytes; H/2 rows. + ck( + cuMemAllocPitch_v2( + &mut uv_ptr, + &mut uv_pitch, + (width as usize / 2) * 2, + (height as usize / 2).max(1), + 16, + ), + "cuMemAllocPitch_v2(UV)", + )?; + } + Ok(((y_ptr, y_pitch), (uv_ptr, uv_pitch))) +} + /// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the /// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and -/// returns its allocation here. Bulk-freed when the last reference drops. +/// returns its allocation here. Bulk-freed when the last reference drops. For NV12 each free entry +/// is the Y plane *and* its paired UV plane (allocated/recycled/freed together). struct PoolInner { free: Vec, + /// NV12 only: the UV plane paired with each Y plane in `free` (same index, same length). + free_uv: Vec, } impl Drop for PoolInner { @@ -281,6 +347,9 @@ impl Drop for PoolInner { for &p in &self.free { let _ = cuMemFree_v2(p); } + for &p in &self.free_uv { + let _ = cuMemFree_v2(p); + } } } } @@ -294,6 +363,8 @@ pub struct BufferPool { width: u32, height: u32, pitch: usize, + /// NV12 pools carry a second (chroma) pitch; `Some` ⇒ buffers from this pool have a UV plane. + uv_pitch: Option, } impl BufferPool { @@ -302,10 +373,30 @@ impl BufferPool { pub fn new(width: u32, height: u32) -> Result { let (ptr, pitch) = alloc_pitched(width, height)?; Ok(BufferPool { - inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })), + inner: Arc::new(Mutex::new(PoolInner { + free: vec![ptr], + free_uv: Vec::new(), + })), width, height, pitch, + uv_pitch: None, + }) + } + + /// Create a pool of NV12 two-plane surfaces (Y + interleaved UV) for `width`x`height`. Allocates + /// one pair up front to learn the driver's per-plane pitches (constant for a given width). + pub fn new_nv12(width: u32, height: u32) -> Result { + let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?; + Ok(BufferPool { + inner: Arc::new(Mutex::new(PoolInner { + free: vec![y_ptr], + free_uv: vec![uv_ptr], + })), + width, + height, + pitch: y_pitch, + uv_pitch: Some(uv_pitch), }) } @@ -318,8 +409,31 @@ impl BufferPool { } /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this - /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). + /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). For an + /// NV12 pool the returned buffer carries both the Y and the paired UV plane. pub fn get(&self) -> Result { + if let Some(uv_pitch) = self.uv_pitch { + let reuse = { + let mut g = self.inner.lock().unwrap(); + g.free.pop().map(|y| (y, g.free_uv.pop())) + }; + let (ptr, uv_ptr) = match reuse { + // Y and UV are pushed/popped together, so a popped Y always has its UV. + Some((y, Some(uv))) => (y, uv), + _ => { + let ((y, _), (uv, _)) = alloc_pitched_nv12(self.width, self.height)?; + (y, uv) + } + }; + return Ok(DeviceBuffer { + ptr, + pitch: self.pitch, + width: self.width, + height: self.height, + uv: Some((uv_ptr, uv_pitch)), + pool: Some(self.inner.clone()), + }); + } let reuse = self.inner.lock().unwrap().free.pop(); let ptr = match reuse { Some(p) => p, @@ -330,6 +444,7 @@ impl BufferPool { pitch: self.pitch, width: self.width, height: self.height, + uv: None, pool: Some(self.inner.clone()), }) } @@ -343,6 +458,9 @@ pub struct DeviceBuffer { pub pitch: usize, pub width: u32, pub height: u32, + /// NV12 only: the interleaved chroma plane `(ptr, pitch)` paired with the Y plane in [`ptr`]. + /// `None` for the default 4-byte RGB/BGRx path. When `Some`, [`ptr`] is the Y plane (1 byte/px). + pub uv: Option<(CUdeviceptr, usize)>, pool: Option>>, } @@ -355,9 +473,29 @@ impl DeviceBuffer { pitch, width, height, + uv: None, pool: None, }) } + + /// Allocate a standalone (un-pooled) NV12 two-plane buffer. Prefer [`BufferPool::new_nv12`] on + /// the hot path; used by the self-test. + pub fn alloc_nv12(width: u32, height: u32) -> Result { + let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?; + Ok(DeviceBuffer { + ptr: y_ptr, + pitch: y_pitch, + width, + height, + uv: Some((uv_ptr, uv_pitch)), + pool: None, + }) + } + + /// True if this buffer carries an NV12 chroma plane. + pub fn is_nv12(&self) -> bool { + self.uv.is_some() + } } impl Drop for DeviceBuffer { @@ -366,8 +504,13 @@ impl Drop for DeviceBuffer { return; } if let Some(pool) = &self.pool { - // Recycle (the consumer synchronized before dropping, so the GPU is done with it). - pool.lock().unwrap().free.push(self.ptr); + // Recycle (the consumer synchronized before dropping, so the GPU is done with it). Y and + // its paired UV go back together so `get` can repair them as a unit. + let mut g = pool.lock().unwrap(); + g.free.push(self.ptr); + if let Some((uv_ptr, _)) = self.uv { + g.free_uv.push(uv_ptr); + } } else { // The buffer may be freed on the encode thread; cuMemFree needs a current context. unsafe { @@ -375,6 +518,9 @@ impl Drop for DeviceBuffer { let _ = cuCtxSetCurrent(c.0); } let _ = cuMemFree_v2(self.ptr); + if let Some((uv_ptr, _)) = self.uv { + let _ = cuMemFree_v2(uv_ptr); + } } } } @@ -440,6 +586,62 @@ impl RegisteredTexture { res } } + + /// Map this texture for the frame and copy its array into the device plane `(dst_ptr, + /// dst_pitch)`, taking `width_bytes`×`height` bytes (the GL internal format dictates + /// `width_bytes`: `width*1` for an `R8` luma target, `(width/2)*2` for an `RG8` chroma target). + /// Synchronized on our priority stream before unmap (so the source dmabuf is safe to recycle). + /// Always unmaps, even on copy error. + fn copy_mapped_plane( + &mut self, + dst_ptr: CUdeviceptr, + dst_pitch: usize, + width_bytes: usize, + height: usize, + ) -> Result<()> { + unsafe { + ck( + cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()), + "cuGraphicsMapResources", + )?; + let mut array: CUarray = std::ptr::null_mut(); + if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 { + let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); + bail!("cuGraphicsSubResourceGetMappedArray failed"); + } + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_ARRAY, + srcArray: array, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: dst_ptr, + dstPitch: dst_pitch, + WidthInBytes: width_bytes, + Height: height, + ..Default::default() + }; + let res = copy_blocking(©, "cuMemcpy2DAsync_v2(plane)"); + let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); + res + } + } +} + +/// Copy the two NV12 convert targets (registered `R8` luma + `RG8` chroma GL textures) into `dst`'s +/// Y and UV planes. `dst` must be an NV12 buffer (`dst.uv` set). The luma plane is `width`×`height` +/// bytes; the chroma plane is `(width/2)·2` bytes wide × `height/2` rows. Both copies sync on our +/// priority stream before returning, so the dmabuf is safe to recycle once this returns. +pub fn copy_mapped_nv12( + y_tex: &mut RegisteredTexture, + uv_tex: &mut RegisteredTexture, + dst: &DeviceBuffer, +) -> Result<()> { + let (uv_ptr, uv_pitch) = dst + .uv + .ok_or_else(|| anyhow::anyhow!("copy_mapped_nv12 on a non-NV12 buffer"))?; + let w = dst.width as usize; + let h = dst.height as usize; + y_tex.copy_mapped_plane(dst.ptr, dst.pitch, w, h)?; + uv_tex.copy_mapped_plane(uv_ptr, uv_pitch, (w / 2) * 2, h / 2) } /// Copy a pitched device buffer into another device region (device→device), e.g. our imported @@ -464,6 +666,50 @@ pub fn copy_device_to_device( unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->dev)") } } +/// Copy our imported NV12 [`DeviceBuffer`] (Y + UV planes) into NVENC's two-plane CUDA surface +/// `(y_dst, y_pitch)` / `(uv_dst, uv_pitch)` (`av_hwframe_get_buffer`'s `data[0]`/`data[1]` + +/// `linesize[0]`/`linesize[1]`). The Y plane is `width`×`height` bytes; the chroma plane is +/// `(width/2)·2` bytes × `height/2` rows. The caller must have the shared context current. +pub fn copy_nv12_to_device( + src: &DeviceBuffer, + y_dst: CUdeviceptr, + y_pitch: usize, + uv_dst: CUdeviceptr, + uv_pitch: usize, +) -> Result<()> { + let (src_uv_ptr, src_uv_pitch) = src + .uv + .ok_or_else(|| anyhow::anyhow!("copy_nv12_to_device on a non-NV12 buffer"))?; + let w = src.width as usize; + let h = src.height as usize; + let y = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_DEVICE, + srcDevice: src.ptr, + srcPitch: src.pitch, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: y_dst, + dstPitch: y_pitch, + WidthInBytes: w, // 1 byte/px luma + Height: h, + ..Default::default() + }; + let uv = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_DEVICE, + srcDevice: src_uv_ptr, + srcPitch: src_uv_pitch, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: uv_dst, + dstPitch: uv_pitch, + WidthInBytes: (w / 2) * 2, // 2 bytes/sample interleaved U,V + Height: h / 2, + ..Default::default() + }; + unsafe { + copy_blocking(&y, "cuMemcpy2DAsync_v2(nv12 Y dev->dev)")?; + copy_blocking(&uv, "cuMemcpy2DAsync_v2(nv12 UV dev->dev)") + } +} + impl Drop for RegisteredTexture { fn drop(&mut self) { if !self.resource.is_null() { diff --git a/crates/punktfunk-host/src/zerocopy/egl.rs b/crates/punktfunk-host/src/zerocopy/egl.rs index bf7eaf8..eee3acd 100644 --- a/crates/punktfunk-host/src/zerocopy/egl.rs +++ b/crates/punktfunk-host/src/zerocopy/egl.rs @@ -34,6 +34,13 @@ const GL_TEXTURE_MAG_FILTER: u32 = 0x2800; const GL_LINEAR: c_int = 0x2601; const GL_NEAREST: c_int = 0x2600; const GL_RGBA8: u32 = 0x8058; +// Single/dual-channel 8-bit formats for the NV12 convert targets: R8 luma (full-res), +// RG8 interleaved chroma (half-res). The `_RED`/`_RG` enums are the matching client formats. +const GL_R8: u32 = 0x8229; +const GL_RG8: u32 = 0x822B; +// Client pixel format/type for texture uploads (self-test only): RGBA bytes. +const GL_RGBA: u32 = 0x1908; +const GL_UNSIGNED_BYTE: u32 = 0x1401; const GL_FRAMEBUFFER: u32 = 0x8D40; const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0; const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5; @@ -54,6 +61,7 @@ extern "C" { fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int); fn glGetError() -> u32; fn glGenFramebuffers(n: c_int, framebuffers: *mut u32); + fn glDeleteFramebuffers(n: c_int, framebuffers: *const u32); fn glBindFramebuffer(target: u32, framebuffer: u32); fn glFramebufferTexture2D( target: u32, @@ -65,6 +73,7 @@ extern "C" { fn glCheckFramebufferStatus(target: u32) -> u32; fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int); fn glGenVertexArrays(n: c_int, arrays: *mut u32); + fn glDeleteVertexArrays(n: c_int, arrays: *const u32); fn glBindVertexArray(array: u32); fn glDrawArrays(mode: u32, first: c_int, count: c_int); fn glActiveTexture(texture: u32); @@ -81,6 +90,18 @@ extern "C" { fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int); fn glGetUniformLocation(program: u32, name: *const i8) -> c_int; fn glUniform1i(location: c_int, v0: c_int); + fn glDeleteProgram(program: u32); + fn glTexSubImage2D( + target: u32, + level: c_int, + xoffset: c_int, + yoffset: c_int, + width: c_int, + height: c_int, + format: u32, + type_: u32, + pixels: *const c_void, + ); } #[link(name = "gbm")] @@ -97,6 +118,17 @@ type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void); const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n"; const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n"; +// NV12 BT.709 LIMITED-range convert from full-range RGB in [0,1]. Two passes share `VERT_SRC` and +// the same source texture (the de-tiled dmabuf): +// Y pass → GL_R8 luma, full-res: Y = (16 + 219·(0.2126R+0.7152G+0.0722B))/255 +// UV pass → GL_RG8 chroma, half-res (GL_LINEAR averages the 2×2 footprint): +// U = (128 + 224·(-0.1146R-0.3854G+0.5000B))/255 → R channel +// V = (128 + 224·( 0.5000R-0.4542G-0.0458B))/255 → G channel +// RG8's (R=U, G=V) byte order matches NV12's interleaved [U,V]. All outputs clamped to [0,1]. +// Matches the Windows VideoConverter (BT.709, limited/studio range) so the two hosts look identical. +const FRAG_Y_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float Y=(16.0+219.0*(0.2126*c.r+0.7152*c.g+0.0722*c.b))/255.0;o_color=vec4(clamp(Y,0.0,1.0),0.0,0.0,1.0);}\n"; +const FRAG_UV_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float U=(128.0+224.0*(-0.1146*c.r-0.3854*c.g+0.5000*c.b))/255.0;float V=(128.0+224.0*(0.5000*c.r-0.4542*c.g-0.0458*c.b))/255.0;o_color=vec4(clamp(U,0.0,1.0),clamp(V,0.0,1.0),0.0,1.0);}\n"; + unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result { let sh = glCreateShader(kind); ensure!(sh != 0, "glCreateShader failed"); @@ -113,9 +145,11 @@ unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result { Ok(sh) } -unsafe fn compile_program() -> Result { +/// Compile+link the fullscreen-triangle program with fragment source `frag` and bind its `image` +/// sampler to texture unit 0. +unsafe fn compile_program_with(frag: &[u8]) -> Result { let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?; - let fs = compile_shader(GL_FRAGMENT_SHADER, FRAG_SRC)?; + let fs = compile_shader(GL_FRAGMENT_SHADER, frag)?; let prog = glCreateProgram(); glAttachShader(prog, vs); glAttachShader(prog, fs); @@ -134,6 +168,10 @@ unsafe fn compile_program() -> Result { Ok(prog) } +unsafe fn compile_program() -> Result { + compile_program_with(FRAG_SRC) +} + /// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture. struct GlBlit { program: u32, @@ -230,6 +268,165 @@ impl GlBlit { } } +/// Per-size GL machinery to convert a dmabuf EGLImage into an NV12 (BT.709 limited-range) pair — +/// the [`GlBlit`] analogue for the `PUNKTFUNK_NV12` path. Two passes share `src_tex`: a full-res Y +/// pass into a CUDA-registrable `GL_R8` texture and a half-res UV pass into a `GL_RG8` texture. +/// Feeding NVENC native NV12 deletes its internal RGB→YUV CSC (which otherwise runs on the SM that a +/// saturating game pins at 100%); the convert here replaces the BGRx swizzle [`GlBlit`] did, at ~the +/// same 3D cost. +struct Nv12Blit { + y_program: u32, + uv_program: u32, + vao: u32, + y_fbo: u32, + uv_fbo: u32, + /// CUDA-registrable luma target (immutable `GL_R8`, W×H). + y_tex: u32, + /// CUDA-registrable chroma target (immutable `GL_RG8`, W/2 × H/2). + uv_tex: u32, + /// Source texture re-targeted to each frame's EGLImage. `GL_LINEAR` so the UV pass averages 2×2. + src_tex: u32, + width: u32, + height: u32, + y_registered: cuda::RegisteredTexture, + uv_registered: cuda::RegisteredTexture, + /// Recycled NV12 device buffers (two-plane) handed to the encoder. + pool: cuda::BufferPool, + /// Self-test only: whether `src_tex` has had immutable RGBA8 storage allocated for the upload + /// path (the live path retargets `src_tex` via EGLImage instead, never allocating storage). + test_src_storage: bool, +} + +impl Nv12Blit { + unsafe fn new(width: u32, height: u32) -> Result { + ensure!( + width % 2 == 0 && height % 2 == 0, + "NV12 convert needs even dimensions (got {width}x{height})" + ); + let y_program = compile_program_with(FRAG_Y_SRC)?; + let uv_program = compile_program_with(FRAG_UV_SRC)?; + let mut vao = 0u32; + glGenVertexArrays(1, &mut vao); + let mut fbos = [0u32; 2]; + glGenFramebuffers(2, fbos.as_mut_ptr()); + let (y_fbo, uv_fbo) = (fbos[0], fbos[1]); + + // Luma target: GL_R8 at full resolution. + let mut y_tex = 0u32; + glGenTextures(1, &mut y_tex); + glBindTexture(GL_TEXTURE_2D, y_tex); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, width as c_int, height as c_int); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + // Chroma target: GL_RG8 at half resolution (R=U, G=V). + let mut uv_tex = 0u32; + glGenTextures(1, &mut uv_tex); + glBindTexture(GL_TEXTURE_2D, uv_tex); + glTexStorage2D( + GL_TEXTURE_2D, + 1, + GL_RG8, + (width / 2) as c_int, + (height / 2) as c_int, + ); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + // Source: GL_LINEAR so the half-res UV pass averages the 2×2 chroma footprint. + let mut src_tex = 0u32; + glGenTextures(1, &mut src_tex); + glBindTexture(GL_TEXTURE_2D, src_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glBindTexture(GL_TEXTURE_2D, 0); + + for (fbo, tex) in [(y_fbo, y_tex), (uv_fbo, uv_tex)] { + glBindFramebuffer(GL_FRAMEBUFFER, fbo); + glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex, 0); + let status = glCheckFramebufferStatus(GL_FRAMEBUFFER); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + ensure!( + status == GL_FRAMEBUFFER_COMPLETE, + "NV12 blit FBO incomplete ({status:#x}) — GL_R8/GL_RG8 not renderable?" + ); + } + // Register both convert targets with CUDA once (per-resolution), + the NV12 two-plane pool. + let y_registered = cuda::RegisteredTexture::register_gl(y_tex)?; + let uv_registered = cuda::RegisteredTexture::register_gl(uv_tex)?; + let pool = cuda::BufferPool::new_nv12(width, height)?; + Ok(Nv12Blit { + y_program, + uv_program, + vao, + y_fbo, + uv_fbo, + y_tex, + uv_tex, + src_tex, + width, + height, + y_registered, + uv_registered, + pool, + test_src_storage: false, + }) + } + + /// Bind `image` to the source texture and run both convert passes into `y_tex`/`uv_tex`. + /// + /// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`. + unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> { + glBindTexture(GL_TEXTURE_2D, self.src_tex); + let _ = glGetError(); + egl_image_target(GL_TEXTURE_2D, image); + let e = glGetError(); + glBindTexture(GL_TEXTURE_2D, 0); + ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})"); + self.run_passes() + } + + /// Run the two convert passes from whatever is currently in `src_tex` (caller populated it). + /// Shared by [`run`](Self::run) (EGLImage source) and the self-test (uploaded RGBA source). + /// + /// # Safety: the GL context is current on this thread. + unsafe fn run_passes(&self) -> Result<()> { + glActiveTexture(GL_TEXTURE0); + glBindVertexArray(self.vao); + // Y pass: full-res into the R8 target. + glBindFramebuffer(GL_FRAMEBUFFER, self.y_fbo); + glViewport(0, 0, self.width as c_int, self.height as c_int); + glUseProgram(self.y_program); + glBindTexture(GL_TEXTURE_2D, self.src_tex); + glDrawArrays(GL_TRIANGLES, 0, 3); + // UV pass: half-res into the RG8 target (GL_LINEAR averages the 2×2). + glBindFramebuffer(GL_FRAMEBUFFER, self.uv_fbo); + glViewport(0, 0, (self.width / 2) as c_int, (self.height / 2) as c_int); + glUseProgram(self.uv_program); + glBindTexture(GL_TEXTURE_2D, self.src_tex); + glDrawArrays(GL_TRIANGLES, 0, 3); + + glBindVertexArray(0); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + glFlush(); // submit GL work before CUDA maps the textures + Ok(()) + } +} + +impl Drop for Nv12Blit { + fn drop(&mut self) { + unsafe { + glDeleteTextures(1, &self.y_tex); + glDeleteTextures(1, &self.uv_tex); + glDeleteTextures(1, &self.src_tex); + glDeleteFramebuffers(2, [self.y_fbo, self.uv_fbo].as_ptr()); + glDeleteVertexArrays(1, &self.vao); + glDeleteProgram(self.y_program); + glDeleteProgram(self.uv_program); + } + } +} + /// One dmabuf plane as delivered by PipeWire (single-plane for BGRx). #[derive(Clone, Copy, Debug)] pub struct DmabufPlane { @@ -252,6 +449,8 @@ pub struct EglImporter { egl_image_target: EglImageTargetFn, /// Lazily-created GL blit machinery (recreated if the frame size changes). blit: Option, + /// Lazily-created NV12 convert machinery (`PUNKTFUNK_NV12` path; recreated on size change). + nv12_blit: Option, /// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA), /// created lazily on the first LINEAR frame, + the destination pool. vk: Option, @@ -355,6 +554,7 @@ impl EglImporter { _gl_ctx: gl_ctx, egl_image_target, blit: None, + nv12_blit: None, vk: None, linear_pool: None, gbm, @@ -448,6 +648,33 @@ impl EglImporter { height: u32, fourcc: u32, modifier: Option, + ) -> Result { + self.import_inner(plane, width, height, fourcc, modifier, false) + } + + /// Like [`import`](Self::import), but de-tiles **and converts** the dmabuf to NV12 (BT.709 + /// limited range) on the GPU — the `PUNKTFUNK_NV12` path — so NVENC can encode native YUV with + /// no internal RGB→YUV CSC. The returned [`DeviceBuffer`] carries both NV12 planes + /// (`DeviceBuffer::is_nv12`). Only the tiled EGL/GL path supports this (LINEAR/Vulkan stays RGB). + pub fn import_nv12( + &mut self, + plane: &DmabufPlane, + width: u32, + height: u32, + fourcc: u32, + modifier: Option, + ) -> Result { + self.import_inner(plane, width, height, fourcc, modifier, true) + } + + fn import_inner( + &mut self, + plane: &DmabufPlane, + width: u32, + height: u32, + fourcc: u32, + modifier: Option, + nv12: bool, ) -> Result { let mut attrs: Vec = vec![ egl::WIDTH as egl::Attrib, @@ -484,10 +711,14 @@ impl EglImporter { ) .context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?; - // EGLImage → (sampled by a shader) → GL_RGBA8 texture → register *that* with CUDA → map - // → array → copy out. Registering the EGLImage texture directly fails (its layout isn't a - // CUDA-registrable format); the RGBA8 render target is. - let result = self.blit_and_copy(image.as_ptr(), width, height); + // EGLImage → (sampled by a shader) → GL_RGBA8 texture (or NV12 R8+RG8 pair) → register + // *that* with CUDA → map → array → copy out. Registering the EGLImage texture directly + // fails (its layout isn't a CUDA-registrable format); the render targets are. + let result = if nv12 { + self.blit_and_copy_nv12(image.as_ptr(), width, height) + } else { + self.blit_and_copy(image.as_ptr(), width, height) + }; let _ = self.egl.destroy_image(self.display, image); result } @@ -514,6 +745,80 @@ impl EglImporter { blit.registered.copy_mapped_to(&dst)?; Ok(dst) } + + /// Convert the dmabuf `image` to NV12 (Y in an R8 texture, UV in an RG8 texture) and copy both + /// planes into a pooled NV12 [`DeviceBuffer`]. (Re)creates the per-size convert machinery as + /// needed. The `PUNKTFUNK_NV12` analogue of [`blit_and_copy`]. + fn blit_and_copy_nv12( + &mut self, + image: *mut c_void, + width: u32, + height: u32, + ) -> Result { + cuda::make_current()?; + if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) { + self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? }); + } + let egl_image_target = self.egl_image_target; + let blit = self.nv12_blit.as_mut().unwrap(); + // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage. + unsafe { blit.run(egl_image_target, image)? }; + let dst = blit.pool.get()?; + cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?; + Ok(dst) + } + + /// Self-test entry: upload a packed `width`×`height` RGBA8 host pattern into a GL texture, run + /// the NV12 convert passes on the GPU, and copy both planes into a pooled NV12 [`DeviceBuffer`]. + /// Exercises the exact shaders + CUDA copy the live path uses, but sourced from an uploaded + /// texture instead of a dmabuf EGLImage (no compositor needed). `rgba` is tightly packed, 4 B/px. + pub fn convert_rgba_for_test( + &mut self, + rgba: &[u8], + width: u32, + height: u32, + ) -> Result { + anyhow::ensure!( + rgba.len() == width as usize * height as usize * 4, + "test RGBA buffer {} bytes != {}x{}x4", + rgba.len(), + width, + height + ); + cuda::make_current()?; + if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) { + self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? }); + } + let blit = self.nv12_blit.as_mut().unwrap(); + unsafe { + // Upload the host RGBA into `src_tex` (an immutable GL_RGBA8 backing must exist first; + // the live path never allocates it — it retargets `src_tex` via EGLImage instead). + glBindTexture(GL_TEXTURE_2D, blit.src_tex); + if !blit.test_src_storage { + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int); + blit.test_src_storage = true; + } + let _ = glGetError(); + glTexSubImage2D( + GL_TEXTURE_2D, + 0, + 0, + 0, + width as c_int, + height as c_int, + GL_RGBA, + GL_UNSIGNED_BYTE, + rgba.as_ptr() as *const c_void, + ); + let e = glGetError(); + glBindTexture(GL_TEXTURE_2D, 0); + ensure!(e == 0, "glTexSubImage2D(test source) failed ({e:#x})"); + blit.run_passes()?; + } + let dst = blit.pool.get()?; + cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?; + Ok(dst) + } } impl Drop for EglImporter { diff --git a/crates/punktfunk-host/src/zerocopy/mod.rs b/crates/punktfunk-host/src/zerocopy/mod.rs index 40311ed..3cf4e98 100644 --- a/crates/punktfunk-host/src/zerocopy/mod.rs +++ b/crates/punktfunk-host/src/zerocopy/mod.rs @@ -14,13 +14,26 @@ pub mod vulkan; pub use cuda::DeviceBuffer; pub use egl::{DmabufPlane, EglImporter}; -/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy). -pub fn enabled() -> bool { - std::env::var("PUNKTFUNK_ZEROCOPY") +/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`). +fn flag(name: &str) -> bool { + std::env::var(name) .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on")) .unwrap_or(false) } +/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy). +pub fn enabled() -> bool { + flag("PUNKTFUNK_ZEROCOPY") +} + +/// Whether the NV12 convert path is opted in (`PUNKTFUNK_NV12` truthy). When set AND the zero-copy +/// tiled-GL path is active, the capturer produces native NV12 (BT.709 limited range) on the GPU and +/// feeds NVENC YUV directly — deleting NVENC's internal RGB→YUV CSC (Tier 2A). Off by default: the +/// existing RGB/BGRx path is then 100% unchanged. +pub fn nv12_enabled() -> bool { + flag("PUNKTFUNK_NV12") +} + /// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`). const fn fourcc(c: &[u8; 4]) -> u32 { (c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24) @@ -49,3 +62,152 @@ pub fn probe() -> anyhow::Result<()> { tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized"); Ok(()) } + +/// Reference BT.709 LIMITED-range conversion of one full-range RGB pixel (`u8`) to (Y, U, V) in +/// `f64`, matching the GPU shaders in [`egl`]. Y in [16,235], U/V in [16,240]. +fn bt709_limited(r: u8, g: u8, b: u8) -> (f64, f64, f64) { + let (r, g, b) = (r as f64 / 255.0, g as f64 / 255.0, b as f64 / 255.0); + let y = 16.0 + 219.0 * (0.2126 * r + 0.7152 * g + 0.0722 * b); + let u = 128.0 + 224.0 * (-0.1146 * r - 0.3854 * g + 0.5000 * b); + let v = 128.0 + 224.0 * (0.5000 * r - 0.4542 * g - 0.0458 * b); + (y, u, v) +} + +/// NV12 colour self-test (the `nv12-selftest` subcommand): stand up the EGL/GL + CUDA stack, upload +/// a known synthetic RGBA pattern, run the real NV12 convert shaders on the GPU, read the Y and UV +/// planes back, and compare against a Rust BT.709 limited-range reference. Validates colour +/// correctness on the GPU **without a display** (the project's green-screen bugs came from exactly +/// this kind of plane/layout error). PASS if max abs error Y ≤ 2, U/V ≤ 3. +pub fn nv12_selftest() -> anyhow::Result<()> { + use anyhow::bail; + + // 64x64, even dims. A 4x4 grid of 16x16 flat-colour blocks (so each 2x2 chroma footprint is + // uniform → exact chroma comparison) covering the primaries + gray/black/white, then the rest + // is a diagonal gradient (every pixel changes — a Y-channel stress that also exercises the + // chroma averaging; the gradient blocks are compared on Y only). + const W: u32 = 64; + const H: u32 = 64; + const BLK: u32 = 16; + // (name, r, g, b) for the labelled blocks in row-major grid order; the rest fall to gradient. + let named: [(&str, u8, u8, u8); 8] = [ + ("red", 255, 0, 0), + ("green", 0, 255, 0), + ("blue", 0, 0, 255), + ("white", 255, 255, 255), + ("black", 0, 0, 0), + ("gray128", 128, 128, 128), + ("yellow", 255, 255, 0), + ("cyan", 0, 255, 255), + ]; + + // Build the RGBA pattern + a parallel record of each pixel's (r,g,b) and whether it sits in a + // flat block (chroma-comparable) or the gradient (Y-only). + let mut rgba = vec![0u8; (W * H * 4) as usize]; + let mut flat = vec![false; (W * H) as usize]; + let grid_cols = W / BLK; // 4 + let pixel_rgb = |x: u32, y: u32| -> (u8, u8, u8, bool) { + let bx = x / BLK; + let by = y / BLK; + let idx = (by * grid_cols + bx) as usize; + if idx < named.len() { + let (_, r, g, b) = named[idx]; + (r, g, b, true) + } else { + // Diagonal gradient — distinct per pixel. + let r = ((x * 4) & 0xff) as u8; + let g = ((y * 4) & 0xff) as u8; + let b = (((x + y) * 2) & 0xff) as u8; + (r, g, b, false) + } + }; + for y in 0..H { + for x in 0..W { + let (r, g, b, is_flat) = pixel_rgb(x, y); + let i = ((y * W + x) * 4) as usize; + rgba[i] = r; + rgba[i + 1] = g; + rgba[i + 2] = b; + rgba[i + 3] = 255; + flat[(y * W + x) as usize] = is_flat; + } + } + + // GPU convert. + let mut importer = EglImporter::new()?; + let nv12 = importer.convert_rgba_for_test(&rgba, W, H)?; + let (uv_ptr, uv_pitch) = nv12 + .uv + .ok_or_else(|| anyhow::anyhow!("self-test buffer is not NV12"))?; + // Read both planes back to host (tightly packed). + let y_host = cuda::read_plane_to_host(nv12.ptr, nv12.pitch, W as usize, H as usize)?; + let uv_host = cuda::read_plane_to_host(uv_ptr, uv_pitch, (W as usize / 2) * 2, H as usize / 2)?; + + // Compare Y over every pixel. + let mut max_y_err = 0.0f64; + for y in 0..H { + for x in 0..W { + let (r, g, b, _) = pixel_rgb(x, y); + let (ref_y, _, _) = bt709_limited(r, g, b); + let got = y_host[(y * W + x) as usize] as f64; + max_y_err = max_y_err.max((got - ref_y).abs()); + } + } + + // Compare U/V over flat blocks only (each 2x2 footprint is a single colour → exact reference). + // Chroma is W/2 × H/2 samples, interleaved [U,V] per sample. + let cw = W / 2; + let ch = H / 2; + let mut max_u_err = 0.0f64; + let mut max_v_err = 0.0f64; + for cy in 0..ch { + for cx in 0..cw { + // The 2x2 source footprint of this chroma sample. + let (sx, sy) = (cx * 2, cy * 2); + // Only compare where all 4 source pixels are flat (uniform colour). + let all_flat = + (0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize])); + if !all_flat { + continue; + } + let (r, g, b, _) = pixel_rgb(sx, sy); + let (_, ref_u, ref_v) = bt709_limited(r, g, b); + let base = ((cy * cw + cx) * 2) as usize; + let got_u = uv_host[base] as f64; + let got_v = uv_host[base + 1] as f64; + max_u_err = max_u_err.max((got_u - ref_u).abs()); + max_v_err = max_v_err.max((got_v - ref_v).abs()); + } + } + + // Per-primary actual-vs-expected (block centre for chroma). + println!("NV12 self-test ({W}x{H}, BT.709 limited range)"); + println!( + " {:<8} {:>14} {:>14} {:>14}", + "color", "Y exp/got", "U exp/got", "V exp/got" + ); + for (idx, (name, r, g, b)) in named.iter().enumerate() { + let bx = (idx as u32 % grid_cols) * BLK + BLK / 2; + let by = (idx as u32 / grid_cols) * BLK + BLK / 2; + let (ey, eu, ev) = bt709_limited(*r, *g, *b); + let gy = y_host[(by * W + bx) as usize] as f64; + let (ccx, ccy) = (bx / 2, by / 2); + let cbase = ((ccy * cw + ccx) * 2) as usize; + let gu = uv_host[cbase] as f64; + let gv = uv_host[cbase + 1] as f64; + println!( + " {:<8} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}", + name, ey, gy, eu, gu, ev, gv + ); + } + println!( + " max abs error: Y={max_y_err:.2} (≤2) U={max_u_err:.2} (≤3) V={max_v_err:.2} (≤3)" + ); + + if max_y_err <= 2.0 && max_u_err <= 3.0 && max_v_err <= 3.0 { + println!("PASS"); + Ok(()) + } else { + println!("FAIL"); + bail!("NV12 self-test FAILED (Y={max_y_err:.2} U={max_u_err:.2} V={max_v_err:.2})"); + } +}