perf(host/linux): NV12 GPU convert — feed NVENC native YUV, off the contended SM (Tier 2A)
apple / swift (push) Successful in 54s
windows-host / package (push) Failing after 2m18s
ci / web (push) Successful in 32s
ci / rust (push) Failing after 5m2s
decky / build-publish (push) Successful in 11s
android / android (push) Failing after 49s
ci / docs-site (push) Successful in 35s
ci / bench (push) Failing after 3m15s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 3m49s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 15s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Failing after 40s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Failing after 28s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
docker / deploy-docs (push) Has been skipped
deb / build-publish (push) Successful in 5m54s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 11s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m36s
apple / swift (push) Successful in 54s
windows-host / package (push) Failing after 2m18s
ci / web (push) Successful in 32s
ci / rust (push) Failing after 5m2s
decky / build-publish (push) Successful in 11s
android / android (push) Failing after 49s
ci / docs-site (push) Successful in 35s
ci / bench (push) Failing after 3m15s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 3m49s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 15s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Failing after 40s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Failing after 28s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
docker / deploy-docs (push) Has been skipped
deb / build-publish (push) Successful in 5m54s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 11s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m36s
The Linux zero-copy tiled-GL path can now produce NV12 (BT.709 limited range) on the GPU and feed NVENC native YUV, deleting NVENC's internal RGB->YUV CSC — which runs on the SM/3D-compute engine a saturating game pins at 100% (the game-vs-encode contention headache). Windows already does this via the D3D11 video processor; this closes the Linux gap. See docs/host-latency-plan.md §2A. Gated behind PUNKTFUNK_NV12 (default OFF → the RGB/BGRx path is byte-for-byte unchanged; zero regression). Only the tiled EGL/GL path converts; the LINEAR/Vulkan-bridge (gamescope) path stays RGB. - zerocopy/egl.rs: Nv12Blit — BT.709 limited Y pass (R8, full-res) + UV pass (RG8, half-res, GL_LINEAR 2x2 average); both CUDA-registered; import_nv12. - zerocopy/cuda.rs: two-plane DeviceBuffer (Y W*H@1B + interleaved UV (W/2)*2 x H/2), paired Y+UV pool, copy_mapped_nv12 + copy_nv12_to_device, on the per-thread priority stream (dmabuf-recycle sync preserved). - encode/linux.rs: nvenc_input(Nv12)->NV12; submit_cuda copies two planes into NVENC's surface; VUI signalled BT.709 limited (colorspace/range/primaries/trc). - capture/linux.rs: gate (PUNKTFUNK_NV12 && tiled), report format Nv12. - main.rs + zerocopy/mod.rs: `nv12-selftest` subcommand. Validated on RTX 5070 Ti two ways: (1) nv12-selftest — synthetic RGBA->NV12 round-trip vs a BT.709 reference, max abs error Y=0.56/U=0.33/V=0.26 LSB; (2) live capture->NV12->NVENC->decode of animated red content matches the RGB path's colour (avg RGB 230,18,18 vs 231,18,20). build/clippy/fmt green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -466,6 +466,9 @@ mod pipewire {
|
|||||||
negotiated: Arc<AtomicBool>,
|
negotiated: Arc<AtomicBool>,
|
||||||
/// Present when zero-copy is enabled: imports a dmabuf → CUDA device buffer.
|
/// Present when zero-copy is enabled: imports a dmabuf → CUDA device buffer.
|
||||||
importer: Option<crate::zerocopy::EglImporter>,
|
importer: Option<crate::zerocopy::EglImporter>,
|
||||||
|
/// `PUNKTFUNK_NV12`: on the tiled EGL/GL zero-copy path, convert to NV12 on the GPU and feed
|
||||||
|
/// NVENC native YUV (Tier 2A). Off ⇒ the BGRx path is unchanged.
|
||||||
|
nv12: bool,
|
||||||
/// Rate-limit counter for the latest-frame-only diagnostic log (see `.process`).
|
/// Rate-limit counter for the latest-frame-only diagnostic log (see `.process`).
|
||||||
dbg_log_n: u64,
|
dbg_log_n: u64,
|
||||||
}
|
}
|
||||||
@@ -780,8 +783,17 @@ mod pipewire {
|
|||||||
// sample LINEAR).
|
// sample LINEAR).
|
||||||
let modifier = (ud.modifier != 0).then_some(ud.modifier);
|
let modifier = (ud.modifier != 0).then_some(ud.modifier);
|
||||||
if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
|
if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
|
||||||
let imported = if modifier.is_some() {
|
// NV12 convert (Tier 2A) only on the tiled EGL/GL path (`modifier.is_some()`):
|
||||||
importer.import(&plane, w as u32, h as u32, fourcc, modifier)
|
// produce native YUV so NVENC skips its internal RGB→YUV CSC. The LINEAR/Vulkan
|
||||||
|
// (gamescope) path stays RGB — its convert isn't wired here. When NV12 is
|
||||||
|
// produced the frame's format is reported as `Nv12` so the encoder opens native.
|
||||||
|
let nv12 = ud.nv12 && modifier.is_some();
|
||||||
|
let imported = if let Some(m) = modifier {
|
||||||
|
if nv12 {
|
||||||
|
importer.import_nv12(&plane, w as u32, h as u32, fourcc, Some(m))
|
||||||
|
} else {
|
||||||
|
importer.import(&plane, w as u32, h as u32, fourcc, Some(m))
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
importer.import_linear(&plane, w as u32, h as u32)
|
importer.import_linear(&plane, w as u32, h as u32)
|
||||||
};
|
};
|
||||||
@@ -794,6 +806,7 @@ mod pipewire {
|
|||||||
w,
|
w,
|
||||||
h,
|
h,
|
||||||
modifier = ud.modifier,
|
modifier = ud.modifier,
|
||||||
|
nv12,
|
||||||
"zero-copy: dmabuf imported to CUDA (no CPU copy)"
|
"zero-copy: dmabuf imported to CUDA (no CPU copy)"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -805,7 +818,7 @@ mod pipewire {
|
|||||||
width: w as u32,
|
width: w as u32,
|
||||||
height: h as u32,
|
height: h as u32,
|
||||||
pts_ns,
|
pts_ns,
|
||||||
format: fmt,
|
format: if nv12 { PixelFormat::Nv12 } else { fmt },
|
||||||
payload: FramePayload::Cuda(devbuf),
|
payload: FramePayload::Cuda(devbuf),
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -978,6 +991,12 @@ mod pipewire {
|
|||||||
"zero-copy: advertising EGL-importable dmabuf modifiers"
|
"zero-copy: advertising EGL-importable dmabuf modifiers"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if want_dmabuf && crate::zerocopy::nv12_enabled() {
|
||||||
|
tracing::info!(
|
||||||
|
"PUNKTFUNK_NV12: tiled dmabufs convert to NV12 (BT.709 limited) on the GPU — NVENC \
|
||||||
|
fed native YUV (no internal RGB→YUV CSC)"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let data = UserData {
|
let data = UserData {
|
||||||
info: VideoInfoRaw::default(),
|
info: VideoInfoRaw::default(),
|
||||||
@@ -987,6 +1006,7 @@ mod pipewire {
|
|||||||
active,
|
active,
|
||||||
negotiated,
|
negotiated,
|
||||||
importer,
|
importer,
|
||||||
|
nv12: crate::zerocopy::nv12_enabled(),
|
||||||
dbg_log_n: 0,
|
dbg_log_n: 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -103,10 +103,14 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) {
|
|||||||
PixelFormat::Rgba => (Pixel::RGBA, false),
|
PixelFormat::Rgba => (Pixel::RGBA, false),
|
||||||
PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0
|
PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0
|
||||||
PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0
|
PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0
|
||||||
// Rgb10a2 (HDR) and NV12/P010 (the Windows video-processor YUV outputs) are produced only by
|
// NV12 is native YUV: NVENC encodes it with NO internal RGB→YUV CSC (the Tier 2A win). On
|
||||||
// the Windows capture/encode paths; the Linux capturer never emits them. Map to BGRA so the
|
// Linux it's produced by the GPU convert on the zero-copy tiled path (`PUNKTFUNK_NV12`); on
|
||||||
// match is exhaustive — unreachable here.
|
// Windows by the D3D11 video processor.
|
||||||
PixelFormat::Rgb10a2 | PixelFormat::Nv12 | PixelFormat::P010 => (Pixel::BGRA, false),
|
PixelFormat::Nv12 => (Pixel::NV12, false),
|
||||||
|
// Rgb10a2 (HDR) and P010 (the Windows 10-bit video-processor output) are produced only by
|
||||||
|
// the Windows paths; the Linux capturer never emits them. Map to BGRA so the match is
|
||||||
|
// exhaustive — unreachable here.
|
||||||
|
PixelFormat::Rgb10a2 | PixelFormat::P010 => (Pixel::BGRA, false),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,6 +208,21 @@ impl NvencEncoder {
|
|||||||
(*video.as_mut_ptr()).gop_size = -1;
|
(*video.as_mut_ptr()).gop_size = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NV12 path: we did the RGB→YUV conversion ourselves as BT.709 *limited* range, so signal
|
||||||
|
// that in the bitstream VUI (colorspace/range/primaries/transfer) — otherwise the client
|
||||||
|
// decoder assumes a default and the picture comes out washed-out / wrong-contrast. The
|
||||||
|
// RGB-input paths leave these unset (NVENC's internal CSC writes its own VUI). Matches the
|
||||||
|
// Windows NV12 path's BT.709 limited-range signalling.
|
||||||
|
if matches!(format, PixelFormat::Nv12) {
|
||||||
|
unsafe {
|
||||||
|
let raw = video.as_mut_ptr();
|
||||||
|
(*raw).colorspace = ffi::AVColorSpace::AVCOL_SPC_BT709;
|
||||||
|
(*raw).color_range = ffi::AVColorRange::AVCOL_RANGE_MPEG; // limited/studio
|
||||||
|
(*raw).color_primaries = ffi::AVColorPrimaries::AVCOL_PRI_BT709;
|
||||||
|
(*raw).color_trc = ffi::AVColorTransferCharacteristic::AVCOL_TRC_BT709;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA
|
// For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA
|
||||||
// hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context
|
// hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context
|
||||||
// *before* open (NVENC derives the device from `hw_frames_ctx`).
|
// *before* open (NVENC derives the device from `hw_frames_ctx`).
|
||||||
@@ -419,9 +438,20 @@ impl NvencEncoder {
|
|||||||
ffi::av_frame_free(&mut f);
|
ffi::av_frame_free(&mut f);
|
||||||
bail!("av_hwframe_get_buffer(CUDA) failed ({r})");
|
bail!("av_hwframe_get_buffer(CUDA) failed ({r})");
|
||||||
}
|
}
|
||||||
|
// NV12 surfaces are two-plane (Y in data[0], interleaved UV in data[1]); the RGB
|
||||||
|
// surfaces are single-plane. Copy the matching layout into NVENC's pooled surface.
|
||||||
|
let copy_res = if buf.is_nv12() {
|
||||||
|
let y_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr;
|
||||||
|
let y_pitch = (*f).linesize[0] as usize;
|
||||||
|
let uv_ptr = (*f).data[1] as crate::zerocopy::cuda::CUdeviceptr;
|
||||||
|
let uv_pitch = (*f).linesize[1] as usize;
|
||||||
|
crate::zerocopy::cuda::copy_nv12_to_device(buf, y_ptr, y_pitch, uv_ptr, uv_pitch)
|
||||||
|
} else {
|
||||||
let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr;
|
let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr;
|
||||||
let dst_pitch = (*f).linesize[0] as usize;
|
let dst_pitch = (*f).linesize[0] as usize;
|
||||||
if let Err(e) = crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) {
|
crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch)
|
||||||
|
};
|
||||||
|
if let Err(e) = copy_res {
|
||||||
ffi::av_frame_free(&mut f);
|
ffi::av_frame_free(&mut f);
|
||||||
return Err(e).context("copy imported buffer into NVENC surface");
|
return Err(e).context("copy imported buffer into NVENC surface");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -125,6 +125,11 @@ fn real_main() -> Result<()> {
|
|||||||
// Zero-copy FFI/GPU probe: init the EGL importer + CUDA context (no capture needed).
|
// Zero-copy FFI/GPU probe: init the EGL importer + CUDA context (no capture needed).
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
Some("zerocopy-probe") => zerocopy::probe(),
|
Some("zerocopy-probe") => zerocopy::probe(),
|
||||||
|
// NV12 colour self-test (no display/capture needed): convert a known RGBA pattern to NV12
|
||||||
|
// on the GPU and compare against a BT.709 limited-range reference. Validates the Tier 2A
|
||||||
|
// `PUNKTFUNK_NV12` convert is colour-correct. Prints PASS/FAIL + max Y/U/V error.
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
Some("nv12-selftest") => zerocopy::nv12_selftest(),
|
||||||
// Compositor readiness probe: exit 0 iff the (detected or PUNKTFUNK_COMPOSITOR-forced)
|
// Compositor readiness probe: exit 0 iff the (detected or PUNKTFUNK_COMPOSITOR-forced)
|
||||||
// compositor is up and able to create a virtual output *now*. A session-bringup
|
// compositor is up and able to create a virtual output *now*. A session-bringup
|
||||||
// script polls this to gate on real readiness instead of a blind `sleep`.
|
// script polls this to gate on real readiness instead of a blind `sleep`.
|
||||||
|
|||||||
@@ -159,6 +159,31 @@ fn ck(r: CUresult, what: &str) -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Copy a pitched device plane `(src_ptr, src_pitch)` down to a tightly-packed host buffer of
|
||||||
|
/// `width_bytes`×`height` (no row padding). Synchronous on the priority stream. Used by the NV12
|
||||||
|
/// self-test to read planes back for the colour comparison; not on the hot path.
|
||||||
|
pub fn read_plane_to_host(
|
||||||
|
src_ptr: CUdeviceptr,
|
||||||
|
src_pitch: usize,
|
||||||
|
width_bytes: usize,
|
||||||
|
height: usize,
|
||||||
|
) -> Result<Vec<u8>> {
|
||||||
|
let mut host = vec![0u8; width_bytes * height];
|
||||||
|
let copy = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
srcDevice: src_ptr,
|
||||||
|
srcPitch: src_pitch,
|
||||||
|
dstMemoryType: 1, // CU_MEMORYTYPE_HOST
|
||||||
|
dstHost: host.as_mut_ptr() as *mut c_void,
|
||||||
|
dstPitch: width_bytes,
|
||||||
|
WidthInBytes: width_bytes,
|
||||||
|
Height: height,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->host)")? };
|
||||||
|
Ok(host)
|
||||||
|
}
|
||||||
|
|
||||||
/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
|
/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
|
||||||
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
|
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
@@ -265,11 +290,52 @@ fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
|
|||||||
Ok((ptr, pitch))
|
Ok((ptr, pitch))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Allocate the two pitched planes of an NV12 surface (8-bit BT.709 4:2:0): a `width`-byte Y plane
|
||||||
|
/// (W×H, 1 byte/px) and an interleaved chroma plane (W/2 × H/2 samples, 2 bytes/sample → W bytes
|
||||||
|
/// wide). Both planes share the driver's Y pitch (the wider request), so the encoder's two-plane
|
||||||
|
/// surface and ours line up. Returns `((y_ptr, y_pitch), (uv_ptr, uv_pitch))`.
|
||||||
|
fn alloc_pitched_nv12(
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
) -> Result<((CUdeviceptr, usize), (CUdeviceptr, usize))> {
|
||||||
|
let mut y_ptr: CUdeviceptr = 0;
|
||||||
|
let mut y_pitch: usize = 0;
|
||||||
|
let mut uv_ptr: CUdeviceptr = 0;
|
||||||
|
let mut uv_pitch: usize = 0;
|
||||||
|
unsafe {
|
||||||
|
ck(
|
||||||
|
cuMemAllocPitch_v2(
|
||||||
|
&mut y_ptr,
|
||||||
|
&mut y_pitch,
|
||||||
|
width as usize,
|
||||||
|
height as usize,
|
||||||
|
16,
|
||||||
|
),
|
||||||
|
"cuMemAllocPitch_v2(Y)",
|
||||||
|
)?;
|
||||||
|
// Chroma is W/2 samples wide at 2 bytes each = W bytes; H/2 rows.
|
||||||
|
ck(
|
||||||
|
cuMemAllocPitch_v2(
|
||||||
|
&mut uv_ptr,
|
||||||
|
&mut uv_pitch,
|
||||||
|
(width as usize / 2) * 2,
|
||||||
|
(height as usize / 2).max(1),
|
||||||
|
16,
|
||||||
|
),
|
||||||
|
"cuMemAllocPitch_v2(UV)",
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
Ok(((y_ptr, y_pitch), (uv_ptr, uv_pitch)))
|
||||||
|
}
|
||||||
|
|
||||||
/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
|
/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
|
||||||
/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
|
/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
|
||||||
/// returns its allocation here. Bulk-freed when the last reference drops.
|
/// returns its allocation here. Bulk-freed when the last reference drops. For NV12 each free entry
|
||||||
|
/// is the Y plane *and* its paired UV plane (allocated/recycled/freed together).
|
||||||
struct PoolInner {
|
struct PoolInner {
|
||||||
free: Vec<CUdeviceptr>,
|
free: Vec<CUdeviceptr>,
|
||||||
|
/// NV12 only: the UV plane paired with each Y plane in `free` (same index, same length).
|
||||||
|
free_uv: Vec<CUdeviceptr>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for PoolInner {
|
impl Drop for PoolInner {
|
||||||
@@ -281,6 +347,9 @@ impl Drop for PoolInner {
|
|||||||
for &p in &self.free {
|
for &p in &self.free {
|
||||||
let _ = cuMemFree_v2(p);
|
let _ = cuMemFree_v2(p);
|
||||||
}
|
}
|
||||||
|
for &p in &self.free_uv {
|
||||||
|
let _ = cuMemFree_v2(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -294,6 +363,8 @@ pub struct BufferPool {
|
|||||||
width: u32,
|
width: u32,
|
||||||
height: u32,
|
height: u32,
|
||||||
pitch: usize,
|
pitch: usize,
|
||||||
|
/// NV12 pools carry a second (chroma) pitch; `Some` ⇒ buffers from this pool have a UV plane.
|
||||||
|
uv_pitch: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BufferPool {
|
impl BufferPool {
|
||||||
@@ -302,10 +373,30 @@ impl BufferPool {
|
|||||||
pub fn new(width: u32, height: u32) -> Result<BufferPool> {
|
pub fn new(width: u32, height: u32) -> Result<BufferPool> {
|
||||||
let (ptr, pitch) = alloc_pitched(width, height)?;
|
let (ptr, pitch) = alloc_pitched(width, height)?;
|
||||||
Ok(BufferPool {
|
Ok(BufferPool {
|
||||||
inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })),
|
inner: Arc::new(Mutex::new(PoolInner {
|
||||||
|
free: vec![ptr],
|
||||||
|
free_uv: Vec::new(),
|
||||||
|
})),
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
pitch,
|
pitch,
|
||||||
|
uv_pitch: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a pool of NV12 two-plane surfaces (Y + interleaved UV) for `width`x`height`. Allocates
|
||||||
|
/// one pair up front to learn the driver's per-plane pitches (constant for a given width).
|
||||||
|
pub fn new_nv12(width: u32, height: u32) -> Result<BufferPool> {
|
||||||
|
let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
|
||||||
|
Ok(BufferPool {
|
||||||
|
inner: Arc::new(Mutex::new(PoolInner {
|
||||||
|
free: vec![y_ptr],
|
||||||
|
free_uv: vec![uv_ptr],
|
||||||
|
})),
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
pitch: y_pitch,
|
||||||
|
uv_pitch: Some(uv_pitch),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -318,8 +409,31 @@ impl BufferPool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
|
/// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
|
||||||
/// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
|
/// pool when dropped (after the consumer has synchronized, so the GPU is done with it). For an
|
||||||
|
/// NV12 pool the returned buffer carries both the Y and the paired UV plane.
|
||||||
pub fn get(&self) -> Result<DeviceBuffer> {
|
pub fn get(&self) -> Result<DeviceBuffer> {
|
||||||
|
if let Some(uv_pitch) = self.uv_pitch {
|
||||||
|
let reuse = {
|
||||||
|
let mut g = self.inner.lock().unwrap();
|
||||||
|
g.free.pop().map(|y| (y, g.free_uv.pop()))
|
||||||
|
};
|
||||||
|
let (ptr, uv_ptr) = match reuse {
|
||||||
|
// Y and UV are pushed/popped together, so a popped Y always has its UV.
|
||||||
|
Some((y, Some(uv))) => (y, uv),
|
||||||
|
_ => {
|
||||||
|
let ((y, _), (uv, _)) = alloc_pitched_nv12(self.width, self.height)?;
|
||||||
|
(y, uv)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return Ok(DeviceBuffer {
|
||||||
|
ptr,
|
||||||
|
pitch: self.pitch,
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
uv: Some((uv_ptr, uv_pitch)),
|
||||||
|
pool: Some(self.inner.clone()),
|
||||||
|
});
|
||||||
|
}
|
||||||
let reuse = self.inner.lock().unwrap().free.pop();
|
let reuse = self.inner.lock().unwrap().free.pop();
|
||||||
let ptr = match reuse {
|
let ptr = match reuse {
|
||||||
Some(p) => p,
|
Some(p) => p,
|
||||||
@@ -330,6 +444,7 @@ impl BufferPool {
|
|||||||
pitch: self.pitch,
|
pitch: self.pitch,
|
||||||
width: self.width,
|
width: self.width,
|
||||||
height: self.height,
|
height: self.height,
|
||||||
|
uv: None,
|
||||||
pool: Some(self.inner.clone()),
|
pool: Some(self.inner.clone()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -343,6 +458,9 @@ pub struct DeviceBuffer {
|
|||||||
pub pitch: usize,
|
pub pitch: usize,
|
||||||
pub width: u32,
|
pub width: u32,
|
||||||
pub height: u32,
|
pub height: u32,
|
||||||
|
/// NV12 only: the interleaved chroma plane `(ptr, pitch)` paired with the Y plane in [`ptr`].
|
||||||
|
/// `None` for the default 4-byte RGB/BGRx path. When `Some`, [`ptr`] is the Y plane (1 byte/px).
|
||||||
|
pub uv: Option<(CUdeviceptr, usize)>,
|
||||||
pool: Option<Arc<Mutex<PoolInner>>>,
|
pool: Option<Arc<Mutex<PoolInner>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -355,9 +473,29 @@ impl DeviceBuffer {
|
|||||||
pitch,
|
pitch,
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
|
uv: None,
|
||||||
pool: None,
|
pool: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Allocate a standalone (un-pooled) NV12 two-plane buffer. Prefer [`BufferPool::new_nv12`] on
|
||||||
|
/// the hot path; used by the self-test.
|
||||||
|
pub fn alloc_nv12(width: u32, height: u32) -> Result<DeviceBuffer> {
|
||||||
|
let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
|
||||||
|
Ok(DeviceBuffer {
|
||||||
|
ptr: y_ptr,
|
||||||
|
pitch: y_pitch,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
uv: Some((uv_ptr, uv_pitch)),
|
||||||
|
pool: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// True if this buffer carries an NV12 chroma plane.
|
||||||
|
pub fn is_nv12(&self) -> bool {
|
||||||
|
self.uv.is_some()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for DeviceBuffer {
|
impl Drop for DeviceBuffer {
|
||||||
@@ -366,8 +504,13 @@ impl Drop for DeviceBuffer {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if let Some(pool) = &self.pool {
|
if let Some(pool) = &self.pool {
|
||||||
// Recycle (the consumer synchronized before dropping, so the GPU is done with it).
|
// Recycle (the consumer synchronized before dropping, so the GPU is done with it). Y and
|
||||||
pool.lock().unwrap().free.push(self.ptr);
|
// its paired UV go back together so `get` can repair them as a unit.
|
||||||
|
let mut g = pool.lock().unwrap();
|
||||||
|
g.free.push(self.ptr);
|
||||||
|
if let Some((uv_ptr, _)) = self.uv {
|
||||||
|
g.free_uv.push(uv_ptr);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
||||||
unsafe {
|
unsafe {
|
||||||
@@ -375,6 +518,9 @@ impl Drop for DeviceBuffer {
|
|||||||
let _ = cuCtxSetCurrent(c.0);
|
let _ = cuCtxSetCurrent(c.0);
|
||||||
}
|
}
|
||||||
let _ = cuMemFree_v2(self.ptr);
|
let _ = cuMemFree_v2(self.ptr);
|
||||||
|
if let Some((uv_ptr, _)) = self.uv {
|
||||||
|
let _ = cuMemFree_v2(uv_ptr);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -440,6 +586,62 @@ impl RegisteredTexture {
|
|||||||
res
|
res
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Map this texture for the frame and copy its array into the device plane `(dst_ptr,
|
||||||
|
/// dst_pitch)`, taking `width_bytes`×`height` bytes (the GL internal format dictates
|
||||||
|
/// `width_bytes`: `width*1` for an `R8` luma target, `(width/2)*2` for an `RG8` chroma target).
|
||||||
|
/// Synchronized on our priority stream before unmap (so the source dmabuf is safe to recycle).
|
||||||
|
/// Always unmaps, even on copy error.
|
||||||
|
fn copy_mapped_plane(
|
||||||
|
&mut self,
|
||||||
|
dst_ptr: CUdeviceptr,
|
||||||
|
dst_pitch: usize,
|
||||||
|
width_bytes: usize,
|
||||||
|
height: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
unsafe {
|
||||||
|
ck(
|
||||||
|
cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
|
||||||
|
"cuGraphicsMapResources",
|
||||||
|
)?;
|
||||||
|
let mut array: CUarray = std::ptr::null_mut();
|
||||||
|
if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||||
|
bail!("cuGraphicsSubResourceGetMappedArray failed");
|
||||||
|
}
|
||||||
|
let copy = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_ARRAY,
|
||||||
|
srcArray: array,
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: dst_ptr,
|
||||||
|
dstPitch: dst_pitch,
|
||||||
|
WidthInBytes: width_bytes,
|
||||||
|
Height: height,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let res = copy_blocking(©, "cuMemcpy2DAsync_v2(plane)");
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||||
|
res
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy the two NV12 convert targets (registered `R8` luma + `RG8` chroma GL textures) into `dst`'s
|
||||||
|
/// Y and UV planes. `dst` must be an NV12 buffer (`dst.uv` set). The luma plane is `width`×`height`
|
||||||
|
/// bytes; the chroma plane is `(width/2)·2` bytes wide × `height/2` rows. Both copies sync on our
|
||||||
|
/// priority stream before returning, so the dmabuf is safe to recycle once this returns.
|
||||||
|
pub fn copy_mapped_nv12(
|
||||||
|
y_tex: &mut RegisteredTexture,
|
||||||
|
uv_tex: &mut RegisteredTexture,
|
||||||
|
dst: &DeviceBuffer,
|
||||||
|
) -> Result<()> {
|
||||||
|
let (uv_ptr, uv_pitch) = dst
|
||||||
|
.uv
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("copy_mapped_nv12 on a non-NV12 buffer"))?;
|
||||||
|
let w = dst.width as usize;
|
||||||
|
let h = dst.height as usize;
|
||||||
|
y_tex.copy_mapped_plane(dst.ptr, dst.pitch, w, h)?;
|
||||||
|
uv_tex.copy_mapped_plane(uv_ptr, uv_pitch, (w / 2) * 2, h / 2)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
|
/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
|
||||||
@@ -464,6 +666,50 @@ pub fn copy_device_to_device(
|
|||||||
unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->dev)") }
|
unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->dev)") }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Copy our imported NV12 [`DeviceBuffer`] (Y + UV planes) into NVENC's two-plane CUDA surface
|
||||||
|
/// `(y_dst, y_pitch)` / `(uv_dst, uv_pitch)` (`av_hwframe_get_buffer`'s `data[0]`/`data[1]` +
|
||||||
|
/// `linesize[0]`/`linesize[1]`). The Y plane is `width`×`height` bytes; the chroma plane is
|
||||||
|
/// `(width/2)·2` bytes × `height/2` rows. The caller must have the shared context current.
|
||||||
|
pub fn copy_nv12_to_device(
|
||||||
|
src: &DeviceBuffer,
|
||||||
|
y_dst: CUdeviceptr,
|
||||||
|
y_pitch: usize,
|
||||||
|
uv_dst: CUdeviceptr,
|
||||||
|
uv_pitch: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
let (src_uv_ptr, src_uv_pitch) = src
|
||||||
|
.uv
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("copy_nv12_to_device on a non-NV12 buffer"))?;
|
||||||
|
let w = src.width as usize;
|
||||||
|
let h = src.height as usize;
|
||||||
|
let y = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
srcDevice: src.ptr,
|
||||||
|
srcPitch: src.pitch,
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: y_dst,
|
||||||
|
dstPitch: y_pitch,
|
||||||
|
WidthInBytes: w, // 1 byte/px luma
|
||||||
|
Height: h,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let uv = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
srcDevice: src_uv_ptr,
|
||||||
|
srcPitch: src_uv_pitch,
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: uv_dst,
|
||||||
|
dstPitch: uv_pitch,
|
||||||
|
WidthInBytes: (w / 2) * 2, // 2 bytes/sample interleaved U,V
|
||||||
|
Height: h / 2,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
unsafe {
|
||||||
|
copy_blocking(&y, "cuMemcpy2DAsync_v2(nv12 Y dev->dev)")?;
|
||||||
|
copy_blocking(&uv, "cuMemcpy2DAsync_v2(nv12 UV dev->dev)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Drop for RegisteredTexture {
|
impl Drop for RegisteredTexture {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if !self.resource.is_null() {
|
if !self.resource.is_null() {
|
||||||
|
|||||||
@@ -34,6 +34,13 @@ const GL_TEXTURE_MAG_FILTER: u32 = 0x2800;
|
|||||||
const GL_LINEAR: c_int = 0x2601;
|
const GL_LINEAR: c_int = 0x2601;
|
||||||
const GL_NEAREST: c_int = 0x2600;
|
const GL_NEAREST: c_int = 0x2600;
|
||||||
const GL_RGBA8: u32 = 0x8058;
|
const GL_RGBA8: u32 = 0x8058;
|
||||||
|
// Single/dual-channel 8-bit formats for the NV12 convert targets: R8 luma (full-res),
|
||||||
|
// RG8 interleaved chroma (half-res). The `_RED`/`_RG` enums are the matching client formats.
|
||||||
|
const GL_R8: u32 = 0x8229;
|
||||||
|
const GL_RG8: u32 = 0x822B;
|
||||||
|
// Client pixel format/type for texture uploads (self-test only): RGBA bytes.
|
||||||
|
const GL_RGBA: u32 = 0x1908;
|
||||||
|
const GL_UNSIGNED_BYTE: u32 = 0x1401;
|
||||||
const GL_FRAMEBUFFER: u32 = 0x8D40;
|
const GL_FRAMEBUFFER: u32 = 0x8D40;
|
||||||
const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
|
const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
|
||||||
const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
|
const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
|
||||||
@@ -54,6 +61,7 @@ extern "C" {
|
|||||||
fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
|
fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
|
||||||
fn glGetError() -> u32;
|
fn glGetError() -> u32;
|
||||||
fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
|
fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
|
||||||
|
fn glDeleteFramebuffers(n: c_int, framebuffers: *const u32);
|
||||||
fn glBindFramebuffer(target: u32, framebuffer: u32);
|
fn glBindFramebuffer(target: u32, framebuffer: u32);
|
||||||
fn glFramebufferTexture2D(
|
fn glFramebufferTexture2D(
|
||||||
target: u32,
|
target: u32,
|
||||||
@@ -65,6 +73,7 @@ extern "C" {
|
|||||||
fn glCheckFramebufferStatus(target: u32) -> u32;
|
fn glCheckFramebufferStatus(target: u32) -> u32;
|
||||||
fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
|
fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
|
||||||
fn glGenVertexArrays(n: c_int, arrays: *mut u32);
|
fn glGenVertexArrays(n: c_int, arrays: *mut u32);
|
||||||
|
fn glDeleteVertexArrays(n: c_int, arrays: *const u32);
|
||||||
fn glBindVertexArray(array: u32);
|
fn glBindVertexArray(array: u32);
|
||||||
fn glDrawArrays(mode: u32, first: c_int, count: c_int);
|
fn glDrawArrays(mode: u32, first: c_int, count: c_int);
|
||||||
fn glActiveTexture(texture: u32);
|
fn glActiveTexture(texture: u32);
|
||||||
@@ -81,6 +90,18 @@ extern "C" {
|
|||||||
fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
|
fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
|
||||||
fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
|
fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
|
||||||
fn glUniform1i(location: c_int, v0: c_int);
|
fn glUniform1i(location: c_int, v0: c_int);
|
||||||
|
fn glDeleteProgram(program: u32);
|
||||||
|
fn glTexSubImage2D(
|
||||||
|
target: u32,
|
||||||
|
level: c_int,
|
||||||
|
xoffset: c_int,
|
||||||
|
yoffset: c_int,
|
||||||
|
width: c_int,
|
||||||
|
height: c_int,
|
||||||
|
format: u32,
|
||||||
|
type_: u32,
|
||||||
|
pixels: *const c_void,
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[link(name = "gbm")]
|
#[link(name = "gbm")]
|
||||||
@@ -97,6 +118,17 @@ type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void);
|
|||||||
const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
|
const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
|
||||||
const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";
|
const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";
|
||||||
|
|
||||||
|
// NV12 BT.709 LIMITED-range convert from full-range RGB in [0,1]. Two passes share `VERT_SRC` and
|
||||||
|
// the same source texture (the de-tiled dmabuf):
|
||||||
|
// Y pass → GL_R8 luma, full-res: Y = (16 + 219·(0.2126R+0.7152G+0.0722B))/255
|
||||||
|
// UV pass → GL_RG8 chroma, half-res (GL_LINEAR averages the 2×2 footprint):
|
||||||
|
// U = (128 + 224·(-0.1146R-0.3854G+0.5000B))/255 → R channel
|
||||||
|
// V = (128 + 224·( 0.5000R-0.4542G-0.0458B))/255 → G channel
|
||||||
|
// RG8's (R=U, G=V) byte order matches NV12's interleaved [U,V]. All outputs clamped to [0,1].
|
||||||
|
// Matches the Windows VideoConverter (BT.709, limited/studio range) so the two hosts look identical.
|
||||||
|
const FRAG_Y_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float Y=(16.0+219.0*(0.2126*c.r+0.7152*c.g+0.0722*c.b))/255.0;o_color=vec4(clamp(Y,0.0,1.0),0.0,0.0,1.0);}\n";
|
||||||
|
const FRAG_UV_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float U=(128.0+224.0*(-0.1146*c.r-0.3854*c.g+0.5000*c.b))/255.0;float V=(128.0+224.0*(0.5000*c.r-0.4542*c.g-0.0458*c.b))/255.0;o_color=vec4(clamp(U,0.0,1.0),clamp(V,0.0,1.0),0.0,1.0);}\n";
|
||||||
|
|
||||||
unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
|
unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
|
||||||
let sh = glCreateShader(kind);
|
let sh = glCreateShader(kind);
|
||||||
ensure!(sh != 0, "glCreateShader failed");
|
ensure!(sh != 0, "glCreateShader failed");
|
||||||
@@ -113,9 +145,11 @@ unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
|
|||||||
Ok(sh)
|
Ok(sh)
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe fn compile_program() -> Result<u32> {
|
/// Compile+link the fullscreen-triangle program with fragment source `frag` and bind its `image`
|
||||||
|
/// sampler to texture unit 0.
|
||||||
|
unsafe fn compile_program_with(frag: &[u8]) -> Result<u32> {
|
||||||
let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
|
let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
|
||||||
let fs = compile_shader(GL_FRAGMENT_SHADER, FRAG_SRC)?;
|
let fs = compile_shader(GL_FRAGMENT_SHADER, frag)?;
|
||||||
let prog = glCreateProgram();
|
let prog = glCreateProgram();
|
||||||
glAttachShader(prog, vs);
|
glAttachShader(prog, vs);
|
||||||
glAttachShader(prog, fs);
|
glAttachShader(prog, fs);
|
||||||
@@ -134,6 +168,10 @@ unsafe fn compile_program() -> Result<u32> {
|
|||||||
Ok(prog)
|
Ok(prog)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsafe fn compile_program() -> Result<u32> {
|
||||||
|
compile_program_with(FRAG_SRC)
|
||||||
|
}
|
||||||
|
|
||||||
/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
|
/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
|
||||||
struct GlBlit {
|
struct GlBlit {
|
||||||
program: u32,
|
program: u32,
|
||||||
@@ -230,6 +268,165 @@ impl GlBlit {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Per-size GL machinery to convert a dmabuf EGLImage into an NV12 (BT.709 limited-range) pair —
|
||||||
|
/// the [`GlBlit`] analogue for the `PUNKTFUNK_NV12` path. Two passes share `src_tex`: a full-res Y
|
||||||
|
/// pass into a CUDA-registrable `GL_R8` texture and a half-res UV pass into a `GL_RG8` texture.
|
||||||
|
/// Feeding NVENC native NV12 deletes its internal RGB→YUV CSC (which otherwise runs on the SM that a
|
||||||
|
/// saturating game pins at 100%); the convert here replaces the BGRx swizzle [`GlBlit`] did, at ~the
|
||||||
|
/// same 3D cost.
|
||||||
|
struct Nv12Blit {
|
||||||
|
y_program: u32,
|
||||||
|
uv_program: u32,
|
||||||
|
vao: u32,
|
||||||
|
y_fbo: u32,
|
||||||
|
uv_fbo: u32,
|
||||||
|
/// CUDA-registrable luma target (immutable `GL_R8`, W×H).
|
||||||
|
y_tex: u32,
|
||||||
|
/// CUDA-registrable chroma target (immutable `GL_RG8`, W/2 × H/2).
|
||||||
|
uv_tex: u32,
|
||||||
|
/// Source texture re-targeted to each frame's EGLImage. `GL_LINEAR` so the UV pass averages 2×2.
|
||||||
|
src_tex: u32,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
y_registered: cuda::RegisteredTexture,
|
||||||
|
uv_registered: cuda::RegisteredTexture,
|
||||||
|
/// Recycled NV12 device buffers (two-plane) handed to the encoder.
|
||||||
|
pool: cuda::BufferPool,
|
||||||
|
/// Self-test only: whether `src_tex` has had immutable RGBA8 storage allocated for the upload
|
||||||
|
/// path (the live path retargets `src_tex` via EGLImage instead, never allocating storage).
|
||||||
|
test_src_storage: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Nv12Blit {
|
||||||
|
unsafe fn new(width: u32, height: u32) -> Result<Nv12Blit> {
|
||||||
|
ensure!(
|
||||||
|
width % 2 == 0 && height % 2 == 0,
|
||||||
|
"NV12 convert needs even dimensions (got {width}x{height})"
|
||||||
|
);
|
||||||
|
let y_program = compile_program_with(FRAG_Y_SRC)?;
|
||||||
|
let uv_program = compile_program_with(FRAG_UV_SRC)?;
|
||||||
|
let mut vao = 0u32;
|
||||||
|
glGenVertexArrays(1, &mut vao);
|
||||||
|
let mut fbos = [0u32; 2];
|
||||||
|
glGenFramebuffers(2, fbos.as_mut_ptr());
|
||||||
|
let (y_fbo, uv_fbo) = (fbos[0], fbos[1]);
|
||||||
|
|
||||||
|
// Luma target: GL_R8 at full resolution.
|
||||||
|
let mut y_tex = 0u32;
|
||||||
|
glGenTextures(1, &mut y_tex);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, y_tex);
|
||||||
|
glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, width as c_int, height as c_int);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||||
|
|
||||||
|
// Chroma target: GL_RG8 at half resolution (R=U, G=V).
|
||||||
|
let mut uv_tex = 0u32;
|
||||||
|
glGenTextures(1, &mut uv_tex);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, uv_tex);
|
||||||
|
glTexStorage2D(
|
||||||
|
GL_TEXTURE_2D,
|
||||||
|
1,
|
||||||
|
GL_RG8,
|
||||||
|
(width / 2) as c_int,
|
||||||
|
(height / 2) as c_int,
|
||||||
|
);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||||
|
|
||||||
|
// Source: GL_LINEAR so the half-res UV pass averages the 2×2 chroma footprint.
|
||||||
|
let mut src_tex = 0u32;
|
||||||
|
glGenTextures(1, &mut src_tex);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, src_tex);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, 0);
|
||||||
|
|
||||||
|
for (fbo, tex) in [(y_fbo, y_tex), (uv_fbo, uv_tex)] {
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
|
||||||
|
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex, 0);
|
||||||
|
let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||||
|
ensure!(
|
||||||
|
status == GL_FRAMEBUFFER_COMPLETE,
|
||||||
|
"NV12 blit FBO incomplete ({status:#x}) — GL_R8/GL_RG8 not renderable?"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// Register both convert targets with CUDA once (per-resolution), + the NV12 two-plane pool.
|
||||||
|
let y_registered = cuda::RegisteredTexture::register_gl(y_tex)?;
|
||||||
|
let uv_registered = cuda::RegisteredTexture::register_gl(uv_tex)?;
|
||||||
|
let pool = cuda::BufferPool::new_nv12(width, height)?;
|
||||||
|
Ok(Nv12Blit {
|
||||||
|
y_program,
|
||||||
|
uv_program,
|
||||||
|
vao,
|
||||||
|
y_fbo,
|
||||||
|
uv_fbo,
|
||||||
|
y_tex,
|
||||||
|
uv_tex,
|
||||||
|
src_tex,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
y_registered,
|
||||||
|
uv_registered,
|
||||||
|
pool,
|
||||||
|
test_src_storage: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bind `image` to the source texture and run both convert passes into `y_tex`/`uv_tex`.
|
||||||
|
///
|
||||||
|
/// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
|
||||||
|
unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
|
||||||
|
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||||
|
let _ = glGetError();
|
||||||
|
egl_image_target(GL_TEXTURE_2D, image);
|
||||||
|
let e = glGetError();
|
||||||
|
glBindTexture(GL_TEXTURE_2D, 0);
|
||||||
|
ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
|
||||||
|
self.run_passes()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the two convert passes from whatever is currently in `src_tex` (caller populated it).
|
||||||
|
/// Shared by [`run`](Self::run) (EGLImage source) and the self-test (uploaded RGBA source).
|
||||||
|
///
|
||||||
|
/// # Safety: the GL context is current on this thread.
|
||||||
|
unsafe fn run_passes(&self) -> Result<()> {
|
||||||
|
glActiveTexture(GL_TEXTURE0);
|
||||||
|
glBindVertexArray(self.vao);
|
||||||
|
// Y pass: full-res into the R8 target.
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, self.y_fbo);
|
||||||
|
glViewport(0, 0, self.width as c_int, self.height as c_int);
|
||||||
|
glUseProgram(self.y_program);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||||
|
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||||
|
// UV pass: half-res into the RG8 target (GL_LINEAR averages the 2×2).
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, self.uv_fbo);
|
||||||
|
glViewport(0, 0, (self.width / 2) as c_int, (self.height / 2) as c_int);
|
||||||
|
glUseProgram(self.uv_program);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||||
|
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||||
|
|
||||||
|
glBindVertexArray(0);
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||||
|
glFlush(); // submit GL work before CUDA maps the textures
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for Nv12Blit {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
unsafe {
|
||||||
|
glDeleteTextures(1, &self.y_tex);
|
||||||
|
glDeleteTextures(1, &self.uv_tex);
|
||||||
|
glDeleteTextures(1, &self.src_tex);
|
||||||
|
glDeleteFramebuffers(2, [self.y_fbo, self.uv_fbo].as_ptr());
|
||||||
|
glDeleteVertexArrays(1, &self.vao);
|
||||||
|
glDeleteProgram(self.y_program);
|
||||||
|
glDeleteProgram(self.uv_program);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||||
#[derive(Clone, Copy, Debug)]
|
#[derive(Clone, Copy, Debug)]
|
||||||
pub struct DmabufPlane {
|
pub struct DmabufPlane {
|
||||||
@@ -252,6 +449,8 @@ pub struct EglImporter {
|
|||||||
egl_image_target: EglImageTargetFn,
|
egl_image_target: EglImageTargetFn,
|
||||||
/// Lazily-created GL blit machinery (recreated if the frame size changes).
|
/// Lazily-created GL blit machinery (recreated if the frame size changes).
|
||||||
blit: Option<GlBlit>,
|
blit: Option<GlBlit>,
|
||||||
|
/// Lazily-created NV12 convert machinery (`PUNKTFUNK_NV12` path; recreated on size change).
|
||||||
|
nv12_blit: Option<Nv12Blit>,
|
||||||
/// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
|
/// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
|
||||||
/// created lazily on the first LINEAR frame, + the destination pool.
|
/// created lazily on the first LINEAR frame, + the destination pool.
|
||||||
vk: Option<super::vulkan::VkBridge>,
|
vk: Option<super::vulkan::VkBridge>,
|
||||||
@@ -355,6 +554,7 @@ impl EglImporter {
|
|||||||
_gl_ctx: gl_ctx,
|
_gl_ctx: gl_ctx,
|
||||||
egl_image_target,
|
egl_image_target,
|
||||||
blit: None,
|
blit: None,
|
||||||
|
nv12_blit: None,
|
||||||
vk: None,
|
vk: None,
|
||||||
linear_pool: None,
|
linear_pool: None,
|
||||||
gbm,
|
gbm,
|
||||||
@@ -448,6 +648,33 @@ impl EglImporter {
|
|||||||
height: u32,
|
height: u32,
|
||||||
fourcc: u32,
|
fourcc: u32,
|
||||||
modifier: Option<u64>,
|
modifier: Option<u64>,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
self.import_inner(plane, width, height, fourcc, modifier, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like [`import`](Self::import), but de-tiles **and converts** the dmabuf to NV12 (BT.709
|
||||||
|
/// limited range) on the GPU — the `PUNKTFUNK_NV12` path — so NVENC can encode native YUV with
|
||||||
|
/// no internal RGB→YUV CSC. The returned [`DeviceBuffer`] carries both NV12 planes
|
||||||
|
/// (`DeviceBuffer::is_nv12`). Only the tiled EGL/GL path supports this (LINEAR/Vulkan stays RGB).
|
||||||
|
pub fn import_nv12(
|
||||||
|
&mut self,
|
||||||
|
plane: &DmabufPlane,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
fourcc: u32,
|
||||||
|
modifier: Option<u64>,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
self.import_inner(plane, width, height, fourcc, modifier, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn import_inner(
|
||||||
|
&mut self,
|
||||||
|
plane: &DmabufPlane,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
fourcc: u32,
|
||||||
|
modifier: Option<u64>,
|
||||||
|
nv12: bool,
|
||||||
) -> Result<DeviceBuffer> {
|
) -> Result<DeviceBuffer> {
|
||||||
let mut attrs: Vec<egl::Attrib> = vec![
|
let mut attrs: Vec<egl::Attrib> = vec![
|
||||||
egl::WIDTH as egl::Attrib,
|
egl::WIDTH as egl::Attrib,
|
||||||
@@ -484,10 +711,14 @@ impl EglImporter {
|
|||||||
)
|
)
|
||||||
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||||
|
|
||||||
// EGLImage → (sampled by a shader) → GL_RGBA8 texture → register *that* with CUDA → map
|
// EGLImage → (sampled by a shader) → GL_RGBA8 texture (or NV12 R8+RG8 pair) → register
|
||||||
// → array → copy out. Registering the EGLImage texture directly fails (its layout isn't a
|
// *that* with CUDA → map → array → copy out. Registering the EGLImage texture directly
|
||||||
// CUDA-registrable format); the RGBA8 render target is.
|
// fails (its layout isn't a CUDA-registrable format); the render targets are.
|
||||||
let result = self.blit_and_copy(image.as_ptr(), width, height);
|
let result = if nv12 {
|
||||||
|
self.blit_and_copy_nv12(image.as_ptr(), width, height)
|
||||||
|
} else {
|
||||||
|
self.blit_and_copy(image.as_ptr(), width, height)
|
||||||
|
};
|
||||||
let _ = self.egl.destroy_image(self.display, image);
|
let _ = self.egl.destroy_image(self.display, image);
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
@@ -514,6 +745,80 @@ impl EglImporter {
|
|||||||
blit.registered.copy_mapped_to(&dst)?;
|
blit.registered.copy_mapped_to(&dst)?;
|
||||||
Ok(dst)
|
Ok(dst)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert the dmabuf `image` to NV12 (Y in an R8 texture, UV in an RG8 texture) and copy both
|
||||||
|
/// planes into a pooled NV12 [`DeviceBuffer`]. (Re)creates the per-size convert machinery as
|
||||||
|
/// needed. The `PUNKTFUNK_NV12` analogue of [`blit_and_copy`].
|
||||||
|
fn blit_and_copy_nv12(
|
||||||
|
&mut self,
|
||||||
|
image: *mut c_void,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
cuda::make_current()?;
|
||||||
|
if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||||
|
self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
|
||||||
|
}
|
||||||
|
let egl_image_target = self.egl_image_target;
|
||||||
|
let blit = self.nv12_blit.as_mut().unwrap();
|
||||||
|
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
||||||
|
unsafe { blit.run(egl_image_target, image)? };
|
||||||
|
let dst = blit.pool.get()?;
|
||||||
|
cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
|
||||||
|
Ok(dst)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Self-test entry: upload a packed `width`×`height` RGBA8 host pattern into a GL texture, run
|
||||||
|
/// the NV12 convert passes on the GPU, and copy both planes into a pooled NV12 [`DeviceBuffer`].
|
||||||
|
/// Exercises the exact shaders + CUDA copy the live path uses, but sourced from an uploaded
|
||||||
|
/// texture instead of a dmabuf EGLImage (no compositor needed). `rgba` is tightly packed, 4 B/px.
|
||||||
|
pub fn convert_rgba_for_test(
|
||||||
|
&mut self,
|
||||||
|
rgba: &[u8],
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
anyhow::ensure!(
|
||||||
|
rgba.len() == width as usize * height as usize * 4,
|
||||||
|
"test RGBA buffer {} bytes != {}x{}x4",
|
||||||
|
rgba.len(),
|
||||||
|
width,
|
||||||
|
height
|
||||||
|
);
|
||||||
|
cuda::make_current()?;
|
||||||
|
if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||||
|
self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
|
||||||
|
}
|
||||||
|
let blit = self.nv12_blit.as_mut().unwrap();
|
||||||
|
unsafe {
|
||||||
|
// Upload the host RGBA into `src_tex` (an immutable GL_RGBA8 backing must exist first;
|
||||||
|
// the live path never allocates it — it retargets `src_tex` via EGLImage instead).
|
||||||
|
glBindTexture(GL_TEXTURE_2D, blit.src_tex);
|
||||||
|
if !blit.test_src_storage {
|
||||||
|
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
|
||||||
|
blit.test_src_storage = true;
|
||||||
|
}
|
||||||
|
let _ = glGetError();
|
||||||
|
glTexSubImage2D(
|
||||||
|
GL_TEXTURE_2D,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
width as c_int,
|
||||||
|
height as c_int,
|
||||||
|
GL_RGBA,
|
||||||
|
GL_UNSIGNED_BYTE,
|
||||||
|
rgba.as_ptr() as *const c_void,
|
||||||
|
);
|
||||||
|
let e = glGetError();
|
||||||
|
glBindTexture(GL_TEXTURE_2D, 0);
|
||||||
|
ensure!(e == 0, "glTexSubImage2D(test source) failed ({e:#x})");
|
||||||
|
blit.run_passes()?;
|
||||||
|
}
|
||||||
|
let dst = blit.pool.get()?;
|
||||||
|
cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
|
||||||
|
Ok(dst)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for EglImporter {
|
impl Drop for EglImporter {
|
||||||
|
|||||||
@@ -14,13 +14,26 @@ pub mod vulkan;
|
|||||||
pub use cuda::DeviceBuffer;
|
pub use cuda::DeviceBuffer;
|
||||||
pub use egl::{DmabufPlane, EglImporter};
|
pub use egl::{DmabufPlane, EglImporter};
|
||||||
|
|
||||||
/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy).
|
/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`).
|
||||||
pub fn enabled() -> bool {
|
fn flag(name: &str) -> bool {
|
||||||
std::env::var("PUNKTFUNK_ZEROCOPY")
|
std::env::var(name)
|
||||||
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
|
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy).
|
||||||
|
pub fn enabled() -> bool {
|
||||||
|
flag("PUNKTFUNK_ZEROCOPY")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the NV12 convert path is opted in (`PUNKTFUNK_NV12` truthy). When set AND the zero-copy
|
||||||
|
/// tiled-GL path is active, the capturer produces native NV12 (BT.709 limited range) on the GPU and
|
||||||
|
/// feeds NVENC YUV directly — deleting NVENC's internal RGB→YUV CSC (Tier 2A). Off by default: the
|
||||||
|
/// existing RGB/BGRx path is then 100% unchanged.
|
||||||
|
pub fn nv12_enabled() -> bool {
|
||||||
|
flag("PUNKTFUNK_NV12")
|
||||||
|
}
|
||||||
|
|
||||||
/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
|
/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
|
||||||
const fn fourcc(c: &[u8; 4]) -> u32 {
|
const fn fourcc(c: &[u8; 4]) -> u32 {
|
||||||
(c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
|
(c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
|
||||||
@@ -49,3 +62,152 @@ pub fn probe() -> anyhow::Result<()> {
|
|||||||
tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
|
tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reference BT.709 LIMITED-range conversion of one full-range RGB pixel (`u8`) to (Y, U, V) in
|
||||||
|
/// `f64`, matching the GPU shaders in [`egl`]. Y in [16,235], U/V in [16,240].
|
||||||
|
fn bt709_limited(r: u8, g: u8, b: u8) -> (f64, f64, f64) {
|
||||||
|
let (r, g, b) = (r as f64 / 255.0, g as f64 / 255.0, b as f64 / 255.0);
|
||||||
|
let y = 16.0 + 219.0 * (0.2126 * r + 0.7152 * g + 0.0722 * b);
|
||||||
|
let u = 128.0 + 224.0 * (-0.1146 * r - 0.3854 * g + 0.5000 * b);
|
||||||
|
let v = 128.0 + 224.0 * (0.5000 * r - 0.4542 * g - 0.0458 * b);
|
||||||
|
(y, u, v)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// NV12 colour self-test (the `nv12-selftest` subcommand): stand up the EGL/GL + CUDA stack, upload
|
||||||
|
/// a known synthetic RGBA pattern, run the real NV12 convert shaders on the GPU, read the Y and UV
|
||||||
|
/// planes back, and compare against a Rust BT.709 limited-range reference. Validates colour
|
||||||
|
/// correctness on the GPU **without a display** (the project's green-screen bugs came from exactly
|
||||||
|
/// this kind of plane/layout error). PASS if max abs error Y ≤ 2, U/V ≤ 3.
|
||||||
|
pub fn nv12_selftest() -> anyhow::Result<()> {
|
||||||
|
use anyhow::bail;
|
||||||
|
|
||||||
|
// 64x64, even dims. A 4x4 grid of 16x16 flat-colour blocks (so each 2x2 chroma footprint is
|
||||||
|
// uniform → exact chroma comparison) covering the primaries + gray/black/white, then the rest
|
||||||
|
// is a diagonal gradient (every pixel changes — a Y-channel stress that also exercises the
|
||||||
|
// chroma averaging; the gradient blocks are compared on Y only).
|
||||||
|
const W: u32 = 64;
|
||||||
|
const H: u32 = 64;
|
||||||
|
const BLK: u32 = 16;
|
||||||
|
// (name, r, g, b) for the labelled blocks in row-major grid order; the rest fall to gradient.
|
||||||
|
let named: [(&str, u8, u8, u8); 8] = [
|
||||||
|
("red", 255, 0, 0),
|
||||||
|
("green", 0, 255, 0),
|
||||||
|
("blue", 0, 0, 255),
|
||||||
|
("white", 255, 255, 255),
|
||||||
|
("black", 0, 0, 0),
|
||||||
|
("gray128", 128, 128, 128),
|
||||||
|
("yellow", 255, 255, 0),
|
||||||
|
("cyan", 0, 255, 255),
|
||||||
|
];
|
||||||
|
|
||||||
|
// Build the RGBA pattern + a parallel record of each pixel's (r,g,b) and whether it sits in a
|
||||||
|
// flat block (chroma-comparable) or the gradient (Y-only).
|
||||||
|
let mut rgba = vec![0u8; (W * H * 4) as usize];
|
||||||
|
let mut flat = vec![false; (W * H) as usize];
|
||||||
|
let grid_cols = W / BLK; // 4
|
||||||
|
let pixel_rgb = |x: u32, y: u32| -> (u8, u8, u8, bool) {
|
||||||
|
let bx = x / BLK;
|
||||||
|
let by = y / BLK;
|
||||||
|
let idx = (by * grid_cols + bx) as usize;
|
||||||
|
if idx < named.len() {
|
||||||
|
let (_, r, g, b) = named[idx];
|
||||||
|
(r, g, b, true)
|
||||||
|
} else {
|
||||||
|
// Diagonal gradient — distinct per pixel.
|
||||||
|
let r = ((x * 4) & 0xff) as u8;
|
||||||
|
let g = ((y * 4) & 0xff) as u8;
|
||||||
|
let b = (((x + y) * 2) & 0xff) as u8;
|
||||||
|
(r, g, b, false)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for y in 0..H {
|
||||||
|
for x in 0..W {
|
||||||
|
let (r, g, b, is_flat) = pixel_rgb(x, y);
|
||||||
|
let i = ((y * W + x) * 4) as usize;
|
||||||
|
rgba[i] = r;
|
||||||
|
rgba[i + 1] = g;
|
||||||
|
rgba[i + 2] = b;
|
||||||
|
rgba[i + 3] = 255;
|
||||||
|
flat[(y * W + x) as usize] = is_flat;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GPU convert.
|
||||||
|
let mut importer = EglImporter::new()?;
|
||||||
|
let nv12 = importer.convert_rgba_for_test(&rgba, W, H)?;
|
||||||
|
let (uv_ptr, uv_pitch) = nv12
|
||||||
|
.uv
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("self-test buffer is not NV12"))?;
|
||||||
|
// Read both planes back to host (tightly packed).
|
||||||
|
let y_host = cuda::read_plane_to_host(nv12.ptr, nv12.pitch, W as usize, H as usize)?;
|
||||||
|
let uv_host = cuda::read_plane_to_host(uv_ptr, uv_pitch, (W as usize / 2) * 2, H as usize / 2)?;
|
||||||
|
|
||||||
|
// Compare Y over every pixel.
|
||||||
|
let mut max_y_err = 0.0f64;
|
||||||
|
for y in 0..H {
|
||||||
|
for x in 0..W {
|
||||||
|
let (r, g, b, _) = pixel_rgb(x, y);
|
||||||
|
let (ref_y, _, _) = bt709_limited(r, g, b);
|
||||||
|
let got = y_host[(y * W + x) as usize] as f64;
|
||||||
|
max_y_err = max_y_err.max((got - ref_y).abs());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare U/V over flat blocks only (each 2x2 footprint is a single colour → exact reference).
|
||||||
|
// Chroma is W/2 × H/2 samples, interleaved [U,V] per sample.
|
||||||
|
let cw = W / 2;
|
||||||
|
let ch = H / 2;
|
||||||
|
let mut max_u_err = 0.0f64;
|
||||||
|
let mut max_v_err = 0.0f64;
|
||||||
|
for cy in 0..ch {
|
||||||
|
for cx in 0..cw {
|
||||||
|
// The 2x2 source footprint of this chroma sample.
|
||||||
|
let (sx, sy) = (cx * 2, cy * 2);
|
||||||
|
// Only compare where all 4 source pixels are flat (uniform colour).
|
||||||
|
let all_flat =
|
||||||
|
(0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize]));
|
||||||
|
if !all_flat {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let (r, g, b, _) = pixel_rgb(sx, sy);
|
||||||
|
let (_, ref_u, ref_v) = bt709_limited(r, g, b);
|
||||||
|
let base = ((cy * cw + cx) * 2) as usize;
|
||||||
|
let got_u = uv_host[base] as f64;
|
||||||
|
let got_v = uv_host[base + 1] as f64;
|
||||||
|
max_u_err = max_u_err.max((got_u - ref_u).abs());
|
||||||
|
max_v_err = max_v_err.max((got_v - ref_v).abs());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-primary actual-vs-expected (block centre for chroma).
|
||||||
|
println!("NV12 self-test ({W}x{H}, BT.709 limited range)");
|
||||||
|
println!(
|
||||||
|
" {:<8} {:>14} {:>14} {:>14}",
|
||||||
|
"color", "Y exp/got", "U exp/got", "V exp/got"
|
||||||
|
);
|
||||||
|
for (idx, (name, r, g, b)) in named.iter().enumerate() {
|
||||||
|
let bx = (idx as u32 % grid_cols) * BLK + BLK / 2;
|
||||||
|
let by = (idx as u32 / grid_cols) * BLK + BLK / 2;
|
||||||
|
let (ey, eu, ev) = bt709_limited(*r, *g, *b);
|
||||||
|
let gy = y_host[(by * W + bx) as usize] as f64;
|
||||||
|
let (ccx, ccy) = (bx / 2, by / 2);
|
||||||
|
let cbase = ((ccy * cw + ccx) * 2) as usize;
|
||||||
|
let gu = uv_host[cbase] as f64;
|
||||||
|
let gv = uv_host[cbase + 1] as f64;
|
||||||
|
println!(
|
||||||
|
" {:<8} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}",
|
||||||
|
name, ey, gy, eu, gu, ev, gv
|
||||||
|
);
|
||||||
|
}
|
||||||
|
println!(
|
||||||
|
" max abs error: Y={max_y_err:.2} (≤2) U={max_u_err:.2} (≤3) V={max_v_err:.2} (≤3)"
|
||||||
|
);
|
||||||
|
|
||||||
|
if max_y_err <= 2.0 && max_u_err <= 3.0 && max_v_err <= 3.0 {
|
||||||
|
println!("PASS");
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
println!("FAIL");
|
||||||
|
bail!("NV12 self-test FAILED (Y={max_y_err:.2} U={max_u_err:.2} V={max_v_err:.2})");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user