From aa9148500872e4fe4b77a8bc6291b135ebe4627b Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Tue, 9 Jun 2026 16:28:29 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20M2=20=E2=80=94=20complete=20zero-copy?= =?UTF-8?q?=20dmabuf=E2=86=92NVENC=20capture=20path=20(EGL/GL=E2=86=92CUDA?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PipeWire dmabuf now reaches NVENC with no CPU touch. Verified live against headless KWin: a tiled BGRx dmabuf is imported and encoded to a pixel-correct H.265 stream (decoded frame matches the captured desktop — no tiling artifacts, no colour swap). The CPU-copy path stays the default and the runtime fallback. Capture side (zerocopy::egl): desktop NVIDIA can't register a dmabuf EGLImage with CUDA directly (cuGraphicsEGLRegisterImage is Tegra-only; cuGraphicsGLRegisterImage rejects EGLImage-backed textures), so we follow OBS/Sunshine — bind the EGLImage to a GL texture, render it through a fullscreen-triangle shader into an immutable GL_RGBA8 texture (de-tiling + .bgra swizzle to the BGRx the encoder wants), then register that texture with CUDA and copy it device-to-device into an owned buffer so the dmabuf returns to the compositor immediately. Encode side (encode/linux::submit_cuda): take a *pooled* CUDA surface via av_hwframe_get_buffer and device→device-copy our imported buffer into it, instead of wrapping our own pointer in a bare AVFrame. A bare frame is rejected with EINVAL (NVENC ignores frames with null buf[0]; the encode path's av_frame_ref needs a refcounted buffer), and a fresh device pointer every frame would thrash NVENC's bounded resource-registration cache — the pool recycles a small set. Also: gate FFmpeg AV_LOG_DEBUG behind LUMEN_FFMPEG_DEBUG for diagnosing hw-frame rejects, and refresh the now-accurate module docs. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/lumen-host/src/capture/linux.rs | 2 +- crates/lumen-host/src/encode/linux.rs | 39 ++- crates/lumen-host/src/zerocopy/cuda.rs | 172 ++++++------ crates/lumen-host/src/zerocopy/egl.rs | 369 +++++++++++++++++++++---- 4 files changed, 431 insertions(+), 151 deletions(-) diff --git a/crates/lumen-host/src/capture/linux.rs b/crates/lumen-host/src/capture/linux.rs index 22660ee..dd40af3 100644 --- a/crates/lumen-host/src/capture/linux.rs +++ b/crates/lumen-host/src/capture/linux.rs @@ -547,7 +547,7 @@ mod pipewire { // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall // through to the shm de-pad copy below. - if let (Some(importer), Some(fmt)) = (ud.importer.as_ref(), ud.format) { + if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) { if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { let plane = crate::zerocopy::DmabufPlane { fd: datas[0].fd(), diff --git a/crates/lumen-host/src/encode/linux.rs b/crates/lumen-host/src/encode/linux.rs index 8701a50..454ed4e 100644 --- a/crates/lumen-host/src/encode/linux.rs +++ b/crates/lumen-host/src/encode/linux.rs @@ -139,6 +139,9 @@ impl NvencEncoder { cuda: bool, ) -> Result { ffmpeg::init().context("ffmpeg init")?; + if std::env::var_os("LUMEN_FFMPEG_DEBUG").is_some() { + unsafe { ffi::av_log_set_level(48) }; // AV_LOG_DEBUG — surface NVENC hw-frame rejects + } let name = codec.nvenc_name(); let av_codec = encoder::find_by_name(name) .ok_or_else(|| anyhow!("{name} not built into libavcodec"))?; @@ -316,9 +319,16 @@ impl NvencEncoder { Ok(()) } - /// Zero-copy path: wrap the imported CUDA device buffer in an `AV_PIX_FMT_CUDA` frame and - /// send it straight to NVENC (no CPU touch). `buf.ptr` aliases device memory we own, so - /// `buf[0]` is left null (ffmpeg must not free it); the frame shell is freed after send. + /// Zero-copy path: hand the imported CUDA device buffer to NVENC with no CPU touch. + /// + /// We take a *pooled* surface from the CUDA hwframes context (`av_hwframe_get_buffer`) and + /// device→device-copy our imported buffer into it, rather than wrapping our own pointer in a + /// bare frame. Two reasons: (1) NVENC's `nvenc_send_frame` ignores frames whose `buf[0]` is + /// null and the generic encode path's `av_frame_ref` needs a refcounted buffer — a bare + /// frame is rejected with `EINVAL`; (2) NVENC caches CUDA-resource *registrations* keyed by + /// device pointer with a bounded table, so a fresh pointer every frame would thrash/overflow + /// it — the pool recycles a small set of pointers. The extra copy is device-local (~8 MB at + /// 1080p, sub-millisecond on the GPU) and keeps the host fully off the pixel path. fn submit_cuda( &mut self, buf: &crate::zerocopy::DeviceBuffer, @@ -330,17 +340,28 @@ impl NvencEncoder { .as_ref() .context("CUDA hw context missing (encoder opened in CPU mode)")? .frames_ref; + // The device→device copy below uses our shared context directly; make it current on the + // encode thread (ffmpeg pushes its own around the pool alloc, so order is fine). + crate::zerocopy::cuda::make_current().context("CUDA context current (encode thread)")?; unsafe { let mut f = ffi::av_frame_alloc(); if f.is_null() { bail!("av_frame_alloc failed"); } - (*f).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA as c_int; - (*f).width = self.width as c_int; - (*f).height = self.height as c_int; - (*f).hw_frames_ctx = ffi::av_buffer_ref(frames_ref); - (*f).data[0] = buf.ptr as *mut u8; - (*f).linesize[0] = buf.pitch as c_int; + // Pooled CUDA surface: sets format, width/height, data[0]/linesize[0], buf[0] and + // hw_frames_ctx. Reused across frames (the pool recycles), keeping NVENC's + // registration cache warm. + let r = ffi::av_hwframe_get_buffer(frames_ref, f, 0); + if r < 0 { + ffi::av_frame_free(&mut f); + bail!("av_hwframe_get_buffer(CUDA) failed ({r})"); + } + let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; + let dst_pitch = (*f).linesize[0] as usize; + if let Err(e) = crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) { + ffi::av_frame_free(&mut f); + return Err(e).context("copy imported buffer into NVENC surface"); + } (*f).pts = pts; (*f).pict_type = if idr { ffi::AVPictureType::AV_PICTURE_TYPE_I diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs index 31206fa..13da31e 100644 --- a/crates/lumen-host/src/zerocopy/cuda.rs +++ b/crates/lumen-host/src/zerocopy/cuda.rs @@ -1,8 +1,9 @@ -//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the EGL-interop -//! driver calls (`cuGraphicsEGLRegisterImage` / `cuGraphicsResourceGetMappedEglFrame`) nor -//! `CUeglFrame`, so we hand-roll exactly what we need and link `libcuda.so.1` (the driver -//! library — NOT `libcudart`). Symbol names verified against `cust_raw` + `cudaEGL.h`: the -//! context/mem ops use the `_v2` ABI suffix; the graphics/EGL-interop ops are unsuffixed. +//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop +//! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and +//! link `libcuda.so.1` (the driver library — NOT `libcudart`). Symbol names verified against +//! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop +//! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is +//! Tegra-only on the desktop driver — see [`super::egl`].) //! //! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture //! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use. @@ -25,26 +26,6 @@ pub type CUarray = *mut c_void; pub const CU_MEMORYTYPE_DEVICE: c_uint = 2; pub const CU_MEMORYTYPE_ARRAY: c_uint = 3; -/// `CUeglFrameType`: ARRAY=0, PITCH=1. -pub const CU_EGL_FRAME_TYPE_ARRAY: c_uint = 0; -pub const CU_EGL_FRAME_TYPE_PITCH: c_uint = 1; - -/// `CUeglFrame` — exact layout from `cudaEGL.h`. `frame` is a union of `CUarray pArray[3]` and -/// `void* pPitch[3]`; both are three pointers, so `[*mut c_void; 3]` models it. -#[repr(C)] -pub struct CUeglFrame { - pub frame: [*mut c_void; 3], - pub width: c_uint, - pub height: c_uint, - pub depth: c_uint, - pub pitch: c_uint, - pub planeCount: c_uint, - pub numChannels: c_uint, - pub frameType: c_uint, - pub eglColorFormat: c_uint, - pub cuFormat: c_uint, -} - /// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing. #[repr(C)] #[derive(Default)] @@ -67,13 +48,6 @@ pub struct CUDA_MEMCPY2D { pub Height: usize, } -impl Default for CUeglFrame { - fn default() -> Self { - // SAFETY: all fields are integers or pointers; zero is a valid bit pattern. - unsafe { std::mem::zeroed() } - } -} - #[link(name = "cuda")] extern "C" { fn cuInit(flags: c_uint) -> CUresult; @@ -91,15 +65,28 @@ extern "C" { fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult; fn cuCtxSynchronize() -> CUresult; - fn cuGraphicsEGLRegisterImage( + // GL interop (cudaGL.h) — these symbols have NO `_v2` suffix. `cuGraphicsEGLRegisterImage` + // is Tegra-only on the desktop driver, so we go EGLImage → GL texture → register the texture. + fn cuGraphicsGLRegisterImage( resource: *mut CUgraphicsResource, - image: *mut c_void, // EGLImage - flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_NONE = 0 + texture: c_uint, // GLuint + target: c_uint, // GL_TEXTURE_2D = 0x0DE1 + flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01 ) -> CUresult; - fn cuGraphicsResourceGetMappedEglFrame( - egl_frame: *mut CUeglFrame, + fn cuGraphicsMapResources( + count: c_uint, + resources: *mut CUgraphicsResource, + stream: *mut c_void, + ) -> CUresult; + fn cuGraphicsUnmapResources( + count: c_uint, + resources: *mut CUgraphicsResource, + stream: *mut c_void, + ) -> CUresult; + fn cuGraphicsSubResourceGetMappedArray( + array: *mut CUarray, resource: CUgraphicsResource, - index: c_uint, + array_index: c_uint, mip_level: c_uint, ) -> CUresult; fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult; @@ -197,60 +184,61 @@ impl Drop for DeviceBuffer { } } -/// A live EGL→CUDA registration. The mapped device memory aliases the dmabuf, so we copy out of -/// it immediately and then unregister (the EGL image is destroyed by the caller). -pub struct MappedImage { +/// A live GL-texture→CUDA registration (mapped). The CUDA array aliases the texture/dmabuf, so +/// we copy out of it immediately; unmap + unregister happen on drop. +pub struct MappedTexture { resource: CUgraphicsResource, - /// `frameType` (ARRAY vs PITCH) determines how to copy out. - frame: CUeglFrame, + array: CUarray, } -impl MappedImage { - /// Register an `EGLImage` with CUDA and map it to a `CUeglFrame`. +impl MappedTexture { + /// Register a `GL_TEXTURE_2D` texture with CUDA, map it, and get its array. The desktop + /// NVIDIA driver only supports CUDA interop through GL textures (not dmabuf EGLImages + /// directly), so the EGLImage is first bound to a GL texture by the caller. /// /// # Safety - /// `image` must be a valid `EGLImage`; the shared context must be current on this thread. - pub unsafe fn register(image: *mut c_void) -> Result { - // CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY (0x01): we only read the surface (encode from it). + /// The GL context and the shared CUDA context must both be current on this thread, and + /// `texture` must be a valid `GL_TEXTURE_2D` bound to the source image. + pub unsafe fn register_gl(texture: u32) -> Result { + const GL_TEXTURE_2D: c_uint = 0x0DE1; + const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01; let mut resource: CUgraphicsResource = std::ptr::null_mut(); ck( - cuGraphicsEGLRegisterImage(&mut resource, image, 0x01), - "cuGraphicsEGLRegisterImage", + cuGraphicsGLRegisterImage( + &mut resource, + texture, + GL_TEXTURE_2D, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY, + ), + "cuGraphicsGLRegisterImage", )?; - let mut frame = CUeglFrame::default(); - let r = cuGraphicsResourceGetMappedEglFrame(&mut frame, resource, 0, 0); - if r != 0 { + if cuGraphicsMapResources(1, &mut resource, std::ptr::null_mut()) != 0 { let _ = cuGraphicsUnregisterResource(resource); - bail!("cuGraphicsResourceGetMappedEglFrame error {r}"); + bail!("cuGraphicsMapResources failed"); } - Ok(MappedImage { resource, frame }) + let mut array: CUarray = std::ptr::null_mut(); + if cuGraphicsSubResourceGetMappedArray(&mut array, resource, 0, 0) != 0 { + let _ = cuGraphicsUnmapResources(1, &mut resource, std::ptr::null_mut()); + let _ = cuGraphicsUnregisterResource(resource); + bail!("cuGraphicsSubResourceGetMappedArray failed"); + } + Ok(MappedTexture { resource, array }) } - /// Device-to-device copy of this mapped frame into `dst` (de-tiling if the source is a tiled - /// CUarray). After this returns the dmabuf is no longer needed. + /// Copy the mapped array into `dst` (array → pitched device memory). The array is the GL + /// blit's already-linear RGBA8 output, so this is a straight copy. After it returns the + /// source dmabuf is no longer needed. pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> { - let width_bytes = (self.frame.width as usize).min(dst.width as usize) * 4; - let height = (self.frame.height as usize).min(dst.height as usize); - let mut copy = CUDA_MEMCPY2D { + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_ARRAY, + srcArray: self.array, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: dst.ptr, dstPitch: dst.pitch, - WidthInBytes: width_bytes, - Height: height, + WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx) + Height: dst.height as usize, ..Default::default() }; - match self.frame.frameType { - CU_EGL_FRAME_TYPE_PITCH => { - copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; - copy.srcDevice = self.frame.frame[0] as CUdeviceptr; - copy.srcPitch = self.frame.pitch as usize; - } - CU_EGL_FRAME_TYPE_ARRAY => { - copy.srcMemoryType = CU_MEMORYTYPE_ARRAY; - copy.srcArray = self.frame.frame[0] as CUarray; - } - other => bail!("unexpected CUeglFrame frameType {other}"), - } unsafe { ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?; // The copy must complete before the dmabuf is requeued / reused. @@ -258,19 +246,39 @@ impl MappedImage { } Ok(()) } - - pub fn color_format(&self) -> c_uint { - self.frame.eglColorFormat - } - pub fn frame_kind(&self) -> c_uint { - self.frame.frameType - } } -impl Drop for MappedImage { +/// Copy a pitched device buffer into another device region (device→device), e.g. our imported +/// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels. +/// The caller must have the shared context current on this thread (see [`make_current`]). +pub fn copy_device_to_device( + src: &DeviceBuffer, + dst_ptr: CUdeviceptr, + dst_pitch: usize, +) -> Result<()> { + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_DEVICE, + srcDevice: src.ptr, + srcPitch: src.pitch, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: dst_ptr, + dstPitch: dst_pitch, + WidthInBytes: src.width as usize * 4, + Height: src.height as usize, + ..Default::default() + }; + unsafe { + ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2(dev->dev)")?; + ck(cuCtxSynchronize(), "cuCtxSynchronize")?; + } + Ok(()) +} + +impl Drop for MappedTexture { fn drop(&mut self) { if !self.resource.is_null() { unsafe { + let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); let _ = cuGraphicsUnregisterResource(self.resource); } } diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs index 1a94991..d6f5814 100644 --- a/crates/lumen-host/src/zerocopy/egl.rs +++ b/crates/lumen-host/src/zerocopy/egl.rs @@ -1,26 +1,26 @@ -//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA EGL device and -//! import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`. The DRM format -//! **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without the modifier -//! yields a corrupt image or `EGL_BAD_MATCH`). The image is handed to CUDA -//! (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an owned buffer so the -//! dmabuf can be returned to the compositor immediately. +//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (GBM platform on +//! the render node) and import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`. +//! The DRM format **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without +//! the modifier yields a corrupt image or `EGL_BAD_MATCH`). //! -//! NOTE (WIP): the negotiation + EGL import are verified end-to-end against KWin (a tiled -//! dmabuf reaches `eglCreateImage` successfully), but `cuGraphicsEGLRegisterImage` currently -//! returns `CUDA_ERROR_INVALID_VALUE` on the dmabuf-imported `EGLImage`. The likely fix is to -//! bind the `EGLImage` to a GL texture (`glEGLImageTargetTexture2DOES`) and register *that* via -//! `cuGraphicsGLRegisterImage` (OBS/Sunshine's path), which needs a GL context. +//! Desktop NVIDIA can't register a dmabuf `EGLImage` with CUDA directly — `cuGraphicsEGLRegisterImage` +//! is Tegra-only and `cuGraphicsGLRegisterImage` rejects EGLImage-backed textures (their internal +//! format is opaque). So we follow OBS/Sunshine: bind the `EGLImage` to a GL texture +//! (`glEGLImageTargetTexture2DOES`), render it through a fullscreen-triangle shader into a plain +//! immutable `GL_RGBA8` texture (de-tiling and swizzling to the BGRx the encoder wants), then +//! register *that* texture with CUDA ([`MappedTexture`]) and copy it device-to-device into an +//! owned [`DeviceBuffer`] so the dmabuf can be returned to the compositor immediately. #![allow(non_upper_case_globals)] -use super::cuda::{self, DeviceBuffer, MappedImage}; -use anyhow::{ensure, Context as _, Result}; +use super::cuda::{self, DeviceBuffer, MappedTexture}; +use anyhow::{bail, ensure, Context as _, Result}; use khronos_egl as egl; -use std::os::raw::c_void; +use std::os::raw::{c_int, c_void}; // EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl). const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270; -const EGL_PLATFORM_DEVICE_EXT: egl::Enum = 0x313F; +const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7; const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271; const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272; const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273; @@ -28,6 +28,197 @@ const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274; const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443; const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444; +const GL_TEXTURE_2D: u32 = 0x0DE1; +const GL_TEXTURE_MIN_FILTER: u32 = 0x2801; +const GL_TEXTURE_MAG_FILTER: u32 = 0x2800; +const GL_LINEAR: c_int = 0x2601; +const GL_NEAREST: c_int = 0x2600; +const GL_RGBA8: u32 = 0x8058; +const GL_FRAMEBUFFER: u32 = 0x8D40; +const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0; +const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5; +const GL_TEXTURE0: u32 = 0x84C0; +const GL_TRIANGLES: u32 = 0x0004; +const GL_VERTEX_SHADER: u32 = 0x8B31; +const GL_FRAGMENT_SHADER: u32 = 0x8B30; +const GL_COMPILE_STATUS: u32 = 0x8B81; +const GL_LINK_STATUS: u32 = 0x8B82; + +// libglvnd's libGL dispatches these to the NVIDIA driver based on the current EGL/GL context. +#[link(name = "GL")] +extern "C" { + fn glGenTextures(n: c_int, textures: *mut u32); + fn glBindTexture(target: u32, texture: u32); + fn glTexParameteri(target: u32, pname: u32, param: c_int); + fn glDeleteTextures(n: c_int, textures: *const u32); + fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int); + fn glGetError() -> u32; + fn glGenFramebuffers(n: c_int, framebuffers: *mut u32); + fn glBindFramebuffer(target: u32, framebuffer: u32); + fn glFramebufferTexture2D( + target: u32, + attachment: u32, + textarget: u32, + texture: u32, + level: c_int, + ); + fn glCheckFramebufferStatus(target: u32) -> u32; + fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int); + fn glGenVertexArrays(n: c_int, arrays: *mut u32); + fn glBindVertexArray(array: u32); + fn glDrawArrays(mode: u32, first: c_int, count: c_int); + fn glActiveTexture(texture: u32); + fn glUseProgram(program: u32); + fn glFlush(); + fn glCreateShader(shader_type: u32) -> u32; + fn glShaderSource(shader: u32, count: c_int, string: *const *const i8, length: *const c_int); + fn glCompileShader(shader: u32); + fn glGetShaderiv(shader: u32, pname: u32, params: *mut c_int); + fn glDeleteShader(shader: u32); + fn glCreateProgram() -> u32; + fn glAttachShader(program: u32, shader: u32); + fn glLinkProgram(program: u32); + fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int); + fn glGetUniformLocation(program: u32, name: *const i8) -> c_int; + fn glUniform1i(location: c_int, v0: c_int); +} + +#[link(name = "gbm")] +extern "C" { + fn gbm_create_device(fd: c_int) -> *mut c_void; + fn gbm_device_destroy(device: *mut c_void); +} + +/// `glEGLImageTargetTexture2DOES(target, EGLImage)` — loaded via `eglGetProcAddress`. +type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void); + +// Fullscreen-triangle blit: sample the dmabuf EGLImage texture and write it (swizzled to BGRA, +// to match the BGRx the encoder expects) into a normal GL_RGBA8 texture that CUDA *can* register. +const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n"; +const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n"; + +unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result { + let sh = glCreateShader(kind); + ensure!(sh != 0, "glCreateShader failed"); + let ptr = src.as_ptr() as *const i8; + let len = src.len() as c_int; + glShaderSource(sh, 1, &ptr, &len); + glCompileShader(sh); + let mut ok: c_int = 0; + glGetShaderiv(sh, GL_COMPILE_STATUS, &mut ok); + if ok == 0 { + glDeleteShader(sh); + bail!("GL shader compile failed"); + } + Ok(sh) +} + +unsafe fn compile_program() -> Result { + let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?; + let fs = compile_shader(GL_FRAGMENT_SHADER, FRAG_SRC)?; + let prog = glCreateProgram(); + glAttachShader(prog, vs); + glAttachShader(prog, fs); + glLinkProgram(prog); + glDeleteShader(vs); + glDeleteShader(fs); + let mut ok: c_int = 0; + glGetProgramiv(prog, GL_LINK_STATUS, &mut ok); + ensure!(ok != 0, "GL program link failed"); + glUseProgram(prog); + let loc = glGetUniformLocation(prog, c"image".as_ptr()); + if loc >= 0 { + glUniform1i(loc, 0); // sampler -> texture unit 0 + } + glUseProgram(0); + Ok(prog) +} + +/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture. +struct GlBlit { + program: u32, + vao: u32, + fbo: u32, + /// CUDA-registrable destination (immutable GL_RGBA8). + dst_tex: u32, + /// Source texture re-targeted to each frame's EGLImage. + src_tex: u32, + width: u32, + height: u32, +} + +impl GlBlit { + unsafe fn new(width: u32, height: u32) -> Result { + let program = compile_program()?; + let mut vao = 0u32; + glGenVertexArrays(1, &mut vao); // core profile needs a bound VAO for glDrawArrays + let mut fbo = 0u32; + glGenFramebuffers(1, &mut fbo); + + let mut dst_tex = 0u32; + glGenTextures(1, &mut dst_tex); + glBindTexture(GL_TEXTURE_2D, dst_tex); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + + let mut src_tex = 0u32; + glGenTextures(1, &mut src_tex); + glBindTexture(GL_TEXTURE_2D, src_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + glBindTexture(GL_TEXTURE_2D, 0); + + glBindFramebuffer(GL_FRAMEBUFFER, fbo); + glFramebufferTexture2D( + GL_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, + dst_tex, + 0, + ); + let status = glCheckFramebufferStatus(GL_FRAMEBUFFER); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + ensure!( + status == GL_FRAMEBUFFER_COMPLETE, + "blit FBO incomplete ({status:#x})" + ); + Ok(GlBlit { + program, + vao, + fbo, + dst_tex, + src_tex, + width, + height, + }) + } + + /// Bind `image` to the source texture and render it into `dst_tex`. + /// + /// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`. + unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> { + glBindTexture(GL_TEXTURE_2D, self.src_tex); + let _ = glGetError(); + egl_image_target(GL_TEXTURE_2D, image); + let e = glGetError(); + glBindTexture(GL_TEXTURE_2D, 0); + ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})"); + + glBindFramebuffer(GL_FRAMEBUFFER, self.fbo); + glViewport(0, 0, self.width as c_int, self.height as c_int); + glUseProgram(self.program); + glActiveTexture(GL_TEXTURE0); + glBindTexture(GL_TEXTURE_2D, self.src_tex); + glBindVertexArray(self.vao); + glDrawArrays(GL_TRIANGLES, 0, 3); + glBindVertexArray(0); + glBindFramebuffer(GL_FRAMEBUFFER, 0); + glFlush(); // submit GL work before CUDA maps the texture + Ok(()) + } +} + /// One dmabuf plane as delivered by PipeWire (single-plane for BGRx). #[derive(Clone, Copy, Debug)] pub struct DmabufPlane { @@ -38,12 +229,20 @@ pub struct DmabufPlane { type Egl = egl::DynamicInstance; -/// Headless EGLDisplay (NVIDIA device platform) used to import dmabufs. Lives on the capture -/// thread. The device platform — not GBM — is what NVIDIA's CUDA-EGL interop registers against. +/// Headless EGLDisplay (NVIDIA device platform) + a surfaceless desktop-GL context used to +/// import dmabufs and bridge them to CUDA via a GL texture. Lives on the capture thread (the GL +/// context is made current there once). pub struct EglImporter { egl: Egl, display: egl::Display, no_ctx: egl::Context, + /// Surfaceless GL context (current on the capture thread) for the EGLImage→texture bind. + _gl_ctx: egl::Context, + egl_image_target: EglImageTargetFn, + /// Lazily-created GL blit machinery (recreated if the frame size changes). + blit: Option, + gbm: *mut c_void, + render_fd: c_int, } // The EGL handles are confined to the capture thread; the struct is moved there once. @@ -53,43 +252,28 @@ impl EglImporter { /// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context /// to exist (so a later `import` only touches the hot path). pub fn new() -> Result { + // GBM platform on the NVIDIA render node: this ties the EGLDisplay (and its GL contexts) + // to the same DRM device CUDA-GL interop associates with, which the EGL device platform + // did not (cuGraphicsGLRegisterImage rejected device-platform GL textures). + let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap(); + let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) }; + ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM"); + let gbm = unsafe { gbm_create_device(render_fd) }; + if gbm.is_null() { + unsafe { libc::close(render_fd) }; + anyhow::bail!("gbm_create_device failed"); + } + let egl: Egl = unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?; - - // Enumerate EGL devices and use the first (the NVIDIA GPU on a single-GPU box). - type QueryDevicesFn = unsafe extern "system" fn( - max_devices: i32, - devices: *mut *mut c_void, - num_devices: *mut i32, - ) -> u32; - let query_devices: QueryDevicesFn = unsafe { - std::mem::transmute( - egl.get_proc_address("eglQueryDevicesEXT") - .context("eglQueryDevicesEXT unavailable")?, - ) - }; - let device = unsafe { - let mut count: i32 = 0; - ensure!( - query_devices(0, std::ptr::null_mut(), &mut count) != 0 && count > 0, - "no EGL devices found" - ); - let mut devices = vec![std::ptr::null_mut::(); count as usize]; - ensure!( - query_devices(count, devices.as_mut_ptr(), &mut count) != 0, - "eglQueryDevicesEXT enumeration failed" - ); - devices[0] - }; - let display = unsafe { egl.get_platform_display( - EGL_PLATFORM_DEVICE_EXT, - device as egl::NativeDisplayType, + EGL_PLATFORM_GBM_KHR, + gbm as egl::NativeDisplayType, &[egl::ATTRIB_NONE], ) } - .context("eglGetPlatformDisplay(DEVICE) on the NVIDIA EGL device")?; + .context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?; egl.initialize(display).context("eglInitialize")?; let exts = egl @@ -106,17 +290,58 @@ impl EglImporter { "EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)" ); + // A surfaceless desktop-GL context so we can bind the dmabuf EGLImage to a GL texture + // (cuGraphicsEGLRegisterImage is Tegra-only; desktop CUDA interop goes through GL). + egl.bind_api(egl::OPENGL_API) + .context("eglBindAPI(OpenGL)")?; + // The default EGL_SURFACE_TYPE in eglChooseConfig is WINDOW_BIT, which a headless device + // display has none of — request a pbuffer-capable config (we run surfaceless anyway). + let config = egl + .choose_first_config( + display, + &[ + egl::SURFACE_TYPE, + egl::PBUFFER_BIT, + egl::RENDERABLE_TYPE, + egl::OPENGL_BIT, + egl::NONE, + ], + ) + .context("eglChooseConfig")? + .context("no EGL config for OpenGL")?; + let gl_ctx = egl + .create_context( + display, + config, + None, + &[egl::CONTEXT_CLIENT_VERSION, 3, egl::NONE], + ) + .context("eglCreateContext(OpenGL)")?; + egl.make_current(display, None, None, Some(gl_ctx)) + .context("eglMakeCurrent surfaceless (needs EGL_KHR_surfaceless_context)")?; + let egl_image_target: EglImageTargetFn = unsafe { + std::mem::transmute( + egl.get_proc_address("glEGLImageTargetTexture2DOES") + .context("glEGLImageTargetTexture2DOES unavailable")?, + ) + }; + // Create the shared CUDA context up front so import() is pure hot path. cuda::context().context("create CUDA context")?; let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) }; tracing::info!( - "zero-copy EGL importer ready (EGL device platform, dma_buf_import + modifiers)" + "zero-copy EGL importer ready (GBM platform + GL texture interop, dma_buf_import + modifiers)" ); Ok(EglImporter { egl, display, no_ctx, + _gl_ctx: gl_ctx, + egl_image_target, + blit: None, + gbm, + render_fd, }) } @@ -175,7 +400,7 @@ impl EglImporter { /// negotiated, or `None` to import with the buffer's implicit modifier (base /// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers). pub fn import( - &self, + &mut self, plane: &DmabufPlane, width: u32, height: u32, @@ -217,17 +442,43 @@ impl EglImporter { ) .context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?; - // CUDA: register + map + copy out, then drop the registration and the EGL image. - let result = (|| -> Result { - cuda::make_current()?; - // SAFETY: `image` is a valid EGLImage we just created; context is current. - let mapped = unsafe { MappedImage::register(image.as_ptr()) }?; - let dst = DeviceBuffer::alloc(width, height)?; - mapped.copy_to(&dst)?; - Ok(dst) - })(); - + // EGLImage → (sampled by a shader) → GL_RGBA8 texture → register *that* with CUDA → map + // → array → copy out. Registering the EGLImage texture directly fails (its layout isn't a + // CUDA-registrable format); the RGBA8 render target is. + let result = self.blit_and_copy(image.as_ptr(), width, height); let _ = self.egl.destroy_image(self.display, image); result } + + /// Render the dmabuf `image` into the registrable RGBA8 texture and copy it to an owned CUDA + /// buffer. (Re)creates the per-size GL blit machinery as needed. + fn blit_and_copy( + &mut self, + image: *mut c_void, + width: u32, + height: u32, + ) -> Result { + cuda::make_current()?; + if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) { + self.blit = Some(unsafe { GlBlit::new(width, height)? }); + } + let blit = self.blit.as_ref().unwrap(); + // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage. + unsafe { blit.run(self.egl_image_target, image)? }; + let mapped = unsafe { MappedTexture::register_gl(blit.dst_tex)? }; + let dst = DeviceBuffer::alloc(width, height)?; + mapped.copy_to(&dst)?; + Ok(dst) + } +} + +impl Drop for EglImporter { + fn drop(&mut self) { + if !self.gbm.is_null() { + unsafe { gbm_device_destroy(self.gbm) }; + } + if self.render_fd >= 0 { + unsafe { libc::close(self.render_fd) }; + } + } }