From 751789f9324a244a43f3d85edbcb399bf26e2d67 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Tue, 9 Jun 2026 22:42:06 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20M2=20=E2=80=94=20LINEAR-dmabuf=20CUDA?= =?UTF-8?q?=20import=20attempt=20+=20graceful=20zero-copy=20fallback=20(ga?= =?UTF-8?q?mescope)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gamescope only offers LINEAR dmabufs, which the EGL/GL interop path can't handle (NVIDIA's EGL lists no LINEAR modifier for sampling). Attempt a direct CUDA external-memory import (cuImportExternalMemory OPAQUE_FD, cached per buffer fd, one DtoD copy per frame into the pooled buffer): the FFI + plumbing are in place, and LINEAR(0) is now advertised alongside the tiled EGL modifiers (tiled first, so KWin still prefers it — regression-tested). Empirically the 595 desktop driver rejects raw dmabuf fds as OPAQUE_FD (CUDA_ERROR_UNKNOWN), matching the documented limitation — true LINEAR GPU import needs a Vulkan interop bridge (import dmabuf via VK_EXT_external_memory_dma_buf, GPU-copy into an exportable allocation, hand that to CUDA), noted as future work. So the importer now degrades instead of dying: on GPU-import failure it logs once, disables itself, and falls through to the CPU mmap path. Validated: gamescope + LUMEN_ZEROCOPY=1 runs full-rate (122.9 fps @720p120, valid HEVC) via the fallback; KWin keeps real zero-copy. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/lumen-host/src/capture/linux.rs | 44 +++++--- crates/lumen-host/src/zerocopy/cuda.rs | 142 +++++++++++++++++++++++++ crates/lumen-host/src/zerocopy/egl.rs | 34 ++++++ 3 files changed, 207 insertions(+), 13 deletions(-) diff --git a/crates/lumen-host/src/capture/linux.rs b/crates/lumen-host/src/capture/linux.rs index f76adea..894ac97 100644 --- a/crates/lumen-host/src/capture/linux.rs +++ b/crates/lumen-host/src/capture/linux.rs @@ -608,12 +608,17 @@ mod pipewire { } else { None }; - // Modifiers our EGL stack can import for BGRx (the layout KWin gives); if none, we can't - // negotiate dmabuf and fall back to the shm path. - let modifiers = importer + // Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus + // LINEAR (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer) + // import via CUDA external memory instead. Tiled stays first so allocators that can do + // both (KWin) prefer it. If none, we can't negotiate dmabuf → shm path. + let mut modifiers = importer .as_ref() .map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap())) .unwrap_or_default(); + if importer.is_some() && !modifiers.contains(&0) { + modifiers.push(0); // DRM_FORMAT_MOD_LINEAR + } let want_dmabuf = importer.is_some() && !modifiers.is_empty(); if zerocopy && !want_dmabuf { tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path"); @@ -714,6 +719,7 @@ mod pipewire { // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall // through to the shm de-pad copy below. + let mut gpu_import_broken = false; if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) { if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { let plane = crate::zerocopy::DmabufPlane { @@ -721,11 +727,17 @@ mod pipewire { offset: datas[0].chunk().offset(), stride: datas[0].chunk().stride().max(0) as u32, }; - // 0 (unset/LINEAR) → import with the implicit modifier; a real tiled - // modifier (if the producer reported one) → import it explicitly. + // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g. + // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't + // sample LINEAR). let modifier = (ud.modifier != 0).then_some(ud.modifier); if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) { - match importer.import(&plane, w as u32, h as u32, fourcc, modifier) { + let imported = if modifier.is_some() { + importer.import(&plane, w as u32, h as u32, fourcc, modifier) + } else { + importer.import_linear(&plane, w as u32, h as u32) + }; + match imported { Ok(devbuf) => { static ONCE: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(true); @@ -744,20 +756,26 @@ mod pipewire { format: fmt, payload: FramePayload::Cuda(devbuf), }); + return; } Err(e) => { - static ONCE: std::sync::atomic::AtomicBool = - std::sync::atomic::AtomicBool::new(true); - if ONCE.swap(false, Ordering::Relaxed) { - tracing::warn!(error = %format!("{e:#}"), - "dmabuf import failed — frames dropped (consider unsetting LUMEN_ZEROCOPY)"); - } + // GPU import unavailable for this buffer kind (e.g. the + // driver rejects LINEAR external-memory import). Disable + // the importer and fall through to the CPU mmap path — + // degraded, not dead. + tracing::warn!(error = %format!("{e:#}"), + "dmabuf GPU import failed — falling back to the CPU copy path"); + gpu_import_broken = true; } } + } else { + return; // format has no DRM fourcc mapping — skip the frame } - return; } } + if gpu_import_broken { + ud.importer = None; + } let d = &mut datas[0]; // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs index 5867dfe..c4edcb0 100644 --- a/crates/lumen-host/src/zerocopy/cuda.rs +++ b/crates/lumen-host/src/zerocopy/cuda.rs @@ -21,6 +21,7 @@ pub type CUstream = *mut c_void; // opaque CUstream_st* pub type CUdeviceptr = u64; pub type CUgraphicsResource = *mut c_void; pub type CUarray = *mut c_void; +pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st* /// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4. pub const CU_MEMORYTYPE_DEVICE: c_uint = 2; @@ -48,6 +49,34 @@ pub struct CUDA_MEMCPY2D { pub Height: usize, } +/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose +/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type +/// only the first 4 bytes (the `int fd`) are read. +#[repr(C)] +#[derive(Default)] +pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC { + pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1 + _pad: u32, + pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject } + pub size: u64, + pub flags: c_uint, + reserved: [c_uint; 16], + _pad2: u32, +} + +/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout). +#[repr(C)] +#[derive(Default)] +pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC { + pub offset: u64, + pub size: u64, + pub flags: c_uint, + reserved: [c_uint; 16], + _pad: u32, +} + +pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1; + #[link(name = "cuda")] extern "C" { fn cuInit(flags: c_uint) -> CUresult; @@ -90,6 +119,19 @@ extern "C" { mip_level: c_uint, ) -> CUresult; fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult; + + // External memory (cuda.h, no `_v2` suffix) — imports a (Vulkan-exported) dmabuf fd as + // device memory. Used for LINEAR dmabufs (gamescope), which EGL/GL interop can't sample. + fn cuImportExternalMemory( + ext_mem_out: *mut CUexternalMemory, + mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC, + ) -> CUresult; + fn cuExternalMemoryGetMappedBuffer( + dev_ptr: *mut CUdeviceptr, + ext_mem: CUexternalMemory, + buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC, + ) -> CUresult; + fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult; } #[inline] @@ -197,6 +239,14 @@ impl BufferPool { }) } + pub fn width(&self) -> u32 { + self.width + } + + pub fn height(&self) -> u32 { + self.height + } + /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). pub fn get(&self) -> Result { @@ -359,3 +409,95 @@ impl Drop for RegisteredTexture { } } } + +/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR +/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed. +/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop. +pub struct ExternalDmabuf { + ext: CUexternalMemory, + pub ptr: CUdeviceptr, + pub size: u64, +} + +// Raw driver handles; used from the single capture thread but moved with the importer. +unsafe impl Send for ExternalDmabuf {} + +impl ExternalDmabuf { + /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it + /// from then on) and map its full `size` bytes to a device pointer. The shared context + /// must be current. + pub fn import(fd: i32, size: u64) -> Result { + let dup = unsafe { libc::dup(fd) }; + if dup < 0 { + bail!("dup(dmabuf fd) failed"); + } + let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC { + type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, + size, + ..Default::default() + }; + desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes) + let mut ext: CUexternalMemory = std::ptr::null_mut(); + let r = unsafe { cuImportExternalMemory(&mut ext, &desc) }; + if r != 0 { + unsafe { libc::close(dup) }; // import failed → the driver did not take the fd + bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?"); + } + let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC { + offset: 0, + size, + ..Default::default() + }; + let mut ptr: CUdeviceptr = 0; + let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) }; + if r != 0 { + unsafe { + let _ = cuDestroyExternalMemory(ext); + } + bail!("cuExternalMemoryGetMappedBuffer failed ({r})"); + } + Ok(ExternalDmabuf { ext, ptr, size }) + } +} + +impl Drop for ExternalDmabuf { + fn drop(&mut self) { + unsafe { + if let Some(c) = CONTEXT.get() { + let _ = cuCtxSetCurrent(c.0); + } + if self.ptr != 0 { + let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory + } + if !self.ext.is_null() { + let _ = cuDestroyExternalMemory(self.ext); + } + } + } +} + +/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk +/// offset) into `dst`. The shared context must be current on this thread. +pub fn copy_pitched_to_buffer( + src_ptr: CUdeviceptr, + src_pitch: usize, + dst: &DeviceBuffer, +) -> Result<()> { + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_DEVICE, + srcDevice: src_ptr, + srcPitch: src_pitch, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: dst.ptr, + dstPitch: dst.pitch, + WidthInBytes: dst.width as usize * 4, + Height: dst.height as usize, + ..Default::default() + }; + unsafe { + ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2(ext->dev)")?; + // The copy must finish before the dmabuf is requeued to the producer. + ck(cuCtxSynchronize(), "cuCtxSynchronize")?; + } + Ok(()) +} diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs index 23c424d..45fbcef 100644 --- a/crates/lumen-host/src/zerocopy/egl.rs +++ b/crates/lumen-host/src/zerocopy/egl.rs @@ -252,6 +252,10 @@ pub struct EglImporter { egl_image_target: EglImageTargetFn, /// Lazily-created GL blit machinery (recreated if the frame size changes). blit: Option, + /// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the + /// producer's buffer pool keeps fds stable for the stream's life) + the destination pool. + linear: std::collections::HashMap, + linear_pool: Option, gbm: *mut c_void, render_fd: c_int, } @@ -351,11 +355,41 @@ impl EglImporter { _gl_ctx: gl_ctx, egl_image_target, blit: None, + linear: std::collections::HashMap::new(), + linear_pool: None, gbm, render_fd, }) } + /// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't + /// sample LINEAR, but the bytes are directly addressable once imported). The import is + /// cached per fd; per frame this is one device→device copy into a pooled buffer. + pub fn import_linear( + &mut self, + plane: &DmabufPlane, + width: u32, + height: u32, + ) -> Result { + cuda::make_current()?; + if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) { + self.linear_pool = Some(cuda::BufferPool::new(width, height)?); + } + let fd = plane.fd; + let ext = match self.linear.entry(fd) { + std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), + std::collections::hash_map::Entry::Vacant(e) => { + // Size from the fd itself (the chunk's size field is unreliable for dmabufs). + let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) }; + anyhow::ensure!(size > 0, "lseek(dmabuf) failed"); + e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?) + } + }; + let dst = self.linear_pool.as_ref().unwrap().get()?; + cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?; + Ok(dst) + } + /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates /// a dmabuf in a layout we can import. Empty on failure (caller falls back).