From 751789f9324a244a43f3d85edbcb399bf26e2d67 Mon Sep 17 00:00:00 2001
From: enricobuehler <buehler@unom.io>
Date: Tue, 9 Jun 2026 22:42:06 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20M2=20=E2=80=94=20LINEAR-dmabuf=20CUDA?=
 =?UTF-8?q?=20import=20attempt=20+=20graceful=20zero-copy=20fallback=20(ga?=
 =?UTF-8?q?mescope)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gamescope only offers LINEAR dmabufs, which the EGL/GL interop path can't handle (NVIDIA's
EGL lists no LINEAR modifier for sampling). Attempt a direct CUDA external-memory import
(cuImportExternalMemory OPAQUE_FD, cached per buffer fd, one DtoD copy per frame into the
pooled buffer): the FFI + plumbing are in place, and LINEAR(0) is now advertised alongside
the tiled EGL modifiers (tiled first, so KWin still prefers it — regression-tested).

Empirically the 595 desktop driver rejects raw dmabuf fds as OPAQUE_FD (CUDA_ERROR_UNKNOWN),
matching the documented limitation — true LINEAR GPU import needs a Vulkan interop bridge
(import dmabuf via VK_EXT_external_memory_dma_buf, GPU-copy into an exportable allocation,
hand that to CUDA), noted as future work. So the importer now degrades instead of dying:
on GPU-import failure it logs once, disables itself, and falls through to the CPU mmap path.
Validated: gamescope + LUMEN_ZEROCOPY=1 runs full-rate (122.9 fps @720p120, valid HEVC) via
the fallback; KWin keeps real zero-copy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/lumen-host/src/capture/linux.rs |  44 +++++---
 crates/lumen-host/src/zerocopy/cuda.rs | 142 +++++++++++++++++++++++++
 crates/lumen-host/src/zerocopy/egl.rs  |  34 ++++++
 3 files changed, 207 insertions(+), 13 deletions(-)

diff --git a/crates/lumen-host/src/capture/linux.rs b/crates/lumen-host/src/capture/linux.rs
index f76adea..894ac97 100644
--- a/crates/lumen-host/src/capture/linux.rs
+++ b/crates/lumen-host/src/capture/linux.rs
@@ -608,12 +608,17 @@ mod pipewire {
         } else {
             None
         };
-        // Modifiers our EGL stack can import for BGRx (the layout KWin gives); if none, we can't
-        // negotiate dmabuf and fall back to the shm path.
-        let modifiers = importer
+        // Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus
+        // LINEAR (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer)
+        // import via CUDA external memory instead. Tiled stays first so allocators that can do
+        // both (KWin) prefer it. If none, we can't negotiate dmabuf → shm path.
+        let mut modifiers = importer
             .as_ref()
             .map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap()))
             .unwrap_or_default();
+        if importer.is_some() && !modifiers.contains(&0) {
+            modifiers.push(0); // DRM_FORMAT_MOD_LINEAR
+        }
         let want_dmabuf = importer.is_some() && !modifiers.is_empty();
         if zerocopy && !want_dmabuf {
             tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path");
@@ -714,6 +719,7 @@ mod pipewire {
                 // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
                 // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
                 // through to the shm de-pad copy below.
+                let mut gpu_import_broken = false;
                 if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) {
                     if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
                         let plane = crate::zerocopy::DmabufPlane {
@@ -721,11 +727,17 @@ mod pipewire {
                             offset: datas[0].chunk().offset(),
                             stride: datas[0].chunk().stride().max(0) as u32,
                         };
-                        // 0 (unset/LINEAR) → import with the implicit modifier; a real tiled
-                        // modifier (if the producer reported one) → import it explicitly.
+                        // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g.
+                        // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't
+                        // sample LINEAR).
                         let modifier = (ud.modifier != 0).then_some(ud.modifier);
                         if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
-                            match importer.import(&plane, w as u32, h as u32, fourcc, modifier) {
+                            let imported = if modifier.is_some() {
+                                importer.import(&plane, w as u32, h as u32, fourcc, modifier)
+                            } else {
+                                importer.import_linear(&plane, w as u32, h as u32)
+                            };
+                            match imported {
                                 Ok(devbuf) => {
                                     static ONCE: std::sync::atomic::AtomicBool =
                                         std::sync::atomic::AtomicBool::new(true);
@@ -744,20 +756,26 @@ mod pipewire {
                                         format: fmt,
                                         payload: FramePayload::Cuda(devbuf),
                                     });
+                                    return;
                                 }
                                 Err(e) => {
-                                    static ONCE: std::sync::atomic::AtomicBool =
-                                        std::sync::atomic::AtomicBool::new(true);
-                                    if ONCE.swap(false, Ordering::Relaxed) {
-                                        tracing::warn!(error = %format!("{e:#}"),
-                                            "dmabuf import failed — frames dropped (consider unsetting LUMEN_ZEROCOPY)");
-                                    }
+                                    // GPU import unavailable for this buffer kind (e.g. the
+                                    // driver rejects LINEAR external-memory import). Disable
+                                    // the importer and fall through to the CPU mmap path —
+                                    // degraded, not dead.
+                                    tracing::warn!(error = %format!("{e:#}"),
+                                        "dmabuf GPU import failed — falling back to the CPU copy path");
+                                    gpu_import_broken = true;
                                 }
                             }
+                        } else {
+                            return; // format has no DRM fourcc mapping — skip the frame
                         }
-                        return;
                     }
                 }
+                if gpu_import_broken {
+                    ud.importer = None;
+                }
 
                 let d = &mut datas[0];
                 // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its
diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs
index 5867dfe..c4edcb0 100644
--- a/crates/lumen-host/src/zerocopy/cuda.rs
+++ b/crates/lumen-host/src/zerocopy/cuda.rs
@@ -21,6 +21,7 @@ pub type CUstream = *mut c_void; // opaque CUstream_st*
 pub type CUdeviceptr = u64;
 pub type CUgraphicsResource = *mut c_void;
 pub type CUarray = *mut c_void;
+pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
 
 /// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
 pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
@@ -48,6 +49,34 @@ pub struct CUDA_MEMCPY2D {
     pub Height: usize,
 }
 
+/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
+/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
+/// only the first 4 bytes (the `int fd`) are read.
+#[repr(C)]
+#[derive(Default)]
+pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
+    pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
+    _pad: u32,
+    pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
+    pub size: u64,
+    pub flags: c_uint,
+    reserved: [c_uint; 16],
+    _pad2: u32,
+}
+
+/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
+#[repr(C)]
+#[derive(Default)]
+pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
+    pub offset: u64,
+    pub size: u64,
+    pub flags: c_uint,
+    reserved: [c_uint; 16],
+    _pad: u32,
+}
+
+pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;
+
 #[link(name = "cuda")]
 extern "C" {
     fn cuInit(flags: c_uint) -> CUresult;
@@ -90,6 +119,19 @@ extern "C" {
         mip_level: c_uint,
     ) -> CUresult;
     fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
+
+    // External memory (cuda.h, no `_v2` suffix) — imports a (Vulkan-exported) dmabuf fd as
+    // device memory. Used for LINEAR dmabufs (gamescope), which EGL/GL interop can't sample.
+    fn cuImportExternalMemory(
+        ext_mem_out: *mut CUexternalMemory,
+        mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
+    ) -> CUresult;
+    fn cuExternalMemoryGetMappedBuffer(
+        dev_ptr: *mut CUdeviceptr,
+        ext_mem: CUexternalMemory,
+        buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
+    ) -> CUresult;
+    fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult;
 }
 
 #[inline]
@@ -197,6 +239,14 @@ impl BufferPool {
         })
     }
 
+    pub fn width(&self) -> u32 {
+        self.width
+    }
+
+    pub fn height(&self) -> u32 {
+        self.height
+    }
+
     /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
     /// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
     pub fn get(&self) -> Result<DeviceBuffer> {
@@ -359,3 +409,95 @@ impl Drop for RegisteredTexture {
         }
     }
 }
+
+/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
+/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
+/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
+pub struct ExternalDmabuf {
+    ext: CUexternalMemory,
+    pub ptr: CUdeviceptr,
+    pub size: u64,
+}
+
+// Raw driver handles; used from the single capture thread but moved with the importer.
+unsafe impl Send for ExternalDmabuf {}
+
+impl ExternalDmabuf {
+    /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
+    /// from then on) and map its full `size` bytes to a device pointer. The shared context
+    /// must be current.
+    pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
+        let dup = unsafe { libc::dup(fd) };
+        if dup < 0 {
+            bail!("dup(dmabuf fd) failed");
+        }
+        let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
+            type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+            size,
+            ..Default::default()
+        };
+        desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
+        let mut ext: CUexternalMemory = std::ptr::null_mut();
+        let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
+        if r != 0 {
+            unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
+            bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
+        }
+        let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
+            offset: 0,
+            size,
+            ..Default::default()
+        };
+        let mut ptr: CUdeviceptr = 0;
+        let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
+        if r != 0 {
+            unsafe {
+                let _ = cuDestroyExternalMemory(ext);
+            }
+            bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
+        }
+        Ok(ExternalDmabuf { ext, ptr, size })
+    }
+}
+
+impl Drop for ExternalDmabuf {
+    fn drop(&mut self) {
+        unsafe {
+            if let Some(c) = CONTEXT.get() {
+                let _ = cuCtxSetCurrent(c.0);
+            }
+            if self.ptr != 0 {
+                let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
+            }
+            if !self.ext.is_null() {
+                let _ = cuDestroyExternalMemory(self.ext);
+            }
+        }
+    }
+}
+
+/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
+/// offset) into `dst`. The shared context must be current on this thread.
+pub fn copy_pitched_to_buffer(
+    src_ptr: CUdeviceptr,
+    src_pitch: usize,
+    dst: &DeviceBuffer,
+) -> Result<()> {
+    let copy = CUDA_MEMCPY2D {
+        srcMemoryType: CU_MEMORYTYPE_DEVICE,
+        srcDevice: src_ptr,
+        srcPitch: src_pitch,
+        dstMemoryType: CU_MEMORYTYPE_DEVICE,
+        dstDevice: dst.ptr,
+        dstPitch: dst.pitch,
+        WidthInBytes: dst.width as usize * 4,
+        Height: dst.height as usize,
+        ..Default::default()
+    };
+    unsafe {
+        ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2(ext->dev)")?;
+        // The copy must finish before the dmabuf is requeued to the producer.
+        ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
+    }
+    Ok(())
+}
diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs
index 23c424d..45fbcef 100644
--- a/crates/lumen-host/src/zerocopy/egl.rs
+++ b/crates/lumen-host/src/zerocopy/egl.rs
@@ -252,6 +252,10 @@ pub struct EglImporter {
     egl_image_target: EglImageTargetFn,
     /// Lazily-created GL blit machinery (recreated if the frame size changes).
     blit: Option<GlBlit>,
+    /// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the
+    /// producer's buffer pool keeps fds stable for the stream's life) + the destination pool.
+    linear: std::collections::HashMap<i32, cuda::ExternalDmabuf>,
+    linear_pool: Option<cuda::BufferPool>,
     gbm: *mut c_void,
     render_fd: c_int,
 }
@@ -351,11 +355,41 @@ impl EglImporter {
             _gl_ctx: gl_ctx,
             egl_image_target,
             blit: None,
+            linear: std::collections::HashMap::new(),
+            linear_pool: None,
             gbm,
             render_fd,
         })
     }
 
+    /// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't
+    /// sample LINEAR, but the bytes are directly addressable once imported). The import is
+    /// cached per fd; per frame this is one device→device copy into a pooled buffer.
+    pub fn import_linear(
+        &mut self,
+        plane: &DmabufPlane,
+        width: u32,
+        height: u32,
+    ) -> Result<DeviceBuffer> {
+        cuda::make_current()?;
+        if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
+            self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
+        }
+        let fd = plane.fd;
+        let ext = match self.linear.entry(fd) {
+            std::collections::hash_map::Entry::Occupied(e) => e.into_mut(),
+            std::collections::hash_map::Entry::Vacant(e) => {
+                // Size from the fd itself (the chunk's size field is unreliable for dmabufs).
+                let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) };
+                anyhow::ensure!(size > 0, "lseek(dmabuf) failed");
+                e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?)
+            }
+        };
+        let dst = self.linear_pool.as_ref().unwrap().get()?;
+        cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?;
+        Ok(dst)
+    }
+
     /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
     /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
     /// a dmabuf in a layout we can import. Empty on failure (caller falls back).