feat: M2 — LINEAR-dmabuf CUDA import attempt + graceful zero-copy fallback (gamescope)

gamescope only offers LINEAR dmabufs, which the EGL/GL interop path can't handle (NVIDIA's EGL lists no LINEAR modifier for sampling). Attempt a direct CUDA external-memory import (cuImportExternalMemory OPAQUE_FD, cached per buffer fd, one DtoD copy per frame into the pooled buffer): the FFI + plumbing are in place, and LINEAR(0) is now advertised alongside the tiled EGL modifiers (tiled first, so KWin still prefers it — regression-tested). Empirically the 595 desktop driver rejects raw dmabuf fds as OPAQUE_FD (CUDA_ERROR_UNKNOWN), matching the documented limitation — true LINEAR GPU import needs a Vulkan interop bridge (import dmabuf via VK_EXT_external_memory_dma_buf, GPU-copy into an exportable allocation, hand that to CUDA), noted as future work. So the importer now degrades instead of dying: on GPU-import failure it logs once, disables itself, and falls through to the CPU mmap path. Validated: gamescope + LUMEN_ZEROCOPY=1 runs full-rate (122.9 fps @720p120, valid HEVC) via the fallback; KWin keeps real zero-copy. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 22:42:06 +00:00
parent 7f3897e0d3
commit 751789f932
3 changed files with 207 additions and 13 deletions
@@ -608,12 +608,17 @@ mod pipewire {
        } else {
            None
        };
-        // Modifiers our EGL stack can import for BGRx (the layout KWin gives); if none, we can't
+        // Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus
-        // negotiate dmabuf and fall back to the shm path.
+        // LINEAR (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer)
-        let modifiers = importer
+        // import via CUDA external memory instead. Tiled stays first so allocators that can do
        // both (KWin) prefer it. If none, we can't negotiate dmabuf → shm path.
        let mut modifiers = importer
            .as_ref()
            .map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap()))
            .unwrap_or_default();
        if importer.is_some() && !modifiers.contains(&0) {
            modifiers.push(0); // DRM_FORMAT_MOD_LINEAR
        }
        let want_dmabuf = importer.is_some() && !modifiers.is_empty();
        if zerocopy && !want_dmabuf {
            tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path");
@@ -714,6 +719,7 @@ mod pipewire {
                // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
                // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
                // through to the shm de-pad copy below.
                let mut gpu_import_broken = false;
                if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) {
                    if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
                        let plane = crate::zerocopy::DmabufPlane {
@@ -721,11 +727,17 @@ mod pipewire {
                            offset: datas[0].chunk().offset(),
                            stride: datas[0].chunk().stride().max(0) as u32,
                        };
-                        // 0 (unset/LINEAR) → import with the implicit modifier; a real tiled
+                        // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g.
-                        // modifier (if the producer reported one) → import it explicitly.
+                        // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't
                        // sample LINEAR).
                        let modifier = (ud.modifier != 0).then_some(ud.modifier);
                        if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
-                            match importer.import(&plane, w as u32, h as u32, fourcc, modifier) {
+                            let imported = if modifier.is_some() {
                                importer.import(&plane, w as u32, h as u32, fourcc, modifier)
                            } else {
                                importer.import_linear(&plane, w as u32, h as u32)
                            };
                            match imported {
                                Ok(devbuf) => {
                                    static ONCE: std::sync::atomic::AtomicBool =
                                        std::sync::atomic::AtomicBool::new(true);
@@ -744,20 +756,26 @@ mod pipewire {
                                        format: fmt,
                                        payload: FramePayload::Cuda(devbuf),
                                    });
                                    return;
                                }
                                Err(e) => {
-                                    static ONCE: std::sync::atomic::AtomicBool =
+                                    // GPU import unavailable for this buffer kind (e.g. the
-                                        std::sync::atomic::AtomicBool::new(true);
+                                    // driver rejects LINEAR external-memory import). Disable
-                                    if ONCE.swap(false, Ordering::Relaxed) {
+                                    // the importer and fall through to the CPU mmap path —
-                                        tracing::warn!(error = %format!("{e:#}"),
+                                    // degraded, not dead.
-                                            "dmabuf import failed — frames dropped (consider unsetting LUMEN_ZEROCOPY)");
+                                    tracing::warn!(error = %format!("{e:#}"),
-                                    }
+                                        "dmabuf GPU import failed — falling back to the CPU copy path");
                                    gpu_import_broken = true;
                                }
                            }
                        } else {
                            return; // format has no DRM fourcc mapping — skip the frame
                        }
                        return;
                    }
                }
                if gpu_import_broken {
                    ud.importer = None;
                }
                let d = &mut datas[0];
                // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its
@@ -21,6 +21,7 @@ pub type CUstream = *mut c_void; // opaque CUstream_st*
 pub type CUdeviceptr = u64;
 pub type CUgraphicsResource = *mut c_void;
 pub type CUarray = *mut c_void;
 pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
 /// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
 pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
@@ -48,6 +49,34 @@ pub struct CUDA_MEMCPY2D {
    pub Height: usize,
 }
 /// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
 /// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
 /// only the first 4 bytes (the `int fd`) are read.
 #[repr(C)]
 #[derive(Default)]
 pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
    pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
    _pad: u32,
    pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
    pub size: u64,
    pub flags: c_uint,
    reserved: [c_uint; 16],
    _pad2: u32,
 }
 /// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
 #[repr(C)]
 #[derive(Default)]
 pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
    pub offset: u64,
    pub size: u64,
    pub flags: c_uint,
    reserved: [c_uint; 16],
    _pad: u32,
 }
 pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;
 #[link(name = "cuda")]
 extern "C" {
    fn cuInit(flags: c_uint) -> CUresult;
@@ -90,6 +119,19 @@ extern "C" {
        mip_level: c_uint,
    ) -> CUresult;
    fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
    // External memory (cuda.h, no `_v2` suffix) — imports a (Vulkan-exported) dmabuf fd as
    // device memory. Used for LINEAR dmabufs (gamescope), which EGL/GL interop can't sample.
    fn cuImportExternalMemory(
        ext_mem_out: *mut CUexternalMemory,
        mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
    ) -> CUresult;
    fn cuExternalMemoryGetMappedBuffer(
        dev_ptr: *mut CUdeviceptr,
        ext_mem: CUexternalMemory,
        buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
    ) -> CUresult;
    fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult;
 }
 #[inline]
@@ -197,6 +239,14 @@ impl BufferPool {
        })
    }
    pub fn width(&self) -> u32 {
        self.width
    }
    pub fn height(&self) -> u32 {
        self.height
    }
    /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
    /// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
    pub fn get(&self) -> Result<DeviceBuffer> {
@@ -359,3 +409,95 @@ impl Drop for RegisteredTexture {
        }
    }
 }
 /// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
 /// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
 /// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
 pub struct ExternalDmabuf {
    ext: CUexternalMemory,
    pub ptr: CUdeviceptr,
    pub size: u64,
 }
 // Raw driver handles; used from the single capture thread but moved with the importer.
 unsafe impl Send for ExternalDmabuf {}
 impl ExternalDmabuf {
    /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
    /// from then on) and map its full `size` bytes to a device pointer. The shared context
    /// must be current.
    pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
        let dup = unsafe { libc::dup(fd) };
        if dup < 0 {
            bail!("dup(dmabuf fd) failed");
        }
        let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
            type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
            size,
            ..Default::default()
        };
        desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
        let mut ext: CUexternalMemory = std::ptr::null_mut();
        let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
        if r != 0 {
            unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
            bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
        }
        let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
            offset: 0,
            size,
            ..Default::default()
        };
        let mut ptr: CUdeviceptr = 0;
        let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
        if r != 0 {
            unsafe {
                let _ = cuDestroyExternalMemory(ext);
            }
            bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
        }
        Ok(ExternalDmabuf { ext, ptr, size })
    }
 }
 impl Drop for ExternalDmabuf {
    fn drop(&mut self) {
        unsafe {
            if let Some(c) = CONTEXT.get() {
                let _ = cuCtxSetCurrent(c.0);
            }
            if self.ptr != 0 {
                let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
            }
            if !self.ext.is_null() {
                let _ = cuDestroyExternalMemory(self.ext);
            }
        }
    }
 }
 /// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
 /// offset) into `dst`. The shared context must be current on this thread.
 pub fn copy_pitched_to_buffer(
    src_ptr: CUdeviceptr,
    src_pitch: usize,
    dst: &DeviceBuffer,
 ) -> Result<()> {
    let copy = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src_ptr,
        srcPitch: src_pitch,
        dstMemoryType: CU_MEMORYTYPE_DEVICE,
        dstDevice: dst.ptr,
        dstPitch: dst.pitch,
        WidthInBytes: dst.width as usize * 4,
        Height: dst.height as usize,
        ..Default::default()
    };
    unsafe {
        ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2(ext->dev)")?;
        // The copy must finish before the dmabuf is requeued to the producer.
        ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
    }
    Ok(())
 }
@@ -252,6 +252,10 @@ pub struct EglImporter {
    egl_image_target: EglImageTargetFn,
    /// Lazily-created GL blit machinery (recreated if the frame size changes).
    blit: Option<GlBlit>,
    /// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the
    /// producer's buffer pool keeps fds stable for the stream's life) + the destination pool.
    linear: std::collections::HashMap<i32, cuda::ExternalDmabuf>,
    linear_pool: Option<cuda::BufferPool>,
    gbm: *mut c_void,
    render_fd: c_int,
 }
@@ -351,11 +355,41 @@ impl EglImporter {
            _gl_ctx: gl_ctx,
            egl_image_target,
            blit: None,
            linear: std::collections::HashMap::new(),
            linear_pool: None,
            gbm,
            render_fd,
        })
    }
    /// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't
    /// sample LINEAR, but the bytes are directly addressable once imported). The import is
    /// cached per fd; per frame this is one device→device copy into a pooled buffer.
    pub fn import_linear(
        &mut self,
        plane: &DmabufPlane,
        width: u32,
        height: u32,
    ) -> Result<DeviceBuffer> {
        cuda::make_current()?;
        if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
            self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
        }
        let fd = plane.fd;
        let ext = match self.linear.entry(fd) {
            std::collections::hash_map::Entry::Occupied(e) => e.into_mut(),
            std::collections::hash_map::Entry::Vacant(e) => {
                // Size from the fd itself (the chunk's size field is unreliable for dmabufs).
                let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) };
                anyhow::ensure!(size > 0, "lseek(dmabuf) failed");
                e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?)
            }
        };
        let dst = self.linear_pool.as_ref().unwrap().get()?;
        cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?;
        Ok(dst)
    }
    /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
    /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
    /// a dmabuf in a layout we can import. Empty on failure (caller falls back).