feat: M2 — LINEAR-dmabuf CUDA import attempt + graceful zero-copy fallback (gamescope)

gamescope only offers LINEAR dmabufs, which the EGL/GL interop path can't handle (NVIDIA's
EGL lists no LINEAR modifier for sampling). Attempt a direct CUDA external-memory import
(cuImportExternalMemory OPAQUE_FD, cached per buffer fd, one DtoD copy per frame into the
pooled buffer): the FFI + plumbing are in place, and LINEAR(0) is now advertised alongside
the tiled EGL modifiers (tiled first, so KWin still prefers it — regression-tested).

Empirically the 595 desktop driver rejects raw dmabuf fds as OPAQUE_FD (CUDA_ERROR_UNKNOWN),
matching the documented limitation — true LINEAR GPU import needs a Vulkan interop bridge
(import dmabuf via VK_EXT_external_memory_dma_buf, GPU-copy into an exportable allocation,
hand that to CUDA), noted as future work. So the importer now degrades instead of dying:
on GPU-import failure it logs once, disables itself, and falls through to the CPU mmap path.
Validated: gamescope + LUMEN_ZEROCOPY=1 runs full-rate (122.9 fps @720p120, valid HEVC) via
the fallback; KWin keeps real zero-copy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-09 22:42:06 +00:00
parent 7f3897e0d3
commit 751789f932
3 changed files with 207 additions and 13 deletions
+31 -13
View File
@@ -608,12 +608,17 @@ mod pipewire {
} else { } else {
None None
}; };
// Modifiers our EGL stack can import for BGRx (the layout KWin gives); if none, we can't // Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus
// negotiate dmabuf and fall back to the shm path. // LINEAR (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer)
let modifiers = importer // import via CUDA external memory instead. Tiled stays first so allocators that can do
// both (KWin) prefer it. If none, we can't negotiate dmabuf → shm path.
let mut modifiers = importer
.as_ref() .as_ref()
.map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap())) .map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap()))
.unwrap_or_default(); .unwrap_or_default();
if importer.is_some() && !modifiers.contains(&0) {
modifiers.push(0); // DRM_FORMAT_MOD_LINEAR
}
let want_dmabuf = importer.is_some() && !modifiers.is_empty(); let want_dmabuf = importer.is_some() && !modifiers.is_empty();
if zerocopy && !want_dmabuf { if zerocopy && !want_dmabuf {
tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path"); tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path");
@@ -714,6 +719,7 @@ mod pipewire {
// Zero-copy path: if the buffer is a dmabuf and we have an importer, import it // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
// into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
// through to the shm de-pad copy below. // through to the shm de-pad copy below.
let mut gpu_import_broken = false;
if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) { if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) {
if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
let plane = crate::zerocopy::DmabufPlane { let plane = crate::zerocopy::DmabufPlane {
@@ -721,11 +727,17 @@ mod pipewire {
offset: datas[0].chunk().offset(), offset: datas[0].chunk().offset(),
stride: datas[0].chunk().stride().max(0) as u32, stride: datas[0].chunk().stride().max(0) as u32,
}; };
// 0 (unset/LINEAR) → import with the implicit modifier; a real tiled // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g.
// modifier (if the producer reported one) → import it explicitly. // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't
// sample LINEAR).
let modifier = (ud.modifier != 0).then_some(ud.modifier); let modifier = (ud.modifier != 0).then_some(ud.modifier);
if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) { if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
match importer.import(&plane, w as u32, h as u32, fourcc, modifier) { let imported = if modifier.is_some() {
importer.import(&plane, w as u32, h as u32, fourcc, modifier)
} else {
importer.import_linear(&plane, w as u32, h as u32)
};
match imported {
Ok(devbuf) => { Ok(devbuf) => {
static ONCE: std::sync::atomic::AtomicBool = static ONCE: std::sync::atomic::AtomicBool =
std::sync::atomic::AtomicBool::new(true); std::sync::atomic::AtomicBool::new(true);
@@ -744,20 +756,26 @@ mod pipewire {
format: fmt, format: fmt,
payload: FramePayload::Cuda(devbuf), payload: FramePayload::Cuda(devbuf),
}); });
return;
} }
Err(e) => { Err(e) => {
static ONCE: std::sync::atomic::AtomicBool = // GPU import unavailable for this buffer kind (e.g. the
std::sync::atomic::AtomicBool::new(true); // driver rejects LINEAR external-memory import). Disable
if ONCE.swap(false, Ordering::Relaxed) { // the importer and fall through to the CPU mmap path —
tracing::warn!(error = %format!("{e:#}"), // degraded, not dead.
"dmabuf import failed — frames dropped (consider unsetting LUMEN_ZEROCOPY)"); tracing::warn!(error = %format!("{e:#}"),
} "dmabuf GPU import failed — falling back to the CPU copy path");
gpu_import_broken = true;
} }
} }
} else {
return; // format has no DRM fourcc mapping — skip the frame
} }
return;
} }
} }
if gpu_import_broken {
ud.importer = None;
}
let d = &mut datas[0]; let d = &mut datas[0];
// CPU path may also receive LINEAR dmabufs (gamescope offers only those once its // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its
+142
View File
@@ -21,6 +21,7 @@ pub type CUstream = *mut c_void; // opaque CUstream_st*
pub type CUdeviceptr = u64; pub type CUdeviceptr = u64;
pub type CUgraphicsResource = *mut c_void; pub type CUgraphicsResource = *mut c_void;
pub type CUarray = *mut c_void; pub type CUarray = *mut c_void;
pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4. /// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2; pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
@@ -48,6 +49,34 @@ pub struct CUDA_MEMCPY2D {
pub Height: usize, pub Height: usize,
} }
/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
/// only the first 4 bytes (the `int fd`) are read.
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
_pad: u32,
pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
pub size: u64,
pub flags: c_uint,
reserved: [c_uint; 16],
_pad2: u32,
}
/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
pub offset: u64,
pub size: u64,
pub flags: c_uint,
reserved: [c_uint; 16],
_pad: u32,
}
pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;
#[link(name = "cuda")] #[link(name = "cuda")]
extern "C" { extern "C" {
fn cuInit(flags: c_uint) -> CUresult; fn cuInit(flags: c_uint) -> CUresult;
@@ -90,6 +119,19 @@ extern "C" {
mip_level: c_uint, mip_level: c_uint,
) -> CUresult; ) -> CUresult;
fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult; fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
// External memory (cuda.h, no `_v2` suffix) — imports a (Vulkan-exported) dmabuf fd as
// device memory. Used for LINEAR dmabufs (gamescope), which EGL/GL interop can't sample.
fn cuImportExternalMemory(
ext_mem_out: *mut CUexternalMemory,
mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
) -> CUresult;
fn cuExternalMemoryGetMappedBuffer(
dev_ptr: *mut CUdeviceptr,
ext_mem: CUexternalMemory,
buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
) -> CUresult;
fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult;
} }
#[inline] #[inline]
@@ -197,6 +239,14 @@ impl BufferPool {
}) })
} }
pub fn width(&self) -> u32 {
self.width
}
pub fn height(&self) -> u32 {
self.height
}
/// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
/// pool when dropped (after the consumer has synchronized, so the GPU is done with it). /// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
pub fn get(&self) -> Result<DeviceBuffer> { pub fn get(&self) -> Result<DeviceBuffer> {
@@ -359,3 +409,95 @@ impl Drop for RegisteredTexture {
} }
} }
} }
/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
pub struct ExternalDmabuf {
ext: CUexternalMemory,
pub ptr: CUdeviceptr,
pub size: u64,
}
// Raw driver handles; used from the single capture thread but moved with the importer.
unsafe impl Send for ExternalDmabuf {}
impl ExternalDmabuf {
/// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
/// from then on) and map its full `size` bytes to a device pointer. The shared context
/// must be current.
pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
let dup = unsafe { libc::dup(fd) };
if dup < 0 {
bail!("dup(dmabuf fd) failed");
}
let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
size,
..Default::default()
};
desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
let mut ext: CUexternalMemory = std::ptr::null_mut();
let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
if r != 0 {
unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
}
let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
offset: 0,
size,
..Default::default()
};
let mut ptr: CUdeviceptr = 0;
let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
if r != 0 {
unsafe {
let _ = cuDestroyExternalMemory(ext);
}
bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
}
Ok(ExternalDmabuf { ext, ptr, size })
}
}
impl Drop for ExternalDmabuf {
fn drop(&mut self) {
unsafe {
if let Some(c) = CONTEXT.get() {
let _ = cuCtxSetCurrent(c.0);
}
if self.ptr != 0 {
let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
}
if !self.ext.is_null() {
let _ = cuDestroyExternalMemory(self.ext);
}
}
}
}
/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
/// offset) into `dst`. The shared context must be current on this thread.
pub fn copy_pitched_to_buffer(
src_ptr: CUdeviceptr,
src_pitch: usize,
dst: &DeviceBuffer,
) -> Result<()> {
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src_ptr,
srcPitch: src_pitch,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: dst.ptr,
dstPitch: dst.pitch,
WidthInBytes: dst.width as usize * 4,
Height: dst.height as usize,
..Default::default()
};
unsafe {
ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2(ext->dev)")?;
// The copy must finish before the dmabuf is requeued to the producer.
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
}
Ok(())
}
+34
View File
@@ -252,6 +252,10 @@ pub struct EglImporter {
egl_image_target: EglImageTargetFn, egl_image_target: EglImageTargetFn,
/// Lazily-created GL blit machinery (recreated if the frame size changes). /// Lazily-created GL blit machinery (recreated if the frame size changes).
blit: Option<GlBlit>, blit: Option<GlBlit>,
/// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the
/// producer's buffer pool keeps fds stable for the stream's life) + the destination pool.
linear: std::collections::HashMap<i32, cuda::ExternalDmabuf>,
linear_pool: Option<cuda::BufferPool>,
gbm: *mut c_void, gbm: *mut c_void,
render_fd: c_int, render_fd: c_int,
} }
@@ -351,11 +355,41 @@ impl EglImporter {
_gl_ctx: gl_ctx, _gl_ctx: gl_ctx,
egl_image_target, egl_image_target,
blit: None, blit: None,
linear: std::collections::HashMap::new(),
linear_pool: None,
gbm, gbm,
render_fd, render_fd,
}) })
} }
/// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't
/// sample LINEAR, but the bytes are directly addressable once imported). The import is
/// cached per fd; per frame this is one device→device copy into a pooled buffer.
pub fn import_linear(
&mut self,
plane: &DmabufPlane,
width: u32,
height: u32,
) -> Result<DeviceBuffer> {
cuda::make_current()?;
if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
}
let fd = plane.fd;
let ext = match self.linear.entry(fd) {
std::collections::hash_map::Entry::Occupied(e) => e.into_mut(),
std::collections::hash_map::Entry::Vacant(e) => {
// Size from the fd itself (the chunk's size field is unreliable for dmabufs).
let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) };
anyhow::ensure!(size > 0, "lseek(dmabuf) failed");
e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?)
}
};
let dst = self.linear_pool.as_ref().unwrap().get()?;
cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?;
Ok(dst)
}
/// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
/// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
/// a dmabuf in a layout we can import. Empty on failure (caller falls back). /// a dmabuf in a layout we can import. Empty on failure (caller falls back).