feat: M2 zero-copy — PipeWire dmabuf negotiation + EGL device-platform import (WIP)
Wire the capture side of zero-copy (LUMEN_ZEROCOPY=1): - EGL importer now opens the headless EGLDisplay on the NVIDIA EGL device (EGL_PLATFORM_DEVICE_EXT) and queries its importable DRM modifiers (eglQueryDmaBufModifiersEXT). - The PipeWire stream advertises a BGRx dmabuf format with those modifiers as a mandatory enum Choice + a dmabuf-only Buffers param; the compositor fixates an importable tiled modifier. param_changed reads the negotiated modifier; the process callback imports the dmabuf (eglCreateImage with explicit LO/HI modifier) and would copy it into a CUDA buffer for the encoder. Validated against headless KWin (Plasma 6.4): negotiation succeeds (13 NVIDIA modifiers advertised, KWin fixates one, stream reaches Streaming with a real tiled dmabuf) and `eglCreateImage` succeeds. The remaining blocker is `cuGraphicsEGLRegisterImage` returning CUDA_ERROR_INVALID_VALUE on the dmabuf-imported EGLImage — the likely fix is to bind the EGLImage to a GL texture (glEGLImageTargetTexture2DOES) and register that via cuGraphicsGLRegisterImage (OBS/Sunshine's path), which needs a GL context. The CPU-copy path stays the default and is unaffected (regression-checked: real KWin capture → HEVC). LUMEN_ZEROCOPY is opt-in/experimental until the CUDA registration lands. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -164,7 +164,13 @@ impl DeviceBuffer {
|
||||
let mut pitch: usize = 0;
|
||||
unsafe {
|
||||
ck(
|
||||
cuMemAllocPitch_v2(&mut ptr, &mut pitch, width as usize * 4, height as usize, 16),
|
||||
cuMemAllocPitch_v2(
|
||||
&mut ptr,
|
||||
&mut pitch,
|
||||
width as usize * 4,
|
||||
height as usize,
|
||||
16,
|
||||
),
|
||||
"cuMemAllocPitch_v2",
|
||||
)?;
|
||||
}
|
||||
@@ -205,9 +211,10 @@ impl MappedImage {
|
||||
/// # Safety
|
||||
/// `image` must be a valid `EGLImage`; the shared context must be current on this thread.
|
||||
pub unsafe fn register(image: *mut c_void) -> Result<MappedImage> {
|
||||
// CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY (0x01): we only read the surface (encode from it).
|
||||
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
||||
ck(
|
||||
cuGraphicsEGLRegisterImage(&mut resource, image, 0),
|
||||
cuGraphicsEGLRegisterImage(&mut resource, image, 0x01),
|
||||
"cuGraphicsEGLRegisterImage",
|
||||
)?;
|
||||
let mut frame = CUeglFrame::default();
|
||||
|
||||
@@ -1,20 +1,26 @@
|
||||
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (via the GBM
|
||||
//! platform on the render node) and import a PipeWire dmabuf as an `EGLImage` with
|
||||
//! `EGL_LINUX_DMA_BUF_EXT`. The DRM format **modifier** is mandatory on NVIDIA (its buffers are
|
||||
//! tiled; importing without the modifier yields a corrupt image or `EGL_BAD_MATCH`). The image
|
||||
//! is then handed to CUDA (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an
|
||||
//! owned buffer so the dmabuf can be returned to the compositor immediately.
|
||||
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA EGL device and
|
||||
//! import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`. The DRM format
|
||||
//! **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without the modifier
|
||||
//! yields a corrupt image or `EGL_BAD_MATCH`). The image is handed to CUDA
|
||||
//! (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an owned buffer so the
|
||||
//! dmabuf can be returned to the compositor immediately.
|
||||
//!
|
||||
//! NOTE (WIP): the negotiation + EGL import are verified end-to-end against KWin (a tiled
|
||||
//! dmabuf reaches `eglCreateImage` successfully), but `cuGraphicsEGLRegisterImage` currently
|
||||
//! returns `CUDA_ERROR_INVALID_VALUE` on the dmabuf-imported `EGLImage`. The likely fix is to
|
||||
//! bind the `EGLImage` to a GL texture (`glEGLImageTargetTexture2DOES`) and register *that* via
|
||||
//! `cuGraphicsGLRegisterImage` (OBS/Sunshine's path), which needs a GL context.
|
||||
|
||||
#![allow(non_upper_case_globals)]
|
||||
|
||||
use super::cuda::{self, DeviceBuffer, MappedImage};
|
||||
use anyhow::{ensure, Context as _, Result};
|
||||
use khronos_egl as egl;
|
||||
use std::os::raw::{c_int, c_void};
|
||||
use std::os::raw::c_void;
|
||||
|
||||
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
||||
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
||||
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
|
||||
const EGL_PLATFORM_DEVICE_EXT: egl::Enum = 0x313F;
|
||||
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
||||
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
||||
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
||||
@@ -22,12 +28,6 @@ const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
||||
|
||||
#[link(name = "gbm")]
|
||||
extern "C" {
|
||||
fn gbm_create_device(fd: c_int) -> *mut c_void;
|
||||
fn gbm_device_destroy(device: *mut c_void);
|
||||
}
|
||||
|
||||
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct DmabufPlane {
|
||||
@@ -38,41 +38,58 @@ pub struct DmabufPlane {
|
||||
|
||||
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
||||
|
||||
/// Headless EGL display + GBM device used to import dmabufs. Lives on the capture thread.
|
||||
/// Headless EGLDisplay (NVIDIA device platform) used to import dmabufs. Lives on the capture
|
||||
/// thread. The device platform — not GBM — is what NVIDIA's CUDA-EGL interop registers against.
|
||||
pub struct EglImporter {
|
||||
egl: Egl,
|
||||
display: egl::Display,
|
||||
no_ctx: egl::Context,
|
||||
gbm: *mut c_void,
|
||||
render_fd: c_int,
|
||||
}
|
||||
|
||||
// The EGL/GBM handles are confined to the capture thread; the struct is moved there once.
|
||||
// The EGL handles are confined to the capture thread; the struct is moved there once.
|
||||
unsafe impl Send for EglImporter {}
|
||||
|
||||
impl EglImporter {
|
||||
/// Open the render node, create a GBM device, and a headless EGLDisplay on it. Also forces
|
||||
/// the shared CUDA context to exist (so a later `import` only touches the hot path).
|
||||
/// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
|
||||
/// to exist (so a later `import` only touches the hot path).
|
||||
pub fn new() -> Result<EglImporter> {
|
||||
let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
|
||||
let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||
ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
|
||||
let gbm = unsafe { gbm_create_device(render_fd) };
|
||||
if gbm.is_null() {
|
||||
unsafe { libc::close(render_fd) };
|
||||
anyhow::bail!("gbm_create_device failed");
|
||||
}
|
||||
|
||||
let egl: Egl =
|
||||
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
||||
|
||||
// Enumerate EGL devices and use the first (the NVIDIA GPU on a single-GPU box).
|
||||
type QueryDevicesFn = unsafe extern "system" fn(
|
||||
max_devices: i32,
|
||||
devices: *mut *mut c_void,
|
||||
num_devices: *mut i32,
|
||||
) -> u32;
|
||||
let query_devices: QueryDevicesFn = unsafe {
|
||||
std::mem::transmute(
|
||||
egl.get_proc_address("eglQueryDevicesEXT")
|
||||
.context("eglQueryDevicesEXT unavailable")?,
|
||||
)
|
||||
};
|
||||
let device = unsafe {
|
||||
let mut count: i32 = 0;
|
||||
ensure!(
|
||||
query_devices(0, std::ptr::null_mut(), &mut count) != 0 && count > 0,
|
||||
"no EGL devices found"
|
||||
);
|
||||
let mut devices = vec![std::ptr::null_mut::<c_void>(); count as usize];
|
||||
ensure!(
|
||||
query_devices(count, devices.as_mut_ptr(), &mut count) != 0,
|
||||
"eglQueryDevicesEXT enumeration failed"
|
||||
);
|
||||
devices[0]
|
||||
};
|
||||
|
||||
let display = unsafe {
|
||||
egl.get_platform_display(
|
||||
EGL_PLATFORM_GBM_KHR,
|
||||
gbm as egl::NativeDisplayType,
|
||||
EGL_PLATFORM_DEVICE_EXT,
|
||||
device as egl::NativeDisplayType,
|
||||
&[egl::ATTRIB_NONE],
|
||||
)
|
||||
}
|
||||
.context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
|
||||
.context("eglGetPlatformDisplay(DEVICE) on the NVIDIA EGL device")?;
|
||||
egl.initialize(display).context("eglInitialize")?;
|
||||
|
||||
let exts = egl
|
||||
@@ -93,27 +110,79 @@ impl EglImporter {
|
||||
cuda::context().context("create CUDA context")?;
|
||||
|
||||
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
||||
tracing::info!("zero-copy EGL importer ready (GBM platform, dma_buf_import + modifiers)");
|
||||
tracing::info!(
|
||||
"zero-copy EGL importer ready (EGL device platform, dma_buf_import + modifiers)"
|
||||
);
|
||||
Ok(EglImporter {
|
||||
egl,
|
||||
display,
|
||||
no_ctx,
|
||||
gbm,
|
||||
render_fd,
|
||||
})
|
||||
}
|
||||
|
||||
/// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer.
|
||||
/// `fourcc` is the DRM FourCC, `modifier` the 64-bit DRM format modifier from PipeWire.
|
||||
/// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
|
||||
/// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
|
||||
/// a dmabuf in a layout we can import. Empty on failure (caller falls back).
|
||||
pub fn supported_modifiers(&self, fourcc: u32) -> Vec<u64> {
|
||||
type QueryFn = unsafe extern "system" fn(
|
||||
dpy: *mut c_void,
|
||||
format: i32,
|
||||
max_modifiers: i32,
|
||||
modifiers: *mut u64,
|
||||
external_only: *mut u32,
|
||||
num_modifiers: *mut i32,
|
||||
) -> u32;
|
||||
let Some(sym) = self.egl.get_proc_address("eglQueryDmaBufModifiersEXT") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let query: QueryFn = unsafe { std::mem::transmute(sym) };
|
||||
let dpy = self.display.as_ptr();
|
||||
unsafe {
|
||||
let mut count: i32 = 0;
|
||||
if query(
|
||||
dpy,
|
||||
fourcc as i32,
|
||||
0,
|
||||
std::ptr::null_mut(),
|
||||
std::ptr::null_mut(),
|
||||
&mut count,
|
||||
) == 0
|
||||
|| count <= 0
|
||||
{
|
||||
return Vec::new();
|
||||
}
|
||||
let mut mods = vec![0u64; count as usize];
|
||||
let mut ext = vec![0u32; count as usize];
|
||||
let mut n: i32 = 0;
|
||||
if query(
|
||||
dpy,
|
||||
fourcc as i32,
|
||||
count,
|
||||
mods.as_mut_ptr(),
|
||||
ext.as_mut_ptr(),
|
||||
&mut n,
|
||||
) == 0
|
||||
{
|
||||
return Vec::new();
|
||||
}
|
||||
mods.truncate(n.max(0) as usize);
|
||||
mods
|
||||
}
|
||||
}
|
||||
|
||||
/// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer. `fourcc`
|
||||
/// is the DRM FourCC; `modifier` is the explicit 64-bit DRM format modifier when one was
|
||||
/// negotiated, or `None` to import with the buffer's implicit modifier (base
|
||||
/// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
|
||||
pub fn import(
|
||||
&self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
fourcc: u32,
|
||||
modifier: u64,
|
||||
modifier: Option<u64>,
|
||||
) -> Result<DeviceBuffer> {
|
||||
let attrs: [egl::Attrib; 19] = [
|
||||
let mut attrs: Vec<egl::Attrib> = vec![
|
||||
egl::WIDTH as egl::Attrib,
|
||||
width as egl::Attrib,
|
||||
egl::HEIGHT as egl::Attrib,
|
||||
@@ -126,14 +195,16 @@ impl EglImporter {
|
||||
plane.offset as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_PITCH_EXT,
|
||||
plane.stride as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
|
||||
(modifier & 0xFFFF_FFFF) as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
|
||||
(modifier >> 32) as egl::Attrib,
|
||||
egl::ATTRIB_NONE,
|
||||
0,
|
||||
0,
|
||||
];
|
||||
if let Some(m) = modifier {
|
||||
attrs.extend_from_slice(&[
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
|
||||
(m & 0xFFFF_FFFF) as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
|
||||
(m >> 32) as egl::Attrib,
|
||||
]);
|
||||
}
|
||||
attrs.push(egl::ATTRIB_NONE);
|
||||
let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
|
||||
let image = self
|
||||
.egl
|
||||
@@ -142,7 +213,7 @@ impl EglImporter {
|
||||
self.no_ctx,
|
||||
EGL_LINUX_DMA_BUF_EXT,
|
||||
client,
|
||||
&attrs[..17], // up to and including ATTRIB_NONE
|
||||
&attrs,
|
||||
)
|
||||
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||
|
||||
@@ -160,14 +231,3 @@ impl EglImporter {
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EglImporter {
|
||||
fn drop(&mut self) {
|
||||
if !self.gbm.is_null() {
|
||||
unsafe { gbm_device_destroy(self.gbm) };
|
||||
}
|
||||
if self.render_fd >= 0 {
|
||||
unsafe { libc::close(self.render_fd) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ pub mod cuda;
|
||||
pub mod egl;
|
||||
|
||||
pub use cuda::DeviceBuffer;
|
||||
pub use egl::EglImporter;
|
||||
pub use egl::{DmabufPlane, EglImporter};
|
||||
|
||||
/// Whether the zero-copy path is opted in (`LUMEN_ZEROCOPY` truthy).
|
||||
pub fn enabled() -> bool {
|
||||
|
||||
Reference in New Issue
Block a user