feat: M2 — complete zero-copy dmabuf→NVENC capture path (EGL/GL→CUDA)
The PipeWire dmabuf now reaches NVENC with no CPU touch. Verified live against headless KWin: a tiled BGRx dmabuf is imported and encoded to a pixel-correct H.265 stream (decoded frame matches the captured desktop — no tiling artifacts, no colour swap). The CPU-copy path stays the default and the runtime fallback. Capture side (zerocopy::egl): desktop NVIDIA can't register a dmabuf EGLImage with CUDA directly (cuGraphicsEGLRegisterImage is Tegra-only; cuGraphicsGLRegisterImage rejects EGLImage-backed textures), so we follow OBS/Sunshine — bind the EGLImage to a GL texture, render it through a fullscreen-triangle shader into an immutable GL_RGBA8 texture (de-tiling + .bgra swizzle to the BGRx the encoder wants), then register that texture with CUDA and copy it device-to-device into an owned buffer so the dmabuf returns to the compositor immediately. Encode side (encode/linux::submit_cuda): take a *pooled* CUDA surface via av_hwframe_get_buffer and device→device-copy our imported buffer into it, instead of wrapping our own pointer in a bare AVFrame. A bare frame is rejected with EINVAL (NVENC ignores frames with null buf[0]; the encode path's av_frame_ref needs a refcounted buffer), and a fresh device pointer every frame would thrash NVENC's bounded resource-registration cache — the pool recycles a small set. Also: gate FFmpeg AV_LOG_DEBUG behind LUMEN_FFMPEG_DEBUG for diagnosing hw-frame rejects, and refresh the now-accurate module docs. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -547,7 +547,7 @@ mod pipewire {
|
|||||||
// Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
|
// Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
|
||||||
// into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
|
// into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
|
||||||
// through to the shm de-pad copy below.
|
// through to the shm de-pad copy below.
|
||||||
if let (Some(importer), Some(fmt)) = (ud.importer.as_ref(), ud.format) {
|
if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) {
|
||||||
if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
|
if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
|
||||||
let plane = crate::zerocopy::DmabufPlane {
|
let plane = crate::zerocopy::DmabufPlane {
|
||||||
fd: datas[0].fd(),
|
fd: datas[0].fd(),
|
||||||
|
|||||||
@@ -139,6 +139,9 @@ impl NvencEncoder {
|
|||||||
cuda: bool,
|
cuda: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
ffmpeg::init().context("ffmpeg init")?;
|
ffmpeg::init().context("ffmpeg init")?;
|
||||||
|
if std::env::var_os("LUMEN_FFMPEG_DEBUG").is_some() {
|
||||||
|
unsafe { ffi::av_log_set_level(48) }; // AV_LOG_DEBUG — surface NVENC hw-frame rejects
|
||||||
|
}
|
||||||
let name = codec.nvenc_name();
|
let name = codec.nvenc_name();
|
||||||
let av_codec = encoder::find_by_name(name)
|
let av_codec = encoder::find_by_name(name)
|
||||||
.ok_or_else(|| anyhow!("{name} not built into libavcodec"))?;
|
.ok_or_else(|| anyhow!("{name} not built into libavcodec"))?;
|
||||||
@@ -316,9 +319,16 @@ impl NvencEncoder {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-copy path: wrap the imported CUDA device buffer in an `AV_PIX_FMT_CUDA` frame and
|
/// Zero-copy path: hand the imported CUDA device buffer to NVENC with no CPU touch.
|
||||||
/// send it straight to NVENC (no CPU touch). `buf.ptr` aliases device memory we own, so
|
///
|
||||||
/// `buf[0]` is left null (ffmpeg must not free it); the frame shell is freed after send.
|
/// We take a *pooled* surface from the CUDA hwframes context (`av_hwframe_get_buffer`) and
|
||||||
|
/// device→device-copy our imported buffer into it, rather than wrapping our own pointer in a
|
||||||
|
/// bare frame. Two reasons: (1) NVENC's `nvenc_send_frame` ignores frames whose `buf[0]` is
|
||||||
|
/// null and the generic encode path's `av_frame_ref` needs a refcounted buffer — a bare
|
||||||
|
/// frame is rejected with `EINVAL`; (2) NVENC caches CUDA-resource *registrations* keyed by
|
||||||
|
/// device pointer with a bounded table, so a fresh pointer every frame would thrash/overflow
|
||||||
|
/// it — the pool recycles a small set of pointers. The extra copy is device-local (~8 MB at
|
||||||
|
/// 1080p, sub-millisecond on the GPU) and keeps the host fully off the pixel path.
|
||||||
fn submit_cuda(
|
fn submit_cuda(
|
||||||
&mut self,
|
&mut self,
|
||||||
buf: &crate::zerocopy::DeviceBuffer,
|
buf: &crate::zerocopy::DeviceBuffer,
|
||||||
@@ -330,17 +340,28 @@ impl NvencEncoder {
|
|||||||
.as_ref()
|
.as_ref()
|
||||||
.context("CUDA hw context missing (encoder opened in CPU mode)")?
|
.context("CUDA hw context missing (encoder opened in CPU mode)")?
|
||||||
.frames_ref;
|
.frames_ref;
|
||||||
|
// The device→device copy below uses our shared context directly; make it current on the
|
||||||
|
// encode thread (ffmpeg pushes its own around the pool alloc, so order is fine).
|
||||||
|
crate::zerocopy::cuda::make_current().context("CUDA context current (encode thread)")?;
|
||||||
unsafe {
|
unsafe {
|
||||||
let mut f = ffi::av_frame_alloc();
|
let mut f = ffi::av_frame_alloc();
|
||||||
if f.is_null() {
|
if f.is_null() {
|
||||||
bail!("av_frame_alloc failed");
|
bail!("av_frame_alloc failed");
|
||||||
}
|
}
|
||||||
(*f).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA as c_int;
|
// Pooled CUDA surface: sets format, width/height, data[0]/linesize[0], buf[0] and
|
||||||
(*f).width = self.width as c_int;
|
// hw_frames_ctx. Reused across frames (the pool recycles), keeping NVENC's
|
||||||
(*f).height = self.height as c_int;
|
// registration cache warm.
|
||||||
(*f).hw_frames_ctx = ffi::av_buffer_ref(frames_ref);
|
let r = ffi::av_hwframe_get_buffer(frames_ref, f, 0);
|
||||||
(*f).data[0] = buf.ptr as *mut u8;
|
if r < 0 {
|
||||||
(*f).linesize[0] = buf.pitch as c_int;
|
ffi::av_frame_free(&mut f);
|
||||||
|
bail!("av_hwframe_get_buffer(CUDA) failed ({r})");
|
||||||
|
}
|
||||||
|
let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr;
|
||||||
|
let dst_pitch = (*f).linesize[0] as usize;
|
||||||
|
if let Err(e) = crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) {
|
||||||
|
ffi::av_frame_free(&mut f);
|
||||||
|
return Err(e).context("copy imported buffer into NVENC surface");
|
||||||
|
}
|
||||||
(*f).pts = pts;
|
(*f).pts = pts;
|
||||||
(*f).pict_type = if idr {
|
(*f).pict_type = if idr {
|
||||||
ffi::AVPictureType::AV_PICTURE_TYPE_I
|
ffi::AVPictureType::AV_PICTURE_TYPE_I
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the EGL-interop
|
//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop
|
||||||
//! driver calls (`cuGraphicsEGLRegisterImage` / `cuGraphicsResourceGetMappedEglFrame`) nor
|
//! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and
|
||||||
//! `CUeglFrame`, so we hand-roll exactly what we need and link `libcuda.so.1` (the driver
|
//! link `libcuda.so.1` (the driver library — NOT `libcudart`). Symbol names verified against
|
||||||
//! library — NOT `libcudart`). Symbol names verified against `cust_raw` + `cudaEGL.h`: the
|
//! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop
|
||||||
//! context/mem ops use the `_v2` ABI suffix; the graphics/EGL-interop ops are unsuffixed.
|
//! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is
|
||||||
|
//! Tegra-only on the desktop driver — see [`super::egl`].)
|
||||||
//!
|
//!
|
||||||
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
|
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
|
||||||
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
|
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
|
||||||
@@ -25,26 +26,6 @@ pub type CUarray = *mut c_void;
|
|||||||
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
|
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
|
||||||
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
|
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
|
||||||
|
|
||||||
/// `CUeglFrameType`: ARRAY=0, PITCH=1.
|
|
||||||
pub const CU_EGL_FRAME_TYPE_ARRAY: c_uint = 0;
|
|
||||||
pub const CU_EGL_FRAME_TYPE_PITCH: c_uint = 1;
|
|
||||||
|
|
||||||
/// `CUeglFrame` — exact layout from `cudaEGL.h`. `frame` is a union of `CUarray pArray[3]` and
|
|
||||||
/// `void* pPitch[3]`; both are three pointers, so `[*mut c_void; 3]` models it.
|
|
||||||
#[repr(C)]
|
|
||||||
pub struct CUeglFrame {
|
|
||||||
pub frame: [*mut c_void; 3],
|
|
||||||
pub width: c_uint,
|
|
||||||
pub height: c_uint,
|
|
||||||
pub depth: c_uint,
|
|
||||||
pub pitch: c_uint,
|
|
||||||
pub planeCount: c_uint,
|
|
||||||
pub numChannels: c_uint,
|
|
||||||
pub frameType: c_uint,
|
|
||||||
pub eglColorFormat: c_uint,
|
|
||||||
pub cuFormat: c_uint,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
|
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@@ -67,13 +48,6 @@ pub struct CUDA_MEMCPY2D {
|
|||||||
pub Height: usize,
|
pub Height: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for CUeglFrame {
|
|
||||||
fn default() -> Self {
|
|
||||||
// SAFETY: all fields are integers or pointers; zero is a valid bit pattern.
|
|
||||||
unsafe { std::mem::zeroed() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[link(name = "cuda")]
|
#[link(name = "cuda")]
|
||||||
extern "C" {
|
extern "C" {
|
||||||
fn cuInit(flags: c_uint) -> CUresult;
|
fn cuInit(flags: c_uint) -> CUresult;
|
||||||
@@ -91,15 +65,28 @@ extern "C" {
|
|||||||
fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
|
fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
|
||||||
fn cuCtxSynchronize() -> CUresult;
|
fn cuCtxSynchronize() -> CUresult;
|
||||||
|
|
||||||
fn cuGraphicsEGLRegisterImage(
|
// GL interop (cudaGL.h) — these symbols have NO `_v2` suffix. `cuGraphicsEGLRegisterImage`
|
||||||
|
// is Tegra-only on the desktop driver, so we go EGLImage → GL texture → register the texture.
|
||||||
|
fn cuGraphicsGLRegisterImage(
|
||||||
resource: *mut CUgraphicsResource,
|
resource: *mut CUgraphicsResource,
|
||||||
image: *mut c_void, // EGLImage
|
texture: c_uint, // GLuint
|
||||||
flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_NONE = 0
|
target: c_uint, // GL_TEXTURE_2D = 0x0DE1
|
||||||
|
flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01
|
||||||
) -> CUresult;
|
) -> CUresult;
|
||||||
fn cuGraphicsResourceGetMappedEglFrame(
|
fn cuGraphicsMapResources(
|
||||||
egl_frame: *mut CUeglFrame,
|
count: c_uint,
|
||||||
|
resources: *mut CUgraphicsResource,
|
||||||
|
stream: *mut c_void,
|
||||||
|
) -> CUresult;
|
||||||
|
fn cuGraphicsUnmapResources(
|
||||||
|
count: c_uint,
|
||||||
|
resources: *mut CUgraphicsResource,
|
||||||
|
stream: *mut c_void,
|
||||||
|
) -> CUresult;
|
||||||
|
fn cuGraphicsSubResourceGetMappedArray(
|
||||||
|
array: *mut CUarray,
|
||||||
resource: CUgraphicsResource,
|
resource: CUgraphicsResource,
|
||||||
index: c_uint,
|
array_index: c_uint,
|
||||||
mip_level: c_uint,
|
mip_level: c_uint,
|
||||||
) -> CUresult;
|
) -> CUresult;
|
||||||
fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
|
fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
|
||||||
@@ -197,60 +184,61 @@ impl Drop for DeviceBuffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A live EGL→CUDA registration. The mapped device memory aliases the dmabuf, so we copy out of
|
/// A live GL-texture→CUDA registration (mapped). The CUDA array aliases the texture/dmabuf, so
|
||||||
/// it immediately and then unregister (the EGL image is destroyed by the caller).
|
/// we copy out of it immediately; unmap + unregister happen on drop.
|
||||||
pub struct MappedImage {
|
pub struct MappedTexture {
|
||||||
resource: CUgraphicsResource,
|
resource: CUgraphicsResource,
|
||||||
/// `frameType` (ARRAY vs PITCH) determines how to copy out.
|
array: CUarray,
|
||||||
frame: CUeglFrame,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MappedImage {
|
impl MappedTexture {
|
||||||
/// Register an `EGLImage` with CUDA and map it to a `CUeglFrame`.
|
/// Register a `GL_TEXTURE_2D` texture with CUDA, map it, and get its array. The desktop
|
||||||
|
/// NVIDIA driver only supports CUDA interop through GL textures (not dmabuf EGLImages
|
||||||
|
/// directly), so the EGLImage is first bound to a GL texture by the caller.
|
||||||
///
|
///
|
||||||
/// # Safety
|
/// # Safety
|
||||||
/// `image` must be a valid `EGLImage`; the shared context must be current on this thread.
|
/// The GL context and the shared CUDA context must both be current on this thread, and
|
||||||
pub unsafe fn register(image: *mut c_void) -> Result<MappedImage> {
|
/// `texture` must be a valid `GL_TEXTURE_2D` bound to the source image.
|
||||||
// CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY (0x01): we only read the surface (encode from it).
|
pub unsafe fn register_gl(texture: u32) -> Result<MappedTexture> {
|
||||||
|
const GL_TEXTURE_2D: c_uint = 0x0DE1;
|
||||||
|
const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
|
||||||
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
||||||
ck(
|
ck(
|
||||||
cuGraphicsEGLRegisterImage(&mut resource, image, 0x01),
|
cuGraphicsGLRegisterImage(
|
||||||
"cuGraphicsEGLRegisterImage",
|
&mut resource,
|
||||||
|
texture,
|
||||||
|
GL_TEXTURE_2D,
|
||||||
|
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY,
|
||||||
|
),
|
||||||
|
"cuGraphicsGLRegisterImage",
|
||||||
)?;
|
)?;
|
||||||
let mut frame = CUeglFrame::default();
|
if cuGraphicsMapResources(1, &mut resource, std::ptr::null_mut()) != 0 {
|
||||||
let r = cuGraphicsResourceGetMappedEglFrame(&mut frame, resource, 0, 0);
|
|
||||||
if r != 0 {
|
|
||||||
let _ = cuGraphicsUnregisterResource(resource);
|
let _ = cuGraphicsUnregisterResource(resource);
|
||||||
bail!("cuGraphicsResourceGetMappedEglFrame error {r}");
|
bail!("cuGraphicsMapResources failed");
|
||||||
}
|
}
|
||||||
Ok(MappedImage { resource, frame })
|
let mut array: CUarray = std::ptr::null_mut();
|
||||||
|
if cuGraphicsSubResourceGetMappedArray(&mut array, resource, 0, 0) != 0 {
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut resource, std::ptr::null_mut());
|
||||||
|
let _ = cuGraphicsUnregisterResource(resource);
|
||||||
|
bail!("cuGraphicsSubResourceGetMappedArray failed");
|
||||||
|
}
|
||||||
|
Ok(MappedTexture { resource, array })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Device-to-device copy of this mapped frame into `dst` (de-tiling if the source is a tiled
|
/// Copy the mapped array into `dst` (array → pitched device memory). The array is the GL
|
||||||
/// CUarray). After this returns the dmabuf is no longer needed.
|
/// blit's already-linear RGBA8 output, so this is a straight copy. After it returns the
|
||||||
|
/// source dmabuf is no longer needed.
|
||||||
pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
|
pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
|
||||||
let width_bytes = (self.frame.width as usize).min(dst.width as usize) * 4;
|
let copy = CUDA_MEMCPY2D {
|
||||||
let height = (self.frame.height as usize).min(dst.height as usize);
|
srcMemoryType: CU_MEMORYTYPE_ARRAY,
|
||||||
let mut copy = CUDA_MEMCPY2D {
|
srcArray: self.array,
|
||||||
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
dstDevice: dst.ptr,
|
dstDevice: dst.ptr,
|
||||||
dstPitch: dst.pitch,
|
dstPitch: dst.pitch,
|
||||||
WidthInBytes: width_bytes,
|
WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
|
||||||
Height: height,
|
Height: dst.height as usize,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
};
|
};
|
||||||
match self.frame.frameType {
|
|
||||||
CU_EGL_FRAME_TYPE_PITCH => {
|
|
||||||
copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
||||||
copy.srcDevice = self.frame.frame[0] as CUdeviceptr;
|
|
||||||
copy.srcPitch = self.frame.pitch as usize;
|
|
||||||
}
|
|
||||||
CU_EGL_FRAME_TYPE_ARRAY => {
|
|
||||||
copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
|
|
||||||
copy.srcArray = self.frame.frame[0] as CUarray;
|
|
||||||
}
|
|
||||||
other => bail!("unexpected CUeglFrame frameType {other}"),
|
|
||||||
}
|
|
||||||
unsafe {
|
unsafe {
|
||||||
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?;
|
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?;
|
||||||
// The copy must complete before the dmabuf is requeued / reused.
|
// The copy must complete before the dmabuf is requeued / reused.
|
||||||
@@ -258,19 +246,39 @@ impl MappedImage {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn color_format(&self) -> c_uint {
|
|
||||||
self.frame.eglColorFormat
|
|
||||||
}
|
|
||||||
pub fn frame_kind(&self) -> c_uint {
|
|
||||||
self.frame.frameType
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for MappedImage {
|
/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
|
||||||
|
/// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels.
|
||||||
|
/// The caller must have the shared context current on this thread (see [`make_current`]).
|
||||||
|
pub fn copy_device_to_device(
|
||||||
|
src: &DeviceBuffer,
|
||||||
|
dst_ptr: CUdeviceptr,
|
||||||
|
dst_pitch: usize,
|
||||||
|
) -> Result<()> {
|
||||||
|
let copy = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
srcDevice: src.ptr,
|
||||||
|
srcPitch: src.pitch,
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: dst_ptr,
|
||||||
|
dstPitch: dst_pitch,
|
||||||
|
WidthInBytes: src.width as usize * 4,
|
||||||
|
Height: src.height as usize,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
unsafe {
|
||||||
|
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2(dev->dev)")?;
|
||||||
|
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for MappedTexture {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if !self.resource.is_null() {
|
if !self.resource.is_null() {
|
||||||
unsafe {
|
unsafe {
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||||
let _ = cuGraphicsUnregisterResource(self.resource);
|
let _ = cuGraphicsUnregisterResource(self.resource);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,26 +1,26 @@
|
|||||||
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA EGL device and
|
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (GBM platform on
|
||||||
//! import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`. The DRM format
|
//! the render node) and import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`.
|
||||||
//! **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without the modifier
|
//! The DRM format **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without
|
||||||
//! yields a corrupt image or `EGL_BAD_MATCH`). The image is handed to CUDA
|
//! the modifier yields a corrupt image or `EGL_BAD_MATCH`).
|
||||||
//! (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an owned buffer so the
|
|
||||||
//! dmabuf can be returned to the compositor immediately.
|
|
||||||
//!
|
//!
|
||||||
//! NOTE (WIP): the negotiation + EGL import are verified end-to-end against KWin (a tiled
|
//! Desktop NVIDIA can't register a dmabuf `EGLImage` with CUDA directly — `cuGraphicsEGLRegisterImage`
|
||||||
//! dmabuf reaches `eglCreateImage` successfully), but `cuGraphicsEGLRegisterImage` currently
|
//! is Tegra-only and `cuGraphicsGLRegisterImage` rejects EGLImage-backed textures (their internal
|
||||||
//! returns `CUDA_ERROR_INVALID_VALUE` on the dmabuf-imported `EGLImage`. The likely fix is to
|
//! format is opaque). So we follow OBS/Sunshine: bind the `EGLImage` to a GL texture
|
||||||
//! bind the `EGLImage` to a GL texture (`glEGLImageTargetTexture2DOES`) and register *that* via
|
//! (`glEGLImageTargetTexture2DOES`), render it through a fullscreen-triangle shader into a plain
|
||||||
//! `cuGraphicsGLRegisterImage` (OBS/Sunshine's path), which needs a GL context.
|
//! immutable `GL_RGBA8` texture (de-tiling and swizzling to the BGRx the encoder wants), then
|
||||||
|
//! register *that* texture with CUDA ([`MappedTexture`]) and copy it device-to-device into an
|
||||||
|
//! owned [`DeviceBuffer`] so the dmabuf can be returned to the compositor immediately.
|
||||||
|
|
||||||
#![allow(non_upper_case_globals)]
|
#![allow(non_upper_case_globals)]
|
||||||
|
|
||||||
use super::cuda::{self, DeviceBuffer, MappedImage};
|
use super::cuda::{self, DeviceBuffer, MappedTexture};
|
||||||
use anyhow::{ensure, Context as _, Result};
|
use anyhow::{bail, ensure, Context as _, Result};
|
||||||
use khronos_egl as egl;
|
use khronos_egl as egl;
|
||||||
use std::os::raw::c_void;
|
use std::os::raw::{c_int, c_void};
|
||||||
|
|
||||||
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
||||||
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
||||||
const EGL_PLATFORM_DEVICE_EXT: egl::Enum = 0x313F;
|
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
|
||||||
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
||||||
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
||||||
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
||||||
@@ -28,6 +28,197 @@ const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
|
|||||||
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
||||||
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
||||||
|
|
||||||
|
const GL_TEXTURE_2D: u32 = 0x0DE1;
|
||||||
|
const GL_TEXTURE_MIN_FILTER: u32 = 0x2801;
|
||||||
|
const GL_TEXTURE_MAG_FILTER: u32 = 0x2800;
|
||||||
|
const GL_LINEAR: c_int = 0x2601;
|
||||||
|
const GL_NEAREST: c_int = 0x2600;
|
||||||
|
const GL_RGBA8: u32 = 0x8058;
|
||||||
|
const GL_FRAMEBUFFER: u32 = 0x8D40;
|
||||||
|
const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
|
||||||
|
const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
|
||||||
|
const GL_TEXTURE0: u32 = 0x84C0;
|
||||||
|
const GL_TRIANGLES: u32 = 0x0004;
|
||||||
|
const GL_VERTEX_SHADER: u32 = 0x8B31;
|
||||||
|
const GL_FRAGMENT_SHADER: u32 = 0x8B30;
|
||||||
|
const GL_COMPILE_STATUS: u32 = 0x8B81;
|
||||||
|
const GL_LINK_STATUS: u32 = 0x8B82;
|
||||||
|
|
||||||
|
// libglvnd's libGL dispatches these to the NVIDIA driver based on the current EGL/GL context.
|
||||||
|
#[link(name = "GL")]
|
||||||
|
extern "C" {
|
||||||
|
fn glGenTextures(n: c_int, textures: *mut u32);
|
||||||
|
fn glBindTexture(target: u32, texture: u32);
|
||||||
|
fn glTexParameteri(target: u32, pname: u32, param: c_int);
|
||||||
|
fn glDeleteTextures(n: c_int, textures: *const u32);
|
||||||
|
fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
|
||||||
|
fn glGetError() -> u32;
|
||||||
|
fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
|
||||||
|
fn glBindFramebuffer(target: u32, framebuffer: u32);
|
||||||
|
fn glFramebufferTexture2D(
|
||||||
|
target: u32,
|
||||||
|
attachment: u32,
|
||||||
|
textarget: u32,
|
||||||
|
texture: u32,
|
||||||
|
level: c_int,
|
||||||
|
);
|
||||||
|
fn glCheckFramebufferStatus(target: u32) -> u32;
|
||||||
|
fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
|
||||||
|
fn glGenVertexArrays(n: c_int, arrays: *mut u32);
|
||||||
|
fn glBindVertexArray(array: u32);
|
||||||
|
fn glDrawArrays(mode: u32, first: c_int, count: c_int);
|
||||||
|
fn glActiveTexture(texture: u32);
|
||||||
|
fn glUseProgram(program: u32);
|
||||||
|
fn glFlush();
|
||||||
|
fn glCreateShader(shader_type: u32) -> u32;
|
||||||
|
fn glShaderSource(shader: u32, count: c_int, string: *const *const i8, length: *const c_int);
|
||||||
|
fn glCompileShader(shader: u32);
|
||||||
|
fn glGetShaderiv(shader: u32, pname: u32, params: *mut c_int);
|
||||||
|
fn glDeleteShader(shader: u32);
|
||||||
|
fn glCreateProgram() -> u32;
|
||||||
|
fn glAttachShader(program: u32, shader: u32);
|
||||||
|
fn glLinkProgram(program: u32);
|
||||||
|
fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
|
||||||
|
fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
|
||||||
|
fn glUniform1i(location: c_int, v0: c_int);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[link(name = "gbm")]
|
||||||
|
extern "C" {
|
||||||
|
fn gbm_create_device(fd: c_int) -> *mut c_void;
|
||||||
|
fn gbm_device_destroy(device: *mut c_void);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `glEGLImageTargetTexture2DOES(target, EGLImage)` — loaded via `eglGetProcAddress`.
|
||||||
|
type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void);
|
||||||
|
|
||||||
|
// Fullscreen-triangle blit: sample the dmabuf EGLImage texture and write it (swizzled to BGRA,
|
||||||
|
// to match the BGRx the encoder expects) into a normal GL_RGBA8 texture that CUDA *can* register.
|
||||||
|
const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
|
||||||
|
const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";
|
||||||
|
|
||||||
|
unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
|
||||||
|
let sh = glCreateShader(kind);
|
||||||
|
ensure!(sh != 0, "glCreateShader failed");
|
||||||
|
let ptr = src.as_ptr() as *const i8;
|
||||||
|
let len = src.len() as c_int;
|
||||||
|
glShaderSource(sh, 1, &ptr, &len);
|
||||||
|
glCompileShader(sh);
|
||||||
|
let mut ok: c_int = 0;
|
||||||
|
glGetShaderiv(sh, GL_COMPILE_STATUS, &mut ok);
|
||||||
|
if ok == 0 {
|
||||||
|
glDeleteShader(sh);
|
||||||
|
bail!("GL shader compile failed");
|
||||||
|
}
|
||||||
|
Ok(sh)
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn compile_program() -> Result<u32> {
|
||||||
|
let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
|
||||||
|
let fs = compile_shader(GL_FRAGMENT_SHADER, FRAG_SRC)?;
|
||||||
|
let prog = glCreateProgram();
|
||||||
|
glAttachShader(prog, vs);
|
||||||
|
glAttachShader(prog, fs);
|
||||||
|
glLinkProgram(prog);
|
||||||
|
glDeleteShader(vs);
|
||||||
|
glDeleteShader(fs);
|
||||||
|
let mut ok: c_int = 0;
|
||||||
|
glGetProgramiv(prog, GL_LINK_STATUS, &mut ok);
|
||||||
|
ensure!(ok != 0, "GL program link failed");
|
||||||
|
glUseProgram(prog);
|
||||||
|
let loc = glGetUniformLocation(prog, c"image".as_ptr());
|
||||||
|
if loc >= 0 {
|
||||||
|
glUniform1i(loc, 0); // sampler -> texture unit 0
|
||||||
|
}
|
||||||
|
glUseProgram(0);
|
||||||
|
Ok(prog)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
|
||||||
|
struct GlBlit {
|
||||||
|
program: u32,
|
||||||
|
vao: u32,
|
||||||
|
fbo: u32,
|
||||||
|
/// CUDA-registrable destination (immutable GL_RGBA8).
|
||||||
|
dst_tex: u32,
|
||||||
|
/// Source texture re-targeted to each frame's EGLImage.
|
||||||
|
src_tex: u32,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl GlBlit {
|
||||||
|
unsafe fn new(width: u32, height: u32) -> Result<GlBlit> {
|
||||||
|
let program = compile_program()?;
|
||||||
|
let mut vao = 0u32;
|
||||||
|
glGenVertexArrays(1, &mut vao); // core profile needs a bound VAO for glDrawArrays
|
||||||
|
let mut fbo = 0u32;
|
||||||
|
glGenFramebuffers(1, &mut fbo);
|
||||||
|
|
||||||
|
let mut dst_tex = 0u32;
|
||||||
|
glGenTextures(1, &mut dst_tex);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, dst_tex);
|
||||||
|
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||||
|
|
||||||
|
let mut src_tex = 0u32;
|
||||||
|
glGenTextures(1, &mut src_tex);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, src_tex);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
|
||||||
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, 0);
|
||||||
|
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
|
||||||
|
glFramebufferTexture2D(
|
||||||
|
GL_FRAMEBUFFER,
|
||||||
|
GL_COLOR_ATTACHMENT0,
|
||||||
|
GL_TEXTURE_2D,
|
||||||
|
dst_tex,
|
||||||
|
0,
|
||||||
|
);
|
||||||
|
let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||||
|
ensure!(
|
||||||
|
status == GL_FRAMEBUFFER_COMPLETE,
|
||||||
|
"blit FBO incomplete ({status:#x})"
|
||||||
|
);
|
||||||
|
Ok(GlBlit {
|
||||||
|
program,
|
||||||
|
vao,
|
||||||
|
fbo,
|
||||||
|
dst_tex,
|
||||||
|
src_tex,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Bind `image` to the source texture and render it into `dst_tex`.
|
||||||
|
///
|
||||||
|
/// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
|
||||||
|
unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
|
||||||
|
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||||
|
let _ = glGetError();
|
||||||
|
egl_image_target(GL_TEXTURE_2D, image);
|
||||||
|
let e = glGetError();
|
||||||
|
glBindTexture(GL_TEXTURE_2D, 0);
|
||||||
|
ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
|
||||||
|
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, self.fbo);
|
||||||
|
glViewport(0, 0, self.width as c_int, self.height as c_int);
|
||||||
|
glUseProgram(self.program);
|
||||||
|
glActiveTexture(GL_TEXTURE0);
|
||||||
|
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||||
|
glBindVertexArray(self.vao);
|
||||||
|
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||||
|
glBindVertexArray(0);
|
||||||
|
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||||
|
glFlush(); // submit GL work before CUDA maps the texture
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||||
#[derive(Clone, Copy, Debug)]
|
#[derive(Clone, Copy, Debug)]
|
||||||
pub struct DmabufPlane {
|
pub struct DmabufPlane {
|
||||||
@@ -38,12 +229,20 @@ pub struct DmabufPlane {
|
|||||||
|
|
||||||
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
||||||
|
|
||||||
/// Headless EGLDisplay (NVIDIA device platform) used to import dmabufs. Lives on the capture
|
/// Headless EGLDisplay (NVIDIA device platform) + a surfaceless desktop-GL context used to
|
||||||
/// thread. The device platform — not GBM — is what NVIDIA's CUDA-EGL interop registers against.
|
/// import dmabufs and bridge them to CUDA via a GL texture. Lives on the capture thread (the GL
|
||||||
|
/// context is made current there once).
|
||||||
pub struct EglImporter {
|
pub struct EglImporter {
|
||||||
egl: Egl,
|
egl: Egl,
|
||||||
display: egl::Display,
|
display: egl::Display,
|
||||||
no_ctx: egl::Context,
|
no_ctx: egl::Context,
|
||||||
|
/// Surfaceless GL context (current on the capture thread) for the EGLImage→texture bind.
|
||||||
|
_gl_ctx: egl::Context,
|
||||||
|
egl_image_target: EglImageTargetFn,
|
||||||
|
/// Lazily-created GL blit machinery (recreated if the frame size changes).
|
||||||
|
blit: Option<GlBlit>,
|
||||||
|
gbm: *mut c_void,
|
||||||
|
render_fd: c_int,
|
||||||
}
|
}
|
||||||
|
|
||||||
// The EGL handles are confined to the capture thread; the struct is moved there once.
|
// The EGL handles are confined to the capture thread; the struct is moved there once.
|
||||||
@@ -53,43 +252,28 @@ impl EglImporter {
|
|||||||
/// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
|
/// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
|
||||||
/// to exist (so a later `import` only touches the hot path).
|
/// to exist (so a later `import` only touches the hot path).
|
||||||
pub fn new() -> Result<EglImporter> {
|
pub fn new() -> Result<EglImporter> {
|
||||||
|
// GBM platform on the NVIDIA render node: this ties the EGLDisplay (and its GL contexts)
|
||||||
|
// to the same DRM device CUDA-GL interop associates with, which the EGL device platform
|
||||||
|
// did not (cuGraphicsGLRegisterImage rejected device-platform GL textures).
|
||||||
|
let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
|
||||||
|
let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||||
|
ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
|
||||||
|
let gbm = unsafe { gbm_create_device(render_fd) };
|
||||||
|
if gbm.is_null() {
|
||||||
|
unsafe { libc::close(render_fd) };
|
||||||
|
anyhow::bail!("gbm_create_device failed");
|
||||||
|
}
|
||||||
|
|
||||||
let egl: Egl =
|
let egl: Egl =
|
||||||
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
||||||
|
|
||||||
// Enumerate EGL devices and use the first (the NVIDIA GPU on a single-GPU box).
|
|
||||||
type QueryDevicesFn = unsafe extern "system" fn(
|
|
||||||
max_devices: i32,
|
|
||||||
devices: *mut *mut c_void,
|
|
||||||
num_devices: *mut i32,
|
|
||||||
) -> u32;
|
|
||||||
let query_devices: QueryDevicesFn = unsafe {
|
|
||||||
std::mem::transmute(
|
|
||||||
egl.get_proc_address("eglQueryDevicesEXT")
|
|
||||||
.context("eglQueryDevicesEXT unavailable")?,
|
|
||||||
)
|
|
||||||
};
|
|
||||||
let device = unsafe {
|
|
||||||
let mut count: i32 = 0;
|
|
||||||
ensure!(
|
|
||||||
query_devices(0, std::ptr::null_mut(), &mut count) != 0 && count > 0,
|
|
||||||
"no EGL devices found"
|
|
||||||
);
|
|
||||||
let mut devices = vec![std::ptr::null_mut::<c_void>(); count as usize];
|
|
||||||
ensure!(
|
|
||||||
query_devices(count, devices.as_mut_ptr(), &mut count) != 0,
|
|
||||||
"eglQueryDevicesEXT enumeration failed"
|
|
||||||
);
|
|
||||||
devices[0]
|
|
||||||
};
|
|
||||||
|
|
||||||
let display = unsafe {
|
let display = unsafe {
|
||||||
egl.get_platform_display(
|
egl.get_platform_display(
|
||||||
EGL_PLATFORM_DEVICE_EXT,
|
EGL_PLATFORM_GBM_KHR,
|
||||||
device as egl::NativeDisplayType,
|
gbm as egl::NativeDisplayType,
|
||||||
&[egl::ATTRIB_NONE],
|
&[egl::ATTRIB_NONE],
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
.context("eglGetPlatformDisplay(DEVICE) on the NVIDIA EGL device")?;
|
.context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
|
||||||
egl.initialize(display).context("eglInitialize")?;
|
egl.initialize(display).context("eglInitialize")?;
|
||||||
|
|
||||||
let exts = egl
|
let exts = egl
|
||||||
@@ -106,17 +290,58 @@ impl EglImporter {
|
|||||||
"EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
|
"EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// A surfaceless desktop-GL context so we can bind the dmabuf EGLImage to a GL texture
|
||||||
|
// (cuGraphicsEGLRegisterImage is Tegra-only; desktop CUDA interop goes through GL).
|
||||||
|
egl.bind_api(egl::OPENGL_API)
|
||||||
|
.context("eglBindAPI(OpenGL)")?;
|
||||||
|
// The default EGL_SURFACE_TYPE in eglChooseConfig is WINDOW_BIT, which a headless device
|
||||||
|
// display has none of — request a pbuffer-capable config (we run surfaceless anyway).
|
||||||
|
let config = egl
|
||||||
|
.choose_first_config(
|
||||||
|
display,
|
||||||
|
&[
|
||||||
|
egl::SURFACE_TYPE,
|
||||||
|
egl::PBUFFER_BIT,
|
||||||
|
egl::RENDERABLE_TYPE,
|
||||||
|
egl::OPENGL_BIT,
|
||||||
|
egl::NONE,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.context("eglChooseConfig")?
|
||||||
|
.context("no EGL config for OpenGL")?;
|
||||||
|
let gl_ctx = egl
|
||||||
|
.create_context(
|
||||||
|
display,
|
||||||
|
config,
|
||||||
|
None,
|
||||||
|
&[egl::CONTEXT_CLIENT_VERSION, 3, egl::NONE],
|
||||||
|
)
|
||||||
|
.context("eglCreateContext(OpenGL)")?;
|
||||||
|
egl.make_current(display, None, None, Some(gl_ctx))
|
||||||
|
.context("eglMakeCurrent surfaceless (needs EGL_KHR_surfaceless_context)")?;
|
||||||
|
let egl_image_target: EglImageTargetFn = unsafe {
|
||||||
|
std::mem::transmute(
|
||||||
|
egl.get_proc_address("glEGLImageTargetTexture2DOES")
|
||||||
|
.context("glEGLImageTargetTexture2DOES unavailable")?,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
// Create the shared CUDA context up front so import() is pure hot path.
|
// Create the shared CUDA context up front so import() is pure hot path.
|
||||||
cuda::context().context("create CUDA context")?;
|
cuda::context().context("create CUDA context")?;
|
||||||
|
|
||||||
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"zero-copy EGL importer ready (EGL device platform, dma_buf_import + modifiers)"
|
"zero-copy EGL importer ready (GBM platform + GL texture interop, dma_buf_import + modifiers)"
|
||||||
);
|
);
|
||||||
Ok(EglImporter {
|
Ok(EglImporter {
|
||||||
egl,
|
egl,
|
||||||
display,
|
display,
|
||||||
no_ctx,
|
no_ctx,
|
||||||
|
_gl_ctx: gl_ctx,
|
||||||
|
egl_image_target,
|
||||||
|
blit: None,
|
||||||
|
gbm,
|
||||||
|
render_fd,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,7 +400,7 @@ impl EglImporter {
|
|||||||
/// negotiated, or `None` to import with the buffer's implicit modifier (base
|
/// negotiated, or `None` to import with the buffer's implicit modifier (base
|
||||||
/// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
|
/// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
|
||||||
pub fn import(
|
pub fn import(
|
||||||
&self,
|
&mut self,
|
||||||
plane: &DmabufPlane,
|
plane: &DmabufPlane,
|
||||||
width: u32,
|
width: u32,
|
||||||
height: u32,
|
height: u32,
|
||||||
@@ -217,17 +442,43 @@ impl EglImporter {
|
|||||||
)
|
)
|
||||||
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||||
|
|
||||||
// CUDA: register + map + copy out, then drop the registration and the EGL image.
|
// EGLImage → (sampled by a shader) → GL_RGBA8 texture → register *that* with CUDA → map
|
||||||
let result = (|| -> Result<DeviceBuffer> {
|
// → array → copy out. Registering the EGLImage texture directly fails (its layout isn't a
|
||||||
cuda::make_current()?;
|
// CUDA-registrable format); the RGBA8 render target is.
|
||||||
// SAFETY: `image` is a valid EGLImage we just created; context is current.
|
let result = self.blit_and_copy(image.as_ptr(), width, height);
|
||||||
let mapped = unsafe { MappedImage::register(image.as_ptr()) }?;
|
|
||||||
let dst = DeviceBuffer::alloc(width, height)?;
|
|
||||||
mapped.copy_to(&dst)?;
|
|
||||||
Ok(dst)
|
|
||||||
})();
|
|
||||||
|
|
||||||
let _ = self.egl.destroy_image(self.display, image);
|
let _ = self.egl.destroy_image(self.display, image);
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Render the dmabuf `image` into the registrable RGBA8 texture and copy it to an owned CUDA
|
||||||
|
/// buffer. (Re)creates the per-size GL blit machinery as needed.
|
||||||
|
fn blit_and_copy(
|
||||||
|
&mut self,
|
||||||
|
image: *mut c_void,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
cuda::make_current()?;
|
||||||
|
if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||||
|
self.blit = Some(unsafe { GlBlit::new(width, height)? });
|
||||||
|
}
|
||||||
|
let blit = self.blit.as_ref().unwrap();
|
||||||
|
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
||||||
|
unsafe { blit.run(self.egl_image_target, image)? };
|
||||||
|
let mapped = unsafe { MappedTexture::register_gl(blit.dst_tex)? };
|
||||||
|
let dst = DeviceBuffer::alloc(width, height)?;
|
||||||
|
mapped.copy_to(&dst)?;
|
||||||
|
Ok(dst)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for EglImporter {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.gbm.is_null() {
|
||||||
|
unsafe { gbm_device_destroy(self.gbm) };
|
||||||
|
}
|
||||||
|
if self.render_fd >= 0 {
|
||||||
|
unsafe { libc::close(self.render_fd) };
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user