feat: M2 zero-copy foundation — EGL→CUDA import + NVENC CUDA-frame path
Scaffolding for dmabuf zero-copy (plan §9), opt-in via LUMEN_ZEROCOPY:
- src/zerocopy/{cuda,egl}.rs: hand-rolled CUDA Driver-API FFI (no Rust crate
exposes the EGL-interop calls / CUeglFrame) with a shared process-wide
CUcontext + pitched device buffers; an EGL importer (GBM platform on the
NVIDIA render node) that turns a dmabuf into an EGLImage, registers it with
CUDA, and copies it device-to-device into an owned buffer. `zerocopy-probe`
subcommand validates the FFI/linking/GPU access — confirmed on the box
(driver 595, EGL_EXT_image_dma_buf_import + modifiers).
- CapturedFrame gains a FramePayload enum (Cpu(Vec<u8>) | Cuda(DeviceBuffer));
the encoder branches: CPU keeps the expand+upload path, CUDA wraps the device
buffer in an AV_PIX_FMT_CUDA frame fed straight to hevc_nvenc (sharing our
CUcontext via a hand-declared AVCUDADeviceContext, since ffmpeg-sys doesn't
bind hwcontext_cuda.h). open_video/the encoder take a `cuda` flag derived from
the first frame's payload.
The capture-side dmabuf negotiation (which produces the Cuda frames) is the
next step; the CPU path is unchanged and remains the default + fallback. Builds
clean, clippy clean, tests pass.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,271 @@
|
||||
//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the EGL-interop
|
||||
//! driver calls (`cuGraphicsEGLRegisterImage` / `cuGraphicsResourceGetMappedEglFrame`) nor
|
||||
//! `CUeglFrame`, so we hand-roll exactly what we need and link `libcuda.so.1` (the driver
|
||||
//! library — NOT `libcudart`). Symbol names verified against `cust_raw` + `cudaEGL.h`: the
|
||||
//! context/mem ops use the `_v2` ABI suffix; the graphics/EGL-interop ops are unsuffixed.
|
||||
//!
|
||||
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
|
||||
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
|
||||
|
||||
#![allow(non_camel_case_types, non_snake_case)]
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use std::os::raw::{c_int, c_uint, c_void};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
pub type CUresult = c_uint; // CUDA_SUCCESS == 0
|
||||
pub type CUdevice = c_int;
|
||||
pub type CUcontext = *mut c_void; // opaque CUctx_st*
|
||||
pub type CUstream = *mut c_void; // opaque CUstream_st*
|
||||
pub type CUdeviceptr = u64;
|
||||
pub type CUgraphicsResource = *mut c_void;
|
||||
pub type CUarray = *mut c_void;
|
||||
|
||||
/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
|
||||
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
|
||||
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
|
||||
|
||||
/// `CUeglFrameType`: ARRAY=0, PITCH=1.
|
||||
pub const CU_EGL_FRAME_TYPE_ARRAY: c_uint = 0;
|
||||
pub const CU_EGL_FRAME_TYPE_PITCH: c_uint = 1;
|
||||
|
||||
/// `CUeglFrame` — exact layout from `cudaEGL.h`. `frame` is a union of `CUarray pArray[3]` and
|
||||
/// `void* pPitch[3]`; both are three pointers, so `[*mut c_void; 3]` models it.
|
||||
#[repr(C)]
|
||||
pub struct CUeglFrame {
|
||||
pub frame: [*mut c_void; 3],
|
||||
pub width: c_uint,
|
||||
pub height: c_uint,
|
||||
pub depth: c_uint,
|
||||
pub pitch: c_uint,
|
||||
pub planeCount: c_uint,
|
||||
pub numChannels: c_uint,
|
||||
pub frameType: c_uint,
|
||||
pub eglColorFormat: c_uint,
|
||||
pub cuFormat: c_uint,
|
||||
}
|
||||
|
||||
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
pub struct CUDA_MEMCPY2D {
|
||||
pub srcXInBytes: usize,
|
||||
pub srcY: usize,
|
||||
pub srcMemoryType: c_uint,
|
||||
pub srcHost: *const c_void,
|
||||
pub srcDevice: CUdeviceptr,
|
||||
pub srcArray: CUarray,
|
||||
pub srcPitch: usize,
|
||||
pub dstXInBytes: usize,
|
||||
pub dstY: usize,
|
||||
pub dstMemoryType: c_uint,
|
||||
pub dstHost: *mut c_void,
|
||||
pub dstDevice: CUdeviceptr,
|
||||
pub dstArray: CUarray,
|
||||
pub dstPitch: usize,
|
||||
pub WidthInBytes: usize,
|
||||
pub Height: usize,
|
||||
}
|
||||
|
||||
impl Default for CUeglFrame {
|
||||
fn default() -> Self {
|
||||
// SAFETY: all fields are integers or pointers; zero is a valid bit pattern.
|
||||
unsafe { std::mem::zeroed() }
|
||||
}
|
||||
}
|
||||
|
||||
#[link(name = "cuda")]
|
||||
extern "C" {
|
||||
fn cuInit(flags: c_uint) -> CUresult;
|
||||
fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
|
||||
fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult;
|
||||
fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult;
|
||||
fn cuMemAllocPitch_v2(
|
||||
dptr: *mut CUdeviceptr,
|
||||
pitch: *mut usize,
|
||||
width_bytes: usize,
|
||||
height: usize,
|
||||
element_size: c_uint,
|
||||
) -> CUresult;
|
||||
fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult;
|
||||
fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
|
||||
fn cuCtxSynchronize() -> CUresult;
|
||||
|
||||
fn cuGraphicsEGLRegisterImage(
|
||||
resource: *mut CUgraphicsResource,
|
||||
image: *mut c_void, // EGLImage
|
||||
flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_NONE = 0
|
||||
) -> CUresult;
|
||||
fn cuGraphicsResourceGetMappedEglFrame(
|
||||
egl_frame: *mut CUeglFrame,
|
||||
resource: CUgraphicsResource,
|
||||
index: c_uint,
|
||||
mip_level: c_uint,
|
||||
) -> CUresult;
|
||||
fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn ck(r: CUresult, what: &str) -> Result<()> {
|
||||
if r == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("CUDA driver error {r} in {what}")
|
||||
}
|
||||
}
|
||||
|
||||
/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
|
||||
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Context(pub CUcontext);
|
||||
unsafe impl Send for Context {}
|
||||
unsafe impl Sync for Context {}
|
||||
|
||||
static CONTEXT: OnceLock<Context> = OnceLock::new();
|
||||
|
||||
/// Get (lazily creating) the shared CUDA context on device 0.
|
||||
pub fn context() -> Result<CUcontext> {
|
||||
if let Some(c) = CONTEXT.get() {
|
||||
return Ok(c.0);
|
||||
}
|
||||
let ctx = unsafe {
|
||||
ck(cuInit(0), "cuInit")?;
|
||||
let mut dev: CUdevice = 0;
|
||||
ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
|
||||
let mut ctx: CUcontext = std::ptr::null_mut();
|
||||
ck(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate_v2")?;
|
||||
ctx
|
||||
};
|
||||
// Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
|
||||
// process-lifetime). `get_or_init` keeps a single shared value.
|
||||
Ok(CONTEXT.get_or_init(|| Context(ctx)).0)
|
||||
}
|
||||
|
||||
/// Make the shared context current on the calling thread (required before any CUDA op here).
|
||||
pub fn make_current() -> Result<()> {
|
||||
let ctx = context()?;
|
||||
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
|
||||
}
|
||||
|
||||
/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder
|
||||
/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be
|
||||
/// returned to the compositor immediately.
|
||||
pub struct DeviceBuffer {
|
||||
pub ptr: CUdeviceptr,
|
||||
pub pitch: usize,
|
||||
pub width: u32,
|
||||
pub height: u32,
|
||||
}
|
||||
|
||||
impl DeviceBuffer {
|
||||
/// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels.
|
||||
pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
|
||||
let mut ptr: CUdeviceptr = 0;
|
||||
let mut pitch: usize = 0;
|
||||
unsafe {
|
||||
ck(
|
||||
cuMemAllocPitch_v2(&mut ptr, &mut pitch, width as usize * 4, height as usize, 16),
|
||||
"cuMemAllocPitch_v2",
|
||||
)?;
|
||||
}
|
||||
Ok(DeviceBuffer {
|
||||
ptr,
|
||||
pitch,
|
||||
width,
|
||||
height,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DeviceBuffer {
|
||||
fn drop(&mut self) {
|
||||
if self.ptr != 0 {
|
||||
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
||||
unsafe {
|
||||
if let Some(c) = CONTEXT.get() {
|
||||
let _ = cuCtxSetCurrent(c.0);
|
||||
}
|
||||
let _ = cuMemFree_v2(self.ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A live EGL→CUDA registration. The mapped device memory aliases the dmabuf, so we copy out of
|
||||
/// it immediately and then unregister (the EGL image is destroyed by the caller).
|
||||
pub struct MappedImage {
|
||||
resource: CUgraphicsResource,
|
||||
/// `frameType` (ARRAY vs PITCH) determines how to copy out.
|
||||
frame: CUeglFrame,
|
||||
}
|
||||
|
||||
impl MappedImage {
|
||||
/// Register an `EGLImage` with CUDA and map it to a `CUeglFrame`.
|
||||
///
|
||||
/// # Safety
|
||||
/// `image` must be a valid `EGLImage`; the shared context must be current on this thread.
|
||||
pub unsafe fn register(image: *mut c_void) -> Result<MappedImage> {
|
||||
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
||||
ck(
|
||||
cuGraphicsEGLRegisterImage(&mut resource, image, 0),
|
||||
"cuGraphicsEGLRegisterImage",
|
||||
)?;
|
||||
let mut frame = CUeglFrame::default();
|
||||
let r = cuGraphicsResourceGetMappedEglFrame(&mut frame, resource, 0, 0);
|
||||
if r != 0 {
|
||||
let _ = cuGraphicsUnregisterResource(resource);
|
||||
bail!("cuGraphicsResourceGetMappedEglFrame error {r}");
|
||||
}
|
||||
Ok(MappedImage { resource, frame })
|
||||
}
|
||||
|
||||
/// Device-to-device copy of this mapped frame into `dst` (de-tiling if the source is a tiled
|
||||
/// CUarray). After this returns the dmabuf is no longer needed.
|
||||
pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
|
||||
let width_bytes = (self.frame.width as usize).min(dst.width as usize) * 4;
|
||||
let height = (self.frame.height as usize).min(dst.height as usize);
|
||||
let mut copy = CUDA_MEMCPY2D {
|
||||
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||
dstDevice: dst.ptr,
|
||||
dstPitch: dst.pitch,
|
||||
WidthInBytes: width_bytes,
|
||||
Height: height,
|
||||
..Default::default()
|
||||
};
|
||||
match self.frame.frameType {
|
||||
CU_EGL_FRAME_TYPE_PITCH => {
|
||||
copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||
copy.srcDevice = self.frame.frame[0] as CUdeviceptr;
|
||||
copy.srcPitch = self.frame.pitch as usize;
|
||||
}
|
||||
CU_EGL_FRAME_TYPE_ARRAY => {
|
||||
copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||
copy.srcArray = self.frame.frame[0] as CUarray;
|
||||
}
|
||||
other => bail!("unexpected CUeglFrame frameType {other}"),
|
||||
}
|
||||
unsafe {
|
||||
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?;
|
||||
// The copy must complete before the dmabuf is requeued / reused.
|
||||
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn color_format(&self) -> c_uint {
|
||||
self.frame.eglColorFormat
|
||||
}
|
||||
pub fn frame_kind(&self) -> c_uint {
|
||||
self.frame.frameType
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for MappedImage {
|
||||
fn drop(&mut self) {
|
||||
if !self.resource.is_null() {
|
||||
unsafe {
|
||||
let _ = cuGraphicsUnregisterResource(self.resource);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (via the GBM
|
||||
//! platform on the render node) and import a PipeWire dmabuf as an `EGLImage` with
|
||||
//! `EGL_LINUX_DMA_BUF_EXT`. The DRM format **modifier** is mandatory on NVIDIA (its buffers are
|
||||
//! tiled; importing without the modifier yields a corrupt image or `EGL_BAD_MATCH`). The image
|
||||
//! is then handed to CUDA (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an
|
||||
//! owned buffer so the dmabuf can be returned to the compositor immediately.
|
||||
|
||||
#![allow(non_upper_case_globals)]
|
||||
|
||||
use super::cuda::{self, DeviceBuffer, MappedImage};
|
||||
use anyhow::{ensure, Context as _, Result};
|
||||
use khronos_egl as egl;
|
||||
use std::os::raw::{c_int, c_void};
|
||||
|
||||
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
||||
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
||||
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
|
||||
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
||||
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
||||
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
||||
const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
||||
|
||||
#[link(name = "gbm")]
|
||||
extern "C" {
|
||||
fn gbm_create_device(fd: c_int) -> *mut c_void;
|
||||
fn gbm_device_destroy(device: *mut c_void);
|
||||
}
|
||||
|
||||
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct DmabufPlane {
|
||||
pub fd: i32,
|
||||
pub offset: u32,
|
||||
pub stride: u32,
|
||||
}
|
||||
|
||||
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
||||
|
||||
/// Headless EGL display + GBM device used to import dmabufs. Lives on the capture thread.
|
||||
pub struct EglImporter {
|
||||
egl: Egl,
|
||||
display: egl::Display,
|
||||
no_ctx: egl::Context,
|
||||
gbm: *mut c_void,
|
||||
render_fd: c_int,
|
||||
}
|
||||
|
||||
// The EGL/GBM handles are confined to the capture thread; the struct is moved there once.
|
||||
unsafe impl Send for EglImporter {}
|
||||
|
||||
impl EglImporter {
|
||||
/// Open the render node, create a GBM device, and a headless EGLDisplay on it. Also forces
|
||||
/// the shared CUDA context to exist (so a later `import` only touches the hot path).
|
||||
pub fn new() -> Result<EglImporter> {
|
||||
let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
|
||||
let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||
ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
|
||||
let gbm = unsafe { gbm_create_device(render_fd) };
|
||||
if gbm.is_null() {
|
||||
unsafe { libc::close(render_fd) };
|
||||
anyhow::bail!("gbm_create_device failed");
|
||||
}
|
||||
|
||||
let egl: Egl =
|
||||
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
||||
let display = unsafe {
|
||||
egl.get_platform_display(
|
||||
EGL_PLATFORM_GBM_KHR,
|
||||
gbm as egl::NativeDisplayType,
|
||||
&[egl::ATTRIB_NONE],
|
||||
)
|
||||
}
|
||||
.context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
|
||||
egl.initialize(display).context("eglInitialize")?;
|
||||
|
||||
let exts = egl
|
||||
.query_string(Some(display), egl::EXTENSIONS)
|
||||
.context("query EGL extensions")?
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
ensure!(
|
||||
exts.contains("EGL_EXT_image_dma_buf_import"),
|
||||
"EGL lacks EGL_EXT_image_dma_buf_import"
|
||||
);
|
||||
ensure!(
|
||||
exts.contains("EGL_EXT_image_dma_buf_import_modifiers"),
|
||||
"EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
|
||||
);
|
||||
|
||||
// Create the shared CUDA context up front so import() is pure hot path.
|
||||
cuda::context().context("create CUDA context")?;
|
||||
|
||||
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
||||
tracing::info!("zero-copy EGL importer ready (GBM platform, dma_buf_import + modifiers)");
|
||||
Ok(EglImporter {
|
||||
egl,
|
||||
display,
|
||||
no_ctx,
|
||||
gbm,
|
||||
render_fd,
|
||||
})
|
||||
}
|
||||
|
||||
/// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer.
|
||||
/// `fourcc` is the DRM FourCC, `modifier` the 64-bit DRM format modifier from PipeWire.
|
||||
pub fn import(
|
||||
&self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
fourcc: u32,
|
||||
modifier: u64,
|
||||
) -> Result<DeviceBuffer> {
|
||||
let attrs: [egl::Attrib; 19] = [
|
||||
egl::WIDTH as egl::Attrib,
|
||||
width as egl::Attrib,
|
||||
egl::HEIGHT as egl::Attrib,
|
||||
height as egl::Attrib,
|
||||
EGL_LINUX_DRM_FOURCC_EXT,
|
||||
fourcc as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_FD_EXT,
|
||||
plane.fd as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_OFFSET_EXT,
|
||||
plane.offset as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_PITCH_EXT,
|
||||
plane.stride as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
|
||||
(modifier & 0xFFFF_FFFF) as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
|
||||
(modifier >> 32) as egl::Attrib,
|
||||
egl::ATTRIB_NONE,
|
||||
0,
|
||||
0,
|
||||
];
|
||||
let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
|
||||
let image = self
|
||||
.egl
|
||||
.create_image(
|
||||
self.display,
|
||||
self.no_ctx,
|
||||
EGL_LINUX_DMA_BUF_EXT,
|
||||
client,
|
||||
&attrs[..17], // up to and including ATTRIB_NONE
|
||||
)
|
||||
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||
|
||||
// CUDA: register + map + copy out, then drop the registration and the EGL image.
|
||||
let result = (|| -> Result<DeviceBuffer> {
|
||||
cuda::make_current()?;
|
||||
// SAFETY: `image` is a valid EGLImage we just created; context is current.
|
||||
let mapped = unsafe { MappedImage::register(image.as_ptr()) }?;
|
||||
let dst = DeviceBuffer::alloc(width, height)?;
|
||||
mapped.copy_to(&dst)?;
|
||||
Ok(dst)
|
||||
})();
|
||||
|
||||
let _ = self.egl.destroy_image(self.display, image);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EglImporter {
|
||||
fn drop(&mut self) {
|
||||
if !self.gbm.is_null() {
|
||||
unsafe { gbm_device_destroy(self.gbm) };
|
||||
}
|
||||
if self.render_fd >= 0 {
|
||||
unsafe { libc::close(self.render_fd) };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
//! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and
|
||||
//! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path
|
||||
//! moves ~3.5 GB/s). Opt in with `LUMEN_ZEROCOPY=1`; the CPU-copy path stays the default and
|
||||
//! the runtime fallback (foreign-allocator / no-dmabuf / import failure).
|
||||
//!
|
||||
//! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the
|
||||
//! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in
|
||||
//! `encode/linux.rs`; the dmabuf negotiation lives in `capture/linux.rs`.
|
||||
|
||||
pub mod cuda;
|
||||
pub mod egl;
|
||||
|
||||
pub use cuda::DeviceBuffer;
|
||||
pub use egl::EglImporter;
|
||||
|
||||
/// Whether the zero-copy path is opted in (`LUMEN_ZEROCOPY` truthy).
|
||||
pub fn enabled() -> bool {
|
||||
std::env::var("LUMEN_ZEROCOPY")
|
||||
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
|
||||
const fn fourcc(c: &[u8; 4]) -> u32 {
|
||||
(c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
|
||||
}
|
||||
|
||||
/// Map a SPA/our [`crate::capture::PixelFormat`] to the DRM FourCC EGL expects for import.
|
||||
/// SPA byte order `BGRx` ⇒ DRM `XRGB8888` (memory B,G,R,X), etc.
|
||||
pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option<u32> {
|
||||
use crate::capture::PixelFormat::*;
|
||||
Some(match format {
|
||||
Bgrx => fourcc(b"XR24"), // DRM_FORMAT_XRGB8888
|
||||
Bgra => fourcc(b"AR24"), // DRM_FORMAT_ARGB8888
|
||||
Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
|
||||
Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
|
||||
// 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
|
||||
Rgb | Bgr => return None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Standalone probe (the `zerocopy-probe` subcommand): initialize the EGL importer + CUDA
|
||||
/// context and report. De-risks the FFI/linking/GPU-access without needing a capture session.
|
||||
pub fn probe() -> anyhow::Result<()> {
|
||||
let _importer = EglImporter::new()?;
|
||||
let ctx = cuda::context()?;
|
||||
tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user