From 16a00563a8f31bac19db805658b5a1e55466de68 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Tue, 9 Jun 2026 15:13:05 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20M2=20zero-copy=20foundation=20=E2=80=94?= =?UTF-8?q?=20EGL=E2=86=92CUDA=20import=20+=20NVENC=20CUDA-frame=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scaffolding for dmabuf zero-copy (plan §9), opt-in via LUMEN_ZEROCOPY: - src/zerocopy/{cuda,egl}.rs: hand-rolled CUDA Driver-API FFI (no Rust crate exposes the EGL-interop calls / CUeglFrame) with a shared process-wide CUcontext + pitched device buffers; an EGL importer (GBM platform on the NVIDIA render node) that turns a dmabuf into an EGLImage, registers it with CUDA, and copies it device-to-device into an owned buffer. `zerocopy-probe` subcommand validates the FFI/linking/GPU access — confirmed on the box (driver 595, EGL_EXT_image_dma_buf_import + modifiers). - CapturedFrame gains a FramePayload enum (Cpu(Vec) | Cuda(DeviceBuffer)); the encoder branches: CPU keeps the expand+upload path, CUDA wraps the device buffer in an AV_PIX_FMT_CUDA frame fed straight to hevc_nvenc (sharing our CUcontext via a hand-declared AVCUDADeviceContext, since ffmpeg-sys doesn't bind hwcontext_cuda.h). open_video/the encoder take a `cuda` flag derived from the first frame's payload. The capture-side dmabuf negotiation (which produces the Cuda frames) is the next step; the CPU path is unchanged and remains the default + fallback. Builds clean, clippy clean, tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 11 + crates/lumen-host/Cargo.toml | 5 + crates/lumen-host/src/capture.rs | 39 ++- crates/lumen-host/src/capture/linux.rs | 10 +- crates/lumen-host/src/encode.rs | 12 +- crates/lumen-host/src/encode/linux.rs | 258 ++++++++++++++++---- crates/lumen-host/src/gamestream/stream.rs | 1 + crates/lumen-host/src/m0.rs | 13 +- crates/lumen-host/src/main.rs | 5 + crates/lumen-host/src/zerocopy/cuda.rs | 271 +++++++++++++++++++++ crates/lumen-host/src/zerocopy/egl.rs | 173 +++++++++++++ crates/lumen-host/src/zerocopy/mod.rs | 49 ++++ 12 files changed, 777 insertions(+), 70 deletions(-) create mode 100644 crates/lumen-host/src/zerocopy/cuda.rs create mode 100644 crates/lumen-host/src/zerocopy/egl.rs create mode 100644 crates/lumen-host/src/zerocopy/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8b4d284..8df5a8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1356,6 +1356,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "khronos-egl" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76" +dependencies = [ + "libc", + "libloading", +] + [[package]] name = "latency-probe" version = "0.0.1" @@ -1501,6 +1511,7 @@ dependencies = [ "ffmpeg-next", "futures-util", "hex", + "khronos-egl", "libc", "lumen-core", "mdns-sd", diff --git a/crates/lumen-host/Cargo.toml b/crates/lumen-host/Cargo.toml index b487dfb..8615ce1 100644 --- a/crates/lumen-host/Cargo.toml +++ b/crates/lumen-host/Cargo.toml @@ -58,3 +58,8 @@ opus = "0.3" reis = { version = "0.6.1", features = ["tokio"] } # `StreamExt::next` on reis's tokio event stream in the libei worker loop. futures-util = "0.3" +# Zero-copy capture (plan §9): EGL imports the PipeWire dmabuf, CUDA maps it, NVENC encodes +# it with no CPU roundtrip. `khronos-egl` (dynamic = load the NVIDIA libEGL at runtime) gives +# eglCreateImage + the dma_buf import; the CUDA driver API (EGL interop) and libgbm are linked +# via hand-rolled FFI in `src/zerocopy/` (no Rust crate exposes the EGL-interop driver calls). +khronos-egl = { version = "6", features = ["dynamic"] } diff --git a/crates/lumen-host/src/capture.rs b/crates/lumen-host/src/capture.rs index 8771ee2..de282bb 100644 --- a/crates/lumen-host/src/capture.rs +++ b/crates/lumen-host/src/capture.rs @@ -33,17 +33,40 @@ impl PixelFormat { } } -/// A captured frame. For zero-copy the real type would wrap a dmabuf fd + modifier; the -/// CPU buffer is the M0 fallback path (plan §9 risk: per-GPU dmabuf import quirks). +/// A captured frame. [`format`](Self::format)/dimensions describe the pixels regardless of +/// where they live — [`payload`](Self::payload) is either a CPU buffer (the M0/fallback path) +/// or a GPU buffer already on the device (the zero-copy path, plan §9). pub struct CapturedFrame { pub width: u32, pub height: u32, pub pts_ns: u64, - /// Pixel layout of `cpu_bytes`. + /// Pixel layout of the payload. pub format: PixelFormat, - /// Tightly-packed pixels in `format`, `width * height * format.bytes_per_pixel()` - /// bytes (no row padding). - pub cpu_bytes: Vec, + pub payload: FramePayload, +} + +/// Where a captured frame's pixels live. +pub enum FramePayload { + /// Tightly-packed CPU pixels in `format`, `width*height*bytes_per_pixel` (no row padding). + Cpu(Vec), + /// A pitched GPU buffer (BGRA-order, on the shared CUDA context) — the zero-copy path. The + /// dmabuf has already been imported + copied into this owned device buffer. + #[cfg(target_os = "linux")] + Cuda(crate::zerocopy::DeviceBuffer), +} + +impl CapturedFrame { + /// True if the frame's pixels are a GPU/CUDA buffer (the zero-copy path). + pub fn is_cuda(&self) -> bool { + #[cfg(target_os = "linux")] + { + matches!(self.payload, FramePayload::Cuda(_)) + } + #[cfg(not(target_os = "linux"))] + { + false + } + } } /// Produces frames from a captured output. Lives on its own thread, feeding the encoder @@ -130,7 +153,7 @@ impl Capturer for SyntheticCapturer { height: self.height, pts_ns, format: PixelFormat::Bgrx, - cpu_bytes: self.buf.clone(), + payload: FramePayload::Cpu(self.buf.clone()), }) } } @@ -173,7 +196,7 @@ impl Capturer for FastSyntheticCapturer { height: self.height, pts_ns: 0, format: PixelFormat::Bgrx, - cpu_bytes: self.buf.clone(), + payload: FramePayload::Cpu(self.buf.clone()), }) } } diff --git a/crates/lumen-host/src/capture/linux.rs b/crates/lumen-host/src/capture/linux.rs index 1e94be2..951f772 100644 --- a/crates/lumen-host/src/capture/linux.rs +++ b/crates/lumen-host/src/capture/linux.rs @@ -15,7 +15,7 @@ //! graceful stop (pipewire `channel` quit + Session close) belongs with the M2 session //! lifecycle. -use super::{CapturedFrame, Capturer, PixelFormat}; +use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use anyhow::{anyhow, Context, Result}; use std::os::fd::OwnedFd; use std::sync::atomic::{AtomicBool, Ordering}; @@ -148,7 +148,7 @@ fn portal_thread(setup_tx: std::sync::mpsc::Sender Result<()>; } -/// Open an NVENC encoder for packed RGB/BGR CPU frames of the given `format` and mode. -/// `format`, `bitrate_bps`, `codec`, and the mode come from session negotiation; M0 takes -/// them from the first captured frame. +/// Open an NVENC encoder for frames of the given `format` and mode. When `cuda` is true the +/// encoder takes GPU frames (`AV_PIX_FMT_CUDA`) from the zero-copy path; otherwise it takes +/// packed RGB/BGR CPU frames. `format`/`bitrate_bps`/`codec`/mode come from session +/// negotiation; the caller derives `cuda` from the first captured frame's payload. pub fn open_video( codec: Codec, format: PixelFormat, @@ -60,15 +61,16 @@ pub fn open_video( height: u32, fps: u32, bitrate_bps: u64, + cuda: bool, ) -> Result> { #[cfg(target_os = "linux")] { - let enc = linux::NvencEncoder::open(codec, format, width, height, fps, bitrate_bps)?; + let enc = linux::NvencEncoder::open(codec, format, width, height, fps, bitrate_bps, cuda)?; Ok(Box::new(enc) as Box) } #[cfg(not(target_os = "linux"))] { - let _ = (codec, format, width, height, fps, bitrate_bps); + let _ = (codec, format, width, height, fps, bitrate_bps, cuda); anyhow::bail!("NVENC encode requires Linux (FFmpeg + NVIDIA driver)") } } diff --git a/crates/lumen-host/src/encode/linux.rs b/crates/lumen-host/src/encode/linux.rs index c4b3568..8701a50 100644 --- a/crates/lumen-host/src/encode/linux.rs +++ b/crates/lumen-host/src/encode/linux.rs @@ -8,12 +8,88 @@ //! every IDR — the output is both a playable raw Annex-B stream and self-contained AUs. use super::{Codec, EncodedFrame, Encoder}; -use crate::capture::{CapturedFrame, PixelFormat}; -use anyhow::{anyhow, Context, Result}; +use crate::capture::{CapturedFrame, FramePayload, PixelFormat}; +use anyhow::{anyhow, bail, Context, Result}; use ffmpeg::format::Pixel; use ffmpeg::util::frame::Video as VideoFrame; use ffmpeg::{codec, encoder, Dictionary, Packet, Rational}; use ffmpeg_next as ffmpeg; +use std::os::raw::c_int; + +use ffmpeg::ffi; // = ffmpeg_sys_next + +/// `AVCUDADeviceContext` (libavutil/hwcontext_cuda.h) — not in the ffmpeg-sys bindings (the +/// crate doesn't allowlist that header), so mirror its stable 3-pointer layout. We set the +/// first field to *our* `CUcontext` so NVENC shares the context the EGL importer maps into. +#[repr(C)] +struct AVCUDADeviceContext { + cuda_ctx: *mut std::ffi::c_void, // CUcontext + stream: *mut std::ffi::c_void, // CUstream (null = default) + internal: *mut std::ffi::c_void, // filled by ctx_init +} + +/// CUDA hardware-frame contexts that wrap our shared `CUcontext`, so `hevc_nvenc` reads the +/// imported device buffer directly. Owns two `AVBufferRef`s, unref'd on drop. +struct CudaHw { + device_ref: *mut ffi::AVBufferRef, + frames_ref: *mut ffi::AVBufferRef, +} + +impl CudaHw { + /// Build a CUDA hwdevice wrapping `cu_ctx` and a frames pool (`sw_format` = `pixel`). + unsafe fn new(cu_ctx: *mut std::ffi::c_void, sw_format: Pixel, w: u32, h: u32) -> Result { + let mut device_ref = ffi::av_hwdevice_ctx_alloc(ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA); + if device_ref.is_null() { + bail!("av_hwdevice_ctx_alloc(CUDA) failed"); + } + let dev_ctx = (*device_ref).data as *mut ffi::AVHWDeviceContext; + let cu = (*dev_ctx).hwctx as *mut AVCUDADeviceContext; + (*cu).cuda_ctx = cu_ctx; // share the importer's context + let r = ffi::av_hwdevice_ctx_init(device_ref); + if r < 0 { + ffi::av_buffer_unref(&mut device_ref); + bail!("av_hwdevice_ctx_init failed ({r})"); + } + + let mut frames_ref = ffi::av_hwframe_ctx_alloc(device_ref); + if frames_ref.is_null() { + ffi::av_buffer_unref(&mut device_ref); + bail!("av_hwframe_ctx_alloc failed"); + } + let fc = (*frames_ref).data as *mut ffi::AVHWFramesContext; + (*fc).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; + (*fc).sw_format = pixel_to_av(sw_format); + (*fc).width = w as c_int; + (*fc).height = h as c_int; + (*fc).initial_pool_size = 0; // we supply the device pointers + let r = ffi::av_hwframe_ctx_init(frames_ref); + if r < 0 { + ffi::av_buffer_unref(&mut frames_ref); + ffi::av_buffer_unref(&mut device_ref); + bail!("av_hwframe_ctx_init failed ({r})"); + } + Ok(CudaHw { + device_ref, + frames_ref, + }) + } +} + +impl Drop for CudaHw { + fn drop(&mut self) { + unsafe { + ffi::av_buffer_unref(&mut self.frames_ref); + ffi::av_buffer_unref(&mut self.device_ref); + } + } +} + +/// `ffmpeg::format::Pixel` → raw `AVPixelFormat`. +fn pixel_to_av(p: Pixel) -> ffi::AVPixelFormat { + // `Pixel` is `#[repr(i32)]`-compatible with `AVPixelFormat` (the bindgen enum) via this + // documented conversion in ffmpeg-next. + ffi::AVPixelFormat::from(p) +} /// Map a captured layout to the NVENC input pixel format, and whether a 3→4 byte expand is /// needed (packed RGB/BGR have no padding byte; the NVENC `*0` formats do). @@ -30,11 +106,13 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) { pub struct NvencEncoder { enc: encoder::video::Encoder, - /// Reusable 4-bpp input frame in `nvenc_pixel` (its plane stride may exceed width*4). + /// Reusable 4-bpp CPU input frame (CPU path only; `None` for the zero-copy/CUDA path). /// Mutating it in place across frames is sound only because the encoder is opened with /// `delay=0`/`bf=0`/`max_b_frames=0` and the caller drains `poll()` after each `submit`, /// so libavcodec holds no reference to the previous frame's buffer when we overwrite it. - frame: VideoFrame, + frame: Option, + /// Zero-copy path: CUDA hwdevice/hwframes contexts (the encoder takes `AV_PIX_FMT_CUDA`). + cuda: Option, src_format: PixelFormat, expand: bool, width: u32, @@ -46,6 +124,10 @@ pub struct NvencEncoder { force_kf: bool, } +// `CudaHw` holds raw `AVBufferRef`s; the encoder lives on a single thread. The CPU encoder is +// already `Send` via ffmpeg-next; assert it for the CUDA fields too. +unsafe impl Send for NvencEncoder {} + impl NvencEncoder { pub fn open( codec: Codec, @@ -54,6 +136,7 @@ impl NvencEncoder { height: u32, fps: u32, bitrate_bps: u64, + cuda: bool, ) -> Result { ffmpeg::init().context("ffmpeg init")?; let name = codec.nvenc_name(); @@ -75,6 +158,23 @@ impl NvencEncoder { video.set_gop(fps.saturating_mul(2).max(1)); // ~2s keyframe interval video.set_max_b_frames(0); + // For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA + // hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context + // *before* open (NVENC derives the device from `hw_frames_ctx`). + let cuda_hw = if cuda { + let cu_ctx = crate::zerocopy::cuda::context().context("shared CUDA context")?; + let hw = unsafe { CudaHw::new(cu_ctx, nvenc_pixel, width, height)? }; + unsafe { + let raw = video.as_mut_ptr(); + (*raw).pix_fmt = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; + (*raw).hw_device_ctx = ffi::av_buffer_ref(hw.device_ref); + (*raw).hw_frames_ctx = ffi::av_buffer_ref(hw.frames_ref); + } + Some(hw) + } else { + None + }; + // Low-latency NVENC tuning (plan §7 / linux-setup doc). let mut opts = Dictionary::new(); opts.set("preset", "p1"); // fastest @@ -87,10 +187,15 @@ impl NvencEncoder { .open_with(opts) .with_context(|| format!("open {name} ({width}x{height}@{fps}, {bitrate_bps} bps)"))?; - let frame = VideoFrame::new(nvenc_pixel, width, height); + let frame = if cuda { + None + } else { + Some(VideoFrame::new(nvenc_pixel, width, height)) + }; Ok(NvencEncoder { enc, frame, + cuda: cuda_hw, src_format: format, expand, width, @@ -112,53 +217,15 @@ impl Encoder for NvencEncoder { self.width, self.height ); - anyhow::ensure!( - captured.format == self.src_format, - "captured format {:?} != encoder source {:?}", - captured.format, - self.src_format - ); - let w = self.width as usize; - let h = self.height as usize; - let src_bpp = self.src_format.bytes_per_pixel(); - let src_row = w * src_bpp; - anyhow::ensure!( - captured.cpu_bytes.len() >= src_row * h, - "captured buffer {} bytes < required {}", - captured.cpu_bytes.len(), - src_row * h - ); - - let stride = self.frame.stride(0); // dst is 4-bpp, aligned - let dst = self.frame.data_mut(0); - if self.expand { - // packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte) - for y in 0..h { - let s = &captured.cpu_bytes[y * src_row..y * src_row + src_row]; - let drow = &mut dst[y * stride..y * stride + w * 4]; - for x in 0..w { - drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]); - drow[x * 4 + 3] = 0; - } - } - } else { - // 4-bpp → 4-bpp, honoring the (possibly larger) dst stride - for y in 0..h { - dst[y * stride..y * stride + src_row] - .copy_from_slice(&captured.cpu_bytes[y * src_row..y * src_row + src_row]); - } - } - self.frame.set_pts(Some(self.frame_idx)); + let pts = self.frame_idx; self.frame_idx += 1; // Force an IDR when requested (client RFI); otherwise let NVENC pick (GOP/P-frame). - if self.force_kf { - self.frame.set_kind(ffmpeg::picture::Type::I); - self.force_kf = false; - } else { - self.frame.set_kind(ffmpeg::picture::Type::None); + let idr = self.force_kf; + self.force_kf = false; + match &captured.payload { + FramePayload::Cuda(buf) => self.submit_cuda(buf, pts, idr), + FramePayload::Cpu(bytes) => self.submit_cpu(bytes, captured.format, pts, idr), } - self.enc.send_frame(&self.frame).context("send_frame")?; - Ok(()) } fn request_keyframe(&mut self) { @@ -196,3 +263,96 @@ impl Encoder for NvencEncoder { Ok(()) } } + +impl NvencEncoder { + /// CPU path: expand/copy the packed RGB/BGR bytes into the reusable 4-bpp frame, then send. + fn submit_cpu(&mut self, bytes: &[u8], format: PixelFormat, pts: i64, idr: bool) -> Result<()> { + anyhow::ensure!( + format == self.src_format, + "captured format {:?} != encoder source {:?}", + format, + self.src_format + ); + let w = self.width as usize; + let h = self.height as usize; + let src_bpp = self.src_format.bytes_per_pixel(); + let src_row = w * src_bpp; + anyhow::ensure!( + bytes.len() >= src_row * h, + "captured buffer {} bytes < required {}", + bytes.len(), + src_row * h + ); + let frame = self + .frame + .as_mut() + .context("CPU frame missing (encoder opened in CUDA mode)")?; + let stride = frame.stride(0); // dst is 4-bpp, aligned + let dst = frame.data_mut(0); + if self.expand { + // packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte) + for y in 0..h { + let s = &bytes[y * src_row..y * src_row + src_row]; + let drow = &mut dst[y * stride..y * stride + w * 4]; + for x in 0..w { + drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]); + drow[x * 4 + 3] = 0; + } + } + } else { + // 4-bpp → 4-bpp, honoring the (possibly larger) dst stride + for y in 0..h { + dst[y * stride..y * stride + src_row] + .copy_from_slice(&bytes[y * src_row..y * src_row + src_row]); + } + } + frame.set_pts(Some(pts)); + frame.set_kind(if idr { + ffmpeg::picture::Type::I + } else { + ffmpeg::picture::Type::None + }); + self.enc.send_frame(frame).context("send_frame")?; + Ok(()) + } + + /// Zero-copy path: wrap the imported CUDA device buffer in an `AV_PIX_FMT_CUDA` frame and + /// send it straight to NVENC (no CPU touch). `buf.ptr` aliases device memory we own, so + /// `buf[0]` is left null (ffmpeg must not free it); the frame shell is freed after send. + fn submit_cuda( + &mut self, + buf: &crate::zerocopy::DeviceBuffer, + pts: i64, + idr: bool, + ) -> Result<()> { + let frames_ref = self + .cuda + .as_ref() + .context("CUDA hw context missing (encoder opened in CPU mode)")? + .frames_ref; + unsafe { + let mut f = ffi::av_frame_alloc(); + if f.is_null() { + bail!("av_frame_alloc failed"); + } + (*f).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA as c_int; + (*f).width = self.width as c_int; + (*f).height = self.height as c_int; + (*f).hw_frames_ctx = ffi::av_buffer_ref(frames_ref); + (*f).data[0] = buf.ptr as *mut u8; + (*f).linesize[0] = buf.pitch as c_int; + (*f).pts = pts; + (*f).pict_type = if idr { + ffi::AVPictureType::AV_PICTURE_TYPE_I + } else { + ffi::AVPictureType::AV_PICTURE_TYPE_NONE + }; + let r = ffi::avcodec_send_frame(self.enc.as_mut_ptr(), f); + ffi::av_frame_free(&mut f); + if r < 0 { + bail!("avcodec_send_frame(CUDA) failed ({r})"); + } + } + Ok(()) + } +} diff --git a/crates/lumen-host/src/gamestream/stream.rs b/crates/lumen-host/src/gamestream/stream.rs index e0bf092..ae101f7 100644 --- a/crates/lumen-host/src/gamestream/stream.rs +++ b/crates/lumen-host/src/gamestream/stream.rs @@ -121,6 +121,7 @@ fn stream_body( frame.height, cfg.fps, cfg.bitrate_kbps as u64 * 1000, + frame.is_cuda(), ) .context("open NVENC for stream")?; // FEC overhead percent (Sunshine default 20). Override with LUMEN_FEC_PCT (0 = data-only). diff --git a/crates/lumen-host/src/m0.rs b/crates/lumen-host/src/m0.rs index 6e9ae10..4c18645 100644 --- a/crates/lumen-host/src/m0.rs +++ b/crates/lumen-host/src/m0.rs @@ -75,9 +75,16 @@ pub fn run(opts: Options) -> Result<()> { bitrate_bps = opts.bitrate_bps, "opening NVENC encoder" ); - let mut encoder = - encode::open_video(opts.codec, first.format, w, h, opts.fps, opts.bitrate_bps) - .context("open encoder")?; + let mut encoder = encode::open_video( + opts.codec, + first.format, + w, + h, + opts.fps, + opts.bitrate_bps, + first.is_cuda(), + ) + .context("open encoder")?; let mut sink = BufWriter::new( File::create(&opts.out).with_context(|| format!("create {}", opts.out.display()))?, diff --git a/crates/lumen-host/src/main.rs b/crates/lumen-host/src/main.rs index 5c6d22a..ae67dd0 100644 --- a/crates/lumen-host/src/main.rs +++ b/crates/lumen-host/src/main.rs @@ -23,6 +23,8 @@ mod pipeline; mod pwinit; mod vdisplay; mod web; +#[cfg(target_os = "linux")] +mod zerocopy; use anyhow::{bail, Result}; use encode::Codec; @@ -52,6 +54,9 @@ fn real_main() -> Result<()> { // Standalone input-injection smoke test (no client needed): open the session's input // backend and inject a scripted mouse/keyboard pattern. Watch a focused app / `wev`. Some("input-test") => input_test(), + // Zero-copy FFI/GPU probe: init the EGL importer + CUDA context (no capture needed). + #[cfg(target_os = "linux")] + Some("zerocopy-probe") => zerocopy::probe(), // M0 pipeline spike. Some("m0") => m0::run(parse_m0(&args[1..])?), Some("-h") | Some("--help") | Some("help") | None => { diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs new file mode 100644 index 0000000..48de338 --- /dev/null +++ b/crates/lumen-host/src/zerocopy/cuda.rs @@ -0,0 +1,271 @@ +//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the EGL-interop +//! driver calls (`cuGraphicsEGLRegisterImage` / `cuGraphicsResourceGetMappedEglFrame`) nor +//! `CUeglFrame`, so we hand-roll exactly what we need and link `libcuda.so.1` (the driver +//! library — NOT `libcudart`). Symbol names verified against `cust_raw` + `cudaEGL.h`: the +//! context/mem ops use the `_v2` ABI suffix; the graphics/EGL-interop ops are unsuffixed. +//! +//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture +//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use. + +#![allow(non_camel_case_types, non_snake_case)] + +use anyhow::{bail, Result}; +use std::os::raw::{c_int, c_uint, c_void}; +use std::sync::OnceLock; + +pub type CUresult = c_uint; // CUDA_SUCCESS == 0 +pub type CUdevice = c_int; +pub type CUcontext = *mut c_void; // opaque CUctx_st* +pub type CUstream = *mut c_void; // opaque CUstream_st* +pub type CUdeviceptr = u64; +pub type CUgraphicsResource = *mut c_void; +pub type CUarray = *mut c_void; + +/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4. +pub const CU_MEMORYTYPE_DEVICE: c_uint = 2; +pub const CU_MEMORYTYPE_ARRAY: c_uint = 3; + +/// `CUeglFrameType`: ARRAY=0, PITCH=1. +pub const CU_EGL_FRAME_TYPE_ARRAY: c_uint = 0; +pub const CU_EGL_FRAME_TYPE_PITCH: c_uint = 1; + +/// `CUeglFrame` — exact layout from `cudaEGL.h`. `frame` is a union of `CUarray pArray[3]` and +/// `void* pPitch[3]`; both are three pointers, so `[*mut c_void; 3]` models it. +#[repr(C)] +pub struct CUeglFrame { + pub frame: [*mut c_void; 3], + pub width: c_uint, + pub height: c_uint, + pub depth: c_uint, + pub pitch: c_uint, + pub planeCount: c_uint, + pub numChannels: c_uint, + pub frameType: c_uint, + pub eglColorFormat: c_uint, + pub cuFormat: c_uint, +} + +/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing. +#[repr(C)] +#[derive(Default)] +pub struct CUDA_MEMCPY2D { + pub srcXInBytes: usize, + pub srcY: usize, + pub srcMemoryType: c_uint, + pub srcHost: *const c_void, + pub srcDevice: CUdeviceptr, + pub srcArray: CUarray, + pub srcPitch: usize, + pub dstXInBytes: usize, + pub dstY: usize, + pub dstMemoryType: c_uint, + pub dstHost: *mut c_void, + pub dstDevice: CUdeviceptr, + pub dstArray: CUarray, + pub dstPitch: usize, + pub WidthInBytes: usize, + pub Height: usize, +} + +impl Default for CUeglFrame { + fn default() -> Self { + // SAFETY: all fields are integers or pointers; zero is a valid bit pattern. + unsafe { std::mem::zeroed() } + } +} + +#[link(name = "cuda")] +extern "C" { + fn cuInit(flags: c_uint) -> CUresult; + fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult; + fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult; + fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult; + fn cuMemAllocPitch_v2( + dptr: *mut CUdeviceptr, + pitch: *mut usize, + width_bytes: usize, + height: usize, + element_size: c_uint, + ) -> CUresult; + fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult; + fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult; + fn cuCtxSynchronize() -> CUresult; + + fn cuGraphicsEGLRegisterImage( + resource: *mut CUgraphicsResource, + image: *mut c_void, // EGLImage + flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_NONE = 0 + ) -> CUresult; + fn cuGraphicsResourceGetMappedEglFrame( + egl_frame: *mut CUeglFrame, + resource: CUgraphicsResource, + index: c_uint, + mip_level: c_uint, + ) -> CUresult; + fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult; +} + +#[inline] +fn ck(r: CUresult, what: &str) -> Result<()> { + if r == 0 { + Ok(()) + } else { + bail!("CUDA driver error {r} in {what}") + } +} + +/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live +/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread. +#[derive(Clone, Copy)] +pub struct Context(pub CUcontext); +unsafe impl Send for Context {} +unsafe impl Sync for Context {} + +static CONTEXT: OnceLock = OnceLock::new(); + +/// Get (lazily creating) the shared CUDA context on device 0. +pub fn context() -> Result { + if let Some(c) = CONTEXT.get() { + return Ok(c.0); + } + let ctx = unsafe { + ck(cuInit(0), "cuInit")?; + let mut dev: CUdevice = 0; + ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?; + let mut ctx: CUcontext = std::ptr::null_mut(); + ck(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate_v2")?; + ctx + }; + // Racy first-init is fine: the winner's context is used; a loser leaks one context (rare, + // process-lifetime). `get_or_init` keeps a single shared value. + Ok(CONTEXT.get_or_init(|| Context(ctx)).0) +} + +/// Make the shared context current on the calling thread (required before any CUDA op here). +pub fn make_current() -> Result<()> { + let ctx = context()?; + unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") } +} + +/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder +/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be +/// returned to the compositor immediately. +pub struct DeviceBuffer { + pub ptr: CUdeviceptr, + pub pitch: usize, + pub width: u32, + pub height: u32, +} + +impl DeviceBuffer { + /// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels. + pub fn alloc(width: u32, height: u32) -> Result { + let mut ptr: CUdeviceptr = 0; + let mut pitch: usize = 0; + unsafe { + ck( + cuMemAllocPitch_v2(&mut ptr, &mut pitch, width as usize * 4, height as usize, 16), + "cuMemAllocPitch_v2", + )?; + } + Ok(DeviceBuffer { + ptr, + pitch, + width, + height, + }) + } +} + +impl Drop for DeviceBuffer { + fn drop(&mut self) { + if self.ptr != 0 { + // The buffer may be freed on the encode thread; cuMemFree needs a current context. + unsafe { + if let Some(c) = CONTEXT.get() { + let _ = cuCtxSetCurrent(c.0); + } + let _ = cuMemFree_v2(self.ptr); + } + } + } +} + +/// A live EGL→CUDA registration. The mapped device memory aliases the dmabuf, so we copy out of +/// it immediately and then unregister (the EGL image is destroyed by the caller). +pub struct MappedImage { + resource: CUgraphicsResource, + /// `frameType` (ARRAY vs PITCH) determines how to copy out. + frame: CUeglFrame, +} + +impl MappedImage { + /// Register an `EGLImage` with CUDA and map it to a `CUeglFrame`. + /// + /// # Safety + /// `image` must be a valid `EGLImage`; the shared context must be current on this thread. + pub unsafe fn register(image: *mut c_void) -> Result { + let mut resource: CUgraphicsResource = std::ptr::null_mut(); + ck( + cuGraphicsEGLRegisterImage(&mut resource, image, 0), + "cuGraphicsEGLRegisterImage", + )?; + let mut frame = CUeglFrame::default(); + let r = cuGraphicsResourceGetMappedEglFrame(&mut frame, resource, 0, 0); + if r != 0 { + let _ = cuGraphicsUnregisterResource(resource); + bail!("cuGraphicsResourceGetMappedEglFrame error {r}"); + } + Ok(MappedImage { resource, frame }) + } + + /// Device-to-device copy of this mapped frame into `dst` (de-tiling if the source is a tiled + /// CUarray). After this returns the dmabuf is no longer needed. + pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> { + let width_bytes = (self.frame.width as usize).min(dst.width as usize) * 4; + let height = (self.frame.height as usize).min(dst.height as usize); + let mut copy = CUDA_MEMCPY2D { + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: dst.ptr, + dstPitch: dst.pitch, + WidthInBytes: width_bytes, + Height: height, + ..Default::default() + }; + match self.frame.frameType { + CU_EGL_FRAME_TYPE_PITCH => { + copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copy.srcDevice = self.frame.frame[0] as CUdeviceptr; + copy.srcPitch = self.frame.pitch as usize; + } + CU_EGL_FRAME_TYPE_ARRAY => { + copy.srcMemoryType = CU_MEMORYTYPE_ARRAY; + copy.srcArray = self.frame.frame[0] as CUarray; + } + other => bail!("unexpected CUeglFrame frameType {other}"), + } + unsafe { + ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?; + // The copy must complete before the dmabuf is requeued / reused. + ck(cuCtxSynchronize(), "cuCtxSynchronize")?; + } + Ok(()) + } + + pub fn color_format(&self) -> c_uint { + self.frame.eglColorFormat + } + pub fn frame_kind(&self) -> c_uint { + self.frame.frameType + } +} + +impl Drop for MappedImage { + fn drop(&mut self) { + if !self.resource.is_null() { + unsafe { + let _ = cuGraphicsUnregisterResource(self.resource); + } + } + } +} diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs new file mode 100644 index 0000000..893f0c1 --- /dev/null +++ b/crates/lumen-host/src/zerocopy/egl.rs @@ -0,0 +1,173 @@ +//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (via the GBM +//! platform on the render node) and import a PipeWire dmabuf as an `EGLImage` with +//! `EGL_LINUX_DMA_BUF_EXT`. The DRM format **modifier** is mandatory on NVIDIA (its buffers are +//! tiled; importing without the modifier yields a corrupt image or `EGL_BAD_MATCH`). The image +//! is then handed to CUDA (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an +//! owned buffer so the dmabuf can be returned to the compositor immediately. + +#![allow(non_upper_case_globals)] + +use super::cuda::{self, DeviceBuffer, MappedImage}; +use anyhow::{ensure, Context as _, Result}; +use khronos_egl as egl; +use std::os::raw::{c_int, c_void}; + +// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl). +const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270; +const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7; +const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271; +const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272; +const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273; +const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274; +const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443; +const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444; + +#[link(name = "gbm")] +extern "C" { + fn gbm_create_device(fd: c_int) -> *mut c_void; + fn gbm_device_destroy(device: *mut c_void); +} + +/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx). +#[derive(Clone, Copy, Debug)] +pub struct DmabufPlane { + pub fd: i32, + pub offset: u32, + pub stride: u32, +} + +type Egl = egl::DynamicInstance; + +/// Headless EGL display + GBM device used to import dmabufs. Lives on the capture thread. +pub struct EglImporter { + egl: Egl, + display: egl::Display, + no_ctx: egl::Context, + gbm: *mut c_void, + render_fd: c_int, +} + +// The EGL/GBM handles are confined to the capture thread; the struct is moved there once. +unsafe impl Send for EglImporter {} + +impl EglImporter { + /// Open the render node, create a GBM device, and a headless EGLDisplay on it. Also forces + /// the shared CUDA context to exist (so a later `import` only touches the hot path). + pub fn new() -> Result { + let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap(); + let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) }; + ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM"); + let gbm = unsafe { gbm_create_device(render_fd) }; + if gbm.is_null() { + unsafe { libc::close(render_fd) }; + anyhow::bail!("gbm_create_device failed"); + } + + let egl: Egl = + unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?; + let display = unsafe { + egl.get_platform_display( + EGL_PLATFORM_GBM_KHR, + gbm as egl::NativeDisplayType, + &[egl::ATTRIB_NONE], + ) + } + .context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?; + egl.initialize(display).context("eglInitialize")?; + + let exts = egl + .query_string(Some(display), egl::EXTENSIONS) + .context("query EGL extensions")? + .to_string_lossy() + .into_owned(); + ensure!( + exts.contains("EGL_EXT_image_dma_buf_import"), + "EGL lacks EGL_EXT_image_dma_buf_import" + ); + ensure!( + exts.contains("EGL_EXT_image_dma_buf_import_modifiers"), + "EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)" + ); + + // Create the shared CUDA context up front so import() is pure hot path. + cuda::context().context("create CUDA context")?; + + let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) }; + tracing::info!("zero-copy EGL importer ready (GBM platform, dma_buf_import + modifiers)"); + Ok(EglImporter { + egl, + display, + no_ctx, + gbm, + render_fd, + }) + } + + /// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer. + /// `fourcc` is the DRM FourCC, `modifier` the 64-bit DRM format modifier from PipeWire. + pub fn import( + &self, + plane: &DmabufPlane, + width: u32, + height: u32, + fourcc: u32, + modifier: u64, + ) -> Result { + let attrs: [egl::Attrib; 19] = [ + egl::WIDTH as egl::Attrib, + width as egl::Attrib, + egl::HEIGHT as egl::Attrib, + height as egl::Attrib, + EGL_LINUX_DRM_FOURCC_EXT, + fourcc as egl::Attrib, + EGL_DMA_BUF_PLANE0_FD_EXT, + plane.fd as egl::Attrib, + EGL_DMA_BUF_PLANE0_OFFSET_EXT, + plane.offset as egl::Attrib, + EGL_DMA_BUF_PLANE0_PITCH_EXT, + plane.stride as egl::Attrib, + EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, + (modifier & 0xFFFF_FFFF) as egl::Attrib, + EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, + (modifier >> 32) as egl::Attrib, + egl::ATTRIB_NONE, + 0, + 0, + ]; + let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) }; + let image = self + .egl + .create_image( + self.display, + self.no_ctx, + EGL_LINUX_DMA_BUF_EXT, + client, + &attrs[..17], // up to and including ATTRIB_NONE + ) + .context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?; + + // CUDA: register + map + copy out, then drop the registration and the EGL image. + let result = (|| -> Result { + cuda::make_current()?; + // SAFETY: `image` is a valid EGLImage we just created; context is current. + let mapped = unsafe { MappedImage::register(image.as_ptr()) }?; + let dst = DeviceBuffer::alloc(width, height)?; + mapped.copy_to(&dst)?; + Ok(dst) + })(); + + let _ = self.egl.destroy_image(self.display, image); + result + } +} + +impl Drop for EglImporter { + fn drop(&mut self) { + if !self.gbm.is_null() { + unsafe { gbm_device_destroy(self.gbm) }; + } + if self.render_fd >= 0 { + unsafe { libc::close(self.render_fd) }; + } + } +} diff --git a/crates/lumen-host/src/zerocopy/mod.rs b/crates/lumen-host/src/zerocopy/mod.rs new file mode 100644 index 0000000..508cad1 --- /dev/null +++ b/crates/lumen-host/src/zerocopy/mod.rs @@ -0,0 +1,49 @@ +//! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and +//! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path +//! moves ~3.5 GB/s). Opt in with `LUMEN_ZEROCOPY=1`; the CPU-copy path stays the default and +//! the runtime fallback (foreign-allocator / no-dmabuf / import failure). +//! +//! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the +//! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in +//! `encode/linux.rs`; the dmabuf negotiation lives in `capture/linux.rs`. + +pub mod cuda; +pub mod egl; + +pub use cuda::DeviceBuffer; +pub use egl::EglImporter; + +/// Whether the zero-copy path is opted in (`LUMEN_ZEROCOPY` truthy). +pub fn enabled() -> bool { + std::env::var("LUMEN_ZEROCOPY") + .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on")) + .unwrap_or(false) +} + +/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`). +const fn fourcc(c: &[u8; 4]) -> u32 { + (c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24) +} + +/// Map a SPA/our [`crate::capture::PixelFormat`] to the DRM FourCC EGL expects for import. +/// SPA byte order `BGRx` ⇒ DRM `XRGB8888` (memory B,G,R,X), etc. +pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option { + use crate::capture::PixelFormat::*; + Some(match format { + Bgrx => fourcc(b"XR24"), // DRM_FORMAT_XRGB8888 + Bgra => fourcc(b"AR24"), // DRM_FORMAT_ARGB8888 + Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888 + Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888 + // 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path. + Rgb | Bgr => return None, + }) +} + +/// Standalone probe (the `zerocopy-probe` subcommand): initialize the EGL importer + CUDA +/// context and report. De-risks the FFI/linking/GPU-access without needing a capture session. +pub fn probe() -> anyhow::Result<()> { + let _importer = EglImporter::new()?; + let ctx = cuda::context()?; + tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized"); + Ok(()) +}