feat: M2 zero-copy foundation — EGL→CUDA import + NVENC CUDA-frame path
Scaffolding for dmabuf zero-copy (plan §9), opt-in via LUMEN_ZEROCOPY:
- src/zerocopy/{cuda,egl}.rs: hand-rolled CUDA Driver-API FFI (no Rust crate
exposes the EGL-interop calls / CUeglFrame) with a shared process-wide
CUcontext + pitched device buffers; an EGL importer (GBM platform on the
NVIDIA render node) that turns a dmabuf into an EGLImage, registers it with
CUDA, and copies it device-to-device into an owned buffer. `zerocopy-probe`
subcommand validates the FFI/linking/GPU access — confirmed on the box
(driver 595, EGL_EXT_image_dma_buf_import + modifiers).
- CapturedFrame gains a FramePayload enum (Cpu(Vec<u8>) | Cuda(DeviceBuffer));
the encoder branches: CPU keeps the expand+upload path, CUDA wraps the device
buffer in an AV_PIX_FMT_CUDA frame fed straight to hevc_nvenc (sharing our
CUcontext via a hand-declared AVCUDADeviceContext, since ffmpeg-sys doesn't
bind hwcontext_cuda.h). open_video/the encoder take a `cuda` flag derived from
the first frame's payload.
The capture-side dmabuf negotiation (which produces the Cuda frames) is the
next step; the CPU path is unchanged and remains the default + fallback. Builds
clean, clippy clean, tests pass.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Generated
+11
@@ -1356,6 +1356,16 @@ dependencies = [
|
|||||||
"wasm-bindgen",
|
"wasm-bindgen",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "khronos-egl"
|
||||||
|
version = "6.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6aae1df220ece3c0ada96b8153459b67eebe9ae9212258bb0134ae60416fdf76"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"libloading",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "latency-probe"
|
name = "latency-probe"
|
||||||
version = "0.0.1"
|
version = "0.0.1"
|
||||||
@@ -1501,6 +1511,7 @@ dependencies = [
|
|||||||
"ffmpeg-next",
|
"ffmpeg-next",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hex",
|
"hex",
|
||||||
|
"khronos-egl",
|
||||||
"libc",
|
"libc",
|
||||||
"lumen-core",
|
"lumen-core",
|
||||||
"mdns-sd",
|
"mdns-sd",
|
||||||
|
|||||||
@@ -58,3 +58,8 @@ opus = "0.3"
|
|||||||
reis = { version = "0.6.1", features = ["tokio"] }
|
reis = { version = "0.6.1", features = ["tokio"] }
|
||||||
# `StreamExt::next` on reis's tokio event stream in the libei worker loop.
|
# `StreamExt::next` on reis's tokio event stream in the libei worker loop.
|
||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
|
# Zero-copy capture (plan §9): EGL imports the PipeWire dmabuf, CUDA maps it, NVENC encodes
|
||||||
|
# it with no CPU roundtrip. `khronos-egl` (dynamic = load the NVIDIA libEGL at runtime) gives
|
||||||
|
# eglCreateImage + the dma_buf import; the CUDA driver API (EGL interop) and libgbm are linked
|
||||||
|
# via hand-rolled FFI in `src/zerocopy/` (no Rust crate exposes the EGL-interop driver calls).
|
||||||
|
khronos-egl = { version = "6", features = ["dynamic"] }
|
||||||
|
|||||||
@@ -33,17 +33,40 @@ impl PixelFormat {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A captured frame. For zero-copy the real type would wrap a dmabuf fd + modifier; the
|
/// A captured frame. [`format`](Self::format)/dimensions describe the pixels regardless of
|
||||||
/// CPU buffer is the M0 fallback path (plan §9 risk: per-GPU dmabuf import quirks).
|
/// where they live — [`payload`](Self::payload) is either a CPU buffer (the M0/fallback path)
|
||||||
|
/// or a GPU buffer already on the device (the zero-copy path, plan §9).
|
||||||
pub struct CapturedFrame {
|
pub struct CapturedFrame {
|
||||||
pub width: u32,
|
pub width: u32,
|
||||||
pub height: u32,
|
pub height: u32,
|
||||||
pub pts_ns: u64,
|
pub pts_ns: u64,
|
||||||
/// Pixel layout of `cpu_bytes`.
|
/// Pixel layout of the payload.
|
||||||
pub format: PixelFormat,
|
pub format: PixelFormat,
|
||||||
/// Tightly-packed pixels in `format`, `width * height * format.bytes_per_pixel()`
|
pub payload: FramePayload,
|
||||||
/// bytes (no row padding).
|
}
|
||||||
pub cpu_bytes: Vec<u8>,
|
|
||||||
|
/// Where a captured frame's pixels live.
|
||||||
|
pub enum FramePayload {
|
||||||
|
/// Tightly-packed CPU pixels in `format`, `width*height*bytes_per_pixel` (no row padding).
|
||||||
|
Cpu(Vec<u8>),
|
||||||
|
/// A pitched GPU buffer (BGRA-order, on the shared CUDA context) — the zero-copy path. The
|
||||||
|
/// dmabuf has already been imported + copied into this owned device buffer.
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
Cuda(crate::zerocopy::DeviceBuffer),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CapturedFrame {
|
||||||
|
/// True if the frame's pixels are a GPU/CUDA buffer (the zero-copy path).
|
||||||
|
pub fn is_cuda(&self) -> bool {
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
{
|
||||||
|
matches!(self.payload, FramePayload::Cuda(_))
|
||||||
|
}
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
{
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Produces frames from a captured output. Lives on its own thread, feeding the encoder
|
/// Produces frames from a captured output. Lives on its own thread, feeding the encoder
|
||||||
@@ -130,7 +153,7 @@ impl Capturer for SyntheticCapturer {
|
|||||||
height: self.height,
|
height: self.height,
|
||||||
pts_ns,
|
pts_ns,
|
||||||
format: PixelFormat::Bgrx,
|
format: PixelFormat::Bgrx,
|
||||||
cpu_bytes: self.buf.clone(),
|
payload: FramePayload::Cpu(self.buf.clone()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -173,7 +196,7 @@ impl Capturer for FastSyntheticCapturer {
|
|||||||
height: self.height,
|
height: self.height,
|
||||||
pts_ns: 0,
|
pts_ns: 0,
|
||||||
format: PixelFormat::Bgrx,
|
format: PixelFormat::Bgrx,
|
||||||
cpu_bytes: self.buf.clone(),
|
payload: FramePayload::Cpu(self.buf.clone()),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
//! graceful stop (pipewire `channel` quit + Session close) belongs with the M2 session
|
//! graceful stop (pipewire `channel` quit + Session close) belongs with the M2 session
|
||||||
//! lifecycle.
|
//! lifecycle.
|
||||||
|
|
||||||
use super::{CapturedFrame, Capturer, PixelFormat};
|
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, Context, Result};
|
||||||
use std::os::fd::OwnedFd;
|
use std::os::fd::OwnedFd;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
@@ -148,7 +148,7 @@ fn portal_thread(setup_tx: std::sync::mpsc::Sender<Result<(OwnedFd, u32), String
|
|||||||
.select_sources(
|
.select_sources(
|
||||||
&session,
|
&session,
|
||||||
SelectSourcesOptions::default()
|
SelectSourcesOptions::default()
|
||||||
.set_cursor_mode(CursorMode::Hidden)
|
.set_cursor_mode(CursorMode::Embedded)
|
||||||
// Only MONITOR is offered by the wlroots backend
|
// Only MONITOR is offered by the wlroots backend
|
||||||
// (AvailableSourceTypes=1); requesting unsupported types
|
// (AvailableSourceTypes=1); requesting unsupported types
|
||||||
// invalidates the session.
|
// invalidates the session.
|
||||||
@@ -251,7 +251,7 @@ fn portal_thread_remote_desktop(setup_tx: std::sync::mpsc::Sender<Result<(OwnedF
|
|||||||
.select_sources(
|
.select_sources(
|
||||||
&session,
|
&session,
|
||||||
SelectSourcesOptions::default()
|
SelectSourcesOptions::default()
|
||||||
.set_cursor_mode(CursorMode::Hidden)
|
.set_cursor_mode(CursorMode::Embedded)
|
||||||
.set_sources(BitFlags::from_flag(SourceType::Monitor))
|
.set_sources(BitFlags::from_flag(SourceType::Monitor))
|
||||||
.set_multiple(false)
|
.set_multiple(false)
|
||||||
.set_persist_mode(PersistMode::DoNot),
|
.set_persist_mode(PersistMode::DoNot),
|
||||||
@@ -297,7 +297,7 @@ fn portal_thread_remote_desktop(setup_tx: std::sync::mpsc::Sender<Result<(OwnedF
|
|||||||
mod pipewire {
|
mod pipewire {
|
||||||
//! The PipeWire consumer, confined to its own thread (the PW types are `!Send`).
|
//! The PipeWire consumer, confined to its own thread (the PW types are `!Send`).
|
||||||
|
|
||||||
use super::{CapturedFrame, PixelFormat};
|
use super::{CapturedFrame, FramePayload, PixelFormat};
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use pipewire as pw;
|
use pipewire as pw;
|
||||||
use pw::{properties::properties, spa};
|
use pw::{properties::properties, spa};
|
||||||
@@ -462,7 +462,7 @@ mod pipewire {
|
|||||||
height: h as u32,
|
height: h as u32,
|
||||||
pts_ns,
|
pts_ns,
|
||||||
format: fmt,
|
format: fmt,
|
||||||
cpu_bytes: tight,
|
payload: FramePayload::Cpu(tight),
|
||||||
};
|
};
|
||||||
// Drop if the encoder is behind — never block the pipewire loop.
|
// Drop if the encoder is behind — never block the pipewire loop.
|
||||||
let _ = ud.tx.try_send(frame);
|
let _ = ud.tx.try_send(frame);
|
||||||
|
|||||||
@@ -50,9 +50,10 @@ pub trait Encoder: Send {
|
|||||||
fn flush(&mut self) -> Result<()>;
|
fn flush(&mut self) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open an NVENC encoder for packed RGB/BGR CPU frames of the given `format` and mode.
|
/// Open an NVENC encoder for frames of the given `format` and mode. When `cuda` is true the
|
||||||
/// `format`, `bitrate_bps`, `codec`, and the mode come from session negotiation; M0 takes
|
/// encoder takes GPU frames (`AV_PIX_FMT_CUDA`) from the zero-copy path; otherwise it takes
|
||||||
/// them from the first captured frame.
|
/// packed RGB/BGR CPU frames. `format`/`bitrate_bps`/`codec`/mode come from session
|
||||||
|
/// negotiation; the caller derives `cuda` from the first captured frame's payload.
|
||||||
pub fn open_video(
|
pub fn open_video(
|
||||||
codec: Codec,
|
codec: Codec,
|
||||||
format: PixelFormat,
|
format: PixelFormat,
|
||||||
@@ -60,15 +61,16 @@ pub fn open_video(
|
|||||||
height: u32,
|
height: u32,
|
||||||
fps: u32,
|
fps: u32,
|
||||||
bitrate_bps: u64,
|
bitrate_bps: u64,
|
||||||
|
cuda: bool,
|
||||||
) -> Result<Box<dyn Encoder>> {
|
) -> Result<Box<dyn Encoder>> {
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
{
|
{
|
||||||
let enc = linux::NvencEncoder::open(codec, format, width, height, fps, bitrate_bps)?;
|
let enc = linux::NvencEncoder::open(codec, format, width, height, fps, bitrate_bps, cuda)?;
|
||||||
Ok(Box::new(enc) as Box<dyn Encoder>)
|
Ok(Box::new(enc) as Box<dyn Encoder>)
|
||||||
}
|
}
|
||||||
#[cfg(not(target_os = "linux"))]
|
#[cfg(not(target_os = "linux"))]
|
||||||
{
|
{
|
||||||
let _ = (codec, format, width, height, fps, bitrate_bps);
|
let _ = (codec, format, width, height, fps, bitrate_bps, cuda);
|
||||||
anyhow::bail!("NVENC encode requires Linux (FFmpeg + NVIDIA driver)")
|
anyhow::bail!("NVENC encode requires Linux (FFmpeg + NVIDIA driver)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,12 +8,88 @@
|
|||||||
//! every IDR — the output is both a playable raw Annex-B stream and self-contained AUs.
|
//! every IDR — the output is both a playable raw Annex-B stream and self-contained AUs.
|
||||||
|
|
||||||
use super::{Codec, EncodedFrame, Encoder};
|
use super::{Codec, EncodedFrame, Encoder};
|
||||||
use crate::capture::{CapturedFrame, PixelFormat};
|
use crate::capture::{CapturedFrame, FramePayload, PixelFormat};
|
||||||
use anyhow::{anyhow, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use ffmpeg::format::Pixel;
|
use ffmpeg::format::Pixel;
|
||||||
use ffmpeg::util::frame::Video as VideoFrame;
|
use ffmpeg::util::frame::Video as VideoFrame;
|
||||||
use ffmpeg::{codec, encoder, Dictionary, Packet, Rational};
|
use ffmpeg::{codec, encoder, Dictionary, Packet, Rational};
|
||||||
use ffmpeg_next as ffmpeg;
|
use ffmpeg_next as ffmpeg;
|
||||||
|
use std::os::raw::c_int;
|
||||||
|
|
||||||
|
use ffmpeg::ffi; // = ffmpeg_sys_next
|
||||||
|
|
||||||
|
/// `AVCUDADeviceContext` (libavutil/hwcontext_cuda.h) — not in the ffmpeg-sys bindings (the
|
||||||
|
/// crate doesn't allowlist that header), so mirror its stable 3-pointer layout. We set the
|
||||||
|
/// first field to *our* `CUcontext` so NVENC shares the context the EGL importer maps into.
|
||||||
|
#[repr(C)]
|
||||||
|
struct AVCUDADeviceContext {
|
||||||
|
cuda_ctx: *mut std::ffi::c_void, // CUcontext
|
||||||
|
stream: *mut std::ffi::c_void, // CUstream (null = default)
|
||||||
|
internal: *mut std::ffi::c_void, // filled by ctx_init
|
||||||
|
}
|
||||||
|
|
||||||
|
/// CUDA hardware-frame contexts that wrap our shared `CUcontext`, so `hevc_nvenc` reads the
|
||||||
|
/// imported device buffer directly. Owns two `AVBufferRef`s, unref'd on drop.
|
||||||
|
struct CudaHw {
|
||||||
|
device_ref: *mut ffi::AVBufferRef,
|
||||||
|
frames_ref: *mut ffi::AVBufferRef,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CudaHw {
|
||||||
|
/// Build a CUDA hwdevice wrapping `cu_ctx` and a frames pool (`sw_format` = `pixel`).
|
||||||
|
unsafe fn new(cu_ctx: *mut std::ffi::c_void, sw_format: Pixel, w: u32, h: u32) -> Result<Self> {
|
||||||
|
let mut device_ref = ffi::av_hwdevice_ctx_alloc(ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA);
|
||||||
|
if device_ref.is_null() {
|
||||||
|
bail!("av_hwdevice_ctx_alloc(CUDA) failed");
|
||||||
|
}
|
||||||
|
let dev_ctx = (*device_ref).data as *mut ffi::AVHWDeviceContext;
|
||||||
|
let cu = (*dev_ctx).hwctx as *mut AVCUDADeviceContext;
|
||||||
|
(*cu).cuda_ctx = cu_ctx; // share the importer's context
|
||||||
|
let r = ffi::av_hwdevice_ctx_init(device_ref);
|
||||||
|
if r < 0 {
|
||||||
|
ffi::av_buffer_unref(&mut device_ref);
|
||||||
|
bail!("av_hwdevice_ctx_init failed ({r})");
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut frames_ref = ffi::av_hwframe_ctx_alloc(device_ref);
|
||||||
|
if frames_ref.is_null() {
|
||||||
|
ffi::av_buffer_unref(&mut device_ref);
|
||||||
|
bail!("av_hwframe_ctx_alloc failed");
|
||||||
|
}
|
||||||
|
let fc = (*frames_ref).data as *mut ffi::AVHWFramesContext;
|
||||||
|
(*fc).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA;
|
||||||
|
(*fc).sw_format = pixel_to_av(sw_format);
|
||||||
|
(*fc).width = w as c_int;
|
||||||
|
(*fc).height = h as c_int;
|
||||||
|
(*fc).initial_pool_size = 0; // we supply the device pointers
|
||||||
|
let r = ffi::av_hwframe_ctx_init(frames_ref);
|
||||||
|
if r < 0 {
|
||||||
|
ffi::av_buffer_unref(&mut frames_ref);
|
||||||
|
ffi::av_buffer_unref(&mut device_ref);
|
||||||
|
bail!("av_hwframe_ctx_init failed ({r})");
|
||||||
|
}
|
||||||
|
Ok(CudaHw {
|
||||||
|
device_ref,
|
||||||
|
frames_ref,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for CudaHw {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
unsafe {
|
||||||
|
ffi::av_buffer_unref(&mut self.frames_ref);
|
||||||
|
ffi::av_buffer_unref(&mut self.device_ref);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `ffmpeg::format::Pixel` → raw `AVPixelFormat`.
|
||||||
|
fn pixel_to_av(p: Pixel) -> ffi::AVPixelFormat {
|
||||||
|
// `Pixel` is `#[repr(i32)]`-compatible with `AVPixelFormat` (the bindgen enum) via this
|
||||||
|
// documented conversion in ffmpeg-next.
|
||||||
|
ffi::AVPixelFormat::from(p)
|
||||||
|
}
|
||||||
|
|
||||||
/// Map a captured layout to the NVENC input pixel format, and whether a 3→4 byte expand is
|
/// Map a captured layout to the NVENC input pixel format, and whether a 3→4 byte expand is
|
||||||
/// needed (packed RGB/BGR have no padding byte; the NVENC `*0` formats do).
|
/// needed (packed RGB/BGR have no padding byte; the NVENC `*0` formats do).
|
||||||
@@ -30,11 +106,13 @@ fn nvenc_input(format: PixelFormat) -> (Pixel, bool) {
|
|||||||
|
|
||||||
pub struct NvencEncoder {
|
pub struct NvencEncoder {
|
||||||
enc: encoder::video::Encoder,
|
enc: encoder::video::Encoder,
|
||||||
/// Reusable 4-bpp input frame in `nvenc_pixel` (its plane stride may exceed width*4).
|
/// Reusable 4-bpp CPU input frame (CPU path only; `None` for the zero-copy/CUDA path).
|
||||||
/// Mutating it in place across frames is sound only because the encoder is opened with
|
/// Mutating it in place across frames is sound only because the encoder is opened with
|
||||||
/// `delay=0`/`bf=0`/`max_b_frames=0` and the caller drains `poll()` after each `submit`,
|
/// `delay=0`/`bf=0`/`max_b_frames=0` and the caller drains `poll()` after each `submit`,
|
||||||
/// so libavcodec holds no reference to the previous frame's buffer when we overwrite it.
|
/// so libavcodec holds no reference to the previous frame's buffer when we overwrite it.
|
||||||
frame: VideoFrame,
|
frame: Option<VideoFrame>,
|
||||||
|
/// Zero-copy path: CUDA hwdevice/hwframes contexts (the encoder takes `AV_PIX_FMT_CUDA`).
|
||||||
|
cuda: Option<CudaHw>,
|
||||||
src_format: PixelFormat,
|
src_format: PixelFormat,
|
||||||
expand: bool,
|
expand: bool,
|
||||||
width: u32,
|
width: u32,
|
||||||
@@ -46,6 +124,10 @@ pub struct NvencEncoder {
|
|||||||
force_kf: bool,
|
force_kf: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// `CudaHw` holds raw `AVBufferRef`s; the encoder lives on a single thread. The CPU encoder is
|
||||||
|
// already `Send` via ffmpeg-next; assert it for the CUDA fields too.
|
||||||
|
unsafe impl Send for NvencEncoder {}
|
||||||
|
|
||||||
impl NvencEncoder {
|
impl NvencEncoder {
|
||||||
pub fn open(
|
pub fn open(
|
||||||
codec: Codec,
|
codec: Codec,
|
||||||
@@ -54,6 +136,7 @@ impl NvencEncoder {
|
|||||||
height: u32,
|
height: u32,
|
||||||
fps: u32,
|
fps: u32,
|
||||||
bitrate_bps: u64,
|
bitrate_bps: u64,
|
||||||
|
cuda: bool,
|
||||||
) -> Result<Self> {
|
) -> Result<Self> {
|
||||||
ffmpeg::init().context("ffmpeg init")?;
|
ffmpeg::init().context("ffmpeg init")?;
|
||||||
let name = codec.nvenc_name();
|
let name = codec.nvenc_name();
|
||||||
@@ -75,6 +158,23 @@ impl NvencEncoder {
|
|||||||
video.set_gop(fps.saturating_mul(2).max(1)); // ~2s keyframe interval
|
video.set_gop(fps.saturating_mul(2).max(1)); // ~2s keyframe interval
|
||||||
video.set_max_b_frames(0);
|
video.set_max_b_frames(0);
|
||||||
|
|
||||||
|
// For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA
|
||||||
|
// hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context
|
||||||
|
// *before* open (NVENC derives the device from `hw_frames_ctx`).
|
||||||
|
let cuda_hw = if cuda {
|
||||||
|
let cu_ctx = crate::zerocopy::cuda::context().context("shared CUDA context")?;
|
||||||
|
let hw = unsafe { CudaHw::new(cu_ctx, nvenc_pixel, width, height)? };
|
||||||
|
unsafe {
|
||||||
|
let raw = video.as_mut_ptr();
|
||||||
|
(*raw).pix_fmt = ffi::AVPixelFormat::AV_PIX_FMT_CUDA;
|
||||||
|
(*raw).hw_device_ctx = ffi::av_buffer_ref(hw.device_ref);
|
||||||
|
(*raw).hw_frames_ctx = ffi::av_buffer_ref(hw.frames_ref);
|
||||||
|
}
|
||||||
|
Some(hw)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
// Low-latency NVENC tuning (plan §7 / linux-setup doc).
|
// Low-latency NVENC tuning (plan §7 / linux-setup doc).
|
||||||
let mut opts = Dictionary::new();
|
let mut opts = Dictionary::new();
|
||||||
opts.set("preset", "p1"); // fastest
|
opts.set("preset", "p1"); // fastest
|
||||||
@@ -87,10 +187,15 @@ impl NvencEncoder {
|
|||||||
.open_with(opts)
|
.open_with(opts)
|
||||||
.with_context(|| format!("open {name} ({width}x{height}@{fps}, {bitrate_bps} bps)"))?;
|
.with_context(|| format!("open {name} ({width}x{height}@{fps}, {bitrate_bps} bps)"))?;
|
||||||
|
|
||||||
let frame = VideoFrame::new(nvenc_pixel, width, height);
|
let frame = if cuda {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(VideoFrame::new(nvenc_pixel, width, height))
|
||||||
|
};
|
||||||
Ok(NvencEncoder {
|
Ok(NvencEncoder {
|
||||||
enc,
|
enc,
|
||||||
frame,
|
frame,
|
||||||
|
cuda: cuda_hw,
|
||||||
src_format: format,
|
src_format: format,
|
||||||
expand,
|
expand,
|
||||||
width,
|
width,
|
||||||
@@ -112,53 +217,15 @@ impl Encoder for NvencEncoder {
|
|||||||
self.width,
|
self.width,
|
||||||
self.height
|
self.height
|
||||||
);
|
);
|
||||||
anyhow::ensure!(
|
let pts = self.frame_idx;
|
||||||
captured.format == self.src_format,
|
|
||||||
"captured format {:?} != encoder source {:?}",
|
|
||||||
captured.format,
|
|
||||||
self.src_format
|
|
||||||
);
|
|
||||||
let w = self.width as usize;
|
|
||||||
let h = self.height as usize;
|
|
||||||
let src_bpp = self.src_format.bytes_per_pixel();
|
|
||||||
let src_row = w * src_bpp;
|
|
||||||
anyhow::ensure!(
|
|
||||||
captured.cpu_bytes.len() >= src_row * h,
|
|
||||||
"captured buffer {} bytes < required {}",
|
|
||||||
captured.cpu_bytes.len(),
|
|
||||||
src_row * h
|
|
||||||
);
|
|
||||||
|
|
||||||
let stride = self.frame.stride(0); // dst is 4-bpp, aligned
|
|
||||||
let dst = self.frame.data_mut(0);
|
|
||||||
if self.expand {
|
|
||||||
// packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte)
|
|
||||||
for y in 0..h {
|
|
||||||
let s = &captured.cpu_bytes[y * src_row..y * src_row + src_row];
|
|
||||||
let drow = &mut dst[y * stride..y * stride + w * 4];
|
|
||||||
for x in 0..w {
|
|
||||||
drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]);
|
|
||||||
drow[x * 4 + 3] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// 4-bpp → 4-bpp, honoring the (possibly larger) dst stride
|
|
||||||
for y in 0..h {
|
|
||||||
dst[y * stride..y * stride + src_row]
|
|
||||||
.copy_from_slice(&captured.cpu_bytes[y * src_row..y * src_row + src_row]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.frame.set_pts(Some(self.frame_idx));
|
|
||||||
self.frame_idx += 1;
|
self.frame_idx += 1;
|
||||||
// Force an IDR when requested (client RFI); otherwise let NVENC pick (GOP/P-frame).
|
// Force an IDR when requested (client RFI); otherwise let NVENC pick (GOP/P-frame).
|
||||||
if self.force_kf {
|
let idr = self.force_kf;
|
||||||
self.frame.set_kind(ffmpeg::picture::Type::I);
|
|
||||||
self.force_kf = false;
|
self.force_kf = false;
|
||||||
} else {
|
match &captured.payload {
|
||||||
self.frame.set_kind(ffmpeg::picture::Type::None);
|
FramePayload::Cuda(buf) => self.submit_cuda(buf, pts, idr),
|
||||||
|
FramePayload::Cpu(bytes) => self.submit_cpu(bytes, captured.format, pts, idr),
|
||||||
}
|
}
|
||||||
self.enc.send_frame(&self.frame).context("send_frame")?;
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn request_keyframe(&mut self) {
|
fn request_keyframe(&mut self) {
|
||||||
@@ -196,3 +263,96 @@ impl Encoder for NvencEncoder {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl NvencEncoder {
|
||||||
|
/// CPU path: expand/copy the packed RGB/BGR bytes into the reusable 4-bpp frame, then send.
|
||||||
|
fn submit_cpu(&mut self, bytes: &[u8], format: PixelFormat, pts: i64, idr: bool) -> Result<()> {
|
||||||
|
anyhow::ensure!(
|
||||||
|
format == self.src_format,
|
||||||
|
"captured format {:?} != encoder source {:?}",
|
||||||
|
format,
|
||||||
|
self.src_format
|
||||||
|
);
|
||||||
|
let w = self.width as usize;
|
||||||
|
let h = self.height as usize;
|
||||||
|
let src_bpp = self.src_format.bytes_per_pixel();
|
||||||
|
let src_row = w * src_bpp;
|
||||||
|
anyhow::ensure!(
|
||||||
|
bytes.len() >= src_row * h,
|
||||||
|
"captured buffer {} bytes < required {}",
|
||||||
|
bytes.len(),
|
||||||
|
src_row * h
|
||||||
|
);
|
||||||
|
let frame = self
|
||||||
|
.frame
|
||||||
|
.as_mut()
|
||||||
|
.context("CPU frame missing (encoder opened in CUDA mode)")?;
|
||||||
|
let stride = frame.stride(0); // dst is 4-bpp, aligned
|
||||||
|
let dst = frame.data_mut(0);
|
||||||
|
if self.expand {
|
||||||
|
// packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte)
|
||||||
|
for y in 0..h {
|
||||||
|
let s = &bytes[y * src_row..y * src_row + src_row];
|
||||||
|
let drow = &mut dst[y * stride..y * stride + w * 4];
|
||||||
|
for x in 0..w {
|
||||||
|
drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]);
|
||||||
|
drow[x * 4 + 3] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 4-bpp → 4-bpp, honoring the (possibly larger) dst stride
|
||||||
|
for y in 0..h {
|
||||||
|
dst[y * stride..y * stride + src_row]
|
||||||
|
.copy_from_slice(&bytes[y * src_row..y * src_row + src_row]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
frame.set_pts(Some(pts));
|
||||||
|
frame.set_kind(if idr {
|
||||||
|
ffmpeg::picture::Type::I
|
||||||
|
} else {
|
||||||
|
ffmpeg::picture::Type::None
|
||||||
|
});
|
||||||
|
self.enc.send_frame(frame).context("send_frame")?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zero-copy path: wrap the imported CUDA device buffer in an `AV_PIX_FMT_CUDA` frame and
|
||||||
|
/// send it straight to NVENC (no CPU touch). `buf.ptr` aliases device memory we own, so
|
||||||
|
/// `buf[0]` is left null (ffmpeg must not free it); the frame shell is freed after send.
|
||||||
|
fn submit_cuda(
|
||||||
|
&mut self,
|
||||||
|
buf: &crate::zerocopy::DeviceBuffer,
|
||||||
|
pts: i64,
|
||||||
|
idr: bool,
|
||||||
|
) -> Result<()> {
|
||||||
|
let frames_ref = self
|
||||||
|
.cuda
|
||||||
|
.as_ref()
|
||||||
|
.context("CUDA hw context missing (encoder opened in CPU mode)")?
|
||||||
|
.frames_ref;
|
||||||
|
unsafe {
|
||||||
|
let mut f = ffi::av_frame_alloc();
|
||||||
|
if f.is_null() {
|
||||||
|
bail!("av_frame_alloc failed");
|
||||||
|
}
|
||||||
|
(*f).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA as c_int;
|
||||||
|
(*f).width = self.width as c_int;
|
||||||
|
(*f).height = self.height as c_int;
|
||||||
|
(*f).hw_frames_ctx = ffi::av_buffer_ref(frames_ref);
|
||||||
|
(*f).data[0] = buf.ptr as *mut u8;
|
||||||
|
(*f).linesize[0] = buf.pitch as c_int;
|
||||||
|
(*f).pts = pts;
|
||||||
|
(*f).pict_type = if idr {
|
||||||
|
ffi::AVPictureType::AV_PICTURE_TYPE_I
|
||||||
|
} else {
|
||||||
|
ffi::AVPictureType::AV_PICTURE_TYPE_NONE
|
||||||
|
};
|
||||||
|
let r = ffi::avcodec_send_frame(self.enc.as_mut_ptr(), f);
|
||||||
|
ffi::av_frame_free(&mut f);
|
||||||
|
if r < 0 {
|
||||||
|
bail!("avcodec_send_frame(CUDA) failed ({r})");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -121,6 +121,7 @@ fn stream_body(
|
|||||||
frame.height,
|
frame.height,
|
||||||
cfg.fps,
|
cfg.fps,
|
||||||
cfg.bitrate_kbps as u64 * 1000,
|
cfg.bitrate_kbps as u64 * 1000,
|
||||||
|
frame.is_cuda(),
|
||||||
)
|
)
|
||||||
.context("open NVENC for stream")?;
|
.context("open NVENC for stream")?;
|
||||||
// FEC overhead percent (Sunshine default 20). Override with LUMEN_FEC_PCT (0 = data-only).
|
// FEC overhead percent (Sunshine default 20). Override with LUMEN_FEC_PCT (0 = data-only).
|
||||||
|
|||||||
@@ -75,8 +75,15 @@ pub fn run(opts: Options) -> Result<()> {
|
|||||||
bitrate_bps = opts.bitrate_bps,
|
bitrate_bps = opts.bitrate_bps,
|
||||||
"opening NVENC encoder"
|
"opening NVENC encoder"
|
||||||
);
|
);
|
||||||
let mut encoder =
|
let mut encoder = encode::open_video(
|
||||||
encode::open_video(opts.codec, first.format, w, h, opts.fps, opts.bitrate_bps)
|
opts.codec,
|
||||||
|
first.format,
|
||||||
|
w,
|
||||||
|
h,
|
||||||
|
opts.fps,
|
||||||
|
opts.bitrate_bps,
|
||||||
|
first.is_cuda(),
|
||||||
|
)
|
||||||
.context("open encoder")?;
|
.context("open encoder")?;
|
||||||
|
|
||||||
let mut sink = BufWriter::new(
|
let mut sink = BufWriter::new(
|
||||||
|
|||||||
@@ -23,6 +23,8 @@ mod pipeline;
|
|||||||
mod pwinit;
|
mod pwinit;
|
||||||
mod vdisplay;
|
mod vdisplay;
|
||||||
mod web;
|
mod web;
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
mod zerocopy;
|
||||||
|
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use encode::Codec;
|
use encode::Codec;
|
||||||
@@ -52,6 +54,9 @@ fn real_main() -> Result<()> {
|
|||||||
// Standalone input-injection smoke test (no client needed): open the session's input
|
// Standalone input-injection smoke test (no client needed): open the session's input
|
||||||
// backend and inject a scripted mouse/keyboard pattern. Watch a focused app / `wev`.
|
// backend and inject a scripted mouse/keyboard pattern. Watch a focused app / `wev`.
|
||||||
Some("input-test") => input_test(),
|
Some("input-test") => input_test(),
|
||||||
|
// Zero-copy FFI/GPU probe: init the EGL importer + CUDA context (no capture needed).
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
Some("zerocopy-probe") => zerocopy::probe(),
|
||||||
// M0 pipeline spike.
|
// M0 pipeline spike.
|
||||||
Some("m0") => m0::run(parse_m0(&args[1..])?),
|
Some("m0") => m0::run(parse_m0(&args[1..])?),
|
||||||
Some("-h") | Some("--help") | Some("help") | None => {
|
Some("-h") | Some("--help") | Some("help") | None => {
|
||||||
|
|||||||
@@ -0,0 +1,271 @@
|
|||||||
|
//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the EGL-interop
|
||||||
|
//! driver calls (`cuGraphicsEGLRegisterImage` / `cuGraphicsResourceGetMappedEglFrame`) nor
|
||||||
|
//! `CUeglFrame`, so we hand-roll exactly what we need and link `libcuda.so.1` (the driver
|
||||||
|
//! library — NOT `libcudart`). Symbol names verified against `cust_raw` + `cudaEGL.h`: the
|
||||||
|
//! context/mem ops use the `_v2` ABI suffix; the graphics/EGL-interop ops are unsuffixed.
|
||||||
|
//!
|
||||||
|
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
|
||||||
|
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
|
||||||
|
|
||||||
|
#![allow(non_camel_case_types, non_snake_case)]
|
||||||
|
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use std::os::raw::{c_int, c_uint, c_void};
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
pub type CUresult = c_uint; // CUDA_SUCCESS == 0
|
||||||
|
pub type CUdevice = c_int;
|
||||||
|
pub type CUcontext = *mut c_void; // opaque CUctx_st*
|
||||||
|
pub type CUstream = *mut c_void; // opaque CUstream_st*
|
||||||
|
pub type CUdeviceptr = u64;
|
||||||
|
pub type CUgraphicsResource = *mut c_void;
|
||||||
|
pub type CUarray = *mut c_void;
|
||||||
|
|
||||||
|
/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
|
||||||
|
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
|
||||||
|
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
|
||||||
|
|
||||||
|
/// `CUeglFrameType`: ARRAY=0, PITCH=1.
|
||||||
|
pub const CU_EGL_FRAME_TYPE_ARRAY: c_uint = 0;
|
||||||
|
pub const CU_EGL_FRAME_TYPE_PITCH: c_uint = 1;
|
||||||
|
|
||||||
|
/// `CUeglFrame` — exact layout from `cudaEGL.h`. `frame` is a union of `CUarray pArray[3]` and
|
||||||
|
/// `void* pPitch[3]`; both are three pointers, so `[*mut c_void; 3]` models it.
|
||||||
|
#[repr(C)]
|
||||||
|
pub struct CUeglFrame {
|
||||||
|
pub frame: [*mut c_void; 3],
|
||||||
|
pub width: c_uint,
|
||||||
|
pub height: c_uint,
|
||||||
|
pub depth: c_uint,
|
||||||
|
pub pitch: c_uint,
|
||||||
|
pub planeCount: c_uint,
|
||||||
|
pub numChannels: c_uint,
|
||||||
|
pub frameType: c_uint,
|
||||||
|
pub eglColorFormat: c_uint,
|
||||||
|
pub cuFormat: c_uint,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
|
||||||
|
#[repr(C)]
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct CUDA_MEMCPY2D {
|
||||||
|
pub srcXInBytes: usize,
|
||||||
|
pub srcY: usize,
|
||||||
|
pub srcMemoryType: c_uint,
|
||||||
|
pub srcHost: *const c_void,
|
||||||
|
pub srcDevice: CUdeviceptr,
|
||||||
|
pub srcArray: CUarray,
|
||||||
|
pub srcPitch: usize,
|
||||||
|
pub dstXInBytes: usize,
|
||||||
|
pub dstY: usize,
|
||||||
|
pub dstMemoryType: c_uint,
|
||||||
|
pub dstHost: *mut c_void,
|
||||||
|
pub dstDevice: CUdeviceptr,
|
||||||
|
pub dstArray: CUarray,
|
||||||
|
pub dstPitch: usize,
|
||||||
|
pub WidthInBytes: usize,
|
||||||
|
pub Height: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for CUeglFrame {
|
||||||
|
fn default() -> Self {
|
||||||
|
// SAFETY: all fields are integers or pointers; zero is a valid bit pattern.
|
||||||
|
unsafe { std::mem::zeroed() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[link(name = "cuda")]
|
||||||
|
extern "C" {
|
||||||
|
fn cuInit(flags: c_uint) -> CUresult;
|
||||||
|
fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
|
||||||
|
fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult;
|
||||||
|
fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult;
|
||||||
|
fn cuMemAllocPitch_v2(
|
||||||
|
dptr: *mut CUdeviceptr,
|
||||||
|
pitch: *mut usize,
|
||||||
|
width_bytes: usize,
|
||||||
|
height: usize,
|
||||||
|
element_size: c_uint,
|
||||||
|
) -> CUresult;
|
||||||
|
fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult;
|
||||||
|
fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
|
||||||
|
fn cuCtxSynchronize() -> CUresult;
|
||||||
|
|
||||||
|
fn cuGraphicsEGLRegisterImage(
|
||||||
|
resource: *mut CUgraphicsResource,
|
||||||
|
image: *mut c_void, // EGLImage
|
||||||
|
flags: c_uint, // CU_GRAPHICS_REGISTER_FLAGS_NONE = 0
|
||||||
|
) -> CUresult;
|
||||||
|
fn cuGraphicsResourceGetMappedEglFrame(
|
||||||
|
egl_frame: *mut CUeglFrame,
|
||||||
|
resource: CUgraphicsResource,
|
||||||
|
index: c_uint,
|
||||||
|
mip_level: c_uint,
|
||||||
|
) -> CUresult;
|
||||||
|
fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn ck(r: CUresult, what: &str) -> Result<()> {
|
||||||
|
if r == 0 {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
bail!("CUDA driver error {r} in {what}")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
|
||||||
|
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct Context(pub CUcontext);
|
||||||
|
unsafe impl Send for Context {}
|
||||||
|
unsafe impl Sync for Context {}
|
||||||
|
|
||||||
|
static CONTEXT: OnceLock<Context> = OnceLock::new();
|
||||||
|
|
||||||
|
/// Get (lazily creating) the shared CUDA context on device 0.
|
||||||
|
pub fn context() -> Result<CUcontext> {
|
||||||
|
if let Some(c) = CONTEXT.get() {
|
||||||
|
return Ok(c.0);
|
||||||
|
}
|
||||||
|
let ctx = unsafe {
|
||||||
|
ck(cuInit(0), "cuInit")?;
|
||||||
|
let mut dev: CUdevice = 0;
|
||||||
|
ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
|
||||||
|
let mut ctx: CUcontext = std::ptr::null_mut();
|
||||||
|
ck(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate_v2")?;
|
||||||
|
ctx
|
||||||
|
};
|
||||||
|
// Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
|
||||||
|
// process-lifetime). `get_or_init` keeps a single shared value.
|
||||||
|
Ok(CONTEXT.get_or_init(|| Context(ctx)).0)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Make the shared context current on the calling thread (required before any CUDA op here).
|
||||||
|
pub fn make_current() -> Result<()> {
|
||||||
|
let ctx = context()?;
|
||||||
|
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder
|
||||||
|
/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be
|
||||||
|
/// returned to the compositor immediately.
|
||||||
|
pub struct DeviceBuffer {
|
||||||
|
pub ptr: CUdeviceptr,
|
||||||
|
pub pitch: usize,
|
||||||
|
pub width: u32,
|
||||||
|
pub height: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeviceBuffer {
|
||||||
|
/// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels.
|
||||||
|
pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
|
||||||
|
let mut ptr: CUdeviceptr = 0;
|
||||||
|
let mut pitch: usize = 0;
|
||||||
|
unsafe {
|
||||||
|
ck(
|
||||||
|
cuMemAllocPitch_v2(&mut ptr, &mut pitch, width as usize * 4, height as usize, 16),
|
||||||
|
"cuMemAllocPitch_v2",
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
Ok(DeviceBuffer {
|
||||||
|
ptr,
|
||||||
|
pitch,
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for DeviceBuffer {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if self.ptr != 0 {
|
||||||
|
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
||||||
|
unsafe {
|
||||||
|
if let Some(c) = CONTEXT.get() {
|
||||||
|
let _ = cuCtxSetCurrent(c.0);
|
||||||
|
}
|
||||||
|
let _ = cuMemFree_v2(self.ptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A live EGL→CUDA registration. The mapped device memory aliases the dmabuf, so we copy out of
|
||||||
|
/// it immediately and then unregister (the EGL image is destroyed by the caller).
|
||||||
|
pub struct MappedImage {
|
||||||
|
resource: CUgraphicsResource,
|
||||||
|
/// `frameType` (ARRAY vs PITCH) determines how to copy out.
|
||||||
|
frame: CUeglFrame,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MappedImage {
|
||||||
|
/// Register an `EGLImage` with CUDA and map it to a `CUeglFrame`.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
/// `image` must be a valid `EGLImage`; the shared context must be current on this thread.
|
||||||
|
pub unsafe fn register(image: *mut c_void) -> Result<MappedImage> {
|
||||||
|
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
||||||
|
ck(
|
||||||
|
cuGraphicsEGLRegisterImage(&mut resource, image, 0),
|
||||||
|
"cuGraphicsEGLRegisterImage",
|
||||||
|
)?;
|
||||||
|
let mut frame = CUeglFrame::default();
|
||||||
|
let r = cuGraphicsResourceGetMappedEglFrame(&mut frame, resource, 0, 0);
|
||||||
|
if r != 0 {
|
||||||
|
let _ = cuGraphicsUnregisterResource(resource);
|
||||||
|
bail!("cuGraphicsResourceGetMappedEglFrame error {r}");
|
||||||
|
}
|
||||||
|
Ok(MappedImage { resource, frame })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Device-to-device copy of this mapped frame into `dst` (de-tiling if the source is a tiled
|
||||||
|
/// CUarray). After this returns the dmabuf is no longer needed.
|
||||||
|
pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
|
||||||
|
let width_bytes = (self.frame.width as usize).min(dst.width as usize) * 4;
|
||||||
|
let height = (self.frame.height as usize).min(dst.height as usize);
|
||||||
|
let mut copy = CUDA_MEMCPY2D {
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: dst.ptr,
|
||||||
|
dstPitch: dst.pitch,
|
||||||
|
WidthInBytes: width_bytes,
|
||||||
|
Height: height,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
match self.frame.frameType {
|
||||||
|
CU_EGL_FRAME_TYPE_PITCH => {
|
||||||
|
copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
||||||
|
copy.srcDevice = self.frame.frame[0] as CUdeviceptr;
|
||||||
|
copy.srcPitch = self.frame.pitch as usize;
|
||||||
|
}
|
||||||
|
CU_EGL_FRAME_TYPE_ARRAY => {
|
||||||
|
copy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
|
||||||
|
copy.srcArray = self.frame.frame[0] as CUarray;
|
||||||
|
}
|
||||||
|
other => bail!("unexpected CUeglFrame frameType {other}"),
|
||||||
|
}
|
||||||
|
unsafe {
|
||||||
|
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?;
|
||||||
|
// The copy must complete before the dmabuf is requeued / reused.
|
||||||
|
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn color_format(&self) -> c_uint {
|
||||||
|
self.frame.eglColorFormat
|
||||||
|
}
|
||||||
|
pub fn frame_kind(&self) -> c_uint {
|
||||||
|
self.frame.frameType
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for MappedImage {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.resource.is_null() {
|
||||||
|
unsafe {
|
||||||
|
let _ = cuGraphicsUnregisterResource(self.resource);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,173 @@
|
|||||||
|
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (via the GBM
|
||||||
|
//! platform on the render node) and import a PipeWire dmabuf as an `EGLImage` with
|
||||||
|
//! `EGL_LINUX_DMA_BUF_EXT`. The DRM format **modifier** is mandatory on NVIDIA (its buffers are
|
||||||
|
//! tiled; importing without the modifier yields a corrupt image or `EGL_BAD_MATCH`). The image
|
||||||
|
//! is then handed to CUDA (`cuGraphicsEGLRegisterImage`) and copied device-to-device into an
|
||||||
|
//! owned buffer so the dmabuf can be returned to the compositor immediately.
|
||||||
|
|
||||||
|
#![allow(non_upper_case_globals)]
|
||||||
|
|
||||||
|
use super::cuda::{self, DeviceBuffer, MappedImage};
|
||||||
|
use anyhow::{ensure, Context as _, Result};
|
||||||
|
use khronos_egl as egl;
|
||||||
|
use std::os::raw::{c_int, c_void};
|
||||||
|
|
||||||
|
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
||||||
|
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
||||||
|
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
|
||||||
|
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
||||||
|
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
||||||
|
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
||||||
|
const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
|
||||||
|
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
||||||
|
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
||||||
|
|
||||||
|
#[link(name = "gbm")]
|
||||||
|
extern "C" {
|
||||||
|
fn gbm_create_device(fd: c_int) -> *mut c_void;
|
||||||
|
fn gbm_device_destroy(device: *mut c_void);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
pub struct DmabufPlane {
|
||||||
|
pub fd: i32,
|
||||||
|
pub offset: u32,
|
||||||
|
pub stride: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
||||||
|
|
||||||
|
/// Headless EGL display + GBM device used to import dmabufs. Lives on the capture thread.
|
||||||
|
pub struct EglImporter {
|
||||||
|
egl: Egl,
|
||||||
|
display: egl::Display,
|
||||||
|
no_ctx: egl::Context,
|
||||||
|
gbm: *mut c_void,
|
||||||
|
render_fd: c_int,
|
||||||
|
}
|
||||||
|
|
||||||
|
// The EGL/GBM handles are confined to the capture thread; the struct is moved there once.
|
||||||
|
unsafe impl Send for EglImporter {}
|
||||||
|
|
||||||
|
impl EglImporter {
|
||||||
|
/// Open the render node, create a GBM device, and a headless EGLDisplay on it. Also forces
|
||||||
|
/// the shared CUDA context to exist (so a later `import` only touches the hot path).
|
||||||
|
pub fn new() -> Result<EglImporter> {
|
||||||
|
let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
|
||||||
|
let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||||
|
ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
|
||||||
|
let gbm = unsafe { gbm_create_device(render_fd) };
|
||||||
|
if gbm.is_null() {
|
||||||
|
unsafe { libc::close(render_fd) };
|
||||||
|
anyhow::bail!("gbm_create_device failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
let egl: Egl =
|
||||||
|
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
||||||
|
let display = unsafe {
|
||||||
|
egl.get_platform_display(
|
||||||
|
EGL_PLATFORM_GBM_KHR,
|
||||||
|
gbm as egl::NativeDisplayType,
|
||||||
|
&[egl::ATTRIB_NONE],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
.context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
|
||||||
|
egl.initialize(display).context("eglInitialize")?;
|
||||||
|
|
||||||
|
let exts = egl
|
||||||
|
.query_string(Some(display), egl::EXTENSIONS)
|
||||||
|
.context("query EGL extensions")?
|
||||||
|
.to_string_lossy()
|
||||||
|
.into_owned();
|
||||||
|
ensure!(
|
||||||
|
exts.contains("EGL_EXT_image_dma_buf_import"),
|
||||||
|
"EGL lacks EGL_EXT_image_dma_buf_import"
|
||||||
|
);
|
||||||
|
ensure!(
|
||||||
|
exts.contains("EGL_EXT_image_dma_buf_import_modifiers"),
|
||||||
|
"EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Create the shared CUDA context up front so import() is pure hot path.
|
||||||
|
cuda::context().context("create CUDA context")?;
|
||||||
|
|
||||||
|
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
||||||
|
tracing::info!("zero-copy EGL importer ready (GBM platform, dma_buf_import + modifiers)");
|
||||||
|
Ok(EglImporter {
|
||||||
|
egl,
|
||||||
|
display,
|
||||||
|
no_ctx,
|
||||||
|
gbm,
|
||||||
|
render_fd,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer.
|
||||||
|
/// `fourcc` is the DRM FourCC, `modifier` the 64-bit DRM format modifier from PipeWire.
|
||||||
|
pub fn import(
|
||||||
|
&self,
|
||||||
|
plane: &DmabufPlane,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
fourcc: u32,
|
||||||
|
modifier: u64,
|
||||||
|
) -> Result<DeviceBuffer> {
|
||||||
|
let attrs: [egl::Attrib; 19] = [
|
||||||
|
egl::WIDTH as egl::Attrib,
|
||||||
|
width as egl::Attrib,
|
||||||
|
egl::HEIGHT as egl::Attrib,
|
||||||
|
height as egl::Attrib,
|
||||||
|
EGL_LINUX_DRM_FOURCC_EXT,
|
||||||
|
fourcc as egl::Attrib,
|
||||||
|
EGL_DMA_BUF_PLANE0_FD_EXT,
|
||||||
|
plane.fd as egl::Attrib,
|
||||||
|
EGL_DMA_BUF_PLANE0_OFFSET_EXT,
|
||||||
|
plane.offset as egl::Attrib,
|
||||||
|
EGL_DMA_BUF_PLANE0_PITCH_EXT,
|
||||||
|
plane.stride as egl::Attrib,
|
||||||
|
EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
|
||||||
|
(modifier & 0xFFFF_FFFF) as egl::Attrib,
|
||||||
|
EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
|
||||||
|
(modifier >> 32) as egl::Attrib,
|
||||||
|
egl::ATTRIB_NONE,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
];
|
||||||
|
let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
|
||||||
|
let image = self
|
||||||
|
.egl
|
||||||
|
.create_image(
|
||||||
|
self.display,
|
||||||
|
self.no_ctx,
|
||||||
|
EGL_LINUX_DMA_BUF_EXT,
|
||||||
|
client,
|
||||||
|
&attrs[..17], // up to and including ATTRIB_NONE
|
||||||
|
)
|
||||||
|
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||||
|
|
||||||
|
// CUDA: register + map + copy out, then drop the registration and the EGL image.
|
||||||
|
let result = (|| -> Result<DeviceBuffer> {
|
||||||
|
cuda::make_current()?;
|
||||||
|
// SAFETY: `image` is a valid EGLImage we just created; context is current.
|
||||||
|
let mapped = unsafe { MappedImage::register(image.as_ptr()) }?;
|
||||||
|
let dst = DeviceBuffer::alloc(width, height)?;
|
||||||
|
mapped.copy_to(&dst)?;
|
||||||
|
Ok(dst)
|
||||||
|
})();
|
||||||
|
|
||||||
|
let _ = self.egl.destroy_image(self.display, image);
|
||||||
|
result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for EglImporter {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if !self.gbm.is_null() {
|
||||||
|
unsafe { gbm_device_destroy(self.gbm) };
|
||||||
|
}
|
||||||
|
if self.render_fd >= 0 {
|
||||||
|
unsafe { libc::close(self.render_fd) };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
//! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and
|
||||||
|
//! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path
|
||||||
|
//! moves ~3.5 GB/s). Opt in with `LUMEN_ZEROCOPY=1`; the CPU-copy path stays the default and
|
||||||
|
//! the runtime fallback (foreign-allocator / no-dmabuf / import failure).
|
||||||
|
//!
|
||||||
|
//! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the
|
||||||
|
//! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in
|
||||||
|
//! `encode/linux.rs`; the dmabuf negotiation lives in `capture/linux.rs`.
|
||||||
|
|
||||||
|
pub mod cuda;
|
||||||
|
pub mod egl;
|
||||||
|
|
||||||
|
pub use cuda::DeviceBuffer;
|
||||||
|
pub use egl::EglImporter;
|
||||||
|
|
||||||
|
/// Whether the zero-copy path is opted in (`LUMEN_ZEROCOPY` truthy).
|
||||||
|
pub fn enabled() -> bool {
|
||||||
|
std::env::var("LUMEN_ZEROCOPY")
|
||||||
|
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
|
||||||
|
const fn fourcc(c: &[u8; 4]) -> u32 {
|
||||||
|
(c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Map a SPA/our [`crate::capture::PixelFormat`] to the DRM FourCC EGL expects for import.
|
||||||
|
/// SPA byte order `BGRx` ⇒ DRM `XRGB8888` (memory B,G,R,X), etc.
|
||||||
|
pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option<u32> {
|
||||||
|
use crate::capture::PixelFormat::*;
|
||||||
|
Some(match format {
|
||||||
|
Bgrx => fourcc(b"XR24"), // DRM_FORMAT_XRGB8888
|
||||||
|
Bgra => fourcc(b"AR24"), // DRM_FORMAT_ARGB8888
|
||||||
|
Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
|
||||||
|
Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
|
||||||
|
// 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
|
||||||
|
Rgb | Bgr => return None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Standalone probe (the `zerocopy-probe` subcommand): initialize the EGL importer + CUDA
|
||||||
|
/// context and report. De-risks the FFI/linking/GPU-access without needing a capture session.
|
||||||
|
pub fn probe() -> anyhow::Result<()> {
|
||||||
|
let _importer = EglImporter::new()?;
|
||||||
|
let ctx = cuda::context()?;
|
||||||
|
tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user