refactor(windows-host): confine platform code under windows/ + linux/ folders (Goal-1 stage 6)
Move 36 platform-specific files into per-module `windows/` and `linux/` subfolders (and the
shared HID codecs into `inject/proto/`):
capture/{windows,linux}/ encode/{windows,linux}/ inject/{windows,linux,proto}/
audio/{windows,linux}/ vdisplay/{windows,linux}/
src/windows/ (service, wgc_helper, win_adapter, win_display)
src/linux/ (dmabuf_fence, drm_sync, zerocopy/)
Done with `#[path]`, NOT a module rename: every file moves into its folder while the
`crate::*::*` module names stay FLAT, so all caller paths and every internal `super::`/`crate::`
reference are unchanged — only the parent `mod` decls gained `#[path = "..."]`. This is the
codebase's existing pattern (inject's gamepad_windows) and makes the move byte-identical in
behaviour with ZERO reference churn, far lower risk than collapsing to a single
`crate::capture::windows::` namespace (that deeper rename is an optional follow-on; this delivers
the cfg-sprawl folder confinement the stage is about). Done LAST, after the semantic stages, so
the path churn didn't fight them.
Verified: Linux cargo check + clippy (-D warnings) clean; my mod-decl changes fmt-clean (the 3
remaining fmt diffs are pre-existing local-rustfmt-version skew that moved with their files); all
36 `#[path]` targets exist; no internal `#[path]`/`include!`/file-child-mod in any moved file
(the inline `mod X {` blocks are self-contained). Box build to follow.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
//! Consumer-side implicit-fence wait for dmabuf capture (`DMA_BUF_IOCTL_EXPORT_SYNC_FILE`).
|
||||
//!
|
||||
//! Mutter renders its virtual monitor DIRECTLY into the PipeWire dmabuf and hands the buffer over
|
||||
//! at GPU-submit time. With no fencing the consumer can sample mid-render and encode the buffer's
|
||||
//! *previous* contents — the "stale/old frame" flashing on NVIDIA (KWin/gamescope blit into the
|
||||
//! buffer so they don't hit this). The producer-driven fix is PipeWire explicit sync, but
|
||||
//! Mutter+NVIDIA can't produce a sync_fd (`error alloc buffers` / no cogl sync_fd).
|
||||
//!
|
||||
//! So sync from the *consumer* side instead: a dmabuf carries its in-flight GPU work as an implicit
|
||||
//! fence on its reservation object. `DMA_BUF_IOCTL_EXPORT_SYNC_FILE` snapshots that into a sync_file
|
||||
//! fd we can `poll()` — readable once the producer's writes complete. This makes zero-copy capture
|
||||
//! race-free WITHOUT the producer doing anything, *iff* the driver actually attaches the fence. If it
|
||||
//! attaches none, the export yields an already-signaled sync_file (poll returns immediately) — no
|
||||
//! wait, no harm, and `waited=false` tells us the driver doesn't fence (so zero-copy would still race).
|
||||
|
||||
use std::os::fd::RawFd;
|
||||
|
||||
// linux/dma-buf.h ioctls on the DMA_BUF_BASE ('b' = 0x62) magic. _IOWR = dir(3)<<30 | size<<16 | base<<8 | nr.
|
||||
const DMA_BUF_BASE: u64 = 0x62;
|
||||
const fn iowr(nr: u32, size: usize) -> u64 {
|
||||
(3u64 << 30) | ((size as u64) << 16) | (DMA_BUF_BASE << 8) | nr as u64
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
struct DmaBufExportSyncFile {
|
||||
flags: u32,
|
||||
fd: i32,
|
||||
}
|
||||
|
||||
const DMA_BUF_IOCTL_EXPORT_SYNC_FILE: u64 = iowr(2, std::mem::size_of::<DmaBufExportSyncFile>());
|
||||
/// We will READ the buffer → export the fence(s) we must wait for before reading (the producer's writes).
|
||||
const DMA_BUF_SYNC_READ: u32 = 1 << 0;
|
||||
|
||||
/// Wait until the producer's writes to `dmabuf_fd` complete (or `timeout_ms` elapses). Returns:
|
||||
/// - `Ok(true)` — a render was still in flight and we waited on its fence (the race was real, now closed).
|
||||
/// - `Ok(false)` — no fence / already signaled (the driver attaches no implicit fence; zero-copy can race).
|
||||
/// - `Err` — the ioctl failed (e.g. the kernel/driver lacks `EXPORT_SYNC_FILE`).
|
||||
pub fn wait_read_ready(dmabuf_fd: RawFd, timeout_ms: i32) -> std::io::Result<bool> {
|
||||
let mut req = DmaBufExportSyncFile {
|
||||
flags: DMA_BUF_SYNC_READ,
|
||||
fd: -1,
|
||||
};
|
||||
let r = unsafe { libc::ioctl(dmabuf_fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &mut req) };
|
||||
if r < 0 {
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
let sync_fd = req.fd;
|
||||
if sync_fd < 0 {
|
||||
return Ok(false); // no sync_file exported
|
||||
}
|
||||
let mut pfd = libc::pollfd {
|
||||
fd: sync_fd,
|
||||
events: libc::POLLIN,
|
||||
revents: 0,
|
||||
};
|
||||
// Non-blocking probe: not-yet-signaled (poll==0) means the producer is still rendering.
|
||||
let pending = unsafe { libc::poll(&mut pfd, 1, 0) } == 0;
|
||||
if pending {
|
||||
pfd.revents = 0;
|
||||
unsafe { libc::poll(&mut pfd, 1, timeout_ms) }; // block until the render fence signals
|
||||
}
|
||||
unsafe { libc::close(sync_fd) };
|
||||
Ok(pending)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The ioctl number must match linux/dma-buf.h exactly — it's computed, so lock it down.
|
||||
#[test]
|
||||
fn ioctl_number_matches_dma_buf_h() {
|
||||
assert_eq!(DMA_BUF_IOCTL_EXPORT_SYNC_FILE, 0xC008_6202);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
//! Minimal DRM timeline-syncobj operations — the consumer side of PipeWire explicit sync
|
||||
//! (`SPA_META_SyncTimeline`).
|
||||
//!
|
||||
//! RETAINED BUT CURRENTLY UNUSED: producer-driven explicit sync is the "right" fix, but no
|
||||
//! compositor we target produces a usable sync_fd today — Mutter+NVIDIA fails buffer allocation
|
||||
//! (`error alloc buffers`, no cogl sync_fd), KWin/gamescope blit so they don't race at all. We sync
|
||||
//! zero-copy from the consumer side instead (see [`crate::dmabuf_fence`]). This module is kept,
|
||||
//! verified (ioctl numbers + a live signal→wait round trip), ready to wire in the moment a producer
|
||||
//! gains working `SPA_META_SyncTimeline`.
|
||||
#![allow(dead_code)]
|
||||
//!
|
||||
//! Compositors that render directly into the PipeWire buffer pool (Mutter's virtual
|
||||
//! monitors) hand buffers over at GPU-submit time; on drivers without implicit dmabuf
|
||||
//! fencing (NVIDIA) reading immediately races the render and shows the buffer's
|
||||
//! *previous* contents. With explicit sync the producer attaches a timeline syncobj:
|
||||
//! wait the acquire point before touching the buffer, signal the release point when done.
|
||||
//!
|
||||
//! Syncobjs are DRM-core objects: any render node can import and wait them, so this
|
||||
//! opens its own fd independent of the capture GPU path.
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use std::os::fd::RawFd;
|
||||
|
||||
// drm.h ioctls on the 'd' (0x64) magic. _IOWR = dir(3)<<30 | size<<16 | 0x64<<8 | nr.
|
||||
const fn iowr(nr: u32, size: usize) -> u64 {
|
||||
(3u64 << 30) | ((size as u64) << 16) | (0x64u64 << 8) | nr as u64
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
struct DrmSyncobjHandle {
|
||||
handle: u32,
|
||||
flags: u32,
|
||||
fd: i32,
|
||||
pad: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
struct DrmSyncobjDestroy {
|
||||
handle: u32,
|
||||
pad: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
struct DrmSyncobjTimelineWait {
|
||||
handles: u64,
|
||||
points: u64,
|
||||
/// Absolute CLOCK_MONOTONIC deadline, nanoseconds.
|
||||
timeout_nsec: i64,
|
||||
count_handles: u32,
|
||||
flags: u32,
|
||||
first_signaled: u32,
|
||||
pad: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
struct DrmSyncobjTimelineArray {
|
||||
handles: u64,
|
||||
points: u64,
|
||||
count_handles: u32,
|
||||
flags: u32,
|
||||
}
|
||||
|
||||
const DRM_IOCTL_SYNCOBJ_DESTROY: u64 = iowr(0xC0, std::mem::size_of::<DrmSyncobjDestroy>());
|
||||
const DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE: u64 = iowr(0xC2, std::mem::size_of::<DrmSyncobjHandle>());
|
||||
const DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT: u64 =
|
||||
iowr(0xCA, std::mem::size_of::<DrmSyncobjTimelineWait>());
|
||||
const DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL: u64 =
|
||||
iowr(0xCD, std::mem::size_of::<DrmSyncobjTimelineArray>());
|
||||
|
||||
/// The producer's point may not be attached yet when the buffer reaches us.
|
||||
const DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT: u32 = 1 << 1;
|
||||
|
||||
pub struct DrmSync {
|
||||
fd: RawFd,
|
||||
}
|
||||
|
||||
impl DrmSync {
|
||||
pub fn open() -> Result<DrmSync> {
|
||||
let path = c"/dev/dri/renderD128";
|
||||
let fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||
if fd < 0 {
|
||||
bail!("open /dev/dri/renderD128 for syncobj ops: {}", errno());
|
||||
}
|
||||
Ok(DrmSync { fd })
|
||||
}
|
||||
|
||||
/// Import a syncobj fd into a (temporary) handle on our device.
|
||||
fn import(&self, syncobj_fd: RawFd) -> Result<u32> {
|
||||
let mut req = DrmSyncobjHandle {
|
||||
fd: syncobj_fd,
|
||||
..Default::default()
|
||||
};
|
||||
let r = unsafe { libc::ioctl(self.fd, DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &mut req) };
|
||||
if r < 0 {
|
||||
bail!("SYNCOBJ_FD_TO_HANDLE: {}", errno());
|
||||
}
|
||||
Ok(req.handle)
|
||||
}
|
||||
|
||||
fn destroy(&self, handle: u32) {
|
||||
let mut req = DrmSyncobjDestroy {
|
||||
handle,
|
||||
..Default::default()
|
||||
};
|
||||
unsafe { libc::ioctl(self.fd, DRM_IOCTL_SYNCOBJ_DESTROY, &mut req) };
|
||||
}
|
||||
|
||||
/// Block until `point` on the producer's timeline is signaled (the buffer's contents
|
||||
/// are ready), or `timeout_ms` passes.
|
||||
pub fn wait_point(&self, syncobj_fd: RawFd, point: u64, timeout_ms: u64) -> Result<()> {
|
||||
let handle = self.import(syncobj_fd)?;
|
||||
let mut now = libc::timespec {
|
||||
tv_sec: 0,
|
||||
tv_nsec: 0,
|
||||
};
|
||||
unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut now) };
|
||||
let deadline = now.tv_sec * 1_000_000_000 + now.tv_nsec + timeout_ms as i64 * 1_000_000;
|
||||
let handles = [handle];
|
||||
let points = [point];
|
||||
let mut req = DrmSyncobjTimelineWait {
|
||||
handles: handles.as_ptr() as u64,
|
||||
points: points.as_ptr() as u64,
|
||||
timeout_nsec: deadline,
|
||||
count_handles: 1,
|
||||
flags: DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT,
|
||||
..Default::default()
|
||||
};
|
||||
let r = unsafe { libc::ioctl(self.fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &mut req) };
|
||||
let saved = errno();
|
||||
self.destroy(handle);
|
||||
if r < 0 {
|
||||
bail!("SYNCOBJ_TIMELINE_WAIT(point {point}): {saved}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Signal `point` on the consumer release timeline — the producer may reuse the
|
||||
/// buffer. Must be called for every buffer that carried sync metadata, even when the
|
||||
/// frame was skipped, or the producer stalls waiting for it.
|
||||
pub fn signal_point(&self, syncobj_fd: RawFd, point: u64) -> Result<()> {
|
||||
let handle = self.import(syncobj_fd)?;
|
||||
let handles = [handle];
|
||||
let points = [point];
|
||||
let mut req = DrmSyncobjTimelineArray {
|
||||
handles: handles.as_ptr() as u64,
|
||||
points: points.as_ptr() as u64,
|
||||
count_handles: 1,
|
||||
flags: 0,
|
||||
};
|
||||
let r = unsafe { libc::ioctl(self.fd, DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &mut req) };
|
||||
let saved = errno();
|
||||
self.destroy(handle);
|
||||
if r < 0 {
|
||||
bail!("SYNCOBJ_TIMELINE_SIGNAL(point {point}): {saved}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DrmSync {
|
||||
fn drop(&mut self) {
|
||||
unsafe { libc::close(self.fd) };
|
||||
}
|
||||
}
|
||||
|
||||
fn errno() -> std::io::Error {
|
||||
std::io::Error::last_os_error()
|
||||
}
|
||||
|
||||
// `DrmSync::open` must not panic the PipeWire thread; everything is Result-based and the
|
||||
// caller degrades to unsynchronized capture (with a loud warning) when it fails.
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The ioctl numbers must match drm.h exactly — computed, so lock them down.
|
||||
#[test]
|
||||
fn ioctl_numbers_match_drm_h() {
|
||||
assert_eq!(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, 0xC010_64C2);
|
||||
assert_eq!(DRM_IOCTL_SYNCOBJ_DESTROY, 0xC008_64C0);
|
||||
assert_eq!(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, 0xC028_64CA);
|
||||
assert_eq!(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, 0xC018_64CD);
|
||||
}
|
||||
|
||||
/// Round-trip against the real DRM device when one exists (CI containers skip).
|
||||
#[test]
|
||||
fn signal_then_wait_roundtrip() {
|
||||
let Ok(sync) = DrmSync::open() else {
|
||||
eprintln!("no render node — skipping");
|
||||
return;
|
||||
};
|
||||
// Create a fresh syncobj (CREATE = 0xBF), export it, signal point 1, wait point 1.
|
||||
#[repr(C)]
|
||||
#[derive(Default)]
|
||||
struct Create {
|
||||
handle: u32,
|
||||
flags: u32,
|
||||
}
|
||||
const CREATE: u64 = iowr(0xBF, std::mem::size_of::<Create>());
|
||||
const HANDLE_TO_FD: u64 = iowr(0xC1, std::mem::size_of::<DrmSyncobjHandle>());
|
||||
let mut c = Create::default();
|
||||
assert!(unsafe { libc::ioctl(sync.fd, CREATE, &mut c) } >= 0);
|
||||
let mut h = DrmSyncobjHandle {
|
||||
handle: c.handle,
|
||||
..Default::default()
|
||||
};
|
||||
assert!(unsafe { libc::ioctl(sync.fd, HANDLE_TO_FD, &mut h) } >= 0);
|
||||
sync.signal_point(h.fd, 1).expect("signal");
|
||||
sync.wait_point(h.fd, 1, 100).expect("wait after signal");
|
||||
unsafe { libc::close(h.fd) };
|
||||
sync.destroy(c.handle);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,833 @@
|
||||
//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (GBM platform on
|
||||
//! the render node) and import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`.
|
||||
//! The DRM format **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without
|
||||
//! the modifier yields a corrupt image or `EGL_BAD_MATCH`).
|
||||
//!
|
||||
//! Desktop NVIDIA can't register a dmabuf `EGLImage` with CUDA directly — `cuGraphicsEGLRegisterImage`
|
||||
//! is Tegra-only and `cuGraphicsGLRegisterImage` rejects EGLImage-backed textures (their internal
|
||||
//! format is opaque). So we follow OBS/Sunshine: bind the `EGLImage` to a GL texture
|
||||
//! (`glEGLImageTargetTexture2DOES`), render it through a fullscreen-triangle shader into a plain
|
||||
//! immutable `GL_RGBA8` texture (de-tiling and swizzling to the BGRx the encoder wants), then
|
||||
//! register *that* texture with CUDA ([`MappedTexture`]) and copy it device-to-device into an
|
||||
//! owned [`DeviceBuffer`] so the dmabuf can be returned to the compositor immediately.
|
||||
|
||||
#![allow(non_upper_case_globals)]
|
||||
|
||||
use super::cuda::{self, DeviceBuffer};
|
||||
use anyhow::{bail, ensure, Context as _, Result};
|
||||
use khronos_egl as egl;
|
||||
use std::os::raw::{c_int, c_void};
|
||||
|
||||
// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
|
||||
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
|
||||
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
|
||||
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
|
||||
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
|
||||
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
|
||||
const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
|
||||
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
|
||||
|
||||
const GL_TEXTURE_2D: u32 = 0x0DE1;
|
||||
const GL_TEXTURE_MIN_FILTER: u32 = 0x2801;
|
||||
const GL_TEXTURE_MAG_FILTER: u32 = 0x2800;
|
||||
const GL_LINEAR: c_int = 0x2601;
|
||||
const GL_NEAREST: c_int = 0x2600;
|
||||
const GL_RGBA8: u32 = 0x8058;
|
||||
// Single/dual-channel 8-bit formats for the NV12 convert targets: R8 luma (full-res),
|
||||
// RG8 interleaved chroma (half-res). The `_RED`/`_RG` enums are the matching client formats.
|
||||
const GL_R8: u32 = 0x8229;
|
||||
const GL_RG8: u32 = 0x822B;
|
||||
// Client pixel format/type for texture uploads (self-test only): RGBA bytes.
|
||||
const GL_RGBA: u32 = 0x1908;
|
||||
const GL_UNSIGNED_BYTE: u32 = 0x1401;
|
||||
const GL_FRAMEBUFFER: u32 = 0x8D40;
|
||||
const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
|
||||
const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
|
||||
const GL_TEXTURE0: u32 = 0x84C0;
|
||||
const GL_TRIANGLES: u32 = 0x0004;
|
||||
const GL_VERTEX_SHADER: u32 = 0x8B31;
|
||||
const GL_FRAGMENT_SHADER: u32 = 0x8B30;
|
||||
const GL_COMPILE_STATUS: u32 = 0x8B81;
|
||||
const GL_LINK_STATUS: u32 = 0x8B82;
|
||||
|
||||
// libglvnd's libGL dispatches these to the NVIDIA driver based on the current EGL/GL context.
|
||||
#[link(name = "GL")]
|
||||
extern "C" {
|
||||
fn glGenTextures(n: c_int, textures: *mut u32);
|
||||
fn glBindTexture(target: u32, texture: u32);
|
||||
fn glTexParameteri(target: u32, pname: u32, param: c_int);
|
||||
fn glDeleteTextures(n: c_int, textures: *const u32);
|
||||
fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
|
||||
fn glGetError() -> u32;
|
||||
fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
|
||||
fn glDeleteFramebuffers(n: c_int, framebuffers: *const u32);
|
||||
fn glBindFramebuffer(target: u32, framebuffer: u32);
|
||||
fn glFramebufferTexture2D(
|
||||
target: u32,
|
||||
attachment: u32,
|
||||
textarget: u32,
|
||||
texture: u32,
|
||||
level: c_int,
|
||||
);
|
||||
fn glCheckFramebufferStatus(target: u32) -> u32;
|
||||
fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
|
||||
fn glGenVertexArrays(n: c_int, arrays: *mut u32);
|
||||
fn glDeleteVertexArrays(n: c_int, arrays: *const u32);
|
||||
fn glBindVertexArray(array: u32);
|
||||
fn glDrawArrays(mode: u32, first: c_int, count: c_int);
|
||||
fn glActiveTexture(texture: u32);
|
||||
fn glUseProgram(program: u32);
|
||||
fn glFlush();
|
||||
fn glCreateShader(shader_type: u32) -> u32;
|
||||
fn glShaderSource(shader: u32, count: c_int, string: *const *const i8, length: *const c_int);
|
||||
fn glCompileShader(shader: u32);
|
||||
fn glGetShaderiv(shader: u32, pname: u32, params: *mut c_int);
|
||||
fn glDeleteShader(shader: u32);
|
||||
fn glCreateProgram() -> u32;
|
||||
fn glAttachShader(program: u32, shader: u32);
|
||||
fn glLinkProgram(program: u32);
|
||||
fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
|
||||
fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
|
||||
fn glUniform1i(location: c_int, v0: c_int);
|
||||
fn glDeleteProgram(program: u32);
|
||||
fn glTexSubImage2D(
|
||||
target: u32,
|
||||
level: c_int,
|
||||
xoffset: c_int,
|
||||
yoffset: c_int,
|
||||
width: c_int,
|
||||
height: c_int,
|
||||
format: u32,
|
||||
type_: u32,
|
||||
pixels: *const c_void,
|
||||
);
|
||||
}
|
||||
|
||||
#[link(name = "gbm")]
|
||||
extern "C" {
|
||||
fn gbm_create_device(fd: c_int) -> *mut c_void;
|
||||
fn gbm_device_destroy(device: *mut c_void);
|
||||
}
|
||||
|
||||
/// `glEGLImageTargetTexture2DOES(target, EGLImage)` — loaded via `eglGetProcAddress`.
|
||||
type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void);
|
||||
|
||||
// Fullscreen-triangle blit: sample the dmabuf EGLImage texture and write it (swizzled to BGRA,
|
||||
// to match the BGRx the encoder expects) into a normal GL_RGBA8 texture that CUDA *can* register.
|
||||
const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
|
||||
const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";
|
||||
|
||||
// NV12 BT.709 LIMITED-range convert from full-range RGB in [0,1]. Two passes share `VERT_SRC` and
|
||||
// the same source texture (the de-tiled dmabuf):
|
||||
// Y pass → GL_R8 luma, full-res: Y = (16 + 219·(0.2126R+0.7152G+0.0722B))/255
|
||||
// UV pass → GL_RG8 chroma, half-res (GL_LINEAR averages the 2×2 footprint):
|
||||
// U = (128 + 224·(-0.1146R-0.3854G+0.5000B))/255 → R channel
|
||||
// V = (128 + 224·( 0.5000R-0.4542G-0.0458B))/255 → G channel
|
||||
// RG8's (R=U, G=V) byte order matches NV12's interleaved [U,V]. All outputs clamped to [0,1].
|
||||
// Matches the Windows VideoConverter (BT.709, limited/studio range) so the two hosts look identical.
|
||||
const FRAG_Y_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float Y=(16.0+219.0*(0.2126*c.r+0.7152*c.g+0.0722*c.b))/255.0;o_color=vec4(clamp(Y,0.0,1.0),0.0,0.0,1.0);}\n";
|
||||
const FRAG_UV_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float U=(128.0+224.0*(-0.1146*c.r-0.3854*c.g+0.5000*c.b))/255.0;float V=(128.0+224.0*(0.5000*c.r-0.4542*c.g-0.0458*c.b))/255.0;o_color=vec4(clamp(U,0.0,1.0),clamp(V,0.0,1.0),0.0,1.0);}\n";
|
||||
|
||||
unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
|
||||
let sh = glCreateShader(kind);
|
||||
ensure!(sh != 0, "glCreateShader failed");
|
||||
let ptr = src.as_ptr() as *const i8;
|
||||
let len = src.len() as c_int;
|
||||
glShaderSource(sh, 1, &ptr, &len);
|
||||
glCompileShader(sh);
|
||||
let mut ok: c_int = 0;
|
||||
glGetShaderiv(sh, GL_COMPILE_STATUS, &mut ok);
|
||||
if ok == 0 {
|
||||
glDeleteShader(sh);
|
||||
bail!("GL shader compile failed");
|
||||
}
|
||||
Ok(sh)
|
||||
}
|
||||
|
||||
/// Compile+link the fullscreen-triangle program with fragment source `frag` and bind its `image`
|
||||
/// sampler to texture unit 0.
|
||||
unsafe fn compile_program_with(frag: &[u8]) -> Result<u32> {
|
||||
let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
|
||||
let fs = compile_shader(GL_FRAGMENT_SHADER, frag)?;
|
||||
let prog = glCreateProgram();
|
||||
glAttachShader(prog, vs);
|
||||
glAttachShader(prog, fs);
|
||||
glLinkProgram(prog);
|
||||
glDeleteShader(vs);
|
||||
glDeleteShader(fs);
|
||||
let mut ok: c_int = 0;
|
||||
glGetProgramiv(prog, GL_LINK_STATUS, &mut ok);
|
||||
ensure!(ok != 0, "GL program link failed");
|
||||
glUseProgram(prog);
|
||||
let loc = glGetUniformLocation(prog, c"image".as_ptr());
|
||||
if loc >= 0 {
|
||||
glUniform1i(loc, 0); // sampler -> texture unit 0
|
||||
}
|
||||
glUseProgram(0);
|
||||
Ok(prog)
|
||||
}
|
||||
|
||||
unsafe fn compile_program() -> Result<u32> {
|
||||
compile_program_with(FRAG_SRC)
|
||||
}
|
||||
|
||||
/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
|
||||
struct GlBlit {
|
||||
program: u32,
|
||||
vao: u32,
|
||||
fbo: u32,
|
||||
/// CUDA-registrable destination (immutable GL_RGBA8).
|
||||
dst_tex: u32,
|
||||
/// Source texture re-targeted to each frame's EGLImage.
|
||||
src_tex: u32,
|
||||
width: u32,
|
||||
height: u32,
|
||||
/// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame.
|
||||
registered: cuda::RegisteredTexture,
|
||||
/// Recycled CUDA device buffers (the imported frames handed to the encoder).
|
||||
pool: cuda::BufferPool,
|
||||
}
|
||||
|
||||
impl GlBlit {
|
||||
unsafe fn new(width: u32, height: u32) -> Result<GlBlit> {
|
||||
let program = compile_program()?;
|
||||
let mut vao = 0u32;
|
||||
glGenVertexArrays(1, &mut vao); // core profile needs a bound VAO for glDrawArrays
|
||||
let mut fbo = 0u32;
|
||||
glGenFramebuffers(1, &mut fbo);
|
||||
|
||||
let mut dst_tex = 0u32;
|
||||
glGenTextures(1, &mut dst_tex);
|
||||
glBindTexture(GL_TEXTURE_2D, dst_tex);
|
||||
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||
|
||||
let mut src_tex = 0u32;
|
||||
glGenTextures(1, &mut src_tex);
|
||||
glBindTexture(GL_TEXTURE_2D, src_tex);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
|
||||
glFramebufferTexture2D(
|
||||
GL_FRAMEBUFFER,
|
||||
GL_COLOR_ATTACHMENT0,
|
||||
GL_TEXTURE_2D,
|
||||
dst_tex,
|
||||
0,
|
||||
);
|
||||
let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||
ensure!(
|
||||
status == GL_FRAMEBUFFER_COMPLETE,
|
||||
"blit FBO incomplete ({status:#x})"
|
||||
);
|
||||
// Register the (immutable, reused) destination texture with CUDA once, and stand up the
|
||||
// device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be
|
||||
// current (the caller makes it current before constructing the blit).
|
||||
let registered = cuda::RegisteredTexture::register_gl(dst_tex)?;
|
||||
let pool = cuda::BufferPool::new(width, height)?;
|
||||
Ok(GlBlit {
|
||||
program,
|
||||
vao,
|
||||
fbo,
|
||||
dst_tex,
|
||||
src_tex,
|
||||
width,
|
||||
height,
|
||||
registered,
|
||||
pool,
|
||||
})
|
||||
}
|
||||
|
||||
/// Bind `image` to the source texture and render it into `dst_tex`.
|
||||
///
|
||||
/// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
|
||||
unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
|
||||
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||
let _ = glGetError();
|
||||
egl_image_target(GL_TEXTURE_2D, image);
|
||||
let e = glGetError();
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
|
||||
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, self.fbo);
|
||||
glViewport(0, 0, self.width as c_int, self.height as c_int);
|
||||
glUseProgram(self.program);
|
||||
glActiveTexture(GL_TEXTURE0);
|
||||
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||
glBindVertexArray(self.vao);
|
||||
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||
glBindVertexArray(0);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||
glFlush(); // submit GL work before CUDA maps the texture
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-size GL machinery to convert a dmabuf EGLImage into an NV12 (BT.709 limited-range) pair —
|
||||
/// the [`GlBlit`] analogue for the `PUNKTFUNK_NV12` path. Two passes share `src_tex`: a full-res Y
|
||||
/// pass into a CUDA-registrable `GL_R8` texture and a half-res UV pass into a `GL_RG8` texture.
|
||||
/// Feeding NVENC native NV12 deletes its internal RGB→YUV CSC (which otherwise runs on the SM that a
|
||||
/// saturating game pins at 100%); the convert here replaces the BGRx swizzle [`GlBlit`] did, at ~the
|
||||
/// same 3D cost.
|
||||
struct Nv12Blit {
|
||||
y_program: u32,
|
||||
uv_program: u32,
|
||||
vao: u32,
|
||||
y_fbo: u32,
|
||||
uv_fbo: u32,
|
||||
/// CUDA-registrable luma target (immutable `GL_R8`, W×H).
|
||||
y_tex: u32,
|
||||
/// CUDA-registrable chroma target (immutable `GL_RG8`, W/2 × H/2).
|
||||
uv_tex: u32,
|
||||
/// Source texture re-targeted to each frame's EGLImage. `GL_LINEAR` so the UV pass averages 2×2.
|
||||
src_tex: u32,
|
||||
width: u32,
|
||||
height: u32,
|
||||
y_registered: cuda::RegisteredTexture,
|
||||
uv_registered: cuda::RegisteredTexture,
|
||||
/// Recycled NV12 device buffers (two-plane) handed to the encoder.
|
||||
pool: cuda::BufferPool,
|
||||
/// Self-test only: whether `src_tex` has had immutable RGBA8 storage allocated for the upload
|
||||
/// path (the live path retargets `src_tex` via EGLImage instead, never allocating storage).
|
||||
test_src_storage: bool,
|
||||
}
|
||||
|
||||
impl Nv12Blit {
|
||||
unsafe fn new(width: u32, height: u32) -> Result<Nv12Blit> {
|
||||
ensure!(
|
||||
width % 2 == 0 && height % 2 == 0,
|
||||
"NV12 convert needs even dimensions (got {width}x{height})"
|
||||
);
|
||||
let y_program = compile_program_with(FRAG_Y_SRC)?;
|
||||
let uv_program = compile_program_with(FRAG_UV_SRC)?;
|
||||
let mut vao = 0u32;
|
||||
glGenVertexArrays(1, &mut vao);
|
||||
let mut fbos = [0u32; 2];
|
||||
glGenFramebuffers(2, fbos.as_mut_ptr());
|
||||
let (y_fbo, uv_fbo) = (fbos[0], fbos[1]);
|
||||
|
||||
// Luma target: GL_R8 at full resolution.
|
||||
let mut y_tex = 0u32;
|
||||
glGenTextures(1, &mut y_tex);
|
||||
glBindTexture(GL_TEXTURE_2D, y_tex);
|
||||
glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, width as c_int, height as c_int);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||
|
||||
// Chroma target: GL_RG8 at half resolution (R=U, G=V).
|
||||
let mut uv_tex = 0u32;
|
||||
glGenTextures(1, &mut uv_tex);
|
||||
glBindTexture(GL_TEXTURE_2D, uv_tex);
|
||||
glTexStorage2D(
|
||||
GL_TEXTURE_2D,
|
||||
1,
|
||||
GL_RG8,
|
||||
(width / 2) as c_int,
|
||||
(height / 2) as c_int,
|
||||
);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||
|
||||
// Source: GL_LINEAR so the half-res UV pass averages the 2×2 chroma footprint.
|
||||
let mut src_tex = 0u32;
|
||||
glGenTextures(1, &mut src_tex);
|
||||
glBindTexture(GL_TEXTURE_2D, src_tex);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
|
||||
for (fbo, tex) in [(y_fbo, y_tex), (uv_fbo, uv_tex)] {
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
|
||||
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex, 0);
|
||||
let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||
ensure!(
|
||||
status == GL_FRAMEBUFFER_COMPLETE,
|
||||
"NV12 blit FBO incomplete ({status:#x}) — GL_R8/GL_RG8 not renderable?"
|
||||
);
|
||||
}
|
||||
// Register both convert targets with CUDA once (per-resolution), + the NV12 two-plane pool.
|
||||
let y_registered = cuda::RegisteredTexture::register_gl(y_tex)?;
|
||||
let uv_registered = cuda::RegisteredTexture::register_gl(uv_tex)?;
|
||||
let pool = cuda::BufferPool::new_nv12(width, height)?;
|
||||
Ok(Nv12Blit {
|
||||
y_program,
|
||||
uv_program,
|
||||
vao,
|
||||
y_fbo,
|
||||
uv_fbo,
|
||||
y_tex,
|
||||
uv_tex,
|
||||
src_tex,
|
||||
width,
|
||||
height,
|
||||
y_registered,
|
||||
uv_registered,
|
||||
pool,
|
||||
test_src_storage: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// Bind `image` to the source texture and run both convert passes into `y_tex`/`uv_tex`.
|
||||
///
|
||||
/// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
|
||||
unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
|
||||
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||
let _ = glGetError();
|
||||
egl_image_target(GL_TEXTURE_2D, image);
|
||||
let e = glGetError();
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
|
||||
self.run_passes()
|
||||
}
|
||||
|
||||
/// Run the two convert passes from whatever is currently in `src_tex` (caller populated it).
|
||||
/// Shared by [`run`](Self::run) (EGLImage source) and the self-test (uploaded RGBA source).
|
||||
///
|
||||
/// # Safety: the GL context is current on this thread.
|
||||
unsafe fn run_passes(&self) -> Result<()> {
|
||||
glActiveTexture(GL_TEXTURE0);
|
||||
glBindVertexArray(self.vao);
|
||||
// Y pass: full-res into the R8 target.
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, self.y_fbo);
|
||||
glViewport(0, 0, self.width as c_int, self.height as c_int);
|
||||
glUseProgram(self.y_program);
|
||||
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||
// UV pass: half-res into the RG8 target (GL_LINEAR averages the 2×2).
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, self.uv_fbo);
|
||||
glViewport(0, 0, (self.width / 2) as c_int, (self.height / 2) as c_int);
|
||||
glUseProgram(self.uv_program);
|
||||
glBindTexture(GL_TEXTURE_2D, self.src_tex);
|
||||
glDrawArrays(GL_TRIANGLES, 0, 3);
|
||||
|
||||
glBindVertexArray(0);
|
||||
glBindFramebuffer(GL_FRAMEBUFFER, 0);
|
||||
glFlush(); // submit GL work before CUDA maps the textures
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Nv12Blit {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
glDeleteTextures(1, &self.y_tex);
|
||||
glDeleteTextures(1, &self.uv_tex);
|
||||
glDeleteTextures(1, &self.src_tex);
|
||||
glDeleteFramebuffers(2, [self.y_fbo, self.uv_fbo].as_ptr());
|
||||
glDeleteVertexArrays(1, &self.vao);
|
||||
glDeleteProgram(self.y_program);
|
||||
glDeleteProgram(self.uv_program);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct DmabufPlane {
|
||||
pub fd: i32,
|
||||
pub offset: u32,
|
||||
pub stride: u32,
|
||||
}
|
||||
|
||||
type Egl = egl::DynamicInstance<egl::EGL1_5>;
|
||||
|
||||
/// Headless EGLDisplay (NVIDIA device platform) + a surfaceless desktop-GL context used to
|
||||
/// import dmabufs and bridge them to CUDA via a GL texture. Lives on the capture thread (the GL
|
||||
/// context is made current there once).
|
||||
pub struct EglImporter {
|
||||
egl: Egl,
|
||||
display: egl::Display,
|
||||
no_ctx: egl::Context,
|
||||
/// Surfaceless GL context (current on the capture thread) for the EGLImage→texture bind.
|
||||
_gl_ctx: egl::Context,
|
||||
egl_image_target: EglImageTargetFn,
|
||||
/// Lazily-created GL blit machinery (recreated if the frame size changes).
|
||||
blit: Option<GlBlit>,
|
||||
/// Lazily-created NV12 convert machinery (`PUNKTFUNK_NV12` path; recreated on size change).
|
||||
nv12_blit: Option<Nv12Blit>,
|
||||
/// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
|
||||
/// created lazily on the first LINEAR frame, + the destination pool.
|
||||
vk: Option<super::vulkan::VkBridge>,
|
||||
linear_pool: Option<cuda::BufferPool>,
|
||||
gbm: *mut c_void,
|
||||
render_fd: c_int,
|
||||
}
|
||||
|
||||
// The EGL handles are confined to the capture thread; the struct is moved there once.
|
||||
unsafe impl Send for EglImporter {}
|
||||
|
||||
impl EglImporter {
|
||||
/// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
|
||||
/// to exist (so a later `import` only touches the hot path).
|
||||
pub fn new() -> Result<EglImporter> {
|
||||
// GBM platform on the NVIDIA render node: this ties the EGLDisplay (and its GL contexts)
|
||||
// to the same DRM device CUDA-GL interop associates with, which the EGL device platform
|
||||
// did not (cuGraphicsGLRegisterImage rejected device-platform GL textures).
|
||||
let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
|
||||
let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
|
||||
ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
|
||||
let gbm = unsafe { gbm_create_device(render_fd) };
|
||||
if gbm.is_null() {
|
||||
unsafe { libc::close(render_fd) };
|
||||
anyhow::bail!("gbm_create_device failed");
|
||||
}
|
||||
|
||||
let egl: Egl =
|
||||
unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
|
||||
let display = unsafe {
|
||||
egl.get_platform_display(
|
||||
EGL_PLATFORM_GBM_KHR,
|
||||
gbm as egl::NativeDisplayType,
|
||||
&[egl::ATTRIB_NONE],
|
||||
)
|
||||
}
|
||||
.context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
|
||||
egl.initialize(display).context("eglInitialize")?;
|
||||
|
||||
let exts = egl
|
||||
.query_string(Some(display), egl::EXTENSIONS)
|
||||
.context("query EGL extensions")?
|
||||
.to_string_lossy()
|
||||
.into_owned();
|
||||
ensure!(
|
||||
exts.contains("EGL_EXT_image_dma_buf_import"),
|
||||
"EGL lacks EGL_EXT_image_dma_buf_import"
|
||||
);
|
||||
ensure!(
|
||||
exts.contains("EGL_EXT_image_dma_buf_import_modifiers"),
|
||||
"EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
|
||||
);
|
||||
|
||||
// A surfaceless desktop-GL context so we can bind the dmabuf EGLImage to a GL texture
|
||||
// (cuGraphicsEGLRegisterImage is Tegra-only; desktop CUDA interop goes through GL).
|
||||
egl.bind_api(egl::OPENGL_API)
|
||||
.context("eglBindAPI(OpenGL)")?;
|
||||
// The default EGL_SURFACE_TYPE in eglChooseConfig is WINDOW_BIT, which a headless device
|
||||
// display has none of — request a pbuffer-capable config (we run surfaceless anyway).
|
||||
let config = egl
|
||||
.choose_first_config(
|
||||
display,
|
||||
&[
|
||||
egl::SURFACE_TYPE,
|
||||
egl::PBUFFER_BIT,
|
||||
egl::RENDERABLE_TYPE,
|
||||
egl::OPENGL_BIT,
|
||||
egl::NONE,
|
||||
],
|
||||
)
|
||||
.context("eglChooseConfig")?
|
||||
.context("no EGL config for OpenGL")?;
|
||||
let gl_ctx = egl
|
||||
.create_context(
|
||||
display,
|
||||
config,
|
||||
None,
|
||||
&[egl::CONTEXT_CLIENT_VERSION, 3, egl::NONE],
|
||||
)
|
||||
.context("eglCreateContext(OpenGL)")?;
|
||||
egl.make_current(display, None, None, Some(gl_ctx))
|
||||
.context("eglMakeCurrent surfaceless (needs EGL_KHR_surfaceless_context)")?;
|
||||
let egl_image_target: EglImageTargetFn = unsafe {
|
||||
std::mem::transmute(
|
||||
egl.get_proc_address("glEGLImageTargetTexture2DOES")
|
||||
.context("glEGLImageTargetTexture2DOES unavailable")?,
|
||||
)
|
||||
};
|
||||
|
||||
// Create the shared CUDA context up front so import() is pure hot path.
|
||||
cuda::context().context("create CUDA context")?;
|
||||
|
||||
let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
|
||||
tracing::info!(
|
||||
"zero-copy EGL importer ready (GBM platform + GL texture interop, dma_buf_import + modifiers)"
|
||||
);
|
||||
Ok(EglImporter {
|
||||
egl,
|
||||
display,
|
||||
no_ctx,
|
||||
_gl_ctx: gl_ctx,
|
||||
egl_image_target,
|
||||
blit: None,
|
||||
nv12_blit: None,
|
||||
vk: None,
|
||||
linear_pool: None,
|
||||
gbm,
|
||||
render_fd,
|
||||
})
|
||||
}
|
||||
|
||||
/// Import a LINEAR dmabuf via the Vulkan bridge (no EGL/GL involved — NVIDIA's EGL can't
|
||||
/// sample LINEAR, and the CUDA driver rejects raw dmabuf fds; Vulkan imports the dmabuf,
|
||||
/// GPU-copies into an exportable allocation, and CUDA reads that). See [`super::vulkan`].
|
||||
pub fn import_linear(
|
||||
&mut self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<DeviceBuffer> {
|
||||
cuda::make_current()?;
|
||||
if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
|
||||
self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
|
||||
}
|
||||
if self.vk.is_none() {
|
||||
self.vk = Some(super::vulkan::VkBridge::new()?);
|
||||
}
|
||||
self.vk.as_mut().unwrap().import_linear(
|
||||
plane.fd,
|
||||
plane.offset,
|
||||
plane.stride,
|
||||
height,
|
||||
self.linear_pool.as_ref().unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
/// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
|
||||
/// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
|
||||
/// a dmabuf in a layout we can import. Empty on failure (caller falls back).
|
||||
pub fn supported_modifiers(&self, fourcc: u32) -> Vec<u64> {
|
||||
type QueryFn = unsafe extern "system" fn(
|
||||
dpy: *mut c_void,
|
||||
format: i32,
|
||||
max_modifiers: i32,
|
||||
modifiers: *mut u64,
|
||||
external_only: *mut u32,
|
||||
num_modifiers: *mut i32,
|
||||
) -> u32;
|
||||
let Some(sym) = self.egl.get_proc_address("eglQueryDmaBufModifiersEXT") else {
|
||||
return Vec::new();
|
||||
};
|
||||
let query: QueryFn = unsafe { std::mem::transmute(sym) };
|
||||
let dpy = self.display.as_ptr();
|
||||
unsafe {
|
||||
let mut count: i32 = 0;
|
||||
if query(
|
||||
dpy,
|
||||
fourcc as i32,
|
||||
0,
|
||||
std::ptr::null_mut(),
|
||||
std::ptr::null_mut(),
|
||||
&mut count,
|
||||
) == 0
|
||||
|| count <= 0
|
||||
{
|
||||
return Vec::new();
|
||||
}
|
||||
let mut mods = vec![0u64; count as usize];
|
||||
let mut ext = vec![0u32; count as usize];
|
||||
let mut n: i32 = 0;
|
||||
if query(
|
||||
dpy,
|
||||
fourcc as i32,
|
||||
count,
|
||||
mods.as_mut_ptr(),
|
||||
ext.as_mut_ptr(),
|
||||
&mut n,
|
||||
) == 0
|
||||
{
|
||||
return Vec::new();
|
||||
}
|
||||
mods.truncate(n.max(0) as usize);
|
||||
mods
|
||||
}
|
||||
}
|
||||
|
||||
/// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer. `fourcc`
|
||||
/// is the DRM FourCC; `modifier` is the explicit 64-bit DRM format modifier when one was
|
||||
/// negotiated, or `None` to import with the buffer's implicit modifier (base
|
||||
/// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
|
||||
pub fn import(
|
||||
&mut self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
fourcc: u32,
|
||||
modifier: Option<u64>,
|
||||
) -> Result<DeviceBuffer> {
|
||||
self.import_inner(plane, width, height, fourcc, modifier, false)
|
||||
}
|
||||
|
||||
/// Like [`import`](Self::import), but de-tiles **and converts** the dmabuf to NV12 (BT.709
|
||||
/// limited range) on the GPU — the `PUNKTFUNK_NV12` path — so NVENC can encode native YUV with
|
||||
/// no internal RGB→YUV CSC. The returned [`DeviceBuffer`] carries both NV12 planes
|
||||
/// (`DeviceBuffer::is_nv12`). Only the tiled EGL/GL path supports this (LINEAR/Vulkan stays RGB).
|
||||
pub fn import_nv12(
|
||||
&mut self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
fourcc: u32,
|
||||
modifier: Option<u64>,
|
||||
) -> Result<DeviceBuffer> {
|
||||
self.import_inner(plane, width, height, fourcc, modifier, true)
|
||||
}
|
||||
|
||||
fn import_inner(
|
||||
&mut self,
|
||||
plane: &DmabufPlane,
|
||||
width: u32,
|
||||
height: u32,
|
||||
fourcc: u32,
|
||||
modifier: Option<u64>,
|
||||
nv12: bool,
|
||||
) -> Result<DeviceBuffer> {
|
||||
let mut attrs: Vec<egl::Attrib> = vec![
|
||||
egl::WIDTH as egl::Attrib,
|
||||
width as egl::Attrib,
|
||||
egl::HEIGHT as egl::Attrib,
|
||||
height as egl::Attrib,
|
||||
EGL_LINUX_DRM_FOURCC_EXT,
|
||||
fourcc as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_FD_EXT,
|
||||
plane.fd as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_OFFSET_EXT,
|
||||
plane.offset as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_PITCH_EXT,
|
||||
plane.stride as egl::Attrib,
|
||||
];
|
||||
if let Some(m) = modifier {
|
||||
attrs.extend_from_slice(&[
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
|
||||
(m & 0xFFFF_FFFF) as egl::Attrib,
|
||||
EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
|
||||
(m >> 32) as egl::Attrib,
|
||||
]);
|
||||
}
|
||||
attrs.push(egl::ATTRIB_NONE);
|
||||
let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
|
||||
let image = self
|
||||
.egl
|
||||
.create_image(
|
||||
self.display,
|
||||
self.no_ctx,
|
||||
EGL_LINUX_DMA_BUF_EXT,
|
||||
client,
|
||||
&attrs,
|
||||
)
|
||||
.context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
|
||||
|
||||
// EGLImage → (sampled by a shader) → GL_RGBA8 texture (or NV12 R8+RG8 pair) → register
|
||||
// *that* with CUDA → map → array → copy out. Registering the EGLImage texture directly
|
||||
// fails (its layout isn't a CUDA-registrable format); the render targets are.
|
||||
let result = if nv12 {
|
||||
self.blit_and_copy_nv12(image.as_ptr(), width, height)
|
||||
} else {
|
||||
self.blit_and_copy(image.as_ptr(), width, height)
|
||||
};
|
||||
let _ = self.egl.destroy_image(self.display, image);
|
||||
result
|
||||
}
|
||||
|
||||
/// Render the dmabuf `image` into the registrable RGBA8 texture and copy it to an owned CUDA
|
||||
/// buffer. (Re)creates the per-size GL blit machinery as needed.
|
||||
fn blit_and_copy(
|
||||
&mut self,
|
||||
image: *mut c_void,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<DeviceBuffer> {
|
||||
cuda::make_current()?;
|
||||
if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||
self.blit = Some(unsafe { GlBlit::new(width, height)? });
|
||||
}
|
||||
let egl_image_target = self.egl_image_target;
|
||||
let blit = self.blit.as_mut().unwrap();
|
||||
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
||||
unsafe { blit.run(egl_image_target, image)? };
|
||||
// Persistent registration (mapped per frame) + a pooled buffer — no per-frame
|
||||
// cuGraphicsGLRegisterImage / cuMemAllocPitch.
|
||||
let dst = blit.pool.get()?;
|
||||
blit.registered.copy_mapped_to(&dst)?;
|
||||
Ok(dst)
|
||||
}
|
||||
|
||||
/// Convert the dmabuf `image` to NV12 (Y in an R8 texture, UV in an RG8 texture) and copy both
|
||||
/// planes into a pooled NV12 [`DeviceBuffer`]. (Re)creates the per-size convert machinery as
|
||||
/// needed. The `PUNKTFUNK_NV12` analogue of [`blit_and_copy`].
|
||||
fn blit_and_copy_nv12(
|
||||
&mut self,
|
||||
image: *mut c_void,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<DeviceBuffer> {
|
||||
cuda::make_current()?;
|
||||
if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||
self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
|
||||
}
|
||||
let egl_image_target = self.egl_image_target;
|
||||
let blit = self.nv12_blit.as_mut().unwrap();
|
||||
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
||||
unsafe { blit.run(egl_image_target, image)? };
|
||||
let dst = blit.pool.get()?;
|
||||
cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
|
||||
Ok(dst)
|
||||
}
|
||||
|
||||
/// Self-test entry: upload a packed `width`×`height` RGBA8 host pattern into a GL texture, run
|
||||
/// the NV12 convert passes on the GPU, and copy both planes into a pooled NV12 [`DeviceBuffer`].
|
||||
/// Exercises the exact shaders + CUDA copy the live path uses, but sourced from an uploaded
|
||||
/// texture instead of a dmabuf EGLImage (no compositor needed). `rgba` is tightly packed, 4 B/px.
|
||||
pub fn convert_rgba_for_test(
|
||||
&mut self,
|
||||
rgba: &[u8],
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<DeviceBuffer> {
|
||||
anyhow::ensure!(
|
||||
rgba.len() == width as usize * height as usize * 4,
|
||||
"test RGBA buffer {} bytes != {}x{}x4",
|
||||
rgba.len(),
|
||||
width,
|
||||
height
|
||||
);
|
||||
cuda::make_current()?;
|
||||
if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||
self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
|
||||
}
|
||||
let blit = self.nv12_blit.as_mut().unwrap();
|
||||
unsafe {
|
||||
// Upload the host RGBA into `src_tex` (an immutable GL_RGBA8 backing must exist first;
|
||||
// the live path never allocates it — it retargets `src_tex` via EGLImage instead).
|
||||
glBindTexture(GL_TEXTURE_2D, blit.src_tex);
|
||||
if !blit.test_src_storage {
|
||||
glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
|
||||
blit.test_src_storage = true;
|
||||
}
|
||||
let _ = glGetError();
|
||||
glTexSubImage2D(
|
||||
GL_TEXTURE_2D,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
width as c_int,
|
||||
height as c_int,
|
||||
GL_RGBA,
|
||||
GL_UNSIGNED_BYTE,
|
||||
rgba.as_ptr() as *const c_void,
|
||||
);
|
||||
let e = glGetError();
|
||||
glBindTexture(GL_TEXTURE_2D, 0);
|
||||
ensure!(e == 0, "glTexSubImage2D(test source) failed ({e:#x})");
|
||||
blit.run_passes()?;
|
||||
}
|
||||
let dst = blit.pool.get()?;
|
||||
cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
|
||||
Ok(dst)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EglImporter {
|
||||
fn drop(&mut self) {
|
||||
if !self.gbm.is_null() {
|
||||
unsafe { gbm_device_destroy(self.gbm) };
|
||||
}
|
||||
if self.render_fd >= 0 {
|
||||
unsafe { libc::close(self.render_fd) };
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,213 @@
|
||||
//! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and
|
||||
//! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path
|
||||
//! moves ~3.5 GB/s). Opt in with `PUNKTFUNK_ZEROCOPY=1`; the CPU-copy path stays the default and
|
||||
//! the runtime fallback (foreign-allocator / no-dmabuf / import failure).
|
||||
//!
|
||||
//! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the
|
||||
//! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in
|
||||
//! `encode/linux.rs`; the dmabuf negotiation lives in `capture/linux.rs`.
|
||||
|
||||
pub mod cuda;
|
||||
pub mod egl;
|
||||
pub mod vulkan;
|
||||
|
||||
pub use cuda::DeviceBuffer;
|
||||
pub use egl::{DmabufPlane, EglImporter};
|
||||
|
||||
/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`).
|
||||
fn flag(name: &str) -> bool {
|
||||
std::env::var(name)
|
||||
.map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy).
|
||||
pub fn enabled() -> bool {
|
||||
flag("PUNKTFUNK_ZEROCOPY")
|
||||
}
|
||||
|
||||
/// Whether the NV12 convert path is opted in (`PUNKTFUNK_NV12` truthy). When set AND the zero-copy
|
||||
/// tiled-GL path is active, the capturer produces native NV12 (BT.709 limited range) on the GPU and
|
||||
/// feeds NVENC YUV directly — deleting NVENC's internal RGB→YUV CSC (Tier 2A). Off by default: the
|
||||
/// existing RGB/BGRx path is then 100% unchanged.
|
||||
pub fn nv12_enabled() -> bool {
|
||||
flag("PUNKTFUNK_NV12")
|
||||
}
|
||||
|
||||
/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
|
||||
const fn fourcc(c: &[u8; 4]) -> u32 {
|
||||
(c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
|
||||
}
|
||||
|
||||
/// Map a SPA/our [`crate::capture::PixelFormat`] to the DRM FourCC EGL expects for import.
|
||||
/// SPA byte order `BGRx` ⇒ DRM `XRGB8888` (memory B,G,R,X), etc.
|
||||
pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option<u32> {
|
||||
use crate::capture::PixelFormat::*;
|
||||
Some(match format {
|
||||
Bgrx => fourcc(b"XR24"), // DRM_FORMAT_XRGB8888
|
||||
Bgra => fourcc(b"AR24"), // DRM_FORMAT_ARGB8888
|
||||
Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
|
||||
Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
|
||||
// 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
|
||||
// Rgb10a2/Nv12/P010 are the Windows HDR / video-processor formats — never produced on Linux.
|
||||
Rgb | Bgr | Rgb10a2 | Nv12 | P010 => return None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Standalone probe (the `zerocopy-probe` subcommand): initialize the EGL importer + CUDA
|
||||
/// context and report. De-risks the FFI/linking/GPU-access without needing a capture session.
|
||||
pub fn probe() -> anyhow::Result<()> {
|
||||
let _importer = EglImporter::new()?;
|
||||
let ctx = cuda::context()?;
|
||||
tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reference BT.709 LIMITED-range conversion of one full-range RGB pixel (`u8`) to (Y, U, V) in
|
||||
/// `f64`, matching the GPU shaders in [`egl`]. Y in [16,235], U/V in [16,240].
|
||||
fn bt709_limited(r: u8, g: u8, b: u8) -> (f64, f64, f64) {
|
||||
let (r, g, b) = (r as f64 / 255.0, g as f64 / 255.0, b as f64 / 255.0);
|
||||
let y = 16.0 + 219.0 * (0.2126 * r + 0.7152 * g + 0.0722 * b);
|
||||
let u = 128.0 + 224.0 * (-0.1146 * r - 0.3854 * g + 0.5000 * b);
|
||||
let v = 128.0 + 224.0 * (0.5000 * r - 0.4542 * g - 0.0458 * b);
|
||||
(y, u, v)
|
||||
}
|
||||
|
||||
/// NV12 colour self-test (the `nv12-selftest` subcommand): stand up the EGL/GL + CUDA stack, upload
|
||||
/// a known synthetic RGBA pattern, run the real NV12 convert shaders on the GPU, read the Y and UV
|
||||
/// planes back, and compare against a Rust BT.709 limited-range reference. Validates colour
|
||||
/// correctness on the GPU **without a display** (the project's green-screen bugs came from exactly
|
||||
/// this kind of plane/layout error). PASS if max abs error Y ≤ 2, U/V ≤ 3.
|
||||
pub fn nv12_selftest() -> anyhow::Result<()> {
|
||||
use anyhow::bail;
|
||||
|
||||
// 64x64, even dims. A 4x4 grid of 16x16 flat-colour blocks (so each 2x2 chroma footprint is
|
||||
// uniform → exact chroma comparison) covering the primaries + gray/black/white, then the rest
|
||||
// is a diagonal gradient (every pixel changes — a Y-channel stress that also exercises the
|
||||
// chroma averaging; the gradient blocks are compared on Y only).
|
||||
const W: u32 = 64;
|
||||
const H: u32 = 64;
|
||||
const BLK: u32 = 16;
|
||||
// (name, r, g, b) for the labelled blocks in row-major grid order; the rest fall to gradient.
|
||||
let named: [(&str, u8, u8, u8); 8] = [
|
||||
("red", 255, 0, 0),
|
||||
("green", 0, 255, 0),
|
||||
("blue", 0, 0, 255),
|
||||
("white", 255, 255, 255),
|
||||
("black", 0, 0, 0),
|
||||
("gray128", 128, 128, 128),
|
||||
("yellow", 255, 255, 0),
|
||||
("cyan", 0, 255, 255),
|
||||
];
|
||||
|
||||
// Build the RGBA pattern + a parallel record of each pixel's (r,g,b) and whether it sits in a
|
||||
// flat block (chroma-comparable) or the gradient (Y-only).
|
||||
let mut rgba = vec![0u8; (W * H * 4) as usize];
|
||||
let mut flat = vec![false; (W * H) as usize];
|
||||
let grid_cols = W / BLK; // 4
|
||||
let pixel_rgb = |x: u32, y: u32| -> (u8, u8, u8, bool) {
|
||||
let bx = x / BLK;
|
||||
let by = y / BLK;
|
||||
let idx = (by * grid_cols + bx) as usize;
|
||||
if idx < named.len() {
|
||||
let (_, r, g, b) = named[idx];
|
||||
(r, g, b, true)
|
||||
} else {
|
||||
// Diagonal gradient — distinct per pixel.
|
||||
let r = ((x * 4) & 0xff) as u8;
|
||||
let g = ((y * 4) & 0xff) as u8;
|
||||
let b = (((x + y) * 2) & 0xff) as u8;
|
||||
(r, g, b, false)
|
||||
}
|
||||
};
|
||||
for y in 0..H {
|
||||
for x in 0..W {
|
||||
let (r, g, b, is_flat) = pixel_rgb(x, y);
|
||||
let i = ((y * W + x) * 4) as usize;
|
||||
rgba[i] = r;
|
||||
rgba[i + 1] = g;
|
||||
rgba[i + 2] = b;
|
||||
rgba[i + 3] = 255;
|
||||
flat[(y * W + x) as usize] = is_flat;
|
||||
}
|
||||
}
|
||||
|
||||
// GPU convert.
|
||||
let mut importer = EglImporter::new()?;
|
||||
let nv12 = importer.convert_rgba_for_test(&rgba, W, H)?;
|
||||
let (uv_ptr, uv_pitch) = nv12
|
||||
.uv
|
||||
.ok_or_else(|| anyhow::anyhow!("self-test buffer is not NV12"))?;
|
||||
// Read both planes back to host (tightly packed).
|
||||
let y_host = cuda::read_plane_to_host(nv12.ptr, nv12.pitch, W as usize, H as usize)?;
|
||||
let uv_host = cuda::read_plane_to_host(uv_ptr, uv_pitch, (W as usize / 2) * 2, H as usize / 2)?;
|
||||
|
||||
// Compare Y over every pixel.
|
||||
let mut max_y_err = 0.0f64;
|
||||
for y in 0..H {
|
||||
for x in 0..W {
|
||||
let (r, g, b, _) = pixel_rgb(x, y);
|
||||
let (ref_y, _, _) = bt709_limited(r, g, b);
|
||||
let got = y_host[(y * W + x) as usize] as f64;
|
||||
max_y_err = max_y_err.max((got - ref_y).abs());
|
||||
}
|
||||
}
|
||||
|
||||
// Compare U/V over flat blocks only (each 2x2 footprint is a single colour → exact reference).
|
||||
// Chroma is W/2 × H/2 samples, interleaved [U,V] per sample.
|
||||
let cw = W / 2;
|
||||
let ch = H / 2;
|
||||
let mut max_u_err = 0.0f64;
|
||||
let mut max_v_err = 0.0f64;
|
||||
for cy in 0..ch {
|
||||
for cx in 0..cw {
|
||||
// The 2x2 source footprint of this chroma sample.
|
||||
let (sx, sy) = (cx * 2, cy * 2);
|
||||
// Only compare where all 4 source pixels are flat (uniform colour).
|
||||
let all_flat =
|
||||
(0..2).all(|dy| (0..2).all(|dx| flat[((sy + dy) * W + (sx + dx)) as usize]));
|
||||
if !all_flat {
|
||||
continue;
|
||||
}
|
||||
let (r, g, b, _) = pixel_rgb(sx, sy);
|
||||
let (_, ref_u, ref_v) = bt709_limited(r, g, b);
|
||||
let base = ((cy * cw + cx) * 2) as usize;
|
||||
let got_u = uv_host[base] as f64;
|
||||
let got_v = uv_host[base + 1] as f64;
|
||||
max_u_err = max_u_err.max((got_u - ref_u).abs());
|
||||
max_v_err = max_v_err.max((got_v - ref_v).abs());
|
||||
}
|
||||
}
|
||||
|
||||
// Per-primary actual-vs-expected (block centre for chroma).
|
||||
println!("NV12 self-test ({W}x{H}, BT.709 limited range)");
|
||||
println!(
|
||||
" {:<8} {:>14} {:>14} {:>14}",
|
||||
"color", "Y exp/got", "U exp/got", "V exp/got"
|
||||
);
|
||||
for (idx, (name, r, g, b)) in named.iter().enumerate() {
|
||||
let bx = (idx as u32 % grid_cols) * BLK + BLK / 2;
|
||||
let by = (idx as u32 / grid_cols) * BLK + BLK / 2;
|
||||
let (ey, eu, ev) = bt709_limited(*r, *g, *b);
|
||||
let gy = y_host[(by * W + bx) as usize] as f64;
|
||||
let (ccx, ccy) = (bx / 2, by / 2);
|
||||
let cbase = ((ccy * cw + ccx) * 2) as usize;
|
||||
let gu = uv_host[cbase] as f64;
|
||||
let gv = uv_host[cbase + 1] as f64;
|
||||
println!(
|
||||
" {:<8} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0} {:>6.1}/{:<6.0}",
|
||||
name, ey, gy, eu, gu, ev, gv
|
||||
);
|
||||
}
|
||||
println!(
|
||||
" max abs error: Y={max_y_err:.2} (≤2) U={max_u_err:.2} (≤3) V={max_v_err:.2} (≤3)"
|
||||
);
|
||||
|
||||
if max_y_err <= 2.0 && max_u_err <= 3.0 && max_v_err <= 3.0 {
|
||||
println!("PASS");
|
||||
Ok(())
|
||||
} else {
|
||||
println!("FAIL");
|
||||
bail!("NV12 self-test FAILED (Y={max_y_err:.2} U={max_u_err:.2} V={max_v_err:.2})");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,366 @@
|
||||
//! Vulkan bridge for LINEAR dmabufs (gamescope's only offer), completing zero-copy where the
|
||||
//! other interops can't: NVIDIA's EGL won't sample LINEAR, and the CUDA driver rejects raw
|
||||
//! dmabuf fds as external memory. Vulkan *does* import dmabufs (`VK_EXT_external_memory_dma_buf`)
|
||||
//! and *does* export `OPAQUE_FD` memory that CUDA officially imports. So:
|
||||
//!
|
||||
//! ```text
|
||||
//! dmabuf fd ──VkImportMemoryFdInfoKHR(DMA_BUF)──▶ VkBuffer (cached per fd)
|
||||
//! │ vkCmdCopyBuffer (GPU, device-local)
|
||||
//! ▼
|
||||
//! exportable VkBuffer ──vkGetMemoryFdKHR(OPAQUE_FD)──▶ cuImportExternalMemory ──▶ CUdeviceptr
|
||||
//! ```
|
||||
//!
|
||||
//! The exportable buffer + its CUDA mapping are created once per resolution; per frame it's one
|
||||
//! GPU buffer copy (fence-waited) and one pitched CUDA copy into the encoder's pooled buffer.
|
||||
//! No CPU ever touches pixels. Imports are cached per fd (PipeWire's buffer pool is stable for
|
||||
//! a stream's life). Falls back cleanly: any init/import error disables the importer and the
|
||||
//! CPU mmap path takes over.
|
||||
|
||||
use super::cuda::{self, DeviceBuffer};
|
||||
use anyhow::{anyhow, bail, Context as _, Result};
|
||||
use ash::vk;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Vulkan objects for one imported source dmabuf (cached per fd).
|
||||
struct SrcBuf {
|
||||
buffer: vk::Buffer,
|
||||
memory: vk::DeviceMemory,
|
||||
size: u64,
|
||||
}
|
||||
|
||||
/// The per-resolution destination: exportable Vulkan memory mapped into CUDA.
|
||||
struct DstBuf {
|
||||
buffer: vk::Buffer,
|
||||
memory: vk::DeviceMemory,
|
||||
size: u64,
|
||||
/// CUDA's view of the same memory (owns the exported OPAQUE_FD).
|
||||
cuda: cuda::ExternalDmabuf,
|
||||
}
|
||||
|
||||
pub struct VkBridge {
|
||||
_entry: ash::Entry,
|
||||
instance: ash::Instance,
|
||||
device: ash::Device,
|
||||
ext_fd: ash::khr::external_memory_fd::Device,
|
||||
queue: vk::Queue,
|
||||
cmd_pool: vk::CommandPool,
|
||||
cmd: vk::CommandBuffer,
|
||||
fence: vk::Fence,
|
||||
mem_props: vk::PhysicalDeviceMemoryProperties,
|
||||
src_cache: HashMap<i32, SrcBuf>,
|
||||
dst: Option<DstBuf>,
|
||||
}
|
||||
|
||||
// Confined to the capture thread; moved there once.
|
||||
unsafe impl Send for VkBridge {}
|
||||
|
||||
impl VkBridge {
|
||||
/// Bring up Vulkan on the NVIDIA GPU with the external-memory extensions.
|
||||
pub fn new() -> Result<VkBridge> {
|
||||
unsafe {
|
||||
let entry = ash::Entry::load().context("load libvulkan")?;
|
||||
let app = vk::ApplicationInfo::default().api_version(vk::API_VERSION_1_1);
|
||||
let instance = entry
|
||||
.create_instance(
|
||||
&vk::InstanceCreateInfo::default().application_info(&app),
|
||||
None,
|
||||
)
|
||||
.context("vkCreateInstance")?;
|
||||
|
||||
// Pick the NVIDIA GPU (matches CUDA device 0 on this single-dGPU host).
|
||||
let phys = instance
|
||||
.enumerate_physical_devices()
|
||||
.context("enumerate GPUs")?
|
||||
.into_iter()
|
||||
.find(|&p| instance.get_physical_device_properties(p).vendor_id == 0x10DE)
|
||||
.ok_or_else(|| anyhow!("no NVIDIA Vulkan device"))?;
|
||||
let mem_props = instance.get_physical_device_memory_properties(phys);
|
||||
|
||||
// Any queue family supporting transfer (graphics/compute imply it).
|
||||
let qf = instance
|
||||
.get_physical_device_queue_family_properties(phys)
|
||||
.iter()
|
||||
.position(|q| {
|
||||
q.queue_flags.intersects(
|
||||
vk::QueueFlags::TRANSFER
|
||||
| vk::QueueFlags::GRAPHICS
|
||||
| vk::QueueFlags::COMPUTE,
|
||||
)
|
||||
})
|
||||
.ok_or_else(|| anyhow!("no transfer-capable queue family"))?
|
||||
as u32;
|
||||
|
||||
let exts = [
|
||||
ash::khr::external_memory_fd::NAME.as_ptr(),
|
||||
ash::ext::external_memory_dma_buf::NAME.as_ptr(),
|
||||
];
|
||||
let prio = [1.0f32];
|
||||
let qci = [vk::DeviceQueueCreateInfo::default()
|
||||
.queue_family_index(qf)
|
||||
.queue_priorities(&prio)];
|
||||
let device = instance
|
||||
.create_device(
|
||||
phys,
|
||||
&vk::DeviceCreateInfo::default()
|
||||
.queue_create_infos(&qci)
|
||||
.enabled_extension_names(&exts),
|
||||
None,
|
||||
)
|
||||
.context("vkCreateDevice (external-memory extensions supported?)")?;
|
||||
let ext_fd = ash::khr::external_memory_fd::Device::new(&instance, &device);
|
||||
let queue = device.get_device_queue(qf, 0);
|
||||
|
||||
let cmd_pool = device
|
||||
.create_command_pool(
|
||||
&vk::CommandPoolCreateInfo::default()
|
||||
.queue_family_index(qf)
|
||||
.flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER),
|
||||
None,
|
||||
)
|
||||
.context("create command pool")?;
|
||||
let cmd = device
|
||||
.allocate_command_buffers(
|
||||
&vk::CommandBufferAllocateInfo::default()
|
||||
.command_pool(cmd_pool)
|
||||
.level(vk::CommandBufferLevel::PRIMARY)
|
||||
.command_buffer_count(1),
|
||||
)
|
||||
.context("allocate command buffer")?[0];
|
||||
let fence = device
|
||||
.create_fence(&vk::FenceCreateInfo::default(), None)
|
||||
.context("create fence")?;
|
||||
|
||||
tracing::info!("Vulkan bridge ready (dmabuf import → OPAQUE_FD export → CUDA)");
|
||||
Ok(VkBridge {
|
||||
_entry: entry,
|
||||
instance,
|
||||
device,
|
||||
ext_fd,
|
||||
queue,
|
||||
cmd_pool,
|
||||
cmd,
|
||||
fence,
|
||||
mem_props,
|
||||
src_cache: HashMap::new(),
|
||||
dst: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_type(&self, type_bits: u32, flags: vk::MemoryPropertyFlags) -> Result<u32> {
|
||||
(0..self.mem_props.memory_type_count)
|
||||
.find(|&i| {
|
||||
type_bits & (1 << i) != 0
|
||||
&& self.mem_props.memory_types[i as usize]
|
||||
.property_flags
|
||||
.contains(flags)
|
||||
})
|
||||
.ok_or_else(|| anyhow!("no compatible Vulkan memory type"))
|
||||
}
|
||||
|
||||
/// Import `fd` (dup'd internally; Vulkan owns the dup) as a transfer-src buffer of `size`.
|
||||
unsafe fn import_src(&mut self, fd: i32, size: u64) -> Result<()> {
|
||||
let dup = libc::dup(fd);
|
||||
if dup < 0 {
|
||||
bail!("dup(dmabuf fd)");
|
||||
}
|
||||
let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
|
||||
.handle_types(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT);
|
||||
let buffer = self
|
||||
.device
|
||||
.create_buffer(
|
||||
&vk::BufferCreateInfo::default()
|
||||
.size(size)
|
||||
.usage(vk::BufferUsageFlags::TRANSFER_SRC)
|
||||
.push_next(&mut ext_info),
|
||||
None,
|
||||
)
|
||||
.context("create import buffer")?;
|
||||
let mut fd_props = vk::MemoryFdPropertiesKHR::default();
|
||||
self.ext_fd
|
||||
.get_memory_fd_properties(
|
||||
vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT,
|
||||
dup,
|
||||
&mut fd_props,
|
||||
)
|
||||
.context("vkGetMemoryFdPropertiesKHR")?;
|
||||
let reqs = self.device.get_buffer_memory_requirements(buffer);
|
||||
let mem_type = self.memory_type(
|
||||
reqs.memory_type_bits & fd_props.memory_type_bits,
|
||||
vk::MemoryPropertyFlags::empty(),
|
||||
)?;
|
||||
let mut import = vk::ImportMemoryFdInfoKHR::default()
|
||||
.handle_type(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT)
|
||||
.fd(dup); // Vulkan takes ownership of `dup` on success
|
||||
let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
|
||||
let memory = self
|
||||
.device
|
||||
.allocate_memory(
|
||||
&vk::MemoryAllocateInfo::default()
|
||||
.allocation_size(reqs.size.max(size))
|
||||
.memory_type_index(mem_type)
|
||||
.push_next(&mut import)
|
||||
.push_next(&mut dedicated),
|
||||
None,
|
||||
)
|
||||
.map_err(|e| {
|
||||
libc::close(dup); // failed import does not consume the fd
|
||||
anyhow!("import dmabuf memory: {e}")
|
||||
})?;
|
||||
self.device
|
||||
.bind_buffer_memory(buffer, memory, 0)
|
||||
.context("bind import memory")?;
|
||||
self.src_cache.insert(
|
||||
fd,
|
||||
SrcBuf {
|
||||
buffer,
|
||||
memory,
|
||||
size,
|
||||
},
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// (Re)create the exportable destination of at least `size` bytes + its CUDA mapping.
|
||||
unsafe fn ensure_dst(&mut self, size: u64) -> Result<()> {
|
||||
if self.dst.as_ref().is_some_and(|d| d.size >= size) {
|
||||
return Ok(());
|
||||
}
|
||||
if let Some(old) = self.dst.take() {
|
||||
self.device.destroy_buffer(old.buffer, None);
|
||||
self.device.free_memory(old.memory, None);
|
||||
// old.cuda drops its mapping with it
|
||||
}
|
||||
let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
|
||||
.handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
|
||||
let buffer = self
|
||||
.device
|
||||
.create_buffer(
|
||||
&vk::BufferCreateInfo::default()
|
||||
.size(size)
|
||||
.usage(vk::BufferUsageFlags::TRANSFER_DST)
|
||||
.push_next(&mut ext_info),
|
||||
None,
|
||||
)
|
||||
.context("create export buffer")?;
|
||||
let reqs = self.device.get_buffer_memory_requirements(buffer);
|
||||
let mem_type =
|
||||
self.memory_type(reqs.memory_type_bits, vk::MemoryPropertyFlags::DEVICE_LOCAL)?;
|
||||
let mut export = vk::ExportMemoryAllocateInfo::default()
|
||||
.handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
|
||||
let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
|
||||
let memory = self
|
||||
.device
|
||||
.allocate_memory(
|
||||
&vk::MemoryAllocateInfo::default()
|
||||
.allocation_size(reqs.size)
|
||||
.memory_type_index(mem_type)
|
||||
.push_next(&mut export)
|
||||
.push_next(&mut dedicated),
|
||||
None,
|
||||
)
|
||||
.context("allocate exportable memory")?;
|
||||
self.device
|
||||
.bind_buffer_memory(buffer, memory, 0)
|
||||
.context("bind export memory")?;
|
||||
let opaque_fd = self
|
||||
.ext_fd
|
||||
.get_memory_fd(
|
||||
&vk::MemoryGetFdInfoKHR::default()
|
||||
.memory(memory)
|
||||
.handle_type(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD),
|
||||
)
|
||||
.context("vkGetMemoryFdKHR")?;
|
||||
// CUDA imports (and on success owns) the exported fd. Size must match the allocation.
|
||||
let cuda = cuda::ExternalDmabuf::import_owned_fd(opaque_fd, reqs.size)
|
||||
.context("cuImportExternalMemory(OPAQUE_FD from Vulkan)")?;
|
||||
tracing::info!(size, "Vulkan→CUDA exportable staging buffer ready");
|
||||
self.dst = Some(DstBuf {
|
||||
buffer,
|
||||
memory,
|
||||
size: reqs.size,
|
||||
cuda,
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Bridge one LINEAR dmabuf frame into a pooled CUDA buffer: GPU copy dmabuf→exportable,
|
||||
/// then pitched CUDA copy exportable→`pool` buffer.
|
||||
pub fn import_linear(
|
||||
&mut self,
|
||||
fd: i32,
|
||||
offset: u32,
|
||||
stride: u32,
|
||||
height: u32,
|
||||
pool: &cuda::BufferPool,
|
||||
) -> Result<DeviceBuffer> {
|
||||
unsafe {
|
||||
let span = offset as u64 + stride as u64 * height as u64;
|
||||
if !self.src_cache.contains_key(&fd) {
|
||||
let size = libc::lseek(fd, 0, libc::SEEK_END);
|
||||
anyhow::ensure!(size > 0, "lseek(dmabuf)");
|
||||
anyhow::ensure!(size as u64 >= span, "dmabuf smaller than frame span");
|
||||
self.import_src(fd, size as u64)?;
|
||||
}
|
||||
let (src_buffer, src_size) = {
|
||||
let s = &self.src_cache[&fd];
|
||||
(s.buffer, s.size)
|
||||
};
|
||||
let copy_size = src_size.min(span);
|
||||
self.ensure_dst(copy_size)?;
|
||||
let dst = self.dst.as_ref().unwrap();
|
||||
|
||||
// Record + submit the GPU copy, wait on the fence (GPU-GPU, sub-millisecond).
|
||||
self.device
|
||||
.begin_command_buffer(
|
||||
self.cmd,
|
||||
&vk::CommandBufferBeginInfo::default()
|
||||
.flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT),
|
||||
)
|
||||
.context("begin cmd")?;
|
||||
let region = vk::BufferCopy::default().size(copy_size);
|
||||
self.device
|
||||
.cmd_copy_buffer(self.cmd, src_buffer, dst.buffer, &[region]);
|
||||
self.device
|
||||
.end_command_buffer(self.cmd)
|
||||
.context("end cmd")?;
|
||||
let cmds = [self.cmd];
|
||||
let submit = vk::SubmitInfo::default().command_buffers(&cmds);
|
||||
self.device
|
||||
.queue_submit(self.queue, &[submit], self.fence)
|
||||
.context("queue submit")?;
|
||||
self.device
|
||||
.wait_for_fences(&[self.fence], true, 1_000_000_000)
|
||||
.context("fence wait")?;
|
||||
self.device
|
||||
.reset_fences(&[self.fence])
|
||||
.context("reset fence")?;
|
||||
|
||||
// De-stride from the CUDA view of the exportable memory into a pooled buffer.
|
||||
cuda::make_current()?;
|
||||
let out = pool.get()?;
|
||||
cuda::copy_pitched_to_buffer(dst.cuda.ptr + offset as u64, stride as usize, &out)?;
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VkBridge {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
let _ = self.device.device_wait_idle();
|
||||
for (_, s) in self.src_cache.drain() {
|
||||
self.device.destroy_buffer(s.buffer, None);
|
||||
self.device.free_memory(s.memory, None);
|
||||
}
|
||||
if let Some(d) = self.dst.take() {
|
||||
self.device.destroy_buffer(d.buffer, None);
|
||||
self.device.free_memory(d.memory, None);
|
||||
}
|
||||
self.device.destroy_fence(self.fence, None);
|
||||
self.device.destroy_command_pool(self.cmd_pool, None);
|
||||
self.device.destroy_device(None);
|
||||
self.instance.destroy_instance(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user