Files
punktfunk/crates/punktfunk-host/src/linux/zerocopy/cuda.rs
T
enricobuehler ba68a98873 docs(host): prove every unsafe block in the Linux FFI files + gate them (unsafe-proof program 2/N)
Continues the structural unsafe-proof program (every unsafe carries a documented
proof of soundness; the file gains #![deny(clippy::undocumented_unsafe_blocks)]
so it stays proven). This batch covers all 10 remaining pure-Linux files
(104 blocks), each proof stating the REAL invariant — not boilerplate:

  zerocopy/cuda.rs (26)   leaked process-lifetime libcuda fn-ptr table; opaque
                          CUcontext never dereferenced; free-exactly-once via the
                          Arc<Mutex<PoolInner>> ownership graph; dmabuf fd take/close split
  zerocopy/egl.rs (18)    eglGetProcAddress'd procs with the GL context current;
                          EGLImage liveness; the two-call modifier-query bounds
  zerocopy/vulkan.rs (4)  copy-bounds arithmetic (src_size>=span); Send = thread
                          confinement to the punktfunk-pipewire thread
  dmabuf_fence.rs (4)     poll/ioctl/close fd liveness + ownership
  capture/linux/mod.rs (16)  spa_data repr(transparent) cast; null-checked spa
                          derefs; single-loop-thread buffer ownership until requeue
  inject/linux/gamepad.rs (10)  uinput ioctl request-number ↔ struct-size match
                          (static-asserted); InputEventRaw no-padding for the byte cast
  encode/linux/vaapi.rs (15) + encode/linux/mod.rs (9)  ffmpeg object ownership/
                          free ladders; VAAPI/DRM graph; Send = single-thread transfer
  inject/linux/wlr.rs (2), vdisplay/linux/kwin.rs (1)

No memory-unsafety SUSPECT blocks were found — the unsafe is sound. The vaapi
agent did flag two real AVBufferRef *leaks* (not UB) in DmabufInner::open; marked
inline with NOTE(leak) and addressed in a follow-up.

Verified: cargo clippy -p punktfunk-host --all-targets -- -D warnings is clean
(each file's deny gate hard-errors on any undocumented block).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 09:00:30 +00:00

1137 lines
51 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop
//! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and
//! `dlopen` `libcuda.so.1` at runtime (the driver library — NOT `libcudart`; NOT a link-time
//! `#[link]`, so one binary runs on NVIDIA and on AMD/Intel where `libcuda` is absent — see
//! [`CudaApi`]). Symbol names verified against
//! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop
//! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is
//! Tegra-only on the desktop driver — see [`super::egl`].)
//!
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
#![allow(non_camel_case_types, non_snake_case)]
// Every `unsafe` block/impl below carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]
use anyhow::{bail, Result};
use std::os::raw::{c_int, c_uint, c_void};
use std::sync::{Arc, Mutex, OnceLock};
pub type CUresult = c_uint; // CUDA_SUCCESS == 0
pub type CUdevice = c_int;
pub type CUcontext = *mut c_void; // opaque CUctx_st*
pub type CUstream = *mut c_void; // opaque CUstream_st*
pub type CUdeviceptr = u64;
pub type CUgraphicsResource = *mut c_void;
pub type CUarray = *mut c_void;
pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
/// `CUctx_flags` (cuda.h): block the CPU on an OS primitive while waiting for the GPU instead of
/// busy-spinning. On this shared box (compositor + send thread on the same cores) spinning a core
/// to detect copy completion steals CPU from the very threads we want scheduled; BLOCKING_SYNC
/// frees it. Default (`CU_CTX_SCHED_AUTO=0`) heuristically picks SPIN vs YIELD by core count.
const CU_CTX_SCHED_BLOCKING_SYNC: c_uint = 0x04;
/// `cuStreamCreateWithPriority` flag: don't implicitly synchronize with the legacy NULL stream.
const CU_STREAM_NON_BLOCKING: c_uint = 0x01;
/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
#[repr(C)]
#[derive(Default)]
pub struct CUDA_MEMCPY2D {
pub srcXInBytes: usize,
pub srcY: usize,
pub srcMemoryType: c_uint,
pub srcHost: *const c_void,
pub srcDevice: CUdeviceptr,
pub srcArray: CUarray,
pub srcPitch: usize,
pub dstXInBytes: usize,
pub dstY: usize,
pub dstMemoryType: c_uint,
pub dstHost: *mut c_void,
pub dstDevice: CUdeviceptr,
pub dstArray: CUarray,
pub dstPitch: usize,
pub WidthInBytes: usize,
pub Height: usize,
}
/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
/// only the first 4 bytes (the `int fd`) are read.
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
_pad: u32,
pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
pub size: u64,
pub flags: c_uint,
reserved: [c_uint; 16],
_pad2: u32,
}
/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
pub offset: u64,
pub size: u64,
pub flags: c_uint,
reserved: [c_uint; 16],
_pad: u32,
}
pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;
/// CUDA Driver API entry points, resolved at runtime from `libcuda.so.1` via `dlopen` rather than
/// a link-time `#[link(name = "cuda")]`. This is what lets ONE host binary run on NVIDIA
/// (zero-copy via CUDA → NVENC) *and* on AMD/Intel (VAAPI, where the NVIDIA driver — and thus
/// `libcuda` — is absent): with a hard link the loader would refuse to start the binary at all.
/// Every `cu*` call below goes through a same-named wrapper fn that forwards to this table; when
/// the driver isn't present the table is `None` and the wrappers return a non-zero `CUresult`, so
/// `context()` fails cleanly and the capturer falls back to the CPU path. The `cuda_api()` loader
/// is memoised; the library handle is intentionally leaked (process-lifetime, like the context).
struct CudaApi {
cuInit: unsafe extern "C" fn(c_uint) -> CUresult,
cuDeviceGet: unsafe extern "C" fn(*mut CUdevice, c_int) -> CUresult,
cuCtxCreate_v2: unsafe extern "C" fn(*mut CUcontext, c_uint, CUdevice) -> CUresult,
cuCtxSetCurrent: unsafe extern "C" fn(CUcontext) -> CUresult,
cuMemAllocPitch_v2:
unsafe extern "C" fn(*mut CUdeviceptr, *mut usize, usize, usize, c_uint) -> CUresult,
cuMemFree_v2: unsafe extern "C" fn(CUdeviceptr) -> CUresult,
cuMemcpy2DAsync_v2: unsafe extern "C" fn(*const CUDA_MEMCPY2D, CUstream) -> CUresult,
cuStreamSynchronize: unsafe extern "C" fn(CUstream) -> CUresult,
cuCtxGetStreamPriorityRange: unsafe extern "C" fn(*mut c_int, *mut c_int) -> CUresult,
cuStreamCreateWithPriority: unsafe extern "C" fn(*mut CUstream, c_uint, c_int) -> CUresult,
cuGraphicsGLRegisterImage:
unsafe extern "C" fn(*mut CUgraphicsResource, c_uint, c_uint, c_uint) -> CUresult,
cuGraphicsMapResources:
unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult,
cuGraphicsUnmapResources:
unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult,
cuGraphicsSubResourceGetMappedArray:
unsafe extern "C" fn(*mut CUarray, CUgraphicsResource, c_uint, c_uint) -> CUresult,
cuGraphicsUnregisterResource: unsafe extern "C" fn(CUgraphicsResource) -> CUresult,
cuImportExternalMemory: unsafe extern "C" fn(
*mut CUexternalMemory,
*const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
) -> CUresult,
cuExternalMemoryGetMappedBuffer: unsafe extern "C" fn(
*mut CUdeviceptr,
CUexternalMemory,
*const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
) -> CUresult,
cuDestroyExternalMemory: unsafe extern "C" fn(CUexternalMemory) -> CUresult,
}
// SAFETY: every field is a bare `extern "C" fn` address into the leaked, process-lifetime
// `libcuda` mapping (`cuda_api` `forget`s the `Library`, so it is never unloaded) — an immutable
// value with no interior mutability and no thread affinity. Moving the table to another thread
// cannot dangle (the code it points at stays mapped) or race (the fields are read-only).
unsafe impl Send for CudaApi {}
// SAFETY: as above — the table is a set of immutable fn-pointer addresses with no interior
// mutability, so concurrent shared reads from multiple threads cannot race; the driver entry
// points they address are themselves thread-safe.
unsafe impl Sync for CudaApi {}
/// `CUresult` returned by the wrappers when `libcuda` isn't loaded (no NVIDIA driver). Non-zero so
/// the existing `ck()`/`!= 0` checks treat it as an ordinary driver error; distinct from any real
/// `CUDA_ERROR_*` (all < 1000). Never produced by the actual driver.
const CU_ERROR_NOT_LOADED: CUresult = 999;
static CUDA_API: OnceLock<Option<CudaApi>> = OnceLock::new();
/// Resolve `libcuda.so.1` and its symbols once. `None` when the NVIDIA driver isn't installed
/// (the expected case on AMD/Intel hosts) — logged at debug, not an error.
fn cuda_api() -> Option<&'static CudaApi> {
CUDA_API
// SAFETY: `Library::new` runs `libcuda.so.1`'s initializers — it is the trusted NVIDIA
// driver library, so loading has no unexpected effects; `?`/`None` handle its absence.
// Each `lib.get::<T>(name)` asserts the symbol's real ABI equals `T`: every NUL-terminated
// name is a documented CUDA Driver API entry point and `T` is the exact
// `unsafe extern "C" fn(..)` signature from cuda.h/cudaGL.h (`_v2` for ctx/mem ops). Each
// `Symbol` only borrows `lib` until the end of the struct-literal statement; we deref-copy
// the raw fn-pointer out first, then `forget(lib)` leaks the mapping so those addresses
// stay valid for the whole process. Runs once under the `OnceLock` init — no aliasing.
.get_or_init(|| unsafe {
let lib = libloading::Library::new("libcuda.so.1")
.or_else(|_| libloading::Library::new("libcuda.so"))
.map_err(|e| {
tracing::debug!(error = %e, "libcuda not loadable — CUDA zero-copy unavailable (expected on AMD/Intel)");
})
.ok()?;
// Resolve all symbols; the field types drive `get`'s inference. `lib` is leaked after
// construction so the fn pointers stay valid for the process lifetime (the temporary
// `Symbol` borrows end with the struct-literal statement, before the forget).
let api = CudaApi {
cuInit: *lib.get(b"cuInit\0").ok()?,
cuDeviceGet: *lib.get(b"cuDeviceGet\0").ok()?,
cuCtxCreate_v2: *lib.get(b"cuCtxCreate_v2\0").ok()?,
cuCtxSetCurrent: *lib.get(b"cuCtxSetCurrent\0").ok()?,
cuMemAllocPitch_v2: *lib.get(b"cuMemAllocPitch_v2\0").ok()?,
cuMemFree_v2: *lib.get(b"cuMemFree_v2\0").ok()?,
cuMemcpy2DAsync_v2: *lib.get(b"cuMemcpy2DAsync_v2\0").ok()?,
cuStreamSynchronize: *lib.get(b"cuStreamSynchronize\0").ok()?,
cuCtxGetStreamPriorityRange: *lib.get(b"cuCtxGetStreamPriorityRange\0").ok()?,
cuStreamCreateWithPriority: *lib.get(b"cuStreamCreateWithPriority\0").ok()?,
cuGraphicsGLRegisterImage: *lib.get(b"cuGraphicsGLRegisterImage\0").ok()?,
cuGraphicsMapResources: *lib.get(b"cuGraphicsMapResources\0").ok()?,
cuGraphicsUnmapResources: *lib.get(b"cuGraphicsUnmapResources\0").ok()?,
cuGraphicsSubResourceGetMappedArray: *lib
.get(b"cuGraphicsSubResourceGetMappedArray\0")
.ok()?,
cuGraphicsUnregisterResource: *lib.get(b"cuGraphicsUnregisterResource\0").ok()?,
cuImportExternalMemory: *lib.get(b"cuImportExternalMemory\0").ok()?,
cuExternalMemoryGetMappedBuffer: *lib
.get(b"cuExternalMemoryGetMappedBuffer\0")
.ok()?,
cuDestroyExternalMemory: *lib.get(b"cuDestroyExternalMemory\0").ok()?,
};
std::mem::forget(lib); // keep libcuda mapped for the fn pointers' lifetime (process)
Some(api)
})
.as_ref()
}
// Same-named wrappers so the call sites below are unchanged. Each forwards through the dlopen'd
// table, or returns `CU_ERROR_NOT_LOADED` when the driver is absent (AMD/Intel) — which the
// `CUresult` checks already handle. Only `context()` is reachable before the driver is confirmed
// present; every other entry runs after `context()` succeeded, so its wrapper always hits `Some`.
unsafe fn cuInit(flags: c_uint) -> CUresult {
match cuda_api() {
Some(a) => (a.cuInit)(flags),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult {
match cuda_api() {
Some(a) => (a.cuDeviceGet)(device, ordinal),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult {
match cuda_api() {
Some(a) => (a.cuCtxCreate_v2)(pctx, flags, dev),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult {
match cuda_api() {
Some(a) => (a.cuCtxSetCurrent)(ctx),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuMemAllocPitch_v2(
dptr: *mut CUdeviceptr,
pitch: *mut usize,
width_bytes: usize,
height: usize,
element_size: c_uint,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuMemAllocPitch_v2)(dptr, pitch, width_bytes, height, element_size),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult {
match cuda_api() {
Some(a) => (a.cuMemFree_v2)(dptr),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuMemcpy2DAsync_v2(copy: *const CUDA_MEMCPY2D, stream: CUstream) -> CUresult {
match cuda_api() {
Some(a) => (a.cuMemcpy2DAsync_v2)(copy, stream),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuStreamSynchronize(stream: CUstream) -> CUresult {
match cuda_api() {
Some(a) => (a.cuStreamSynchronize)(stream),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuCtxGetStreamPriorityRange(least: *mut c_int, greatest: *mut c_int) -> CUresult {
match cuda_api() {
Some(a) => (a.cuCtxGetStreamPriorityRange)(least, greatest),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuStreamCreateWithPriority(
stream: *mut CUstream,
flags: c_uint,
priority: c_int,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuStreamCreateWithPriority)(stream, flags, priority),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuGraphicsGLRegisterImage(
resource: *mut CUgraphicsResource,
texture: c_uint,
target: c_uint,
flags: c_uint,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuGraphicsGLRegisterImage)(resource, texture, target, flags),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuGraphicsMapResources(
count: c_uint,
resources: *mut CUgraphicsResource,
stream: *mut c_void,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuGraphicsMapResources)(count, resources, stream),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuGraphicsUnmapResources(
count: c_uint,
resources: *mut CUgraphicsResource,
stream: *mut c_void,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuGraphicsUnmapResources)(count, resources, stream),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuGraphicsSubResourceGetMappedArray(
array: *mut CUarray,
resource: CUgraphicsResource,
array_index: c_uint,
mip_level: c_uint,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuGraphicsSubResourceGetMappedArray)(array, resource, array_index, mip_level),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult {
match cuda_api() {
Some(a) => (a.cuGraphicsUnregisterResource)(resource),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuImportExternalMemory(
ext_mem_out: *mut CUexternalMemory,
mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuImportExternalMemory)(ext_mem_out, mem_handle_desc),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuExternalMemoryGetMappedBuffer(
dev_ptr: *mut CUdeviceptr,
ext_mem: CUexternalMemory,
buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
) -> CUresult {
match cuda_api() {
Some(a) => (a.cuExternalMemoryGetMappedBuffer)(dev_ptr, ext_mem, buffer_desc),
None => CU_ERROR_NOT_LOADED,
}
}
unsafe fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult {
match cuda_api() {
Some(a) => (a.cuDestroyExternalMemory)(ext_mem),
None => CU_ERROR_NOT_LOADED,
}
}
#[inline]
fn ck(r: CUresult, what: &str) -> Result<()> {
if r == 0 {
Ok(())
} else {
bail!("CUDA driver error {r} in {what}")
}
}
/// Copy a pitched device plane `(src_ptr, src_pitch)` down to a tightly-packed host buffer of
/// `width_bytes`×`height` (no row padding). Synchronous on the priority stream. Used by the NV12
/// self-test to read planes back for the colour comparison; not on the hot path.
pub fn read_plane_to_host(
src_ptr: CUdeviceptr,
src_pitch: usize,
width_bytes: usize,
height: usize,
) -> Result<Vec<u8>> {
let mut host = vec![0u8; width_bytes * height];
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src_ptr,
srcPitch: src_pitch,
dstMemoryType: 1, // CU_MEMORYTYPE_HOST
dstHost: host.as_mut_ptr() as *mut c_void,
dstPitch: width_bytes,
WidthInBytes: width_bytes,
Height: height,
..Default::default()
};
// SAFETY: `copy_blocking` is unsafe because it issues a CUDA copy; its contract is a valid
// descriptor with the shared context current (the caller's responsibility — self-test path).
// `&copy` is a live local `#[repr(C)] CUDA_MEMCPY2D` that outlives the synchronous call:
// `srcDevice`/`srcPitch` are the caller's live pitched device plane, `dstHost` addresses the
// freshly-allocated `host` `Vec` of exactly `width_bytes*height` bytes, and `WidthInBytes`×
// `Height` fit both. The copy is synchronous, so `host` is fully written before we return it.
unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(dev->host)")? };
Ok(host)
}
/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
#[derive(Clone, Copy)]
pub struct Context(pub CUcontext);
// SAFETY: `CUcontext` is an opaque CUDA driver handle, not a dereferenceable Rust pointer. It is
// created once and never destroyed (process lifetime), and the only thing done with it is
// `cuCtxSetCurrent`, which the Driver API explicitly allows from any thread — so transferring the
// handle to another thread cannot dangle or race (the driver owns the synchronization).
unsafe impl Send for Context {}
// SAFETY: as above — the wrapped handle is an immutable opaque address and the driver does all the
// synchronization, so sharing `&Context` across threads is sound.
unsafe impl Sync for Context {}
static CONTEXT: OnceLock<Context> = OnceLock::new();
/// Get (lazily creating) the shared CUDA context on device 0.
pub fn context() -> Result<CUcontext> {
if let Some(c) = CONTEXT.get() {
return Ok(c.0);
}
if cuda_api().is_none() {
bail!("libcuda.so.1 not available — no NVIDIA driver (CUDA zero-copy disabled)");
}
// SAFETY: we returned above unless `cuda_api()` is `Some`, so every wrapper here forwards into
// the live, leaked `libcuda` table rather than the not-loaded stub. `cuInit(0)` passes the
// API-required flags value 0. `&mut dev`/`&mut ctx` are live, zero/null-initialized stack
// out-params the driver writes the device handle / new context into; each outlives its
// synchronous call and they are distinct locals (no aliasing). `cuCtxCreate_v2` yields a valid
// `CUcontext` on success (`ck` bails otherwise), which becomes the block's value.
let ctx = unsafe {
ck(cuInit(0), "cuInit")?;
let mut dev: CUdevice = 0;
ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
let mut ctx: CUcontext = std::ptr::null_mut();
ck(
cuCtxCreate_v2(&mut ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev),
"cuCtxCreate_v2",
)?;
ctx
};
// Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
// process-lifetime). `get_or_init` keeps a single shared value.
Ok(CONTEXT.get_or_init(|| Context(ctx)).0)
}
/// Make the shared context current on the calling thread (required before any CUDA op here).
pub fn make_current() -> Result<()> {
let ctx = context()?;
// SAFETY: `ctx` came from `context()?`, so it is the live shared `CUcontext` and the driver
// table is present. `cuCtxSetCurrent` binds that opaque handle to the calling thread; it takes
// no Rust-memory pointer and is thread-safe (affects only this thread's current context), so
// there is no aliasing or lifetime hazard.
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
}
thread_local! {
/// Per-thread copy stream. `None` until first use; `Some(null)` means "creation failed, use the
/// default (NULL) stream". Per-thread (not shared) so each worker's `cuStreamSynchronize` waits
/// only on ITS OWN copies — the old per-frame `cuCtxSynchronize` was context-wide and also
/// blocked on the other worker thread's in-flight NULL-stream copies.
static COPY_STREAM: std::cell::Cell<Option<CUstream>> = const { std::cell::Cell::new(None) };
}
/// The calling thread's highest-priority copy stream (lazily created; context must be current).
/// Carries the greatest stream priority the driver exposes — a scheduler hint that nudges our
/// copies ahead of the game's queued compute. NOTE: stream priority is an intra-process hint and
/// NVIDIA's Linux driver may ignore it / not preempt a saturating game's graphics context; this is
/// "measure-then-keep", and it never regresses (falls back to the NULL stream). The greatest
/// priority is the numerically-lowest value (`greatest` from `cuCtxGetStreamPriorityRange`).
fn copy_stream() -> CUstream {
COPY_STREAM.with(|cell| {
if let Some(s) = cell.get() {
return s;
}
// SAFETY: `copy_stream` runs with the shared context current (its doc contract), so the
// wrappers forward into the live `libcuda` table. `&mut least`/`&mut greatest` are live
// stack `i32`s the driver fills with the priority range; `&mut s` is a live null-init
// `CUstream` the driver writes the new stream into. All out-params outlive their
// synchronous calls and are distinct locals. On any non-zero result we fall back to a null
// (NULL-stream) value and never read an uninitialized handle.
let stream = unsafe {
let (mut least, mut greatest) = (0i32, 0i32);
if cuCtxGetStreamPriorityRange(&mut least, &mut greatest) != 0 {
std::ptr::null_mut()
} else {
let mut s: CUstream = std::ptr::null_mut();
if cuStreamCreateWithPriority(&mut s, CU_STREAM_NON_BLOCKING, greatest) != 0 {
std::ptr::null_mut()
} else {
tracing::debug!(
priority = greatest,
"CUDA high-priority copy stream created"
);
s
}
}
};
cell.set(Some(stream));
stream
})
}
/// Issue `copy` on this thread's priority stream and block until it completes. Replaces the
/// per-frame `cuMemcpy2D_v2` + context-wide `cuCtxSynchronize` pair: same completion guarantee
/// (the source dmabuf is safe to recycle once this returns), but the wait is scoped to our own
/// stream and the copy carries the high priority hint.
unsafe fn copy_blocking(copy: &CUDA_MEMCPY2D, what: &str) -> Result<()> {
let stream = copy_stream();
ck(cuMemcpy2DAsync_v2(copy, stream), what)?;
ck(cuStreamSynchronize(stream), "cuStreamSynchronize")
}
/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
let mut ptr: CUdeviceptr = 0;
let mut pitch: usize = 0;
// SAFETY: `cuMemAllocPitch_v2` allocates a pitched device buffer (the wrapper forwards to the
// live table on any path that reached allocation). `&mut ptr` (`CUdeviceptr`) and `&mut pitch`
// (`usize`) are live, distinct stack out-params the driver writes the allocation pointer and
// its pitch into; both outlive the synchronous call. Width/height/element-size are by-value
// ints. No aliasing — two separate locals.
unsafe {
ck(
cuMemAllocPitch_v2(
&mut ptr,
&mut pitch,
width as usize * 4,
height as usize,
16,
),
"cuMemAllocPitch_v2",
)?;
}
Ok((ptr, pitch))
}
/// Allocate the two pitched planes of an NV12 surface (8-bit BT.709 4:2:0): a `width`-byte Y plane
/// (W×H, 1 byte/px) and an interleaved chroma plane (W/2 × H/2 samples, 2 bytes/sample → W bytes
/// wide). Both planes share the driver's Y pitch (the wider request), so the encoder's two-plane
/// surface and ours line up. Returns `((y_ptr, y_pitch), (uv_ptr, uv_pitch))`.
fn alloc_pitched_nv12(
width: u32,
height: u32,
) -> Result<((CUdeviceptr, usize), (CUdeviceptr, usize))> {
let mut y_ptr: CUdeviceptr = 0;
let mut y_pitch: usize = 0;
let mut uv_ptr: CUdeviceptr = 0;
let mut uv_pitch: usize = 0;
// SAFETY: two independent `cuMemAllocPitch_v2` calls (wrapper → live table). `&mut y_ptr`/
// `&mut y_pitch` and `&mut uv_ptr`/`&mut uv_pitch` are live, distinct stack out-params the
// driver writes each plane's pointer and pitch into; all outlive their synchronous calls. The
// dimension/element-size args are by-value ints. No aliasing — four separate locals.
unsafe {
ck(
cuMemAllocPitch_v2(
&mut y_ptr,
&mut y_pitch,
width as usize,
height as usize,
16,
),
"cuMemAllocPitch_v2(Y)",
)?;
// Chroma is W/2 samples wide at 2 bytes each = W bytes; H/2 rows.
ck(
cuMemAllocPitch_v2(
&mut uv_ptr,
&mut uv_pitch,
(width as usize / 2) * 2,
(height as usize / 2).max(1),
16,
),
"cuMemAllocPitch_v2(UV)",
)?;
}
Ok(((y_ptr, y_pitch), (uv_ptr, uv_pitch)))
}
/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
/// returns its allocation here. Bulk-freed when the last reference drops. For NV12 each free entry
/// is the Y plane *and* its paired UV plane (allocated/recycled/freed together).
struct PoolInner {
free: Vec<CUdeviceptr>,
/// NV12 only: the UV plane paired with each Y plane in `free` (same index, same length).
free_uv: Vec<CUdeviceptr>,
}
impl Drop for PoolInner {
fn drop(&mut self) {
// SAFETY: the pool only exists because allocation succeeded, so the driver table is live.
// `PoolInner` drops only once every `DeviceBuffer` that referenced it (each holds an `Arc`
// clone) has been recycled, so `free`/`free_uv` hold every outstanding allocation exactly
// once and nothing else still uses them — no double-free or use-after-free. We make the
// shared context current first (drop may run off the allocating thread) so `cuMemFree_v2`
// targets the right context. Each `p` is a `CUdeviceptr` previously returned by
// `cuMemAllocPitch_v2`; results are ignored (best-effort teardown).
unsafe {
if let Some(c) = CONTEXT.get() {
let _ = cuCtxSetCurrent(c.0);
}
for &p in &self.free {
let _ = cuMemFree_v2(p);
}
for &p in &self.free_uv {
let _ = cuMemFree_v2(p);
}
}
}
}
/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame
/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock
/// and serializes against the GPU every frame.
#[derive(Clone)]
pub struct BufferPool {
inner: Arc<Mutex<PoolInner>>,
width: u32,
height: u32,
pitch: usize,
/// NV12 pools carry a second (chroma) pitch; `Some` ⇒ buffers from this pool have a UV plane.
uv_pitch: Option<usize>,
}
impl BufferPool {
/// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the
/// driver's pitch, which is constant for a given width).
pub fn new(width: u32, height: u32) -> Result<BufferPool> {
let (ptr, pitch) = alloc_pitched(width, height)?;
Ok(BufferPool {
inner: Arc::new(Mutex::new(PoolInner {
free: vec![ptr],
free_uv: Vec::new(),
})),
width,
height,
pitch,
uv_pitch: None,
})
}
/// Create a pool of NV12 two-plane surfaces (Y + interleaved UV) for `width`x`height`. Allocates
/// one pair up front to learn the driver's per-plane pitches (constant for a given width).
pub fn new_nv12(width: u32, height: u32) -> Result<BufferPool> {
let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
Ok(BufferPool {
inner: Arc::new(Mutex::new(PoolInner {
free: vec![y_ptr],
free_uv: vec![uv_ptr],
})),
width,
height,
pitch: y_pitch,
uv_pitch: Some(uv_pitch),
})
}
pub fn width(&self) -> u32 {
self.width
}
pub fn height(&self) -> u32 {
self.height
}
/// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
/// pool when dropped (after the consumer has synchronized, so the GPU is done with it). For an
/// NV12 pool the returned buffer carries both the Y and the paired UV plane.
pub fn get(&self) -> Result<DeviceBuffer> {
if let Some(uv_pitch) = self.uv_pitch {
let reuse = {
let mut g = self.inner.lock().unwrap();
g.free.pop().map(|y| (y, g.free_uv.pop()))
};
let (ptr, uv_ptr) = match reuse {
// Y and UV are pushed/popped together, so a popped Y always has its UV.
Some((y, Some(uv))) => (y, uv),
_ => {
let ((y, _), (uv, _)) = alloc_pitched_nv12(self.width, self.height)?;
(y, uv)
}
};
return Ok(DeviceBuffer {
ptr,
pitch: self.pitch,
width: self.width,
height: self.height,
uv: Some((uv_ptr, uv_pitch)),
pool: Some(self.inner.clone()),
});
}
let reuse = self.inner.lock().unwrap().free.pop();
let ptr = match reuse {
Some(p) => p,
None => alloc_pitched(self.width, self.height)?.0,
};
Ok(DeviceBuffer {
ptr,
pitch: self.pitch,
width: self.width,
height: self.height,
uv: None,
pool: Some(self.inner.clone()),
})
}
}
/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped
/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder.
/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees.
pub struct DeviceBuffer {
pub ptr: CUdeviceptr,
pub pitch: usize,
pub width: u32,
pub height: u32,
/// NV12 only: the interleaved chroma plane `(ptr, pitch)` paired with the Y plane in [`ptr`].
/// `None` for the default 4-byte RGB/BGRx path. When `Some`, [`ptr`] is the Y plane (1 byte/px).
pub uv: Option<(CUdeviceptr, usize)>,
pool: Option<Arc<Mutex<PoolInner>>>,
}
impl DeviceBuffer {
/// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path.
pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
let (ptr, pitch) = alloc_pitched(width, height)?;
Ok(DeviceBuffer {
ptr,
pitch,
width,
height,
uv: None,
pool: None,
})
}
/// Allocate a standalone (un-pooled) NV12 two-plane buffer. Prefer [`BufferPool::new_nv12`] on
/// the hot path; used by the self-test.
pub fn alloc_nv12(width: u32, height: u32) -> Result<DeviceBuffer> {
let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
Ok(DeviceBuffer {
ptr: y_ptr,
pitch: y_pitch,
width,
height,
uv: Some((uv_ptr, uv_pitch)),
pool: None,
})
}
/// True if this buffer carries an NV12 chroma plane.
pub fn is_nv12(&self) -> bool {
self.uv.is_some()
}
}
impl Drop for DeviceBuffer {
fn drop(&mut self) {
if self.ptr == 0 {
return;
}
if let Some(pool) = &self.pool {
// Recycle (the consumer synchronized before dropping, so the GPU is done with it). Y and
// its paired UV go back together so `get` can repair them as a unit.
let mut g = pool.lock().unwrap();
g.free.push(self.ptr);
if let Some((uv_ptr, _)) = self.uv {
g.free_uv.push(uv_ptr);
}
} else {
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
// SAFETY: this is the un-pooled branch (`pool` is `None`), so this `DeviceBuffer`
// exclusively owns `self.ptr` (and `self.uv`'s `uv_ptr`), each returned by
// `cuMemAllocPitch_v2` and freed exactly once here — `drop` runs once and the
// `self.ptr == 0` guard above skips the sentinel/empty case, so no double-free. We set
// the shared context current first because drop may run on a thread where it isn't, and
// `cuMemFree_v2` needs it. Wrapper → live table; results ignored (teardown).
unsafe {
if let Some(c) = CONTEXT.get() {
let _ = cuCtxSetCurrent(c.0);
}
let _ = cuMemFree_v2(self.ptr);
if let Some((uv_ptr, _)) = self.uv {
let _ = cuMemFree_v2(uv_ptr);
}
}
}
}
}
/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA
/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the
/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only
/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point),
/// instead of registering/unregistering every frame. Unregisters on drop.
pub struct RegisteredTexture {
resource: CUgraphicsResource,
}
impl RegisteredTexture {
/// Register a `GL_TEXTURE_2D` once.
///
/// # Safety
/// The GL context and the shared CUDA context must both be current on this thread, and
/// `texture` must be a valid `GL_TEXTURE_2D`.
pub unsafe fn register_gl(texture: u32) -> Result<RegisteredTexture> {
const GL_TEXTURE_2D: c_uint = 0x0DE1;
const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
let mut resource: CUgraphicsResource = std::ptr::null_mut();
ck(
cuGraphicsGLRegisterImage(
&mut resource,
texture,
GL_TEXTURE_2D,
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY,
),
"cuGraphicsGLRegisterImage",
)?;
Ok(RegisteredTexture { resource })
}
/// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
/// unmap. The copy is synchronized (on our priority stream) before unmap so `dst` is ready
/// before the source dmabuf is recycled. Always unmaps, even if the copy errors.
pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
// SAFETY: `self.resource` is the valid `CUgraphicsResource` from a successful `register_gl`
// (its only constructor), so the wrappers forward to the live table; the caller holds the
// GL+CUDA contexts current (the registration's contract). `cuGraphicsMapResources` maps
// `count == 1` resource via `&mut self.resource` (a live field) on the default stream;
// `cuGraphicsSubResourceGetMappedArray` writes the mapped `CUarray` into the live local
// `array` (index 0, mip 0). On failure we unmap and bail (balanced). `&copy` is a live
// local `CUDA_MEMCPY2D` outliving the synchronous `copy_blocking`: `srcArray` is valid
// while mapped, `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height` fit
// both. `copy_blocking` syncs before we unmap, so the array stays valid through the copy;
// we always unmap afterward (even on error), keeping the map/unmap pair balanced.
unsafe {
ck(
cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
"cuGraphicsMapResources",
)?;
let mut array: CUarray = std::ptr::null_mut();
if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
bail!("cuGraphicsSubResourceGetMappedArray failed");
}
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_ARRAY,
srcArray: array,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: dst.ptr,
dstPitch: dst.pitch,
WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
Height: dst.height as usize,
..Default::default()
};
let res = copy_blocking(&copy, "cuMemcpy2DAsync_v2");
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
res
}
}
/// Map this texture for the frame and copy its array into the device plane `(dst_ptr,
/// dst_pitch)`, taking `width_bytes`×`height` bytes (the GL internal format dictates
/// `width_bytes`: `width*1` for an `R8` luma target, `(width/2)*2` for an `RG8` chroma target).
/// Synchronized on our priority stream before unmap (so the source dmabuf is safe to recycle).
/// Always unmaps, even on copy error.
fn copy_mapped_plane(
&mut self,
dst_ptr: CUdeviceptr,
dst_pitch: usize,
width_bytes: usize,
height: usize,
) -> Result<()> {
// SAFETY: identical contract to `copy_mapped_to` — `self.resource` is the valid
// `CUgraphicsResource` from `register_gl` (wrappers → live table; caller holds GL+CUDA
// contexts current). Map `count == 1` resource via the live `&mut self.resource`; the
// mapped `CUarray` is written into the live local `array` (index 0, mip 0); on failure we
// unmap and bail (balanced). `&copy` is a live local outliving the synchronous
// `copy_blocking`: `srcArray` valid while mapped, `dstDevice`/`dstPitch` are the caller's
// live plane, `width_bytes`×`height` fit it. We always unmap afterward, even on copy error,
// so the map/unmap pair stays balanced and the array outlives the copy.
unsafe {
ck(
cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
"cuGraphicsMapResources",
)?;
let mut array: CUarray = std::ptr::null_mut();
if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
bail!("cuGraphicsSubResourceGetMappedArray failed");
}
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_ARRAY,
srcArray: array,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: dst_ptr,
dstPitch: dst_pitch,
WidthInBytes: width_bytes,
Height: height,
..Default::default()
};
let res = copy_blocking(&copy, "cuMemcpy2DAsync_v2(plane)");
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
res
}
}
}
/// Copy the two NV12 convert targets (registered `R8` luma + `RG8` chroma GL textures) into `dst`'s
/// Y and UV planes. `dst` must be an NV12 buffer (`dst.uv` set). The luma plane is `width`×`height`
/// bytes; the chroma plane is `(width/2)·2` bytes wide × `height/2` rows. Both copies sync on our
/// priority stream before returning, so the dmabuf is safe to recycle once this returns.
pub fn copy_mapped_nv12(
y_tex: &mut RegisteredTexture,
uv_tex: &mut RegisteredTexture,
dst: &DeviceBuffer,
) -> Result<()> {
let (uv_ptr, uv_pitch) = dst
.uv
.ok_or_else(|| anyhow::anyhow!("copy_mapped_nv12 on a non-NV12 buffer"))?;
let w = dst.width as usize;
let h = dst.height as usize;
y_tex.copy_mapped_plane(dst.ptr, dst.pitch, w, h)?;
uv_tex.copy_mapped_plane(uv_ptr, uv_pitch, (w / 2) * 2, h / 2)
}
/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
/// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels.
/// The caller must have the shared context current on this thread (see [`make_current`]).
pub fn copy_device_to_device(
src: &DeviceBuffer,
dst_ptr: CUdeviceptr,
dst_pitch: usize,
) -> Result<()> {
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src.ptr,
srcPitch: src.pitch,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: dst_ptr,
dstPitch: dst_pitch,
WidthInBytes: src.width as usize * 4,
Height: src.height as usize,
..Default::default()
};
// SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared
// context current (documented). `&copy` is a live local device→device `CUDA_MEMCPY2D` outliving
// the synchronous call: `srcDevice`/`srcPitch` are `src`'s live allocation, `dstDevice`/
// `dstPitch` the caller's live region, `width*4`×`height` within both. Wrapper → live table.
unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(dev->dev)") }
}
/// Copy our imported NV12 [`DeviceBuffer`] (Y + UV planes) into NVENC's two-plane CUDA surface
/// `(y_dst, y_pitch)` / `(uv_dst, uv_pitch)` (`av_hwframe_get_buffer`'s `data[0]`/`data[1]` +
/// `linesize[0]`/`linesize[1]`). The Y plane is `width`×`height` bytes; the chroma plane is
/// `(width/2)·2` bytes × `height/2` rows. The caller must have the shared context current.
pub fn copy_nv12_to_device(
src: &DeviceBuffer,
y_dst: CUdeviceptr,
y_pitch: usize,
uv_dst: CUdeviceptr,
uv_pitch: usize,
) -> Result<()> {
let (src_uv_ptr, src_uv_pitch) = src
.uv
.ok_or_else(|| anyhow::anyhow!("copy_nv12_to_device on a non-NV12 buffer"))?;
let w = src.width as usize;
let h = src.height as usize;
let y = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src.ptr,
srcPitch: src.pitch,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: y_dst,
dstPitch: y_pitch,
WidthInBytes: w, // 1 byte/px luma
Height: h,
..Default::default()
};
let uv = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src_uv_ptr,
srcPitch: src_uv_pitch,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: uv_dst,
dstPitch: uv_pitch,
WidthInBytes: (w / 2) * 2, // 2 bytes/sample interleaved U,V
Height: h / 2,
..Default::default()
};
// SAFETY: two unsafe `copy_blocking` device→device copies; the caller must have the shared
// context current (documented). `&y`/`&uv` are live local `CUDA_MEMCPY2D`s outliving each
// synchronous call. All four device pointers are valid: `src.ptr`/`src_uv_ptr` come from a live
// NV12 `DeviceBuffer` (its `.uv` presence was checked via `ok_or_else`), `y_dst`/`uv_dst` are
// the caller's live NVENC surface planes; the luma copy is `w`×`h`, the chroma copy
// `(w/2)*2`×`h/2`, each within its planes. Wrappers → live table.
unsafe {
copy_blocking(&y, "cuMemcpy2DAsync_v2(nv12 Y dev->dev)")?;
copy_blocking(&uv, "cuMemcpy2DAsync_v2(nv12 UV dev->dev)")
}
}
impl Drop for RegisteredTexture {
fn drop(&mut self) {
if !self.resource.is_null() {
// SAFETY: `self.resource` is non-null (just checked) and is the valid
// `CUgraphicsResource` from `register_gl`, owned exclusively by this `RegisteredTexture`
// and unregistered exactly once here (drop runs once) — no use-after-free or
// double-unregister. `cuGraphicsUnregisterResource` releases the GL↔CUDA registration;
// wrapper → live table (the resource exists ⇒ the driver was present). Result ignored
// (best-effort teardown).
unsafe {
let _ = cuGraphicsUnregisterResource(self.resource);
}
}
}
}
/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
pub struct ExternalDmabuf {
ext: CUexternalMemory,
pub ptr: CUdeviceptr,
pub size: u64,
}
// SAFETY: the fields are opaque CUDA driver handles — an external-memory handle and a device
// pointer — not dereferenceable Rust memory, and the value is uniquely owned (no `Clone`). It is
// used from a single capture thread but constructed on / moved between threads with the importer;
// transferring these handles is sound because uniqueness rules out aliasing and they are destroyed
// exactly once in `Drop`. Only `Send` (not `Sync`) is asserted, matching the single-thread use.
unsafe impl Send for ExternalDmabuf {}
impl ExternalDmabuf {
/// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
/// from then on) and map its full `size` bytes to a device pointer. The shared context
/// must be current.
pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
// SAFETY: `libc::dup` only reads the integer `fd` and returns a new descriptor (or -1); it
// touches no Rust memory and `fd` is the caller's still-owned dmabuf fd (not consumed
// here). No aliasing or lifetime concern — a pure syscall on an integer.
let dup = unsafe { libc::dup(fd) };
if dup < 0 {
bail!("dup(dmabuf fd) failed");
}
Self::import_owned_fd(dup, size)
}
/// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by
/// the driver on success, closed by us on failure.
pub fn import_owned_fd(dup: i32, size: u64) -> Result<ExternalDmabuf> {
let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
size,
..Default::default()
};
desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
let mut ext: CUexternalMemory = std::ptr::null_mut();
// SAFETY: `cuImportExternalMemory` imports the memory described by `&desc`, a live local
// `#[repr(C)] CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h 64-bit layout) that outlives this
// synchronous call: `type_` is OPAQUE_FD, `handle[0]` holds the dup'd fd in the union's
// `int fd` low bytes, `size` is set. `&mut ext` is a live null-init out-param the driver
// writes the imported handle into. The driver takes ownership of the fd only on success.
// Distinct locals → no aliasing. Wrapper → live table (caller holds the context current).
let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
if r != 0 {
// SAFETY: import failed (`r != 0`), so the driver did NOT take ownership of `dup`; we
// still own it and close it exactly once here on the error path (the success path never
// closes it — the driver does). `libc::close` acts on the integer fd alone.
unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
}
let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
offset: 0,
size,
..Default::default()
};
let mut ptr: CUdeviceptr = 0;
// SAFETY: maps a device pointer from `ext` (the valid `CUexternalMemory` just imported) per
// `&buf`, a live local `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (offset 0, full `size`) that
// outlives this synchronous call. `&mut ptr` is a live zero-init out-param the driver writes
// the mapped device address into; distinct locals → no aliasing. Wrapper → live table
// (context current).
let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
if r != 0 {
// SAFETY: mapping failed; `ext` is the valid `CUexternalMemory` we imported and
// exclusively own. We destroy it exactly once here on the error path (the success path
// instead moves it into the returned `ExternalDmabuf`, whose `Drop` destroys it),
// releasing the fd the driver took — no double-destroy or use-after-free.
unsafe {
let _ = cuDestroyExternalMemory(ext);
}
bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
}
Ok(ExternalDmabuf { ext, ptr, size })
}
}
impl Drop for ExternalDmabuf {
fn drop(&mut self) {
// SAFETY: this `ExternalDmabuf` only exists after a successful import, so the driver table
// is live. It exclusively owns `self.ptr` (the mapped buffer) and `self.ext` (the external
// memory), each torn down exactly once here (drop runs once; guarded by `!= 0` / `!null`) —
// no double-free or use-after-free. We make the shared context current first because drop
// may run off the import thread, and we free the mapped buffer before destroying its
// backing external memory. Results ignored (best-effort teardown).
unsafe {
if let Some(c) = CONTEXT.get() {
let _ = cuCtxSetCurrent(c.0);
}
if self.ptr != 0 {
let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
}
if !self.ext.is_null() {
let _ = cuDestroyExternalMemory(self.ext);
}
}
}
}
/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
/// offset) into `dst`. The shared context must be current on this thread.
pub fn copy_pitched_to_buffer(
src_ptr: CUdeviceptr,
src_pitch: usize,
dst: &DeviceBuffer,
) -> Result<()> {
let copy = CUDA_MEMCPY2D {
srcMemoryType: CU_MEMORYTYPE_DEVICE,
srcDevice: src_ptr,
srcPitch: src_pitch,
dstMemoryType: CU_MEMORYTYPE_DEVICE,
dstDevice: dst.ptr,
dstPitch: dst.pitch,
WidthInBytes: dst.width as usize * 4,
Height: dst.height as usize,
..Default::default()
};
// copy_blocking syncs our priority stream before returning, so the copy is complete before the
// dmabuf is requeued to the producer.
// SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared
// context current (documented). `&copy` is a live local device→device `CUDA_MEMCPY2D` outliving
// the synchronous call: `srcDevice`/`srcPitch` are the caller's live mapped span (e.g. an
// `ExternalDmabuf`), `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height`
// within both. Wrapper → live table.
unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(ext->dev)") }
}