//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop //! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and //! `dlopen` `libcuda.so.1` at runtime (the driver library — NOT `libcudart`; NOT a link-time //! `#[link]`, so one binary runs on NVIDIA and on AMD/Intel where `libcuda` is absent — see //! [`CudaApi`]). Symbol names verified against //! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop //! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is //! Tegra-only on the desktop driver — see [`super::egl`].) //! //! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture //! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use. #![allow(non_camel_case_types, non_snake_case)] // Every `unsafe` block/impl below carries a `// SAFETY:` proof; enforce it (unsafe-proof program). #![deny(clippy::undocumented_unsafe_blocks)] use anyhow::{bail, Result}; use std::os::raw::{c_int, c_uint, c_void}; use std::sync::{Arc, Mutex, OnceLock}; pub type CUresult = c_uint; // CUDA_SUCCESS == 0 pub type CUdevice = c_int; pub type CUcontext = *mut c_void; // opaque CUctx_st* pub type CUstream = *mut c_void; // opaque CUstream_st* pub type CUdeviceptr = u64; pub type CUgraphicsResource = *mut c_void; pub type CUarray = *mut c_void; pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st* /// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4. pub const CU_MEMORYTYPE_DEVICE: c_uint = 2; pub const CU_MEMORYTYPE_ARRAY: c_uint = 3; /// `CUctx_flags` (cuda.h): block the CPU on an OS primitive while waiting for the GPU instead of /// busy-spinning. On this shared box (compositor + send thread on the same cores) spinning a core /// to detect copy completion steals CPU from the very threads we want scheduled; BLOCKING_SYNC /// frees it. Default (`CU_CTX_SCHED_AUTO=0`) heuristically picks SPIN vs YIELD by core count. const CU_CTX_SCHED_BLOCKING_SYNC: c_uint = 0x04; /// `cuStreamCreateWithPriority` flag: don't implicitly synchronize with the legacy NULL stream. const CU_STREAM_NON_BLOCKING: c_uint = 0x01; /// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing. #[repr(C)] #[derive(Default)] pub struct CUDA_MEMCPY2D { pub srcXInBytes: usize, pub srcY: usize, pub srcMemoryType: c_uint, pub srcHost: *const c_void, pub srcDevice: CUdeviceptr, pub srcArray: CUarray, pub srcPitch: usize, pub dstXInBytes: usize, pub dstY: usize, pub dstMemoryType: c_uint, pub dstHost: *mut c_void, pub dstDevice: CUdeviceptr, pub dstArray: CUarray, pub dstPitch: usize, pub WidthInBytes: usize, pub Height: usize, } /// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose /// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type /// only the first 4 bytes (the `int fd`) are read. #[repr(C)] #[derive(Default)] pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC { pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1 _pad: u32, pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject } pub size: u64, pub flags: c_uint, reserved: [c_uint; 16], _pad2: u32, } /// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout). #[repr(C)] #[derive(Default)] pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC { pub offset: u64, pub size: u64, pub flags: c_uint, reserved: [c_uint; 16], _pad: u32, } pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1; /// CUDA Driver API entry points, resolved at runtime from `libcuda.so.1` via `dlopen` rather than /// a link-time `#[link(name = "cuda")]`. This is what lets ONE host binary run on NVIDIA /// (zero-copy via CUDA → NVENC) *and* on AMD/Intel (VAAPI, where the NVIDIA driver — and thus /// `libcuda` — is absent): with a hard link the loader would refuse to start the binary at all. /// Every `cu*` call below goes through a same-named wrapper fn that forwards to this table; when /// the driver isn't present the table is `None` and the wrappers return a non-zero `CUresult`, so /// `context()` fails cleanly and the capturer falls back to the CPU path. The `cuda_api()` loader /// is memoised; the library handle is intentionally leaked (process-lifetime, like the context). struct CudaApi { cuInit: unsafe extern "C" fn(c_uint) -> CUresult, cuDeviceGet: unsafe extern "C" fn(*mut CUdevice, c_int) -> CUresult, cuCtxCreate_v2: unsafe extern "C" fn(*mut CUcontext, c_uint, CUdevice) -> CUresult, cuCtxSetCurrent: unsafe extern "C" fn(CUcontext) -> CUresult, cuMemAllocPitch_v2: unsafe extern "C" fn(*mut CUdeviceptr, *mut usize, usize, usize, c_uint) -> CUresult, cuMemFree_v2: unsafe extern "C" fn(CUdeviceptr) -> CUresult, cuMemcpy2DAsync_v2: unsafe extern "C" fn(*const CUDA_MEMCPY2D, CUstream) -> CUresult, cuStreamSynchronize: unsafe extern "C" fn(CUstream) -> CUresult, cuCtxGetStreamPriorityRange: unsafe extern "C" fn(*mut c_int, *mut c_int) -> CUresult, cuStreamCreateWithPriority: unsafe extern "C" fn(*mut CUstream, c_uint, c_int) -> CUresult, cuGraphicsGLRegisterImage: unsafe extern "C" fn(*mut CUgraphicsResource, c_uint, c_uint, c_uint) -> CUresult, cuGraphicsMapResources: unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult, cuGraphicsUnmapResources: unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult, cuGraphicsSubResourceGetMappedArray: unsafe extern "C" fn(*mut CUarray, CUgraphicsResource, c_uint, c_uint) -> CUresult, cuGraphicsUnregisterResource: unsafe extern "C" fn(CUgraphicsResource) -> CUresult, cuImportExternalMemory: unsafe extern "C" fn( *mut CUexternalMemory, *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC, ) -> CUresult, cuExternalMemoryGetMappedBuffer: unsafe extern "C" fn( *mut CUdeviceptr, CUexternalMemory, *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC, ) -> CUresult, cuDestroyExternalMemory: unsafe extern "C" fn(CUexternalMemory) -> CUresult, } // SAFETY: every field is a bare `extern "C" fn` address into the leaked, process-lifetime // `libcuda` mapping (`cuda_api` `forget`s the `Library`, so it is never unloaded) — an immutable // value with no interior mutability and no thread affinity. Moving the table to another thread // cannot dangle (the code it points at stays mapped) or race (the fields are read-only). unsafe impl Send for CudaApi {} // SAFETY: as above — the table is a set of immutable fn-pointer addresses with no interior // mutability, so concurrent shared reads from multiple threads cannot race; the driver entry // points they address are themselves thread-safe. unsafe impl Sync for CudaApi {} /// `CUresult` returned by the wrappers when `libcuda` isn't loaded (no NVIDIA driver). Non-zero so /// the existing `ck()`/`!= 0` checks treat it as an ordinary driver error; distinct from any real /// `CUDA_ERROR_*` (all < 1000). Never produced by the actual driver. const CU_ERROR_NOT_LOADED: CUresult = 999; static CUDA_API: OnceLock> = OnceLock::new(); /// Resolve `libcuda.so.1` and its symbols once. `None` when the NVIDIA driver isn't installed /// (the expected case on AMD/Intel hosts) — logged at debug, not an error. fn cuda_api() -> Option<&'static CudaApi> { CUDA_API // SAFETY: `Library::new` runs `libcuda.so.1`'s initializers — it is the trusted NVIDIA // driver library, so loading has no unexpected effects; `?`/`None` handle its absence. // Each `lib.get::(name)` asserts the symbol's real ABI equals `T`: every NUL-terminated // name is a documented CUDA Driver API entry point and `T` is the exact // `unsafe extern "C" fn(..)` signature from cuda.h/cudaGL.h (`_v2` for ctx/mem ops). Each // `Symbol` only borrows `lib` until the end of the struct-literal statement; we deref-copy // the raw fn-pointer out first, then `forget(lib)` leaks the mapping so those addresses // stay valid for the whole process. Runs once under the `OnceLock` init — no aliasing. .get_or_init(|| unsafe { let lib = libloading::Library::new("libcuda.so.1") .or_else(|_| libloading::Library::new("libcuda.so")) .map_err(|e| { tracing::debug!(error = %e, "libcuda not loadable — CUDA zero-copy unavailable (expected on AMD/Intel)"); }) .ok()?; // Resolve all symbols; the field types drive `get`'s inference. `lib` is leaked after // construction so the fn pointers stay valid for the process lifetime (the temporary // `Symbol` borrows end with the struct-literal statement, before the forget). let api = CudaApi { cuInit: *lib.get(b"cuInit\0").ok()?, cuDeviceGet: *lib.get(b"cuDeviceGet\0").ok()?, cuCtxCreate_v2: *lib.get(b"cuCtxCreate_v2\0").ok()?, cuCtxSetCurrent: *lib.get(b"cuCtxSetCurrent\0").ok()?, cuMemAllocPitch_v2: *lib.get(b"cuMemAllocPitch_v2\0").ok()?, cuMemFree_v2: *lib.get(b"cuMemFree_v2\0").ok()?, cuMemcpy2DAsync_v2: *lib.get(b"cuMemcpy2DAsync_v2\0").ok()?, cuStreamSynchronize: *lib.get(b"cuStreamSynchronize\0").ok()?, cuCtxGetStreamPriorityRange: *lib.get(b"cuCtxGetStreamPriorityRange\0").ok()?, cuStreamCreateWithPriority: *lib.get(b"cuStreamCreateWithPriority\0").ok()?, cuGraphicsGLRegisterImage: *lib.get(b"cuGraphicsGLRegisterImage\0").ok()?, cuGraphicsMapResources: *lib.get(b"cuGraphicsMapResources\0").ok()?, cuGraphicsUnmapResources: *lib.get(b"cuGraphicsUnmapResources\0").ok()?, cuGraphicsSubResourceGetMappedArray: *lib .get(b"cuGraphicsSubResourceGetMappedArray\0") .ok()?, cuGraphicsUnregisterResource: *lib.get(b"cuGraphicsUnregisterResource\0").ok()?, cuImportExternalMemory: *lib.get(b"cuImportExternalMemory\0").ok()?, cuExternalMemoryGetMappedBuffer: *lib .get(b"cuExternalMemoryGetMappedBuffer\0") .ok()?, cuDestroyExternalMemory: *lib.get(b"cuDestroyExternalMemory\0").ok()?, }; std::mem::forget(lib); // keep libcuda mapped for the fn pointers' lifetime (process) Some(api) }) .as_ref() } // Same-named wrappers so the call sites below are unchanged. Each forwards through the dlopen'd // table, or returns `CU_ERROR_NOT_LOADED` when the driver is absent (AMD/Intel) — which the // `CUresult` checks already handle. Only `context()` is reachable before the driver is confirmed // present; every other entry runs after `context()` succeeded, so its wrapper always hits `Some`. unsafe fn cuInit(flags: c_uint) -> CUresult { match cuda_api() { Some(a) => (a.cuInit)(flags), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult { match cuda_api() { Some(a) => (a.cuDeviceGet)(device, ordinal), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult { match cuda_api() { Some(a) => (a.cuCtxCreate_v2)(pctx, flags, dev), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult { match cuda_api() { Some(a) => (a.cuCtxSetCurrent)(ctx), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuMemAllocPitch_v2( dptr: *mut CUdeviceptr, pitch: *mut usize, width_bytes: usize, height: usize, element_size: c_uint, ) -> CUresult { match cuda_api() { Some(a) => (a.cuMemAllocPitch_v2)(dptr, pitch, width_bytes, height, element_size), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult { match cuda_api() { Some(a) => (a.cuMemFree_v2)(dptr), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuMemcpy2DAsync_v2(copy: *const CUDA_MEMCPY2D, stream: CUstream) -> CUresult { match cuda_api() { Some(a) => (a.cuMemcpy2DAsync_v2)(copy, stream), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuStreamSynchronize(stream: CUstream) -> CUresult { match cuda_api() { Some(a) => (a.cuStreamSynchronize)(stream), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuCtxGetStreamPriorityRange(least: *mut c_int, greatest: *mut c_int) -> CUresult { match cuda_api() { Some(a) => (a.cuCtxGetStreamPriorityRange)(least, greatest), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuStreamCreateWithPriority( stream: *mut CUstream, flags: c_uint, priority: c_int, ) -> CUresult { match cuda_api() { Some(a) => (a.cuStreamCreateWithPriority)(stream, flags, priority), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuGraphicsGLRegisterImage( resource: *mut CUgraphicsResource, texture: c_uint, target: c_uint, flags: c_uint, ) -> CUresult { match cuda_api() { Some(a) => (a.cuGraphicsGLRegisterImage)(resource, texture, target, flags), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuGraphicsMapResources( count: c_uint, resources: *mut CUgraphicsResource, stream: *mut c_void, ) -> CUresult { match cuda_api() { Some(a) => (a.cuGraphicsMapResources)(count, resources, stream), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuGraphicsUnmapResources( count: c_uint, resources: *mut CUgraphicsResource, stream: *mut c_void, ) -> CUresult { match cuda_api() { Some(a) => (a.cuGraphicsUnmapResources)(count, resources, stream), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuGraphicsSubResourceGetMappedArray( array: *mut CUarray, resource: CUgraphicsResource, array_index: c_uint, mip_level: c_uint, ) -> CUresult { match cuda_api() { Some(a) => (a.cuGraphicsSubResourceGetMappedArray)(array, resource, array_index, mip_level), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult { match cuda_api() { Some(a) => (a.cuGraphicsUnregisterResource)(resource), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuImportExternalMemory( ext_mem_out: *mut CUexternalMemory, mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC, ) -> CUresult { match cuda_api() { Some(a) => (a.cuImportExternalMemory)(ext_mem_out, mem_handle_desc), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuExternalMemoryGetMappedBuffer( dev_ptr: *mut CUdeviceptr, ext_mem: CUexternalMemory, buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC, ) -> CUresult { match cuda_api() { Some(a) => (a.cuExternalMemoryGetMappedBuffer)(dev_ptr, ext_mem, buffer_desc), None => CU_ERROR_NOT_LOADED, } } unsafe fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult { match cuda_api() { Some(a) => (a.cuDestroyExternalMemory)(ext_mem), None => CU_ERROR_NOT_LOADED, } } #[inline] fn ck(r: CUresult, what: &str) -> Result<()> { if r == 0 { Ok(()) } else { bail!("CUDA driver error {r} in {what}") } } /// Copy a pitched device plane `(src_ptr, src_pitch)` down to a tightly-packed host buffer of /// `width_bytes`×`height` (no row padding). Synchronous on the priority stream. Used by the NV12 /// self-test to read planes back for the colour comparison; not on the hot path. pub fn read_plane_to_host( src_ptr: CUdeviceptr, src_pitch: usize, width_bytes: usize, height: usize, ) -> Result> { let mut host = vec![0u8; width_bytes * height]; let copy = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_DEVICE, srcDevice: src_ptr, srcPitch: src_pitch, dstMemoryType: 1, // CU_MEMORYTYPE_HOST dstHost: host.as_mut_ptr() as *mut c_void, dstPitch: width_bytes, WidthInBytes: width_bytes, Height: height, ..Default::default() }; // SAFETY: `copy_blocking` is unsafe because it issues a CUDA copy; its contract is a valid // descriptor with the shared context current (the caller's responsibility — self-test path). // `©` is a live local `#[repr(C)] CUDA_MEMCPY2D` that outlives the synchronous call: // `srcDevice`/`srcPitch` are the caller's live pitched device plane, `dstHost` addresses the // freshly-allocated `host` `Vec` of exactly `width_bytes*height` bytes, and `WidthInBytes`× // `Height` fit both. The copy is synchronous, so `host` is fully written before we return it. unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->host)")? }; Ok(host) } /// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live /// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread. #[derive(Clone, Copy)] pub struct Context(pub CUcontext); // SAFETY: `CUcontext` is an opaque CUDA driver handle, not a dereferenceable Rust pointer. It is // created once and never destroyed (process lifetime), and the only thing done with it is // `cuCtxSetCurrent`, which the Driver API explicitly allows from any thread — so transferring the // handle to another thread cannot dangle or race (the driver owns the synchronization). unsafe impl Send for Context {} // SAFETY: as above — the wrapped handle is an immutable opaque address and the driver does all the // synchronization, so sharing `&Context` across threads is sound. unsafe impl Sync for Context {} static CONTEXT: OnceLock = OnceLock::new(); /// Get (lazily creating) the shared CUDA context on device 0. pub fn context() -> Result { if let Some(c) = CONTEXT.get() { return Ok(c.0); } if cuda_api().is_none() { bail!("libcuda.so.1 not available — no NVIDIA driver (CUDA zero-copy disabled)"); } // SAFETY: we returned above unless `cuda_api()` is `Some`, so every wrapper here forwards into // the live, leaked `libcuda` table rather than the not-loaded stub. `cuInit(0)` passes the // API-required flags value 0. `&mut dev`/`&mut ctx` are live, zero/null-initialized stack // out-params the driver writes the device handle / new context into; each outlives its // synchronous call and they are distinct locals (no aliasing). `cuCtxCreate_v2` yields a valid // `CUcontext` on success (`ck` bails otherwise), which becomes the block's value. let ctx = unsafe { ck(cuInit(0), "cuInit")?; let mut dev: CUdevice = 0; ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?; let mut ctx: CUcontext = std::ptr::null_mut(); ck( cuCtxCreate_v2(&mut ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), "cuCtxCreate_v2", )?; ctx }; // Racy first-init is fine: the winner's context is used; a loser leaks one context (rare, // process-lifetime). `get_or_init` keeps a single shared value. Ok(CONTEXT.get_or_init(|| Context(ctx)).0) } /// Make the shared context current on the calling thread (required before any CUDA op here). pub fn make_current() -> Result<()> { let ctx = context()?; // SAFETY: `ctx` came from `context()?`, so it is the live shared `CUcontext` and the driver // table is present. `cuCtxSetCurrent` binds that opaque handle to the calling thread; it takes // no Rust-memory pointer and is thread-safe (affects only this thread's current context), so // there is no aliasing or lifetime hazard. unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") } } thread_local! { /// Per-thread copy stream. `None` until first use; `Some(null)` means "creation failed, use the /// default (NULL) stream". Per-thread (not shared) so each worker's `cuStreamSynchronize` waits /// only on ITS OWN copies — the old per-frame `cuCtxSynchronize` was context-wide and also /// blocked on the other worker thread's in-flight NULL-stream copies. static COPY_STREAM: std::cell::Cell> = const { std::cell::Cell::new(None) }; } /// The calling thread's highest-priority copy stream (lazily created; context must be current). /// Carries the greatest stream priority the driver exposes — a scheduler hint that nudges our /// copies ahead of the game's queued compute. NOTE: stream priority is an intra-process hint and /// NVIDIA's Linux driver may ignore it / not preempt a saturating game's graphics context; this is /// "measure-then-keep", and it never regresses (falls back to the NULL stream). The greatest /// priority is the numerically-lowest value (`greatest` from `cuCtxGetStreamPriorityRange`). fn copy_stream() -> CUstream { COPY_STREAM.with(|cell| { if let Some(s) = cell.get() { return s; } // SAFETY: `copy_stream` runs with the shared context current (its doc contract), so the // wrappers forward into the live `libcuda` table. `&mut least`/`&mut greatest` are live // stack `i32`s the driver fills with the priority range; `&mut s` is a live null-init // `CUstream` the driver writes the new stream into. All out-params outlive their // synchronous calls and are distinct locals. On any non-zero result we fall back to a null // (NULL-stream) value and never read an uninitialized handle. let stream = unsafe { let (mut least, mut greatest) = (0i32, 0i32); if cuCtxGetStreamPriorityRange(&mut least, &mut greatest) != 0 { std::ptr::null_mut() } else { let mut s: CUstream = std::ptr::null_mut(); if cuStreamCreateWithPriority(&mut s, CU_STREAM_NON_BLOCKING, greatest) != 0 { std::ptr::null_mut() } else { tracing::debug!( priority = greatest, "CUDA high-priority copy stream created" ); s } } }; cell.set(Some(stream)); stream }) } /// Issue `copy` on this thread's priority stream and block until it completes. Replaces the /// per-frame `cuMemcpy2D_v2` + context-wide `cuCtxSynchronize` pair: same completion guarantee /// (the source dmabuf is safe to recycle once this returns), but the wait is scoped to our own /// stream and the copy carries the high priority hint. unsafe fn copy_blocking(copy: &CUDA_MEMCPY2D, what: &str) -> Result<()> { let stream = copy_stream(); ck(cuMemcpy2DAsync_v2(copy, stream), what)?; ck(cuStreamSynchronize(stream), "cuStreamSynchronize") } /// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`. fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> { let mut ptr: CUdeviceptr = 0; let mut pitch: usize = 0; // SAFETY: `cuMemAllocPitch_v2` allocates a pitched device buffer (the wrapper forwards to the // live table on any path that reached allocation). `&mut ptr` (`CUdeviceptr`) and `&mut pitch` // (`usize`) are live, distinct stack out-params the driver writes the allocation pointer and // its pitch into; both outlive the synchronous call. Width/height/element-size are by-value // ints. No aliasing — two separate locals. unsafe { ck( cuMemAllocPitch_v2( &mut ptr, &mut pitch, width as usize * 4, height as usize, 16, ), "cuMemAllocPitch_v2", )?; } Ok((ptr, pitch)) } /// Allocate the two pitched planes of an NV12 surface (8-bit BT.709 4:2:0): a `width`-byte Y plane /// (W×H, 1 byte/px) and an interleaved chroma plane (W/2 × H/2 samples, 2 bytes/sample → W bytes /// wide). Both planes share the driver's Y pitch (the wider request), so the encoder's two-plane /// surface and ours line up. Returns `((y_ptr, y_pitch), (uv_ptr, uv_pitch))`. fn alloc_pitched_nv12( width: u32, height: u32, ) -> Result<((CUdeviceptr, usize), (CUdeviceptr, usize))> { let mut y_ptr: CUdeviceptr = 0; let mut y_pitch: usize = 0; let mut uv_ptr: CUdeviceptr = 0; let mut uv_pitch: usize = 0; // SAFETY: two independent `cuMemAllocPitch_v2` calls (wrapper → live table). `&mut y_ptr`/ // `&mut y_pitch` and `&mut uv_ptr`/`&mut uv_pitch` are live, distinct stack out-params the // driver writes each plane's pointer and pitch into; all outlive their synchronous calls. The // dimension/element-size args are by-value ints. No aliasing — four separate locals. unsafe { ck( cuMemAllocPitch_v2( &mut y_ptr, &mut y_pitch, width as usize, height as usize, 16, ), "cuMemAllocPitch_v2(Y)", )?; // Chroma is W/2 samples wide at 2 bytes each = W bytes; H/2 rows. ck( cuMemAllocPitch_v2( &mut uv_ptr, &mut uv_pitch, (width as usize / 2) * 2, (height as usize / 2).max(1), 16, ), "cuMemAllocPitch_v2(UV)", )?; } Ok(((y_ptr, y_pitch), (uv_ptr, uv_pitch))) } /// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the /// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and /// returns its allocation here. Bulk-freed when the last reference drops. For NV12 each free entry /// is the Y plane *and* its paired UV plane (allocated/recycled/freed together). struct PoolInner { free: Vec, /// NV12 only: the UV plane paired with each Y plane in `free` (same index, same length). free_uv: Vec, } impl Drop for PoolInner { fn drop(&mut self) { // SAFETY: the pool only exists because allocation succeeded, so the driver table is live. // `PoolInner` drops only once every `DeviceBuffer` that referenced it (each holds an `Arc` // clone) has been recycled, so `free`/`free_uv` hold every outstanding allocation exactly // once and nothing else still uses them — no double-free or use-after-free. We make the // shared context current first (drop may run off the allocating thread) so `cuMemFree_v2` // targets the right context. Each `p` is a `CUdeviceptr` previously returned by // `cuMemAllocPitch_v2`; results are ignored (best-effort teardown). unsafe { if let Some(c) = CONTEXT.get() { let _ = cuCtxSetCurrent(c.0); } for &p in &self.free { let _ = cuMemFree_v2(p); } for &p in &self.free_uv { let _ = cuMemFree_v2(p); } } } } /// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame /// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock /// and serializes against the GPU every frame. #[derive(Clone)] pub struct BufferPool { inner: Arc>, width: u32, height: u32, pitch: usize, /// NV12 pools carry a second (chroma) pitch; `Some` ⇒ buffers from this pool have a UV plane. uv_pitch: Option, } impl BufferPool { /// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the /// driver's pitch, which is constant for a given width). pub fn new(width: u32, height: u32) -> Result { let (ptr, pitch) = alloc_pitched(width, height)?; Ok(BufferPool { inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr], free_uv: Vec::new(), })), width, height, pitch, uv_pitch: None, }) } /// Create a pool of NV12 two-plane surfaces (Y + interleaved UV) for `width`x`height`. Allocates /// one pair up front to learn the driver's per-plane pitches (constant for a given width). pub fn new_nv12(width: u32, height: u32) -> Result { let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?; Ok(BufferPool { inner: Arc::new(Mutex::new(PoolInner { free: vec![y_ptr], free_uv: vec![uv_ptr], })), width, height, pitch: y_pitch, uv_pitch: Some(uv_pitch), }) } pub fn width(&self) -> u32 { self.width } pub fn height(&self) -> u32 { self.height } /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). For an /// NV12 pool the returned buffer carries both the Y and the paired UV plane. pub fn get(&self) -> Result { if let Some(uv_pitch) = self.uv_pitch { let reuse = { let mut g = self.inner.lock().unwrap(); g.free.pop().map(|y| (y, g.free_uv.pop())) }; let (ptr, uv_ptr) = match reuse { // Y and UV are pushed/popped together, so a popped Y always has its UV. Some((y, Some(uv))) => (y, uv), _ => { let ((y, _), (uv, _)) = alloc_pitched_nv12(self.width, self.height)?; (y, uv) } }; return Ok(DeviceBuffer { ptr, pitch: self.pitch, width: self.width, height: self.height, uv: Some((uv_ptr, uv_pitch)), pool: Some(self.inner.clone()), }); } let reuse = self.inner.lock().unwrap().free.pop(); let ptr = match reuse { Some(p) => p, None => alloc_pitched(self.width, self.height)?.0, }; Ok(DeviceBuffer { ptr, pitch: self.pitch, width: self.width, height: self.height, uv: None, pool: Some(self.inner.clone()), }) } } /// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped /// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder. /// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees. pub struct DeviceBuffer { pub ptr: CUdeviceptr, pub pitch: usize, pub width: u32, pub height: u32, /// NV12 only: the interleaved chroma plane `(ptr, pitch)` paired with the Y plane in [`ptr`]. /// `None` for the default 4-byte RGB/BGRx path. When `Some`, [`ptr`] is the Y plane (1 byte/px). pub uv: Option<(CUdeviceptr, usize)>, pool: Option>>, } impl DeviceBuffer { /// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path. pub fn alloc(width: u32, height: u32) -> Result { let (ptr, pitch) = alloc_pitched(width, height)?; Ok(DeviceBuffer { ptr, pitch, width, height, uv: None, pool: None, }) } /// Allocate a standalone (un-pooled) NV12 two-plane buffer. Prefer [`BufferPool::new_nv12`] on /// the hot path; used by the self-test. pub fn alloc_nv12(width: u32, height: u32) -> Result { let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?; Ok(DeviceBuffer { ptr: y_ptr, pitch: y_pitch, width, height, uv: Some((uv_ptr, uv_pitch)), pool: None, }) } /// True if this buffer carries an NV12 chroma plane. pub fn is_nv12(&self) -> bool { self.uv.is_some() } } impl Drop for DeviceBuffer { fn drop(&mut self) { if self.ptr == 0 { return; } if let Some(pool) = &self.pool { // Recycle (the consumer synchronized before dropping, so the GPU is done with it). Y and // its paired UV go back together so `get` can repair them as a unit. let mut g = pool.lock().unwrap(); g.free.push(self.ptr); if let Some((uv_ptr, _)) = self.uv { g.free_uv.push(uv_ptr); } } else { // The buffer may be freed on the encode thread; cuMemFree needs a current context. // SAFETY: this is the un-pooled branch (`pool` is `None`), so this `DeviceBuffer` // exclusively owns `self.ptr` (and `self.uv`'s `uv_ptr`), each returned by // `cuMemAllocPitch_v2` and freed exactly once here — `drop` runs once and the // `self.ptr == 0` guard above skips the sentinel/empty case, so no double-free. We set // the shared context current first because drop may run on a thread where it isn't, and // `cuMemFree_v2` needs it. Wrapper → live table; results ignored (teardown). unsafe { if let Some(c) = CONTEXT.get() { let _ = cuCtxSetCurrent(c.0); } let _ = cuMemFree_v2(self.ptr); if let Some((uv_ptr, _)) = self.uv { let _ = cuMemFree_v2(uv_ptr); } } } } } /// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA /// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the /// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only /// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point), /// instead of registering/unregistering every frame. Unregisters on drop. pub struct RegisteredTexture { resource: CUgraphicsResource, } impl RegisteredTexture { /// Register a `GL_TEXTURE_2D` once. /// /// # Safety /// The GL context and the shared CUDA context must both be current on this thread, and /// `texture` must be a valid `GL_TEXTURE_2D`. pub unsafe fn register_gl(texture: u32) -> Result { const GL_TEXTURE_2D: c_uint = 0x0DE1; const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01; let mut resource: CUgraphicsResource = std::ptr::null_mut(); ck( cuGraphicsGLRegisterImage( &mut resource, texture, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY, ), "cuGraphicsGLRegisterImage", )?; Ok(RegisteredTexture { resource }) } /// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then /// unmap. The copy is synchronized (on our priority stream) before unmap so `dst` is ready /// before the source dmabuf is recycled. Always unmaps, even if the copy errors. pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> { // SAFETY: `self.resource` is the valid `CUgraphicsResource` from a successful `register_gl` // (its only constructor), so the wrappers forward to the live table; the caller holds the // GL+CUDA contexts current (the registration's contract). `cuGraphicsMapResources` maps // `count == 1` resource via `&mut self.resource` (a live field) on the default stream; // `cuGraphicsSubResourceGetMappedArray` writes the mapped `CUarray` into the live local // `array` (index 0, mip 0). On failure we unmap and bail (balanced). `©` is a live // local `CUDA_MEMCPY2D` outliving the synchronous `copy_blocking`: `srcArray` is valid // while mapped, `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height` fit // both. `copy_blocking` syncs before we unmap, so the array stays valid through the copy; // we always unmap afterward (even on error), keeping the map/unmap pair balanced. unsafe { ck( cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()), "cuGraphicsMapResources", )?; let mut array: CUarray = std::ptr::null_mut(); if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 { let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); bail!("cuGraphicsSubResourceGetMappedArray failed"); } let copy = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_ARRAY, srcArray: array, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: dst.ptr, dstPitch: dst.pitch, WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx) Height: dst.height as usize, ..Default::default() }; let res = copy_blocking(©, "cuMemcpy2DAsync_v2"); let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); res } } /// Map this texture for the frame and copy its array into the device plane `(dst_ptr, /// dst_pitch)`, taking `width_bytes`×`height` bytes (the GL internal format dictates /// `width_bytes`: `width*1` for an `R8` luma target, `(width/2)*2` for an `RG8` chroma target). /// Synchronized on our priority stream before unmap (so the source dmabuf is safe to recycle). /// Always unmaps, even on copy error. fn copy_mapped_plane( &mut self, dst_ptr: CUdeviceptr, dst_pitch: usize, width_bytes: usize, height: usize, ) -> Result<()> { // SAFETY: identical contract to `copy_mapped_to` — `self.resource` is the valid // `CUgraphicsResource` from `register_gl` (wrappers → live table; caller holds GL+CUDA // contexts current). Map `count == 1` resource via the live `&mut self.resource`; the // mapped `CUarray` is written into the live local `array` (index 0, mip 0); on failure we // unmap and bail (balanced). `©` is a live local outliving the synchronous // `copy_blocking`: `srcArray` valid while mapped, `dstDevice`/`dstPitch` are the caller's // live plane, `width_bytes`×`height` fit it. We always unmap afterward, even on copy error, // so the map/unmap pair stays balanced and the array outlives the copy. unsafe { ck( cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()), "cuGraphicsMapResources", )?; let mut array: CUarray = std::ptr::null_mut(); if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 { let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); bail!("cuGraphicsSubResourceGetMappedArray failed"); } let copy = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_ARRAY, srcArray: array, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: dst_ptr, dstPitch: dst_pitch, WidthInBytes: width_bytes, Height: height, ..Default::default() }; let res = copy_blocking(©, "cuMemcpy2DAsync_v2(plane)"); let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); res } } } /// Copy the two NV12 convert targets (registered `R8` luma + `RG8` chroma GL textures) into `dst`'s /// Y and UV planes. `dst` must be an NV12 buffer (`dst.uv` set). The luma plane is `width`×`height` /// bytes; the chroma plane is `(width/2)·2` bytes wide × `height/2` rows. Both copies sync on our /// priority stream before returning, so the dmabuf is safe to recycle once this returns. pub fn copy_mapped_nv12( y_tex: &mut RegisteredTexture, uv_tex: &mut RegisteredTexture, dst: &DeviceBuffer, ) -> Result<()> { let (uv_ptr, uv_pitch) = dst .uv .ok_or_else(|| anyhow::anyhow!("copy_mapped_nv12 on a non-NV12 buffer"))?; let w = dst.width as usize; let h = dst.height as usize; y_tex.copy_mapped_plane(dst.ptr, dst.pitch, w, h)?; uv_tex.copy_mapped_plane(uv_ptr, uv_pitch, (w / 2) * 2, h / 2) } /// Copy a pitched device buffer into another device region (device→device), e.g. our imported /// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels. /// The caller must have the shared context current on this thread (see [`make_current`]). pub fn copy_device_to_device( src: &DeviceBuffer, dst_ptr: CUdeviceptr, dst_pitch: usize, ) -> Result<()> { let copy = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_DEVICE, srcDevice: src.ptr, srcPitch: src.pitch, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: dst_ptr, dstPitch: dst_pitch, WidthInBytes: src.width as usize * 4, Height: src.height as usize, ..Default::default() }; // SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared // context current (documented). `©` is a live local device→device `CUDA_MEMCPY2D` outliving // the synchronous call: `srcDevice`/`srcPitch` are `src`'s live allocation, `dstDevice`/ // `dstPitch` the caller's live region, `width*4`×`height` within both. Wrapper → live table. unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(dev->dev)") } } /// Copy our imported NV12 [`DeviceBuffer`] (Y + UV planes) into NVENC's two-plane CUDA surface /// `(y_dst, y_pitch)` / `(uv_dst, uv_pitch)` (`av_hwframe_get_buffer`'s `data[0]`/`data[1]` + /// `linesize[0]`/`linesize[1]`). The Y plane is `width`×`height` bytes; the chroma plane is /// `(width/2)·2` bytes × `height/2` rows. The caller must have the shared context current. pub fn copy_nv12_to_device( src: &DeviceBuffer, y_dst: CUdeviceptr, y_pitch: usize, uv_dst: CUdeviceptr, uv_pitch: usize, ) -> Result<()> { let (src_uv_ptr, src_uv_pitch) = src .uv .ok_or_else(|| anyhow::anyhow!("copy_nv12_to_device on a non-NV12 buffer"))?; let w = src.width as usize; let h = src.height as usize; let y = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_DEVICE, srcDevice: src.ptr, srcPitch: src.pitch, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: y_dst, dstPitch: y_pitch, WidthInBytes: w, // 1 byte/px luma Height: h, ..Default::default() }; let uv = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_DEVICE, srcDevice: src_uv_ptr, srcPitch: src_uv_pitch, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: uv_dst, dstPitch: uv_pitch, WidthInBytes: (w / 2) * 2, // 2 bytes/sample interleaved U,V Height: h / 2, ..Default::default() }; // SAFETY: two unsafe `copy_blocking` device→device copies; the caller must have the shared // context current (documented). `&y`/`&uv` are live local `CUDA_MEMCPY2D`s outliving each // synchronous call. All four device pointers are valid: `src.ptr`/`src_uv_ptr` come from a live // NV12 `DeviceBuffer` (its `.uv` presence was checked via `ok_or_else`), `y_dst`/`uv_dst` are // the caller's live NVENC surface planes; the luma copy is `w`×`h`, the chroma copy // `(w/2)*2`×`h/2`, each within its planes. Wrappers → live table. unsafe { copy_blocking(&y, "cuMemcpy2DAsync_v2(nv12 Y dev->dev)")?; copy_blocking(&uv, "cuMemcpy2DAsync_v2(nv12 UV dev->dev)") } } impl Drop for RegisteredTexture { fn drop(&mut self) { if !self.resource.is_null() { // SAFETY: `self.resource` is non-null (just checked) and is the valid // `CUgraphicsResource` from `register_gl`, owned exclusively by this `RegisteredTexture` // and unregistered exactly once here (drop runs once) — no use-after-free or // double-unregister. `cuGraphicsUnregisterResource` releases the GL↔CUDA registration; // wrapper → live table (the resource exists ⇒ the driver was present). Result ignored // (best-effort teardown). unsafe { let _ = cuGraphicsUnregisterResource(self.resource); } } } } /// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR /// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed. /// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop. pub struct ExternalDmabuf { ext: CUexternalMemory, pub ptr: CUdeviceptr, pub size: u64, } // SAFETY: the fields are opaque CUDA driver handles — an external-memory handle and a device // pointer — not dereferenceable Rust memory, and the value is uniquely owned (no `Clone`). It is // used from a single capture thread but constructed on / moved between threads with the importer; // transferring these handles is sound because uniqueness rules out aliasing and they are destroyed // exactly once in `Drop`. Only `Send` (not `Sync`) is asserted, matching the single-thread use. unsafe impl Send for ExternalDmabuf {} impl ExternalDmabuf { /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it /// from then on) and map its full `size` bytes to a device pointer. The shared context /// must be current. pub fn import(fd: i32, size: u64) -> Result { // SAFETY: `libc::dup` only reads the integer `fd` and returns a new descriptor (or -1); it // touches no Rust memory and `fd` is the caller's still-owned dmabuf fd (not consumed // here). No aliasing or lifetime concern — a pure syscall on an integer. let dup = unsafe { libc::dup(fd) }; if dup < 0 { bail!("dup(dmabuf fd) failed"); } Self::import_owned_fd(dup, size) } /// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by /// the driver on success, closed by us on failure. pub fn import_owned_fd(dup: i32, size: u64) -> Result { let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC { type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, size, ..Default::default() }; desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes) let mut ext: CUexternalMemory = std::ptr::null_mut(); // SAFETY: `cuImportExternalMemory` imports the memory described by `&desc`, a live local // `#[repr(C)] CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h 64-bit layout) that outlives this // synchronous call: `type_` is OPAQUE_FD, `handle[0]` holds the dup'd fd in the union's // `int fd` low bytes, `size` is set. `&mut ext` is a live null-init out-param the driver // writes the imported handle into. The driver takes ownership of the fd only on success. // Distinct locals → no aliasing. Wrapper → live table (caller holds the context current). let r = unsafe { cuImportExternalMemory(&mut ext, &desc) }; if r != 0 { // SAFETY: import failed (`r != 0`), so the driver did NOT take ownership of `dup`; we // still own it and close it exactly once here on the error path (the success path never // closes it — the driver does). `libc::close` acts on the integer fd alone. unsafe { libc::close(dup) }; // import failed → the driver did not take the fd bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?"); } let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC { offset: 0, size, ..Default::default() }; let mut ptr: CUdeviceptr = 0; // SAFETY: maps a device pointer from `ext` (the valid `CUexternalMemory` just imported) per // `&buf`, a live local `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (offset 0, full `size`) that // outlives this synchronous call. `&mut ptr` is a live zero-init out-param the driver writes // the mapped device address into; distinct locals → no aliasing. Wrapper → live table // (context current). let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) }; if r != 0 { // SAFETY: mapping failed; `ext` is the valid `CUexternalMemory` we imported and // exclusively own. We destroy it exactly once here on the error path (the success path // instead moves it into the returned `ExternalDmabuf`, whose `Drop` destroys it), // releasing the fd the driver took — no double-destroy or use-after-free. unsafe { let _ = cuDestroyExternalMemory(ext); } bail!("cuExternalMemoryGetMappedBuffer failed ({r})"); } Ok(ExternalDmabuf { ext, ptr, size }) } } impl Drop for ExternalDmabuf { fn drop(&mut self) { // SAFETY: this `ExternalDmabuf` only exists after a successful import, so the driver table // is live. It exclusively owns `self.ptr` (the mapped buffer) and `self.ext` (the external // memory), each torn down exactly once here (drop runs once; guarded by `!= 0` / `!null`) — // no double-free or use-after-free. We make the shared context current first because drop // may run off the import thread, and we free the mapped buffer before destroying its // backing external memory. Results ignored (best-effort teardown). unsafe { if let Some(c) = CONTEXT.get() { let _ = cuCtxSetCurrent(c.0); } if self.ptr != 0 { let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory } if !self.ext.is_null() { let _ = cuDestroyExternalMemory(self.ext); } } } } /// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk /// offset) into `dst`. The shared context must be current on this thread. pub fn copy_pitched_to_buffer( src_ptr: CUdeviceptr, src_pitch: usize, dst: &DeviceBuffer, ) -> Result<()> { let copy = CUDA_MEMCPY2D { srcMemoryType: CU_MEMORYTYPE_DEVICE, srcDevice: src_ptr, srcPitch: src_pitch, dstMemoryType: CU_MEMORYTYPE_DEVICE, dstDevice: dst.ptr, dstPitch: dst.pitch, WidthInBytes: dst.width as usize * 4, Height: dst.height as usize, ..Default::default() }; // copy_blocking syncs our priority stream before returning, so the copy is complete before the // dmabuf is requeued to the producer. // SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared // context current (documented). `©` is a live local device→device `CUDA_MEMCPY2D` outliving // the synchronous call: `srcDevice`/`srcPitch` are the caller's live mapped span (e.g. an // `ExternalDmabuf`), `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height` // within both. Wrapper → live table. unsafe { copy_blocking(©, "cuMemcpy2DAsync_v2(ext->dev)") } }