punktfunk/crates/punktfunk-host/src/linux/zerocopy/cuda.rs

//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop
//! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and
//! `dlopen` `libcuda.so.1` at runtime (the driver library — NOT `libcudart`; NOT a link-time
//! `#[link]`, so one binary runs on NVIDIA and on AMD/Intel where `libcuda` is absent — see
//! [`CudaApi`]). Symbol names verified against
//! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop
//! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is
//! Tegra-only on the desktop driver — see [`super::egl`].)
//!
//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.

#![allow(non_camel_case_types, non_snake_case)]
// Every `unsafe` block/impl below carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]

use anyhow::{bail, Result};
use std::os::raw::{c_int, c_uint, c_void};
use std::sync::{Arc, Mutex, OnceLock};

pub type CUresult = c_uint; // CUDA_SUCCESS == 0
pub type CUdevice = c_int;
pub type CUcontext = *mut c_void; // opaque CUctx_st*
pub type CUstream = *mut c_void; // opaque CUstream_st*
pub type CUdeviceptr = u64;
pub type CUgraphicsResource = *mut c_void;
pub type CUarray = *mut c_void;
pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*

/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;

/// `CUctx_flags` (cuda.h): block the CPU on an OS primitive while waiting for the GPU instead of
/// busy-spinning. On this shared box (compositor + send thread on the same cores) spinning a core
/// to detect copy completion steals CPU from the very threads we want scheduled; BLOCKING_SYNC
/// frees it. Default (`CU_CTX_SCHED_AUTO=0`) heuristically picks SPIN vs YIELD by core count.
const CU_CTX_SCHED_BLOCKING_SYNC: c_uint = 0x04;

/// `cuStreamCreateWithPriority` flag: don't implicitly synchronize with the legacy NULL stream.
const CU_STREAM_NON_BLOCKING: c_uint = 0x01;

/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
#[repr(C)]
#[derive(Default)]
pub struct CUDA_MEMCPY2D {
    pub srcXInBytes: usize,
    pub srcY: usize,
    pub srcMemoryType: c_uint,
    pub srcHost: *const c_void,
    pub srcDevice: CUdeviceptr,
    pub srcArray: CUarray,
    pub srcPitch: usize,
    pub dstXInBytes: usize,
    pub dstY: usize,
    pub dstMemoryType: c_uint,
    pub dstHost: *mut c_void,
    pub dstDevice: CUdeviceptr,
    pub dstArray: CUarray,
    pub dstPitch: usize,
    pub WidthInBytes: usize,
    pub Height: usize,
}

/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
/// only the first 4 bytes (the `int fd`) are read.
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
    pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
    _pad: u32,
    pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
    pub size: u64,
    pub flags: c_uint,
    reserved: [c_uint; 16],
    _pad2: u32,
}

/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
#[repr(C)]
#[derive(Default)]
pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
    pub offset: u64,
    pub size: u64,
    pub flags: c_uint,
    reserved: [c_uint; 16],
    _pad: u32,
}

pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;

/// CUDA Driver API entry points, resolved at runtime from `libcuda.so.1` via `dlopen` rather than
/// a link-time `#[link(name = "cuda")]`. This is what lets ONE host binary run on NVIDIA
/// (zero-copy via CUDA → NVENC) *and* on AMD/Intel (VAAPI, where the NVIDIA driver — and thus
/// `libcuda` — is absent): with a hard link the loader would refuse to start the binary at all.
/// Every `cu*` call below goes through a same-named wrapper fn that forwards to this table; when
/// the driver isn't present the table is `None` and the wrappers return a non-zero `CUresult`, so
/// `context()` fails cleanly and the capturer falls back to the CPU path. The `cuda_api()` loader
/// is memoised; the library handle is intentionally leaked (process-lifetime, like the context).
struct CudaApi {
    cuInit: unsafe extern "C" fn(c_uint) -> CUresult,
    cuDeviceGet: unsafe extern "C" fn(*mut CUdevice, c_int) -> CUresult,
    cuCtxCreate_v2: unsafe extern "C" fn(*mut CUcontext, c_uint, CUdevice) -> CUresult,
    cuCtxSetCurrent: unsafe extern "C" fn(CUcontext) -> CUresult,
    cuMemAllocPitch_v2:
        unsafe extern "C" fn(*mut CUdeviceptr, *mut usize, usize, usize, c_uint) -> CUresult,
    cuMemFree_v2: unsafe extern "C" fn(CUdeviceptr) -> CUresult,
    cuMemcpy2DAsync_v2: unsafe extern "C" fn(*const CUDA_MEMCPY2D, CUstream) -> CUresult,
    cuStreamSynchronize: unsafe extern "C" fn(CUstream) -> CUresult,
    cuCtxGetStreamPriorityRange: unsafe extern "C" fn(*mut c_int, *mut c_int) -> CUresult,
    cuStreamCreateWithPriority: unsafe extern "C" fn(*mut CUstream, c_uint, c_int) -> CUresult,
    cuGraphicsGLRegisterImage:
        unsafe extern "C" fn(*mut CUgraphicsResource, c_uint, c_uint, c_uint) -> CUresult,
    cuGraphicsMapResources:
        unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult,
    cuGraphicsUnmapResources:
        unsafe extern "C" fn(c_uint, *mut CUgraphicsResource, *mut c_void) -> CUresult,
    cuGraphicsSubResourceGetMappedArray:
        unsafe extern "C" fn(*mut CUarray, CUgraphicsResource, c_uint, c_uint) -> CUresult,
    cuGraphicsUnregisterResource: unsafe extern "C" fn(CUgraphicsResource) -> CUresult,
    cuImportExternalMemory: unsafe extern "C" fn(
        *mut CUexternalMemory,
        *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
    ) -> CUresult,
    cuExternalMemoryGetMappedBuffer: unsafe extern "C" fn(
        *mut CUdeviceptr,
        CUexternalMemory,
        *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
    ) -> CUresult,
    cuDestroyExternalMemory: unsafe extern "C" fn(CUexternalMemory) -> CUresult,
}
// SAFETY: every field is a bare `extern "C" fn` address into the leaked, process-lifetime
// `libcuda` mapping (`cuda_api` `forget`s the `Library`, so it is never unloaded) — an immutable
// value with no interior mutability and no thread affinity. Moving the table to another thread
// cannot dangle (the code it points at stays mapped) or race (the fields are read-only).
unsafe impl Send for CudaApi {}
// SAFETY: as above — the table is a set of immutable fn-pointer addresses with no interior
// mutability, so concurrent shared reads from multiple threads cannot race; the driver entry
// points they address are themselves thread-safe.
unsafe impl Sync for CudaApi {}

/// `CUresult` returned by the wrappers when `libcuda` isn't loaded (no NVIDIA driver). Non-zero so
/// the existing `ck()`/`!= 0` checks treat it as an ordinary driver error; distinct from any real
/// `CUDA_ERROR_*` (all < 1000). Never produced by the actual driver.
const CU_ERROR_NOT_LOADED: CUresult = 999;

static CUDA_API: OnceLock<Option<CudaApi>> = OnceLock::new();

/// Resolve `libcuda.so.1` and its symbols once. `None` when the NVIDIA driver isn't installed
/// (the expected case on AMD/Intel hosts) — logged at debug, not an error.
fn cuda_api() -> Option<&'static CudaApi> {
    CUDA_API
        // SAFETY: `Library::new` runs `libcuda.so.1`'s initializers — it is the trusted NVIDIA
        // driver library, so loading has no unexpected effects; `?`/`None` handle its absence.
        // Each `lib.get::<T>(name)` asserts the symbol's real ABI equals `T`: every NUL-terminated
        // name is a documented CUDA Driver API entry point and `T` is the exact
        // `unsafe extern "C" fn(..)` signature from cuda.h/cudaGL.h (`_v2` for ctx/mem ops). Each
        // `Symbol` only borrows `lib` until the end of the struct-literal statement; we deref-copy
        // the raw fn-pointer out first, then `forget(lib)` leaks the mapping so those addresses
        // stay valid for the whole process. Runs once under the `OnceLock` init — no aliasing.
        .get_or_init(|| unsafe {
            let lib = libloading::Library::new("libcuda.so.1")
                .or_else(|_| libloading::Library::new("libcuda.so"))
                .map_err(|e| {
                    tracing::debug!(error = %e, "libcuda not loadable — CUDA zero-copy unavailable (expected on AMD/Intel)");
                })
                .ok()?;
            // Resolve all symbols; the field types drive `get`'s inference. `lib` is leaked after
            // construction so the fn pointers stay valid for the process lifetime (the temporary
            // `Symbol` borrows end with the struct-literal statement, before the forget).
            let api = CudaApi {
                cuInit: *lib.get(b"cuInit\0").ok()?,
                cuDeviceGet: *lib.get(b"cuDeviceGet\0").ok()?,
                cuCtxCreate_v2: *lib.get(b"cuCtxCreate_v2\0").ok()?,
                cuCtxSetCurrent: *lib.get(b"cuCtxSetCurrent\0").ok()?,
                cuMemAllocPitch_v2: *lib.get(b"cuMemAllocPitch_v2\0").ok()?,
                cuMemFree_v2: *lib.get(b"cuMemFree_v2\0").ok()?,
                cuMemcpy2DAsync_v2: *lib.get(b"cuMemcpy2DAsync_v2\0").ok()?,
                cuStreamSynchronize: *lib.get(b"cuStreamSynchronize\0").ok()?,
                cuCtxGetStreamPriorityRange: *lib.get(b"cuCtxGetStreamPriorityRange\0").ok()?,
                cuStreamCreateWithPriority: *lib.get(b"cuStreamCreateWithPriority\0").ok()?,
                cuGraphicsGLRegisterImage: *lib.get(b"cuGraphicsGLRegisterImage\0").ok()?,
                cuGraphicsMapResources: *lib.get(b"cuGraphicsMapResources\0").ok()?,
                cuGraphicsUnmapResources: *lib.get(b"cuGraphicsUnmapResources\0").ok()?,
                cuGraphicsSubResourceGetMappedArray: *lib
                    .get(b"cuGraphicsSubResourceGetMappedArray\0")
                    .ok()?,
                cuGraphicsUnregisterResource: *lib.get(b"cuGraphicsUnregisterResource\0").ok()?,
                cuImportExternalMemory: *lib.get(b"cuImportExternalMemory\0").ok()?,
                cuExternalMemoryGetMappedBuffer: *lib
                    .get(b"cuExternalMemoryGetMappedBuffer\0")
                    .ok()?,
                cuDestroyExternalMemory: *lib.get(b"cuDestroyExternalMemory\0").ok()?,
            };
            std::mem::forget(lib); // keep libcuda mapped for the fn pointers' lifetime (process)
            Some(api)
        })
        .as_ref()
}

// Same-named wrappers so the call sites below are unchanged. Each forwards through the dlopen'd
// table, or returns `CU_ERROR_NOT_LOADED` when the driver is absent (AMD/Intel) — which the
// `CUresult` checks already handle. Only `context()` is reachable before the driver is confirmed
// present; every other entry runs after `context()` succeeded, so its wrapper always hits `Some`.
unsafe fn cuInit(flags: c_uint) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuInit)(flags),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuDeviceGet)(device, ordinal),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuCtxCreate_v2)(pctx, flags, dev),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuCtxSetCurrent)(ctx),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuMemAllocPitch_v2(
    dptr: *mut CUdeviceptr,
    pitch: *mut usize,
    width_bytes: usize,
    height: usize,
    element_size: c_uint,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuMemAllocPitch_v2)(dptr, pitch, width_bytes, height, element_size),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuMemFree_v2)(dptr),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuMemcpy2DAsync_v2(copy: *const CUDA_MEMCPY2D, stream: CUstream) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuMemcpy2DAsync_v2)(copy, stream),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuStreamSynchronize(stream: CUstream) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuStreamSynchronize)(stream),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuCtxGetStreamPriorityRange(least: *mut c_int, greatest: *mut c_int) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuCtxGetStreamPriorityRange)(least, greatest),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuStreamCreateWithPriority(
    stream: *mut CUstream,
    flags: c_uint,
    priority: c_int,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuStreamCreateWithPriority)(stream, flags, priority),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuGraphicsGLRegisterImage(
    resource: *mut CUgraphicsResource,
    texture: c_uint,
    target: c_uint,
    flags: c_uint,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuGraphicsGLRegisterImage)(resource, texture, target, flags),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuGraphicsMapResources(
    count: c_uint,
    resources: *mut CUgraphicsResource,
    stream: *mut c_void,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuGraphicsMapResources)(count, resources, stream),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuGraphicsUnmapResources(
    count: c_uint,
    resources: *mut CUgraphicsResource,
    stream: *mut c_void,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuGraphicsUnmapResources)(count, resources, stream),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuGraphicsSubResourceGetMappedArray(
    array: *mut CUarray,
    resource: CUgraphicsResource,
    array_index: c_uint,
    mip_level: c_uint,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuGraphicsSubResourceGetMappedArray)(array, resource, array_index, mip_level),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuGraphicsUnregisterResource)(resource),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuImportExternalMemory(
    ext_mem_out: *mut CUexternalMemory,
    mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuImportExternalMemory)(ext_mem_out, mem_handle_desc),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuExternalMemoryGetMappedBuffer(
    dev_ptr: *mut CUdeviceptr,
    ext_mem: CUexternalMemory,
    buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuExternalMemoryGetMappedBuffer)(dev_ptr, ext_mem, buffer_desc),
        None => CU_ERROR_NOT_LOADED,
    }
}
unsafe fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult {
    match cuda_api() {
        Some(a) => (a.cuDestroyExternalMemory)(ext_mem),
        None => CU_ERROR_NOT_LOADED,
    }
}

#[inline]
fn ck(r: CUresult, what: &str) -> Result<()> {
    if r == 0 {
        Ok(())
    } else {
        bail!("CUDA driver error {r} in {what}")
    }
}

/// Copy a pitched device plane `(src_ptr, src_pitch)` down to a tightly-packed host buffer of
/// `width_bytes`×`height` (no row padding). Synchronous on the priority stream. Used by the NV12
/// self-test to read planes back for the colour comparison; not on the hot path.
pub fn read_plane_to_host(
    src_ptr: CUdeviceptr,
    src_pitch: usize,
    width_bytes: usize,
    height: usize,
) -> Result<Vec<u8>> {
    let mut host = vec![0u8; width_bytes * height];
    let copy = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src_ptr,
        srcPitch: src_pitch,
        dstMemoryType: 1, // CU_MEMORYTYPE_HOST
        dstHost: host.as_mut_ptr() as *mut c_void,
        dstPitch: width_bytes,
        WidthInBytes: width_bytes,
        Height: height,
        ..Default::default()
    };
    // SAFETY: `copy_blocking` is unsafe because it issues a CUDA copy; its contract is a valid
    // descriptor with the shared context current (the caller's responsibility — self-test path).
    // `&copy` is a live local `#[repr(C)] CUDA_MEMCPY2D` that outlives the synchronous call:
    // `srcDevice`/`srcPitch` are the caller's live pitched device plane, `dstHost` addresses the
    // freshly-allocated `host` `Vec` of exactly `width_bytes*height` bytes, and `WidthInBytes`×
    // `Height` fit both. The copy is synchronous, so `host` is fully written before we return it.
    unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(dev->host)")? };
    Ok(host)
}

/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
#[derive(Clone, Copy)]
pub struct Context(pub CUcontext);
// SAFETY: `CUcontext` is an opaque CUDA driver handle, not a dereferenceable Rust pointer. It is
// created once and never destroyed (process lifetime), and the only thing done with it is
// `cuCtxSetCurrent`, which the Driver API explicitly allows from any thread — so transferring the
// handle to another thread cannot dangle or race (the driver owns the synchronization).
unsafe impl Send for Context {}
// SAFETY: as above — the wrapped handle is an immutable opaque address and the driver does all the
// synchronization, so sharing `&Context` across threads is sound.
unsafe impl Sync for Context {}

static CONTEXT: OnceLock<Context> = OnceLock::new();

/// Get (lazily creating) the shared CUDA context on device 0.
pub fn context() -> Result<CUcontext> {
    if let Some(c) = CONTEXT.get() {
        return Ok(c.0);
    }
    if cuda_api().is_none() {
        bail!("libcuda.so.1 not available — no NVIDIA driver (CUDA zero-copy disabled)");
    }
    // SAFETY: we returned above unless `cuda_api()` is `Some`, so every wrapper here forwards into
    // the live, leaked `libcuda` table rather than the not-loaded stub. `cuInit(0)` passes the
    // API-required flags value 0. `&mut dev`/`&mut ctx` are live, zero/null-initialized stack
    // out-params the driver writes the device handle / new context into; each outlives its
    // synchronous call and they are distinct locals (no aliasing). `cuCtxCreate_v2` yields a valid
    // `CUcontext` on success (`ck` bails otherwise), which becomes the block's value.
    let ctx = unsafe {
        ck(cuInit(0), "cuInit")?;
        let mut dev: CUdevice = 0;
        ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
        let mut ctx: CUcontext = std::ptr::null_mut();
        ck(
            cuCtxCreate_v2(&mut ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev),
            "cuCtxCreate_v2",
        )?;
        ctx
    };
    // Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
    // process-lifetime). `get_or_init` keeps a single shared value.
    Ok(CONTEXT.get_or_init(|| Context(ctx)).0)
}

/// Make the shared context current on the calling thread (required before any CUDA op here).
pub fn make_current() -> Result<()> {
    let ctx = context()?;
    // SAFETY: `ctx` came from `context()?`, so it is the live shared `CUcontext` and the driver
    // table is present. `cuCtxSetCurrent` binds that opaque handle to the calling thread; it takes
    // no Rust-memory pointer and is thread-safe (affects only this thread's current context), so
    // there is no aliasing or lifetime hazard.
    unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
}

thread_local! {
    /// Per-thread copy stream. `None` until first use; `Some(null)` means "creation failed, use the
    /// default (NULL) stream". Per-thread (not shared) so each worker's `cuStreamSynchronize` waits
    /// only on ITS OWN copies — the old per-frame `cuCtxSynchronize` was context-wide and also
    /// blocked on the other worker thread's in-flight NULL-stream copies.
    static COPY_STREAM: std::cell::Cell<Option<CUstream>> = const { std::cell::Cell::new(None) };
}

/// The calling thread's highest-priority copy stream (lazily created; context must be current).
/// Carries the greatest stream priority the driver exposes — a scheduler hint that nudges our
/// copies ahead of the game's queued compute. NOTE: stream priority is an intra-process hint and
/// NVIDIA's Linux driver may ignore it / not preempt a saturating game's graphics context; this is
/// "measure-then-keep", and it never regresses (falls back to the NULL stream). The greatest
/// priority is the numerically-lowest value (`greatest` from `cuCtxGetStreamPriorityRange`).
fn copy_stream() -> CUstream {
    COPY_STREAM.with(|cell| {
        if let Some(s) = cell.get() {
            return s;
        }
        // SAFETY: `copy_stream` runs with the shared context current (its doc contract), so the
        // wrappers forward into the live `libcuda` table. `&mut least`/`&mut greatest` are live
        // stack `i32`s the driver fills with the priority range; `&mut s` is a live null-init
        // `CUstream` the driver writes the new stream into. All out-params outlive their
        // synchronous calls and are distinct locals. On any non-zero result we fall back to a null
        // (NULL-stream) value and never read an uninitialized handle.
        let stream = unsafe {
            let (mut least, mut greatest) = (0i32, 0i32);
            if cuCtxGetStreamPriorityRange(&mut least, &mut greatest) != 0 {
                std::ptr::null_mut()
            } else {
                let mut s: CUstream = std::ptr::null_mut();
                if cuStreamCreateWithPriority(&mut s, CU_STREAM_NON_BLOCKING, greatest) != 0 {
                    std::ptr::null_mut()
                } else {
                    tracing::debug!(
                        priority = greatest,
                        "CUDA high-priority copy stream created"
                    );
                    s
                }
            }
        };
        cell.set(Some(stream));
        stream
    })
}

/// Issue `copy` on this thread's priority stream and block until it completes. Replaces the
/// per-frame `cuMemcpy2D_v2` + context-wide `cuCtxSynchronize` pair: same completion guarantee
/// (the source dmabuf is safe to recycle once this returns), but the wait is scoped to our own
/// stream and the copy carries the high priority hint.
unsafe fn copy_blocking(copy: &CUDA_MEMCPY2D, what: &str) -> Result<()> {
    let stream = copy_stream();
    ck(cuMemcpy2DAsync_v2(copy, stream), what)?;
    ck(cuStreamSynchronize(stream), "cuStreamSynchronize")
}

/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
    let mut ptr: CUdeviceptr = 0;
    let mut pitch: usize = 0;
    // SAFETY: `cuMemAllocPitch_v2` allocates a pitched device buffer (the wrapper forwards to the
    // live table on any path that reached allocation). `&mut ptr` (`CUdeviceptr`) and `&mut pitch`
    // (`usize`) are live, distinct stack out-params the driver writes the allocation pointer and
    // its pitch into; both outlive the synchronous call. Width/height/element-size are by-value
    // ints. No aliasing — two separate locals.
    unsafe {
        ck(
            cuMemAllocPitch_v2(
                &mut ptr,
                &mut pitch,
                width as usize * 4,
                height as usize,
                16,
            ),
            "cuMemAllocPitch_v2",
        )?;
    }
    Ok((ptr, pitch))
}

/// Allocate the two pitched planes of an NV12 surface (8-bit BT.709 4:2:0): a `width`-byte Y plane
/// (W×H, 1 byte/px) and an interleaved chroma plane (W/2 × H/2 samples, 2 bytes/sample → W bytes
/// wide). Both planes share the driver's Y pitch (the wider request), so the encoder's two-plane
/// surface and ours line up. Returns `((y_ptr, y_pitch), (uv_ptr, uv_pitch))`.
fn alloc_pitched_nv12(
    width: u32,
    height: u32,
) -> Result<((CUdeviceptr, usize), (CUdeviceptr, usize))> {
    let mut y_ptr: CUdeviceptr = 0;
    let mut y_pitch: usize = 0;
    let mut uv_ptr: CUdeviceptr = 0;
    let mut uv_pitch: usize = 0;
    // SAFETY: two independent `cuMemAllocPitch_v2` calls (wrapper → live table). `&mut y_ptr`/
    // `&mut y_pitch` and `&mut uv_ptr`/`&mut uv_pitch` are live, distinct stack out-params the
    // driver writes each plane's pointer and pitch into; all outlive their synchronous calls. The
    // dimension/element-size args are by-value ints. No aliasing — four separate locals.
    unsafe {
        ck(
            cuMemAllocPitch_v2(
                &mut y_ptr,
                &mut y_pitch,
                width as usize,
                height as usize,
                16,
            ),
            "cuMemAllocPitch_v2(Y)",
        )?;
        // Chroma is W/2 samples wide at 2 bytes each = W bytes; H/2 rows.
        ck(
            cuMemAllocPitch_v2(
                &mut uv_ptr,
                &mut uv_pitch,
                (width as usize / 2) * 2,
                (height as usize / 2).max(1),
                16,
            ),
            "cuMemAllocPitch_v2(UV)",
        )?;
    }
    Ok(((y_ptr, y_pitch), (uv_ptr, uv_pitch)))
}

/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
/// returns its allocation here. Bulk-freed when the last reference drops. For NV12 each free entry
/// is the Y plane *and* its paired UV plane (allocated/recycled/freed together).
struct PoolInner {
    free: Vec<CUdeviceptr>,
    /// NV12 only: the UV plane paired with each Y plane in `free` (same index, same length).
    free_uv: Vec<CUdeviceptr>,
}

impl Drop for PoolInner {
    fn drop(&mut self) {
        // SAFETY: the pool only exists because allocation succeeded, so the driver table is live.
        // `PoolInner` drops only once every `DeviceBuffer` that referenced it (each holds an `Arc`
        // clone) has been recycled, so `free`/`free_uv` hold every outstanding allocation exactly
        // once and nothing else still uses them — no double-free or use-after-free. We make the
        // shared context current first (drop may run off the allocating thread) so `cuMemFree_v2`
        // targets the right context. Each `p` is a `CUdeviceptr` previously returned by
        // `cuMemAllocPitch_v2`; results are ignored (best-effort teardown).
        unsafe {
            if let Some(c) = CONTEXT.get() {
                let _ = cuCtxSetCurrent(c.0);
            }
            for &p in &self.free {
                let _ = cuMemFree_v2(p);
            }
            for &p in &self.free_uv {
                let _ = cuMemFree_v2(p);
            }
        }
    }
}

/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame
/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock
/// and serializes against the GPU every frame.
#[derive(Clone)]
pub struct BufferPool {
    inner: Arc<Mutex<PoolInner>>,
    width: u32,
    height: u32,
    pitch: usize,
    /// NV12 pools carry a second (chroma) pitch; `Some` ⇒ buffers from this pool have a UV plane.
    uv_pitch: Option<usize>,
}

impl BufferPool {
    /// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the
    /// driver's pitch, which is constant for a given width).
    pub fn new(width: u32, height: u32) -> Result<BufferPool> {
        let (ptr, pitch) = alloc_pitched(width, height)?;
        Ok(BufferPool {
            inner: Arc::new(Mutex::new(PoolInner {
                free: vec![ptr],
                free_uv: Vec::new(),
            })),
            width,
            height,
            pitch,
            uv_pitch: None,
        })
    }

    /// Create a pool of NV12 two-plane surfaces (Y + interleaved UV) for `width`x`height`. Allocates
    /// one pair up front to learn the driver's per-plane pitches (constant for a given width).
    pub fn new_nv12(width: u32, height: u32) -> Result<BufferPool> {
        let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
        Ok(BufferPool {
            inner: Arc::new(Mutex::new(PoolInner {
                free: vec![y_ptr],
                free_uv: vec![uv_ptr],
            })),
            width,
            height,
            pitch: y_pitch,
            uv_pitch: Some(uv_pitch),
        })
    }

    pub fn width(&self) -> u32 {
        self.width
    }

    pub fn height(&self) -> u32 {
        self.height
    }

    /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
    /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). For an
    /// NV12 pool the returned buffer carries both the Y and the paired UV plane.
    pub fn get(&self) -> Result<DeviceBuffer> {
        if let Some(uv_pitch) = self.uv_pitch {
            let reuse = {
                let mut g = self.inner.lock().unwrap();
                g.free.pop().map(|y| (y, g.free_uv.pop()))
            };
            let (ptr, uv_ptr) = match reuse {
                // Y and UV are pushed/popped together, so a popped Y always has its UV.
                Some((y, Some(uv))) => (y, uv),
                _ => {
                    let ((y, _), (uv, _)) = alloc_pitched_nv12(self.width, self.height)?;
                    (y, uv)
                }
            };
            return Ok(DeviceBuffer {
                ptr,
                pitch: self.pitch,
                width: self.width,
                height: self.height,
                uv: Some((uv_ptr, uv_pitch)),
                pool: Some(self.inner.clone()),
            });
        }
        let reuse = self.inner.lock().unwrap().free.pop();
        let ptr = match reuse {
            Some(p) => p,
            None => alloc_pitched(self.width, self.height)?.0,
        };
        Ok(DeviceBuffer {
            ptr,
            pitch: self.pitch,
            width: self.width,
            height: self.height,
            uv: None,
            pool: Some(self.inner.clone()),
        })
    }
}

/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped
/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder.
/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees.
pub struct DeviceBuffer {
    pub ptr: CUdeviceptr,
    pub pitch: usize,
    pub width: u32,
    pub height: u32,
    /// NV12 only: the interleaved chroma plane `(ptr, pitch)` paired with the Y plane in [`ptr`].
    /// `None` for the default 4-byte RGB/BGRx path. When `Some`, [`ptr`] is the Y plane (1 byte/px).
    pub uv: Option<(CUdeviceptr, usize)>,
    pool: Option<Arc<Mutex<PoolInner>>>,
}

impl DeviceBuffer {
    /// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path.
    pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
        let (ptr, pitch) = alloc_pitched(width, height)?;
        Ok(DeviceBuffer {
            ptr,
            pitch,
            width,
            height,
            uv: None,
            pool: None,
        })
    }

    /// Allocate a standalone (un-pooled) NV12 two-plane buffer. Prefer [`BufferPool::new_nv12`] on
    /// the hot path; used by the self-test.
    pub fn alloc_nv12(width: u32, height: u32) -> Result<DeviceBuffer> {
        let ((y_ptr, y_pitch), (uv_ptr, uv_pitch)) = alloc_pitched_nv12(width, height)?;
        Ok(DeviceBuffer {
            ptr: y_ptr,
            pitch: y_pitch,
            width,
            height,
            uv: Some((uv_ptr, uv_pitch)),
            pool: None,
        })
    }

    /// True if this buffer carries an NV12 chroma plane.
    pub fn is_nv12(&self) -> bool {
        self.uv.is_some()
    }
}

impl Drop for DeviceBuffer {
    fn drop(&mut self) {
        if self.ptr == 0 {
            return;
        }
        if let Some(pool) = &self.pool {
            // Recycle (the consumer synchronized before dropping, so the GPU is done with it). Y and
            // its paired UV go back together so `get` can repair them as a unit.
            let mut g = pool.lock().unwrap();
            g.free.push(self.ptr);
            if let Some((uv_ptr, _)) = self.uv {
                g.free_uv.push(uv_ptr);
            }
        } else {
            // The buffer may be freed on the encode thread; cuMemFree needs a current context.
            // SAFETY: this is the un-pooled branch (`pool` is `None`), so this `DeviceBuffer`
            // exclusively owns `self.ptr` (and `self.uv`'s `uv_ptr`), each returned by
            // `cuMemAllocPitch_v2` and freed exactly once here — `drop` runs once and the
            // `self.ptr == 0` guard above skips the sentinel/empty case, so no double-free. We set
            // the shared context current first because drop may run on a thread where it isn't, and
            // `cuMemFree_v2` needs it. Wrapper → live table; results ignored (teardown).
            unsafe {
                if let Some(c) = CONTEXT.get() {
                    let _ = cuCtxSetCurrent(c.0);
                }
                let _ = cuMemFree_v2(self.ptr);
                if let Some((uv_ptr, _)) = self.uv {
                    let _ = cuMemFree_v2(uv_ptr);
                }
            }
        }
    }
}

/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA
/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the
/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only
/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point),
/// instead of registering/unregistering every frame. Unregisters on drop.
pub struct RegisteredTexture {
    resource: CUgraphicsResource,
}

impl RegisteredTexture {
    /// Register a `GL_TEXTURE_2D` once.
    ///
    /// # Safety
    /// The GL context and the shared CUDA context must both be current on this thread, and
    /// `texture` must be a valid `GL_TEXTURE_2D`.
    pub unsafe fn register_gl(texture: u32) -> Result<RegisteredTexture> {
        const GL_TEXTURE_2D: c_uint = 0x0DE1;
        const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
        let mut resource: CUgraphicsResource = std::ptr::null_mut();
        ck(
            cuGraphicsGLRegisterImage(
                &mut resource,
                texture,
                GL_TEXTURE_2D,
                CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY,
            ),
            "cuGraphicsGLRegisterImage",
        )?;
        Ok(RegisteredTexture { resource })
    }

    /// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
    /// unmap. The copy is synchronized (on our priority stream) before unmap so `dst` is ready
    /// before the source dmabuf is recycled. Always unmaps, even if the copy errors.
    pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
        // SAFETY: `self.resource` is the valid `CUgraphicsResource` from a successful `register_gl`
        // (its only constructor), so the wrappers forward to the live table; the caller holds the
        // GL+CUDA contexts current (the registration's contract). `cuGraphicsMapResources` maps
        // `count == 1` resource via `&mut self.resource` (a live field) on the default stream;
        // `cuGraphicsSubResourceGetMappedArray` writes the mapped `CUarray` into the live local
        // `array` (index 0, mip 0). On failure we unmap and bail (balanced). `&copy` is a live
        // local `CUDA_MEMCPY2D` outliving the synchronous `copy_blocking`: `srcArray` is valid
        // while mapped, `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height` fit
        // both. `copy_blocking` syncs before we unmap, so the array stays valid through the copy;
        // we always unmap afterward (even on error), keeping the map/unmap pair balanced.
        unsafe {
            ck(
                cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
                "cuGraphicsMapResources",
            )?;
            let mut array: CUarray = std::ptr::null_mut();
            if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
                let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
                bail!("cuGraphicsSubResourceGetMappedArray failed");
            }
            let copy = CUDA_MEMCPY2D {
                srcMemoryType: CU_MEMORYTYPE_ARRAY,
                srcArray: array,
                dstMemoryType: CU_MEMORYTYPE_DEVICE,
                dstDevice: dst.ptr,
                dstPitch: dst.pitch,
                WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
                Height: dst.height as usize,
                ..Default::default()
            };
            let res = copy_blocking(&copy, "cuMemcpy2DAsync_v2");
            let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
            res
        }
    }

    /// Map this texture for the frame and copy its array into the device plane `(dst_ptr,
    /// dst_pitch)`, taking `width_bytes`×`height` bytes (the GL internal format dictates
    /// `width_bytes`: `width*1` for an `R8` luma target, `(width/2)*2` for an `RG8` chroma target).
    /// Synchronized on our priority stream before unmap (so the source dmabuf is safe to recycle).
    /// Always unmaps, even on copy error.
    fn copy_mapped_plane(
        &mut self,
        dst_ptr: CUdeviceptr,
        dst_pitch: usize,
        width_bytes: usize,
        height: usize,
    ) -> Result<()> {
        // SAFETY: identical contract to `copy_mapped_to` — `self.resource` is the valid
        // `CUgraphicsResource` from `register_gl` (wrappers → live table; caller holds GL+CUDA
        // contexts current). Map `count == 1` resource via the live `&mut self.resource`; the
        // mapped `CUarray` is written into the live local `array` (index 0, mip 0); on failure we
        // unmap and bail (balanced). `&copy` is a live local outliving the synchronous
        // `copy_blocking`: `srcArray` valid while mapped, `dstDevice`/`dstPitch` are the caller's
        // live plane, `width_bytes`×`height` fit it. We always unmap afterward, even on copy error,
        // so the map/unmap pair stays balanced and the array outlives the copy.
        unsafe {
            ck(
                cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
                "cuGraphicsMapResources",
            )?;
            let mut array: CUarray = std::ptr::null_mut();
            if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
                let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
                bail!("cuGraphicsSubResourceGetMappedArray failed");
            }
            let copy = CUDA_MEMCPY2D {
                srcMemoryType: CU_MEMORYTYPE_ARRAY,
                srcArray: array,
                dstMemoryType: CU_MEMORYTYPE_DEVICE,
                dstDevice: dst_ptr,
                dstPitch: dst_pitch,
                WidthInBytes: width_bytes,
                Height: height,
                ..Default::default()
            };
            let res = copy_blocking(&copy, "cuMemcpy2DAsync_v2(plane)");
            let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
            res
        }
    }
}

/// Copy the two NV12 convert targets (registered `R8` luma + `RG8` chroma GL textures) into `dst`'s
/// Y and UV planes. `dst` must be an NV12 buffer (`dst.uv` set). The luma plane is `width`×`height`
/// bytes; the chroma plane is `(width/2)·2` bytes wide × `height/2` rows. Both copies sync on our
/// priority stream before returning, so the dmabuf is safe to recycle once this returns.
pub fn copy_mapped_nv12(
    y_tex: &mut RegisteredTexture,
    uv_tex: &mut RegisteredTexture,
    dst: &DeviceBuffer,
) -> Result<()> {
    let (uv_ptr, uv_pitch) = dst
        .uv
        .ok_or_else(|| anyhow::anyhow!("copy_mapped_nv12 on a non-NV12 buffer"))?;
    let w = dst.width as usize;
    let h = dst.height as usize;
    y_tex.copy_mapped_plane(dst.ptr, dst.pitch, w, h)?;
    uv_tex.copy_mapped_plane(uv_ptr, uv_pitch, (w / 2) * 2, h / 2)
}

/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
/// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels.
/// The caller must have the shared context current on this thread (see [`make_current`]).
pub fn copy_device_to_device(
    src: &DeviceBuffer,
    dst_ptr: CUdeviceptr,
    dst_pitch: usize,
) -> Result<()> {
    let copy = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src.ptr,
        srcPitch: src.pitch,
        dstMemoryType: CU_MEMORYTYPE_DEVICE,
        dstDevice: dst_ptr,
        dstPitch: dst_pitch,
        WidthInBytes: src.width as usize * 4,
        Height: src.height as usize,
        ..Default::default()
    };
    // SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared
    // context current (documented). `&copy` is a live local device→device `CUDA_MEMCPY2D` outliving
    // the synchronous call: `srcDevice`/`srcPitch` are `src`'s live allocation, `dstDevice`/
    // `dstPitch` the caller's live region, `width*4`×`height` within both. Wrapper → live table.
    unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(dev->dev)") }
}

/// Copy our imported NV12 [`DeviceBuffer`] (Y + UV planes) into NVENC's two-plane CUDA surface
/// `(y_dst, y_pitch)` / `(uv_dst, uv_pitch)` (`av_hwframe_get_buffer`'s `data[0]`/`data[1]` +
/// `linesize[0]`/`linesize[1]`). The Y plane is `width`×`height` bytes; the chroma plane is
/// `(width/2)·2` bytes × `height/2` rows. The caller must have the shared context current.
pub fn copy_nv12_to_device(
    src: &DeviceBuffer,
    y_dst: CUdeviceptr,
    y_pitch: usize,
    uv_dst: CUdeviceptr,
    uv_pitch: usize,
) -> Result<()> {
    let (src_uv_ptr, src_uv_pitch) = src
        .uv
        .ok_or_else(|| anyhow::anyhow!("copy_nv12_to_device on a non-NV12 buffer"))?;
    let w = src.width as usize;
    let h = src.height as usize;
    let y = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src.ptr,
        srcPitch: src.pitch,
        dstMemoryType: CU_MEMORYTYPE_DEVICE,
        dstDevice: y_dst,
        dstPitch: y_pitch,
        WidthInBytes: w, // 1 byte/px luma
        Height: h,
        ..Default::default()
    };
    let uv = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src_uv_ptr,
        srcPitch: src_uv_pitch,
        dstMemoryType: CU_MEMORYTYPE_DEVICE,
        dstDevice: uv_dst,
        dstPitch: uv_pitch,
        WidthInBytes: (w / 2) * 2, // 2 bytes/sample interleaved U,V
        Height: h / 2,
        ..Default::default()
    };
    // SAFETY: two unsafe `copy_blocking` device→device copies; the caller must have the shared
    // context current (documented). `&y`/`&uv` are live local `CUDA_MEMCPY2D`s outliving each
    // synchronous call. All four device pointers are valid: `src.ptr`/`src_uv_ptr` come from a live
    // NV12 `DeviceBuffer` (its `.uv` presence was checked via `ok_or_else`), `y_dst`/`uv_dst` are
    // the caller's live NVENC surface planes; the luma copy is `w`×`h`, the chroma copy
    // `(w/2)*2`×`h/2`, each within its planes. Wrappers → live table.
    unsafe {
        copy_blocking(&y, "cuMemcpy2DAsync_v2(nv12 Y dev->dev)")?;
        copy_blocking(&uv, "cuMemcpy2DAsync_v2(nv12 UV dev->dev)")
    }
}

impl Drop for RegisteredTexture {
    fn drop(&mut self) {
        if !self.resource.is_null() {
            // SAFETY: `self.resource` is non-null (just checked) and is the valid
            // `CUgraphicsResource` from `register_gl`, owned exclusively by this `RegisteredTexture`
            // and unregistered exactly once here (drop runs once) — no use-after-free or
            // double-unregister. `cuGraphicsUnregisterResource` releases the GL↔CUDA registration;
            // wrapper → live table (the resource exists ⇒ the driver was present). Result ignored
            // (best-effort teardown).
            unsafe {
                let _ = cuGraphicsUnregisterResource(self.resource);
            }
        }
    }
}

/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
pub struct ExternalDmabuf {
    ext: CUexternalMemory,
    pub ptr: CUdeviceptr,
    pub size: u64,
}

// SAFETY: the fields are opaque CUDA driver handles — an external-memory handle and a device
// pointer — not dereferenceable Rust memory, and the value is uniquely owned (no `Clone`). It is
// used from a single capture thread but constructed on / moved between threads with the importer;
// transferring these handles is sound because uniqueness rules out aliasing and they are destroyed
// exactly once in `Drop`. Only `Send` (not `Sync`) is asserted, matching the single-thread use.
unsafe impl Send for ExternalDmabuf {}

impl ExternalDmabuf {
    /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
    /// from then on) and map its full `size` bytes to a device pointer. The shared context
    /// must be current.
    pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
        // SAFETY: `libc::dup` only reads the integer `fd` and returns a new descriptor (or -1); it
        // touches no Rust memory and `fd` is the caller's still-owned dmabuf fd (not consumed
        // here). No aliasing or lifetime concern — a pure syscall on an integer.
        let dup = unsafe { libc::dup(fd) };
        if dup < 0 {
            bail!("dup(dmabuf fd) failed");
        }
        Self::import_owned_fd(dup, size)
    }

    /// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by
    /// the driver on success, closed by us on failure.
    pub fn import_owned_fd(dup: i32, size: u64) -> Result<ExternalDmabuf> {
        let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
            type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
            size,
            ..Default::default()
        };
        desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
        let mut ext: CUexternalMemory = std::ptr::null_mut();
        // SAFETY: `cuImportExternalMemory` imports the memory described by `&desc`, a live local
        // `#[repr(C)] CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h 64-bit layout) that outlives this
        // synchronous call: `type_` is OPAQUE_FD, `handle[0]` holds the dup'd fd in the union's
        // `int fd` low bytes, `size` is set. `&mut ext` is a live null-init out-param the driver
        // writes the imported handle into. The driver takes ownership of the fd only on success.
        // Distinct locals → no aliasing. Wrapper → live table (caller holds the context current).
        let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
        if r != 0 {
            // SAFETY: import failed (`r != 0`), so the driver did NOT take ownership of `dup`; we
            // still own it and close it exactly once here on the error path (the success path never
            // closes it — the driver does). `libc::close` acts on the integer fd alone.
            unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
            bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
        }
        let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
            offset: 0,
            size,
            ..Default::default()
        };
        let mut ptr: CUdeviceptr = 0;
        // SAFETY: maps a device pointer from `ext` (the valid `CUexternalMemory` just imported) per
        // `&buf`, a live local `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (offset 0, full `size`) that
        // outlives this synchronous call. `&mut ptr` is a live zero-init out-param the driver writes
        // the mapped device address into; distinct locals → no aliasing. Wrapper → live table
        // (context current).
        let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
        if r != 0 {
            // SAFETY: mapping failed; `ext` is the valid `CUexternalMemory` we imported and
            // exclusively own. We destroy it exactly once here on the error path (the success path
            // instead moves it into the returned `ExternalDmabuf`, whose `Drop` destroys it),
            // releasing the fd the driver took — no double-destroy or use-after-free.
            unsafe {
                let _ = cuDestroyExternalMemory(ext);
            }
            bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
        }
        Ok(ExternalDmabuf { ext, ptr, size })
    }
}

impl Drop for ExternalDmabuf {
    fn drop(&mut self) {
        // SAFETY: this `ExternalDmabuf` only exists after a successful import, so the driver table
        // is live. It exclusively owns `self.ptr` (the mapped buffer) and `self.ext` (the external
        // memory), each torn down exactly once here (drop runs once; guarded by `!= 0` / `!null`) —
        // no double-free or use-after-free. We make the shared context current first because drop
        // may run off the import thread, and we free the mapped buffer before destroying its
        // backing external memory. Results ignored (best-effort teardown).
        unsafe {
            if let Some(c) = CONTEXT.get() {
                let _ = cuCtxSetCurrent(c.0);
            }
            if self.ptr != 0 {
                let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
            }
            if !self.ext.is_null() {
                let _ = cuDestroyExternalMemory(self.ext);
            }
        }
    }
}

/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
/// offset) into `dst`. The shared context must be current on this thread.
pub fn copy_pitched_to_buffer(
    src_ptr: CUdeviceptr,
    src_pitch: usize,
    dst: &DeviceBuffer,
) -> Result<()> {
    let copy = CUDA_MEMCPY2D {
        srcMemoryType: CU_MEMORYTYPE_DEVICE,
        srcDevice: src_ptr,
        srcPitch: src_pitch,
        dstMemoryType: CU_MEMORYTYPE_DEVICE,
        dstDevice: dst.ptr,
        dstPitch: dst.pitch,
        WidthInBytes: dst.width as usize * 4,
        Height: dst.height as usize,
        ..Default::default()
    };
    // copy_blocking syncs our priority stream before returning, so the copy is complete before the
    // dmabuf is requeued to the producer.
    // SAFETY: `copy_blocking` is unsafe (issues a CUDA copy); the caller must have the shared
    // context current (documented). `&copy` is a live local device→device `CUDA_MEMCPY2D` outliving
    // the synchronous call: `srcDevice`/`srcPitch` are the caller's live mapped span (e.g. an
    // `ExternalDmabuf`), `dstDevice`/`dstPitch` are `dst`'s live allocation, `width*4`×`height`
    // within both. Wrapper → live table.
    unsafe { copy_blocking(&copy, "cuMemcpy2DAsync_v2(ext->dev)") }
}