rename: lumen → punktfunk, everywhere

Full project rename, decided 2026-06-10: - Crates/binaries: punktfunk-core / punktfunk-host / punktfunk-client-rs. - C ABI: punktfunk_* symbols, Punktfunk* types, include/punktfunk_core.h, PUNKTFUNK_FEATURE_QUIC guard (header regenerated; cbindgen renames updated, incl. PUNKTFUNK_BTN_*/PUNKTFUNK_AXIS_* wire constants). - Protocol: punktfunk/1 — control-plane magic LMN1 → PKF1, nonce salt lmn1 → pkf1. WIRE BREAK: clients must be rebuilt from this revision. - Env knobs: PUNKTFUNK_VIDEO_SOURCE / PUNKTFUNK_COMPOSITOR / PUNKTFUNK_ZEROCOPY / …. - Host config dir: ~/.config/punktfunk (the box's dir was migrated in place — the persistent identity is unchanged, pinned fingerprints stay valid). - Swift package: PunktfunkKit + PunktfunkCore.xcframework + PunktfunkConnection (Sources/PunktfunkClient app + tests renamed with it); build-xcframework.sh updated. - scripts/: 60-punktfunk.rules, punktfunk-host.service; OpenAPI doc regenerated. Also: scripts/headless/run-headless-kde.sh — full headless Plasma bringup. Root cause of "desktop but no apps/settings" over the stream: plasmashell launched without XDG_MENU_PREFIX=plasma-, so the launcher resolved a nonexistent applications.menu and rendered an empty menu. The script sets the complete KDE session env (menu prefix, KDE_FULL_SESSION, session version) and rebuilds ksycoca before starting plasmashell. Gate: 97/97 tests, clippy -D warnings (both feature sets), fmt, C-ABI harness PASS, zero lumen references left outside .git. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-10 13:11:59 +00:00
parent b8b23c8fb2
commit bfd64ce871
119 changed files with 1245 additions and 1185 deletions
@@ -0,0 +1,509 @@
+//! Minimal CUDA Driver API FFI for the zero-copy path. No Rust crate exposes the GL-interop
+//! driver calls we need (`cuGraphicsGLRegisterImage` & co.), so we hand-roll exactly those and
+//! link `libcuda.so.1` (the driver library — NOT `libcudart`). Symbol names verified against
+//! `cust_raw` + `cudaGL.h`: the context/mem ops use the `_v2` ABI suffix; the graphics-interop
+//! ops are unsuffixed. (We use GL interop, not EGL interop: `cuGraphicsEGLRegisterImage` is
+//! Tegra-only on the desktop driver — see [`super::egl`].)
+//!
+//! One process-wide `CUcontext` is created lazily and shared by the EGL importer (capture
+//! thread) and ffmpeg's `hevc_nvenc` (encode thread); each thread makes it current before use.
+
+#![allow(non_camel_case_types, non_snake_case)]
+
+use anyhow::{bail, Result};
+use std::os::raw::{c_int, c_uint, c_void};
+use std::sync::{Arc, Mutex, OnceLock};
+
+pub type CUresult = c_uint; // CUDA_SUCCESS == 0
+pub type CUdevice = c_int;
+pub type CUcontext = *mut c_void; // opaque CUctx_st*
+pub type CUstream = *mut c_void; // opaque CUstream_st*
+pub type CUdeviceptr = u64;
+pub type CUgraphicsResource = *mut c_void;
+pub type CUarray = *mut c_void;
+pub type CUexternalMemory = *mut c_void; // opaque CUextMemory_st*
+
+/// `CUmemorytype` (cuda.h): HOST=1, DEVICE=2, ARRAY=3, UNIFIED=4.
+pub const CU_MEMORYTYPE_DEVICE: c_uint = 2;
+pub const CU_MEMORYTYPE_ARRAY: c_uint = 3;
+
+/// `CUDA_MEMCPY2D` (cuda.h, `_v2` ABI). Field order is load-bearing.
+#[repr(C)]
+#[derive(Default)]
+pub struct CUDA_MEMCPY2D {
+    pub srcXInBytes: usize,
+    pub srcY: usize,
+    pub srcMemoryType: c_uint,
+    pub srcHost: *const c_void,
+    pub srcDevice: CUdeviceptr,
+    pub srcArray: CUarray,
+    pub srcPitch: usize,
+    pub dstXInBytes: usize,
+    pub dstY: usize,
+    pub dstMemoryType: c_uint,
+    pub dstHost: *mut c_void,
+    pub dstDevice: CUdeviceptr,
+    pub dstArray: CUarray,
+    pub dstPitch: usize,
+    pub WidthInBytes: usize,
+    pub Height: usize,
+}
+
+/// `CUDA_EXTERNAL_MEMORY_HANDLE_DESC` (cuda.h, 64-bit layout). `handle` is a union whose
+/// largest member is the win32 two-pointer struct (16 bytes, align 8); for the OPAQUE_FD type
+/// only the first 4 bytes (the `int fd`) are read.
+#[repr(C)]
+#[derive(Default)]
+pub struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
+    pub type_: c_uint, // CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1
+    _pad: u32,
+    pub handle: [u64; 2], // union { int fd; {void*,void*} win32; void* nvSciBufObject }
+    pub size: u64,
+    pub flags: c_uint,
+    reserved: [c_uint; 16],
+    _pad2: u32,
+}
+
+/// `CUDA_EXTERNAL_MEMORY_BUFFER_DESC` (cuda.h, 64-bit layout).
+#[repr(C)]
+#[derive(Default)]
+pub struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
+    pub offset: u64,
+    pub size: u64,
+    pub flags: c_uint,
+    reserved: [c_uint; 16],
+    _pad: u32,
+}
+
+pub const CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD: c_uint = 1;
+
+#[link(name = "cuda")]
+extern "C" {
+    fn cuInit(flags: c_uint) -> CUresult;
+    fn cuDeviceGet(device: *mut CUdevice, ordinal: c_int) -> CUresult;
+    fn cuCtxCreate_v2(pctx: *mut CUcontext, flags: c_uint, dev: CUdevice) -> CUresult;
+    fn cuCtxSetCurrent(ctx: CUcontext) -> CUresult;
+    fn cuMemAllocPitch_v2(
+        dptr: *mut CUdeviceptr,
+        pitch: *mut usize,
+        width_bytes: usize,
+        height: usize,
+        element_size: c_uint,
+    ) -> CUresult;
+    fn cuMemFree_v2(dptr: CUdeviceptr) -> CUresult;
+    fn cuMemcpy2D_v2(copy: *const CUDA_MEMCPY2D) -> CUresult;
+    fn cuCtxSynchronize() -> CUresult;
+
+    // GL interop (cudaGL.h) — these symbols have NO `_v2` suffix. `cuGraphicsEGLRegisterImage`
+    // is Tegra-only on the desktop driver, so we go EGLImage → GL texture → register the texture.
+    fn cuGraphicsGLRegisterImage(
+        resource: *mut CUgraphicsResource,
+        texture: c_uint, // GLuint
+        target: c_uint,  // GL_TEXTURE_2D = 0x0DE1
+        flags: c_uint,   // CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01
+    ) -> CUresult;
+    fn cuGraphicsMapResources(
+        count: c_uint,
+        resources: *mut CUgraphicsResource,
+        stream: *mut c_void,
+    ) -> CUresult;
+    fn cuGraphicsUnmapResources(
+        count: c_uint,
+        resources: *mut CUgraphicsResource,
+        stream: *mut c_void,
+    ) -> CUresult;
+    fn cuGraphicsSubResourceGetMappedArray(
+        array: *mut CUarray,
+        resource: CUgraphicsResource,
+        array_index: c_uint,
+        mip_level: c_uint,
+    ) -> CUresult;
+    fn cuGraphicsUnregisterResource(resource: CUgraphicsResource) -> CUresult;
+
+    // External memory (cuda.h, no `_v2` suffix) — imports a (Vulkan-exported) dmabuf fd as
+    // device memory. Used for LINEAR dmabufs (gamescope), which EGL/GL interop can't sample.
+    fn cuImportExternalMemory(
+        ext_mem_out: *mut CUexternalMemory,
+        mem_handle_desc: *const CUDA_EXTERNAL_MEMORY_HANDLE_DESC,
+    ) -> CUresult;
+    fn cuExternalMemoryGetMappedBuffer(
+        dev_ptr: *mut CUdeviceptr,
+        ext_mem: CUexternalMemory,
+        buffer_desc: *const CUDA_EXTERNAL_MEMORY_BUFFER_DESC,
+    ) -> CUresult;
+    fn cuDestroyExternalMemory(ext_mem: CUexternalMemory) -> CUresult;
+}
+
+#[inline]
+fn ck(r: CUresult, what: &str) -> Result<()> {
+    if r == 0 {
+        Ok(())
+    } else {
+        bail!("CUDA driver error {r} in {what}")
+    }
+}
+
+/// The shared process-wide CUDA context (created once). Wrapped so it's `Send`/`Sync` to live
+/// in a `OnceLock`; the raw `CUcontext` is thread-safe to make current from any thread.
+#[derive(Clone, Copy)]
+pub struct Context(pub CUcontext);
+unsafe impl Send for Context {}
+unsafe impl Sync for Context {}
+
+static CONTEXT: OnceLock<Context> = OnceLock::new();
+
+/// Get (lazily creating) the shared CUDA context on device 0.
+pub fn context() -> Result<CUcontext> {
+    if let Some(c) = CONTEXT.get() {
+        return Ok(c.0);
+    }
+    let ctx = unsafe {
+        ck(cuInit(0), "cuInit")?;
+        let mut dev: CUdevice = 0;
+        ck(cuDeviceGet(&mut dev, 0), "cuDeviceGet")?;
+        let mut ctx: CUcontext = std::ptr::null_mut();
+        ck(cuCtxCreate_v2(&mut ctx, 0, dev), "cuCtxCreate_v2")?;
+        ctx
+    };
+    // Racy first-init is fine: the winner's context is used; a loser leaks one context (rare,
+    // process-lifetime). `get_or_init` keeps a single shared value.
+    Ok(CONTEXT.get_or_init(|| Context(ctx)).0)
+}
+
+/// Make the shared context current on the calling thread (required before any CUDA op here).
+pub fn make_current() -> Result<()> {
+    let ctx = context()?;
+    unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
+}
+
+/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
+fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
+    let mut ptr: CUdeviceptr = 0;
+    let mut pitch: usize = 0;
+    unsafe {
+        ck(
+            cuMemAllocPitch_v2(
+                &mut ptr,
+                &mut pitch,
+                width as usize * 4,
+                height as usize,
+                16,
+            ),
+            "cuMemAllocPitch_v2",
+        )?;
+    }
+    Ok((ptr, pitch))
+}
+
+/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
+/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
+/// returns its allocation here. Bulk-freed when the last reference drops.
+struct PoolInner {
+    free: Vec<CUdeviceptr>,
+}
+
+impl Drop for PoolInner {
+    fn drop(&mut self) {
+        unsafe {
+            if let Some(c) = CONTEXT.get() {
+                let _ = cuCtxSetCurrent(c.0);
+            }
+            for &p in &self.free {
+                let _ = cuMemFree_v2(p);
+            }
+        }
+    }
+}
+
+/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame
+/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock
+/// and serializes against the GPU every frame.
+#[derive(Clone)]
+pub struct BufferPool {
+    inner: Arc<Mutex<PoolInner>>,
+    width: u32,
+    height: u32,
+    pitch: usize,
+}
+
+impl BufferPool {
+    /// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the
+    /// driver's pitch, which is constant for a given width).
+    pub fn new(width: u32, height: u32) -> Result<BufferPool> {
+        let (ptr, pitch) = alloc_pitched(width, height)?;
+        Ok(BufferPool {
+            inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })),
+            width,
+            height,
+            pitch,
+        })
+    }
+
+    pub fn width(&self) -> u32 {
+        self.width
+    }
+
+    pub fn height(&self) -> u32 {
+        self.height
+    }
+
+    /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
+    /// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
+    pub fn get(&self) -> Result<DeviceBuffer> {
+        let reuse = self.inner.lock().unwrap().free.pop();
+        let ptr = match reuse {
+            Some(p) => p,
+            None => alloc_pitched(self.width, self.height)?.0,
+        };
+        Ok(DeviceBuffer {
+            ptr,
+            pitch: self.pitch,
+            width: self.width,
+            height: self.height,
+            pool: Some(self.inner.clone()),
+        })
+    }
+}
+
+/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped
+/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder.
+/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees.
+pub struct DeviceBuffer {
+    pub ptr: CUdeviceptr,
+    pub pitch: usize,
+    pub width: u32,
+    pub height: u32,
+    pool: Option<Arc<Mutex<PoolInner>>>,
+}
+
+impl DeviceBuffer {
+    /// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path.
+    pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
+        let (ptr, pitch) = alloc_pitched(width, height)?;
+        Ok(DeviceBuffer {
+            ptr,
+            pitch,
+            width,
+            height,
+            pool: None,
+        })
+    }
+}
+
+impl Drop for DeviceBuffer {
+    fn drop(&mut self) {
+        if self.ptr == 0 {
+            return;
+        }
+        if let Some(pool) = &self.pool {
+            // Recycle (the consumer synchronized before dropping, so the GPU is done with it).
+            pool.lock().unwrap().free.push(self.ptr);
+        } else {
+            // The buffer may be freed on the encode thread; cuMemFree needs a current context.
+            unsafe {
+                if let Some(c) = CONTEXT.get() {
+                    let _ = cuCtxSetCurrent(c.0);
+                }
+                let _ = cuMemFree_v2(self.ptr);
+            }
+        }
+    }
+}
+
+/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA
+/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the
+/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only
+/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point),
+/// instead of registering/unregistering every frame. Unregisters on drop.
+pub struct RegisteredTexture {
+    resource: CUgraphicsResource,
+}
+
+impl RegisteredTexture {
+    /// Register a `GL_TEXTURE_2D` once.
+    ///
+    /// # Safety
+    /// The GL context and the shared CUDA context must both be current on this thread, and
+    /// `texture` must be a valid `GL_TEXTURE_2D`.
+    pub unsafe fn register_gl(texture: u32) -> Result<RegisteredTexture> {
+        const GL_TEXTURE_2D: c_uint = 0x0DE1;
+        const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
+        let mut resource: CUgraphicsResource = std::ptr::null_mut();
+        ck(
+            cuGraphicsGLRegisterImage(
+                &mut resource,
+                texture,
+                GL_TEXTURE_2D,
+                CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY,
+            ),
+            "cuGraphicsGLRegisterImage",
+        )?;
+        Ok(RegisteredTexture { resource })
+    }
+
+    /// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
+    /// unmap. The `cuCtxSynchronize` ensures `dst` is ready before the source dmabuf is recycled.
+    pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
+        unsafe {
+            ck(
+                cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
+                "cuGraphicsMapResources",
+            )?;
+            let mut array: CUarray = std::ptr::null_mut();
+            if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
+                let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
+                bail!("cuGraphicsSubResourceGetMappedArray failed");
+            }
+            let copy = CUDA_MEMCPY2D {
+                srcMemoryType: CU_MEMORYTYPE_ARRAY,
+                srcArray: array,
+                dstMemoryType: CU_MEMORYTYPE_DEVICE,
+                dstDevice: dst.ptr,
+                dstPitch: dst.pitch,
+                WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
+                Height: dst.height as usize,
+                ..Default::default()
+            };
+            let r = cuMemcpy2D_v2(&copy);
+            let s = cuCtxSynchronize();
+            let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
+            ck(r, "cuMemcpy2D_v2")?;
+            ck(s, "cuCtxSynchronize")?;
+        }
+        Ok(())
+    }
+}
+
+/// Copy a pitched device buffer into another device region (device→device), e.g. our imported
+/// [`DeviceBuffer`] into a pooled CUDA surface NVENC owns. Both are 4-byte (BGRx) pixels.
+/// The caller must have the shared context current on this thread (see [`make_current`]).
+pub fn copy_device_to_device(
+    src: &DeviceBuffer,
+    dst_ptr: CUdeviceptr,
+    dst_pitch: usize,
+) -> Result<()> {
+    let copy = CUDA_MEMCPY2D {
+        srcMemoryType: CU_MEMORYTYPE_DEVICE,
+        srcDevice: src.ptr,
+        srcPitch: src.pitch,
+        dstMemoryType: CU_MEMORYTYPE_DEVICE,
+        dstDevice: dst_ptr,
+        dstPitch: dst_pitch,
+        WidthInBytes: src.width as usize * 4,
+        Height: src.height as usize,
+        ..Default::default()
+    };
+    unsafe {
+        ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2(dev->dev)")?;
+        ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
+    }
+    Ok(())
+}
+
+impl Drop for RegisteredTexture {
+    fn drop(&mut self) {
+        if !self.resource.is_null() {
+            unsafe {
+                let _ = cuGraphicsUnregisterResource(self.resource);
+            }
+        }
+    }
+}
+
+/// A dmabuf fd imported as CUDA external memory and mapped to a device pointer — the LINEAR
+/// path (gamescope): the buffer's bytes are directly addressable, no GL de-tiling needed.
+/// Cached per PipeWire buffer (the fd pool is stable for a stream's life); destroyed on drop.
+pub struct ExternalDmabuf {
+    ext: CUexternalMemory,
+    pub ptr: CUdeviceptr,
+    pub size: u64,
+}
+
+// Raw driver handles; used from the single capture thread but moved with the importer.
+unsafe impl Send for ExternalDmabuf {}
+
+impl ExternalDmabuf {
+    /// Import `fd` (NOT consumed — an internal `dup` is handed to the driver, which owns it
+    /// from then on) and map its full `size` bytes to a device pointer. The shared context
+    /// must be current.
+    pub fn import(fd: i32, size: u64) -> Result<ExternalDmabuf> {
+        let dup = unsafe { libc::dup(fd) };
+        if dup < 0 {
+            bail!("dup(dmabuf fd) failed");
+        }
+        Self::import_owned_fd(dup, size)
+    }
+
+    /// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by
+    /// the driver on success, closed by us on failure.
+    pub fn import_owned_fd(dup: i32, size: u64) -> Result<ExternalDmabuf> {
+        let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
+            type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+            size,
+            ..Default::default()
+        };
+        desc.handle[0] = dup as u32 as u64; // union member `int fd` (little-endian low bytes)
+        let mut ext: CUexternalMemory = std::ptr::null_mut();
+        let r = unsafe { cuImportExternalMemory(&mut ext, &desc) };
+        if r != 0 {
+            unsafe { libc::close(dup) }; // import failed → the driver did not take the fd
+            bail!("cuImportExternalMemory failed ({r}) — LINEAR dmabuf import unsupported?");
+        }
+        let buf = CUDA_EXTERNAL_MEMORY_BUFFER_DESC {
+            offset: 0,
+            size,
+            ..Default::default()
+        };
+        let mut ptr: CUdeviceptr = 0;
+        let r = unsafe { cuExternalMemoryGetMappedBuffer(&mut ptr, ext, &buf) };
+        if r != 0 {
+            unsafe {
+                let _ = cuDestroyExternalMemory(ext);
+            }
+            bail!("cuExternalMemoryGetMappedBuffer failed ({r})");
+        }
+        Ok(ExternalDmabuf { ext, ptr, size })
+    }
+}
+
+impl Drop for ExternalDmabuf {
+    fn drop(&mut self) {
+        unsafe {
+            if let Some(c) = CONTEXT.get() {
+                let _ = cuCtxSetCurrent(c.0);
+            }
+            if self.ptr != 0 {
+                let _ = cuMemFree_v2(self.ptr); // mapped buffers are freed like device memory
+            }
+            if !self.ext.is_null() {
+                let _ = cuDestroyExternalMemory(self.ext);
+            }
+        }
+    }
+}
+
+/// Copy a pitched span starting at `src_ptr` (e.g. an [`ExternalDmabuf`] mapping at the chunk
+/// offset) into `dst`. The shared context must be current on this thread.
+pub fn copy_pitched_to_buffer(
+    src_ptr: CUdeviceptr,
+    src_pitch: usize,
+    dst: &DeviceBuffer,
+) -> Result<()> {
+    let copy = CUDA_MEMCPY2D {
+        srcMemoryType: CU_MEMORYTYPE_DEVICE,
+        srcDevice: src_ptr,
+        srcPitch: src_pitch,
+        dstMemoryType: CU_MEMORYTYPE_DEVICE,
+        dstDevice: dst.ptr,
+        dstPitch: dst.pitch,
+        WidthInBytes: dst.width as usize * 4,
+        Height: dst.height as usize,
+        ..Default::default()
+    };
+    unsafe {
+        ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2(ext->dev)")?;
+        // The copy must finish before the dmabuf is requeued to the producer.
+        ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
+    }
+    Ok(())
+}
@@ -0,0 +1,528 @@
+//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (GBM platform on
+//! the render node) and import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`.
+//! The DRM format **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without
+//! the modifier yields a corrupt image or `EGL_BAD_MATCH`).
+//!
+//! Desktop NVIDIA can't register a dmabuf `EGLImage` with CUDA directly — `cuGraphicsEGLRegisterImage`
+//! is Tegra-only and `cuGraphicsGLRegisterImage` rejects EGLImage-backed textures (their internal
+//! format is opaque). So we follow OBS/Sunshine: bind the `EGLImage` to a GL texture
+//! (`glEGLImageTargetTexture2DOES`), render it through a fullscreen-triangle shader into a plain
+//! immutable `GL_RGBA8` texture (de-tiling and swizzling to the BGRx the encoder wants), then
+//! register *that* texture with CUDA ([`MappedTexture`]) and copy it device-to-device into an
+//! owned [`DeviceBuffer`] so the dmabuf can be returned to the compositor immediately.
+
+#![allow(non_upper_case_globals)]
+
+use super::cuda::{self, DeviceBuffer};
+use anyhow::{bail, ensure, Context as _, Result};
+use khronos_egl as egl;
+use std::os::raw::{c_int, c_void};
+
+// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
+const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
+const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
+const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
+const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
+const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
+const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
+const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
+const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;
+
+const GL_TEXTURE_2D: u32 = 0x0DE1;
+const GL_TEXTURE_MIN_FILTER: u32 = 0x2801;
+const GL_TEXTURE_MAG_FILTER: u32 = 0x2800;
+const GL_LINEAR: c_int = 0x2601;
+const GL_NEAREST: c_int = 0x2600;
+const GL_RGBA8: u32 = 0x8058;
+const GL_FRAMEBUFFER: u32 = 0x8D40;
+const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
+const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
+const GL_TEXTURE0: u32 = 0x84C0;
+const GL_TRIANGLES: u32 = 0x0004;
+const GL_VERTEX_SHADER: u32 = 0x8B31;
+const GL_FRAGMENT_SHADER: u32 = 0x8B30;
+const GL_COMPILE_STATUS: u32 = 0x8B81;
+const GL_LINK_STATUS: u32 = 0x8B82;
+
+// libglvnd's libGL dispatches these to the NVIDIA driver based on the current EGL/GL context.
+#[link(name = "GL")]
+extern "C" {
+    fn glGenTextures(n: c_int, textures: *mut u32);
+    fn glBindTexture(target: u32, texture: u32);
+    fn glTexParameteri(target: u32, pname: u32, param: c_int);
+    fn glDeleteTextures(n: c_int, textures: *const u32);
+    fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
+    fn glGetError() -> u32;
+    fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
+    fn glBindFramebuffer(target: u32, framebuffer: u32);
+    fn glFramebufferTexture2D(
+        target: u32,
+        attachment: u32,
+        textarget: u32,
+        texture: u32,
+        level: c_int,
+    );
+    fn glCheckFramebufferStatus(target: u32) -> u32;
+    fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
+    fn glGenVertexArrays(n: c_int, arrays: *mut u32);
+    fn glBindVertexArray(array: u32);
+    fn glDrawArrays(mode: u32, first: c_int, count: c_int);
+    fn glActiveTexture(texture: u32);
+    fn glUseProgram(program: u32);
+    fn glFlush();
+    fn glCreateShader(shader_type: u32) -> u32;
+    fn glShaderSource(shader: u32, count: c_int, string: *const *const i8, length: *const c_int);
+    fn glCompileShader(shader: u32);
+    fn glGetShaderiv(shader: u32, pname: u32, params: *mut c_int);
+    fn glDeleteShader(shader: u32);
+    fn glCreateProgram() -> u32;
+    fn glAttachShader(program: u32, shader: u32);
+    fn glLinkProgram(program: u32);
+    fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
+    fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
+    fn glUniform1i(location: c_int, v0: c_int);
+}
+
+#[link(name = "gbm")]
+extern "C" {
+    fn gbm_create_device(fd: c_int) -> *mut c_void;
+    fn gbm_device_destroy(device: *mut c_void);
+}
+
+/// `glEGLImageTargetTexture2DOES(target, EGLImage)` — loaded via `eglGetProcAddress`.
+type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void);
+
+// Fullscreen-triangle blit: sample the dmabuf EGLImage texture and write it (swizzled to BGRA,
+// to match the BGRx the encoder expects) into a normal GL_RGBA8 texture that CUDA *can* register.
+const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
+const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";
+
+unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
+    let sh = glCreateShader(kind);
+    ensure!(sh != 0, "glCreateShader failed");
+    let ptr = src.as_ptr() as *const i8;
+    let len = src.len() as c_int;
+    glShaderSource(sh, 1, &ptr, &len);
+    glCompileShader(sh);
+    let mut ok: c_int = 0;
+    glGetShaderiv(sh, GL_COMPILE_STATUS, &mut ok);
+    if ok == 0 {
+        glDeleteShader(sh);
+        bail!("GL shader compile failed");
+    }
+    Ok(sh)
+}
+
+unsafe fn compile_program() -> Result<u32> {
+    let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
+    let fs = compile_shader(GL_FRAGMENT_SHADER, FRAG_SRC)?;
+    let prog = glCreateProgram();
+    glAttachShader(prog, vs);
+    glAttachShader(prog, fs);
+    glLinkProgram(prog);
+    glDeleteShader(vs);
+    glDeleteShader(fs);
+    let mut ok: c_int = 0;
+    glGetProgramiv(prog, GL_LINK_STATUS, &mut ok);
+    ensure!(ok != 0, "GL program link failed");
+    glUseProgram(prog);
+    let loc = glGetUniformLocation(prog, c"image".as_ptr());
+    if loc >= 0 {
+        glUniform1i(loc, 0); // sampler -> texture unit 0
+    }
+    glUseProgram(0);
+    Ok(prog)
+}
+
+/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
+struct GlBlit {
+    program: u32,
+    vao: u32,
+    fbo: u32,
+    /// CUDA-registrable destination (immutable GL_RGBA8).
+    dst_tex: u32,
+    /// Source texture re-targeted to each frame's EGLImage.
+    src_tex: u32,
+    width: u32,
+    height: u32,
+    /// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame.
+    registered: cuda::RegisteredTexture,
+    /// Recycled CUDA device buffers (the imported frames handed to the encoder).
+    pool: cuda::BufferPool,
+}
+
+impl GlBlit {
+    unsafe fn new(width: u32, height: u32) -> Result<GlBlit> {
+        let program = compile_program()?;
+        let mut vao = 0u32;
+        glGenVertexArrays(1, &mut vao); // core profile needs a bound VAO for glDrawArrays
+        let mut fbo = 0u32;
+        glGenFramebuffers(1, &mut fbo);
+
+        let mut dst_tex = 0u32;
+        glGenTextures(1, &mut dst_tex);
+        glBindTexture(GL_TEXTURE_2D, dst_tex);
+        glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+        let mut src_tex = 0u32;
+        glGenTextures(1, &mut src_tex);
+        glBindTexture(GL_TEXTURE_2D, src_tex);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+        glBindTexture(GL_TEXTURE_2D, 0);
+
+        glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+        glFramebufferTexture2D(
+            GL_FRAMEBUFFER,
+            GL_COLOR_ATTACHMENT0,
+            GL_TEXTURE_2D,
+            dst_tex,
+            0,
+        );
+        let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+        glBindFramebuffer(GL_FRAMEBUFFER, 0);
+        ensure!(
+            status == GL_FRAMEBUFFER_COMPLETE,
+            "blit FBO incomplete ({status:#x})"
+        );
+        // Register the (immutable, reused) destination texture with CUDA once, and stand up the
+        // device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be
+        // current (the caller makes it current before constructing the blit).
+        let registered = cuda::RegisteredTexture::register_gl(dst_tex)?;
+        let pool = cuda::BufferPool::new(width, height)?;
+        Ok(GlBlit {
+            program,
+            vao,
+            fbo,
+            dst_tex,
+            src_tex,
+            width,
+            height,
+            registered,
+            pool,
+        })
+    }
+
+    /// Bind `image` to the source texture and render it into `dst_tex`.
+    ///
+    /// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
+    unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
+        glBindTexture(GL_TEXTURE_2D, self.src_tex);
+        let _ = glGetError();
+        egl_image_target(GL_TEXTURE_2D, image);
+        let e = glGetError();
+        glBindTexture(GL_TEXTURE_2D, 0);
+        ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
+
+        glBindFramebuffer(GL_FRAMEBUFFER, self.fbo);
+        glViewport(0, 0, self.width as c_int, self.height as c_int);
+        glUseProgram(self.program);
+        glActiveTexture(GL_TEXTURE0);
+        glBindTexture(GL_TEXTURE_2D, self.src_tex);
+        glBindVertexArray(self.vao);
+        glDrawArrays(GL_TRIANGLES, 0, 3);
+        glBindVertexArray(0);
+        glBindFramebuffer(GL_FRAMEBUFFER, 0);
+        glFlush(); // submit GL work before CUDA maps the texture
+        Ok(())
+    }
+}
+
+/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
+#[derive(Clone, Copy, Debug)]
+pub struct DmabufPlane {
+    pub fd: i32,
+    pub offset: u32,
+    pub stride: u32,
+}
+
+type Egl = egl::DynamicInstance<egl::EGL1_5>;
+
+/// Headless EGLDisplay (NVIDIA device platform) + a surfaceless desktop-GL context used to
+/// import dmabufs and bridge them to CUDA via a GL texture. Lives on the capture thread (the GL
+/// context is made current there once).
+pub struct EglImporter {
+    egl: Egl,
+    display: egl::Display,
+    no_ctx: egl::Context,
+    /// Surfaceless GL context (current on the capture thread) for the EGLImage→texture bind.
+    _gl_ctx: egl::Context,
+    egl_image_target: EglImageTargetFn,
+    /// Lazily-created GL blit machinery (recreated if the frame size changes).
+    blit: Option<GlBlit>,
+    /// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
+    /// created lazily on the first LINEAR frame, + the destination pool.
+    vk: Option<super::vulkan::VkBridge>,
+    linear_pool: Option<cuda::BufferPool>,
+    gbm: *mut c_void,
+    render_fd: c_int,
+}
+
+// The EGL handles are confined to the capture thread; the struct is moved there once.
+unsafe impl Send for EglImporter {}
+
+impl EglImporter {
+    /// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
+    /// to exist (so a later `import` only touches the hot path).
+    pub fn new() -> Result<EglImporter> {
+        // GBM platform on the NVIDIA render node: this ties the EGLDisplay (and its GL contexts)
+        // to the same DRM device CUDA-GL interop associates with, which the EGL device platform
+        // did not (cuGraphicsGLRegisterImage rejected device-platform GL textures).
+        let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
+        let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
+        ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
+        let gbm = unsafe { gbm_create_device(render_fd) };
+        if gbm.is_null() {
+            unsafe { libc::close(render_fd) };
+            anyhow::bail!("gbm_create_device failed");
+        }
+
+        let egl: Egl =
+            unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
+        let display = unsafe {
+            egl.get_platform_display(
+                EGL_PLATFORM_GBM_KHR,
+                gbm as egl::NativeDisplayType,
+                &[egl::ATTRIB_NONE],
+            )
+        }
+        .context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
+        egl.initialize(display).context("eglInitialize")?;
+
+        let exts = egl
+            .query_string(Some(display), egl::EXTENSIONS)
+            .context("query EGL extensions")?
+            .to_string_lossy()
+            .into_owned();
+        ensure!(
+            exts.contains("EGL_EXT_image_dma_buf_import"),
+            "EGL lacks EGL_EXT_image_dma_buf_import"
+        );
+        ensure!(
+            exts.contains("EGL_EXT_image_dma_buf_import_modifiers"),
+            "EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
+        );
+
+        // A surfaceless desktop-GL context so we can bind the dmabuf EGLImage to a GL texture
+        // (cuGraphicsEGLRegisterImage is Tegra-only; desktop CUDA interop goes through GL).
+        egl.bind_api(egl::OPENGL_API)
+            .context("eglBindAPI(OpenGL)")?;
+        // The default EGL_SURFACE_TYPE in eglChooseConfig is WINDOW_BIT, which a headless device
+        // display has none of — request a pbuffer-capable config (we run surfaceless anyway).
+        let config = egl
+            .choose_first_config(
+                display,
+                &[
+                    egl::SURFACE_TYPE,
+                    egl::PBUFFER_BIT,
+                    egl::RENDERABLE_TYPE,
+                    egl::OPENGL_BIT,
+                    egl::NONE,
+                ],
+            )
+            .context("eglChooseConfig")?
+            .context("no EGL config for OpenGL")?;
+        let gl_ctx = egl
+            .create_context(
+                display,
+                config,
+                None,
+                &[egl::CONTEXT_CLIENT_VERSION, 3, egl::NONE],
+            )
+            .context("eglCreateContext(OpenGL)")?;
+        egl.make_current(display, None, None, Some(gl_ctx))
+            .context("eglMakeCurrent surfaceless (needs EGL_KHR_surfaceless_context)")?;
+        let egl_image_target: EglImageTargetFn = unsafe {
+            std::mem::transmute(
+                egl.get_proc_address("glEGLImageTargetTexture2DOES")
+                    .context("glEGLImageTargetTexture2DOES unavailable")?,
+            )
+        };
+
+        // Create the shared CUDA context up front so import() is pure hot path.
+        cuda::context().context("create CUDA context")?;
+
+        let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
+        tracing::info!(
+            "zero-copy EGL importer ready (GBM platform + GL texture interop, dma_buf_import + modifiers)"
+        );
+        Ok(EglImporter {
+            egl,
+            display,
+            no_ctx,
+            _gl_ctx: gl_ctx,
+            egl_image_target,
+            blit: None,
+            vk: None,
+            linear_pool: None,
+            gbm,
+            render_fd,
+        })
+    }
+
+    /// Import a LINEAR dmabuf via the Vulkan bridge (no EGL/GL involved — NVIDIA's EGL can't
+    /// sample LINEAR, and the CUDA driver rejects raw dmabuf fds; Vulkan imports the dmabuf,
+    /// GPU-copies into an exportable allocation, and CUDA reads that). See [`super::vulkan`].
+    pub fn import_linear(
+        &mut self,
+        plane: &DmabufPlane,
+        width: u32,
+        height: u32,
+    ) -> Result<DeviceBuffer> {
+        cuda::make_current()?;
+        if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
+            self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
+        }
+        if self.vk.is_none() {
+            self.vk = Some(super::vulkan::VkBridge::new()?);
+        }
+        self.vk.as_mut().unwrap().import_linear(
+            plane.fd,
+            plane.offset,
+            plane.stride,
+            height,
+            self.linear_pool.as_ref().unwrap(),
+        )
+    }
+
+    /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
+    /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
+    /// a dmabuf in a layout we can import. Empty on failure (caller falls back).
+    pub fn supported_modifiers(&self, fourcc: u32) -> Vec<u64> {
+        type QueryFn = unsafe extern "system" fn(
+            dpy: *mut c_void,
+            format: i32,
+            max_modifiers: i32,
+            modifiers: *mut u64,
+            external_only: *mut u32,
+            num_modifiers: *mut i32,
+        ) -> u32;
+        let Some(sym) = self.egl.get_proc_address("eglQueryDmaBufModifiersEXT") else {
+            return Vec::new();
+        };
+        let query: QueryFn = unsafe { std::mem::transmute(sym) };
+        let dpy = self.display.as_ptr();
+        unsafe {
+            let mut count: i32 = 0;
+            if query(
+                dpy,
+                fourcc as i32,
+                0,
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+                &mut count,
+            ) == 0
+                || count <= 0
+            {
+                return Vec::new();
+            }
+            let mut mods = vec![0u64; count as usize];
+            let mut ext = vec![0u32; count as usize];
+            let mut n: i32 = 0;
+            if query(
+                dpy,
+                fourcc as i32,
+                count,
+                mods.as_mut_ptr(),
+                ext.as_mut_ptr(),
+                &mut n,
+            ) == 0
+            {
+                return Vec::new();
+            }
+            mods.truncate(n.max(0) as usize);
+            mods
+        }
+    }
+
+    /// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer. `fourcc`
+    /// is the DRM FourCC; `modifier` is the explicit 64-bit DRM format modifier when one was
+    /// negotiated, or `None` to import with the buffer's implicit modifier (base
+    /// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
+    pub fn import(
+        &mut self,
+        plane: &DmabufPlane,
+        width: u32,
+        height: u32,
+        fourcc: u32,
+        modifier: Option<u64>,
+    ) -> Result<DeviceBuffer> {
+        let mut attrs: Vec<egl::Attrib> = vec![
+            egl::WIDTH as egl::Attrib,
+            width as egl::Attrib,
+            egl::HEIGHT as egl::Attrib,
+            height as egl::Attrib,
+            EGL_LINUX_DRM_FOURCC_EXT,
+            fourcc as egl::Attrib,
+            EGL_DMA_BUF_PLANE0_FD_EXT,
+            plane.fd as egl::Attrib,
+            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
+            plane.offset as egl::Attrib,
+            EGL_DMA_BUF_PLANE0_PITCH_EXT,
+            plane.stride as egl::Attrib,
+        ];
+        if let Some(m) = modifier {
+            attrs.extend_from_slice(&[
+                EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
+                (m & 0xFFFF_FFFF) as egl::Attrib,
+                EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
+                (m >> 32) as egl::Attrib,
+            ]);
+        }
+        attrs.push(egl::ATTRIB_NONE);
+        let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
+        let image = self
+            .egl
+            .create_image(
+                self.display,
+                self.no_ctx,
+                EGL_LINUX_DMA_BUF_EXT,
+                client,
+                &attrs,
+            )
+            .context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;
+
+        // EGLImage → (sampled by a shader) → GL_RGBA8 texture → register *that* with CUDA → map
+        // → array → copy out. Registering the EGLImage texture directly fails (its layout isn't a
+        // CUDA-registrable format); the RGBA8 render target is.
+        let result = self.blit_and_copy(image.as_ptr(), width, height);
+        let _ = self.egl.destroy_image(self.display, image);
+        result
+    }
+
+    /// Render the dmabuf `image` into the registrable RGBA8 texture and copy it to an owned CUDA
+    /// buffer. (Re)creates the per-size GL blit machinery as needed.
+    fn blit_and_copy(
+        &mut self,
+        image: *mut c_void,
+        width: u32,
+        height: u32,
+    ) -> Result<DeviceBuffer> {
+        cuda::make_current()?;
+        if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
+            self.blit = Some(unsafe { GlBlit::new(width, height)? });
+        }
+        let egl_image_target = self.egl_image_target;
+        let blit = self.blit.as_mut().unwrap();
+        // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
+        unsafe { blit.run(egl_image_target, image)? };
+        // Persistent registration (mapped per frame) + a pooled buffer — no per-frame
+        // cuGraphicsGLRegisterImage / cuMemAllocPitch.
+        let dst = blit.pool.get()?;
+        blit.registered.copy_mapped_to(&dst)?;
+        Ok(dst)
+    }
+}
+
+impl Drop for EglImporter {
+    fn drop(&mut self) {
+        if !self.gbm.is_null() {
+            unsafe { gbm_device_destroy(self.gbm) };
+        }
+        if self.render_fd >= 0 {
+            unsafe { libc::close(self.render_fd) };
+        }
+    }
+}
@@ -0,0 +1,50 @@
+//! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and
+//! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path
+//! moves ~3.5 GB/s). Opt in with `PUNKTFUNK_ZEROCOPY=1`; the CPU-copy path stays the default and
+//! the runtime fallback (foreign-allocator / no-dmabuf / import failure).
+//!
+//! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the
+//! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in
+//! `encode/linux.rs`; the dmabuf negotiation lives in `capture/linux.rs`.
+
+pub mod cuda;
+pub mod egl;
+pub mod vulkan;
+
+pub use cuda::DeviceBuffer;
+pub use egl::{DmabufPlane, EglImporter};
+
+/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy).
+pub fn enabled() -> bool {
+    std::env::var("PUNKTFUNK_ZEROCOPY")
+        .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on"))
+        .unwrap_or(false)
+}
+
+/// DRM FourCC for a packed 32-bit format name (little-endian, e.g. `b"XR24"`).
+const fn fourcc(c: &[u8; 4]) -> u32 {
+    (c[0] as u32) | ((c[1] as u32) << 8) | ((c[2] as u32) << 16) | ((c[3] as u32) << 24)
+}
+
+/// Map a SPA/our [`crate::capture::PixelFormat`] to the DRM FourCC EGL expects for import.
+/// SPA byte order `BGRx` ⇒ DRM `XRGB8888` (memory B,G,R,X), etc.
+pub fn drm_fourcc(format: crate::capture::PixelFormat) -> Option<u32> {
+    use crate::capture::PixelFormat::*;
+    Some(match format {
+        Bgrx => fourcc(b"XR24"), // DRM_FORMAT_XRGB8888
+        Bgra => fourcc(b"AR24"), // DRM_FORMAT_ARGB8888
+        Rgbx => fourcc(b"XB24"), // DRM_FORMAT_XBGR8888
+        Rgba => fourcc(b"AB24"), // DRM_FORMAT_ABGR8888
+        // 24-bit packed RGB/BGR have no straightforward dmabuf import here; use the CPU path.
+        Rgb | Bgr => return None,
+    })
+}
+
+/// Standalone probe (the `zerocopy-probe` subcommand): initialize the EGL importer + CUDA
+/// context and report. De-risks the FFI/linking/GPU-access without needing a capture session.
+pub fn probe() -> anyhow::Result<()> {
+    let _importer = EglImporter::new()?;
+    let ctx = cuda::context()?;
+    tracing::info!(cuda_ctx = ?ctx, "zero-copy probe OK — EGL display + CUDA context initialized");
+    Ok(())
+}
@@ -0,0 +1,366 @@
+//! Vulkan bridge for LINEAR dmabufs (gamescope's only offer), completing zero-copy where the
+//! other interops can't: NVIDIA's EGL won't sample LINEAR, and the CUDA driver rejects raw
+//! dmabuf fds as external memory. Vulkan *does* import dmabufs (`VK_EXT_external_memory_dma_buf`)
+//! and *does* export `OPAQUE_FD` memory that CUDA officially imports. So:
+//!
+//! ```text
+//!   dmabuf fd ──VkImportMemoryFdInfoKHR(DMA_BUF)──▶ VkBuffer (cached per fd)
+//!        │ vkCmdCopyBuffer (GPU, device-local)
+//!        ▼
+//!   exportable VkBuffer ──vkGetMemoryFdKHR(OPAQUE_FD)──▶ cuImportExternalMemory ──▶ CUdeviceptr
+//! ```
+//!
+//! The exportable buffer + its CUDA mapping are created once per resolution; per frame it's one
+//! GPU buffer copy (fence-waited) and one pitched CUDA copy into the encoder's pooled buffer.
+//! No CPU ever touches pixels. Imports are cached per fd (PipeWire's buffer pool is stable for
+//! a stream's life). Falls back cleanly: any init/import error disables the importer and the
+//! CPU mmap path takes over.
+
+use super::cuda::{self, DeviceBuffer};
+use anyhow::{anyhow, bail, Context as _, Result};
+use ash::vk;
+use std::collections::HashMap;
+
+/// Vulkan objects for one imported source dmabuf (cached per fd).
+struct SrcBuf {
+    buffer: vk::Buffer,
+    memory: vk::DeviceMemory,
+    size: u64,
+}
+
+/// The per-resolution destination: exportable Vulkan memory mapped into CUDA.
+struct DstBuf {
+    buffer: vk::Buffer,
+    memory: vk::DeviceMemory,
+    size: u64,
+    /// CUDA's view of the same memory (owns the exported OPAQUE_FD).
+    cuda: cuda::ExternalDmabuf,
+}
+
+pub struct VkBridge {
+    _entry: ash::Entry,
+    instance: ash::Instance,
+    device: ash::Device,
+    ext_fd: ash::khr::external_memory_fd::Device,
+    queue: vk::Queue,
+    cmd_pool: vk::CommandPool,
+    cmd: vk::CommandBuffer,
+    fence: vk::Fence,
+    mem_props: vk::PhysicalDeviceMemoryProperties,
+    src_cache: HashMap<i32, SrcBuf>,
+    dst: Option<DstBuf>,
+}
+
+// Confined to the capture thread; moved there once.
+unsafe impl Send for VkBridge {}
+
+impl VkBridge {
+    /// Bring up Vulkan on the NVIDIA GPU with the external-memory extensions.
+    pub fn new() -> Result<VkBridge> {
+        unsafe {
+            let entry = ash::Entry::load().context("load libvulkan")?;
+            let app = vk::ApplicationInfo::default().api_version(vk::API_VERSION_1_1);
+            let instance = entry
+                .create_instance(
+                    &vk::InstanceCreateInfo::default().application_info(&app),
+                    None,
+                )
+                .context("vkCreateInstance")?;
+
+            // Pick the NVIDIA GPU (matches CUDA device 0 on this single-dGPU host).
+            let phys = instance
+                .enumerate_physical_devices()
+                .context("enumerate GPUs")?
+                .into_iter()
+                .find(|&p| instance.get_physical_device_properties(p).vendor_id == 0x10DE)
+                .ok_or_else(|| anyhow!("no NVIDIA Vulkan device"))?;
+            let mem_props = instance.get_physical_device_memory_properties(phys);
+
+            // Any queue family supporting transfer (graphics/compute imply it).
+            let qf = instance
+                .get_physical_device_queue_family_properties(phys)
+                .iter()
+                .position(|q| {
+                    q.queue_flags.intersects(
+                        vk::QueueFlags::TRANSFER
+                            | vk::QueueFlags::GRAPHICS
+                            | vk::QueueFlags::COMPUTE,
+                    )
+                })
+                .ok_or_else(|| anyhow!("no transfer-capable queue family"))?
+                as u32;
+
+            let exts = [
+                ash::khr::external_memory_fd::NAME.as_ptr(),
+                ash::ext::external_memory_dma_buf::NAME.as_ptr(),
+            ];
+            let prio = [1.0f32];
+            let qci = [vk::DeviceQueueCreateInfo::default()
+                .queue_family_index(qf)
+                .queue_priorities(&prio)];
+            let device = instance
+                .create_device(
+                    phys,
+                    &vk::DeviceCreateInfo::default()
+                        .queue_create_infos(&qci)
+                        .enabled_extension_names(&exts),
+                    None,
+                )
+                .context("vkCreateDevice (external-memory extensions supported?)")?;
+            let ext_fd = ash::khr::external_memory_fd::Device::new(&instance, &device);
+            let queue = device.get_device_queue(qf, 0);
+
+            let cmd_pool = device
+                .create_command_pool(
+                    &vk::CommandPoolCreateInfo::default()
+                        .queue_family_index(qf)
+                        .flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER),
+                    None,
+                )
+                .context("create command pool")?;
+            let cmd = device
+                .allocate_command_buffers(
+                    &vk::CommandBufferAllocateInfo::default()
+                        .command_pool(cmd_pool)
+                        .level(vk::CommandBufferLevel::PRIMARY)
+                        .command_buffer_count(1),
+                )
+                .context("allocate command buffer")?[0];
+            let fence = device
+                .create_fence(&vk::FenceCreateInfo::default(), None)
+                .context("create fence")?;
+
+            tracing::info!("Vulkan bridge ready (dmabuf import → OPAQUE_FD export → CUDA)");
+            Ok(VkBridge {
+                _entry: entry,
+                instance,
+                device,
+                ext_fd,
+                queue,
+                cmd_pool,
+                cmd,
+                fence,
+                mem_props,
+                src_cache: HashMap::new(),
+                dst: None,
+            })
+        }
+    }
+
+    fn memory_type(&self, type_bits: u32, flags: vk::MemoryPropertyFlags) -> Result<u32> {
+        (0..self.mem_props.memory_type_count)
+            .find(|&i| {
+                type_bits & (1 << i) != 0
+                    && self.mem_props.memory_types[i as usize]
+                        .property_flags
+                        .contains(flags)
+            })
+            .ok_or_else(|| anyhow!("no compatible Vulkan memory type"))
+    }
+
+    /// Import `fd` (dup'd internally; Vulkan owns the dup) as a transfer-src buffer of `size`.
+    unsafe fn import_src(&mut self, fd: i32, size: u64) -> Result<()> {
+        let dup = libc::dup(fd);
+        if dup < 0 {
+            bail!("dup(dmabuf fd)");
+        }
+        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT);
+        let buffer = self
+            .device
+            .create_buffer(
+                &vk::BufferCreateInfo::default()
+                    .size(size)
+                    .usage(vk::BufferUsageFlags::TRANSFER_SRC)
+                    .push_next(&mut ext_info),
+                None,
+            )
+            .context("create import buffer")?;
+        let mut fd_props = vk::MemoryFdPropertiesKHR::default();
+        self.ext_fd
+            .get_memory_fd_properties(
+                vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT,
+                dup,
+                &mut fd_props,
+            )
+            .context("vkGetMemoryFdPropertiesKHR")?;
+        let reqs = self.device.get_buffer_memory_requirements(buffer);
+        let mem_type = self.memory_type(
+            reqs.memory_type_bits & fd_props.memory_type_bits,
+            vk::MemoryPropertyFlags::empty(),
+        )?;
+        let mut import = vk::ImportMemoryFdInfoKHR::default()
+            .handle_type(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT)
+            .fd(dup); // Vulkan takes ownership of `dup` on success
+        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
+        let memory = self
+            .device
+            .allocate_memory(
+                &vk::MemoryAllocateInfo::default()
+                    .allocation_size(reqs.size.max(size))
+                    .memory_type_index(mem_type)
+                    .push_next(&mut import)
+                    .push_next(&mut dedicated),
+                None,
+            )
+            .map_err(|e| {
+                libc::close(dup); // failed import does not consume the fd
+                anyhow!("import dmabuf memory: {e}")
+            })?;
+        self.device
+            .bind_buffer_memory(buffer, memory, 0)
+            .context("bind import memory")?;
+        self.src_cache.insert(
+            fd,
+            SrcBuf {
+                buffer,
+                memory,
+                size,
+            },
+        );
+        Ok(())
+    }
+
+    /// (Re)create the exportable destination of at least `size` bytes + its CUDA mapping.
+    unsafe fn ensure_dst(&mut self, size: u64) -> Result<()> {
+        if self.dst.as_ref().is_some_and(|d| d.size >= size) {
+            return Ok(());
+        }
+        if let Some(old) = self.dst.take() {
+            self.device.destroy_buffer(old.buffer, None);
+            self.device.free_memory(old.memory, None);
+            // old.cuda drops its mapping with it
+        }
+        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
+        let buffer = self
+            .device
+            .create_buffer(
+                &vk::BufferCreateInfo::default()
+                    .size(size)
+                    .usage(vk::BufferUsageFlags::TRANSFER_DST)
+                    .push_next(&mut ext_info),
+                None,
+            )
+            .context("create export buffer")?;
+        let reqs = self.device.get_buffer_memory_requirements(buffer);
+        let mem_type =
+            self.memory_type(reqs.memory_type_bits, vk::MemoryPropertyFlags::DEVICE_LOCAL)?;
+        let mut export = vk::ExportMemoryAllocateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
+        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
+        let memory = self
+            .device
+            .allocate_memory(
+                &vk::MemoryAllocateInfo::default()
+                    .allocation_size(reqs.size)
+                    .memory_type_index(mem_type)
+                    .push_next(&mut export)
+                    .push_next(&mut dedicated),
+                None,
+            )
+            .context("allocate exportable memory")?;
+        self.device
+            .bind_buffer_memory(buffer, memory, 0)
+            .context("bind export memory")?;
+        let opaque_fd = self
+            .ext_fd
+            .get_memory_fd(
+                &vk::MemoryGetFdInfoKHR::default()
+                    .memory(memory)
+                    .handle_type(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD),
+            )
+            .context("vkGetMemoryFdKHR")?;
+        // CUDA imports (and on success owns) the exported fd. Size must match the allocation.
+        let cuda = cuda::ExternalDmabuf::import_owned_fd(opaque_fd, reqs.size)
+            .context("cuImportExternalMemory(OPAQUE_FD from Vulkan)")?;
+        tracing::info!(size, "Vulkan→CUDA exportable staging buffer ready");
+        self.dst = Some(DstBuf {
+            buffer,
+            memory,
+            size: reqs.size,
+            cuda,
+        });
+        Ok(())
+    }
+
+    /// Bridge one LINEAR dmabuf frame into a pooled CUDA buffer: GPU copy dmabuf→exportable,
+    /// then pitched CUDA copy exportable→`pool` buffer.
+    pub fn import_linear(
+        &mut self,
+        fd: i32,
+        offset: u32,
+        stride: u32,
+        height: u32,
+        pool: &cuda::BufferPool,
+    ) -> Result<DeviceBuffer> {
+        unsafe {
+            let span = offset as u64 + stride as u64 * height as u64;
+            if !self.src_cache.contains_key(&fd) {
+                let size = libc::lseek(fd, 0, libc::SEEK_END);
+                anyhow::ensure!(size > 0, "lseek(dmabuf)");
+                anyhow::ensure!(size as u64 >= span, "dmabuf smaller than frame span");
+                self.import_src(fd, size as u64)?;
+            }
+            let (src_buffer, src_size) = {
+                let s = &self.src_cache[&fd];
+                (s.buffer, s.size)
+            };
+            let copy_size = src_size.min(span);
+            self.ensure_dst(copy_size)?;
+            let dst = self.dst.as_ref().unwrap();
+
+            // Record + submit the GPU copy, wait on the fence (GPU-GPU, sub-millisecond).
+            self.device
+                .begin_command_buffer(
+                    self.cmd,
+                    &vk::CommandBufferBeginInfo::default()
+                        .flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT),
+                )
+                .context("begin cmd")?;
+            let region = vk::BufferCopy::default().size(copy_size);
+            self.device
+                .cmd_copy_buffer(self.cmd, src_buffer, dst.buffer, &[region]);
+            self.device
+                .end_command_buffer(self.cmd)
+                .context("end cmd")?;
+            let cmds = [self.cmd];
+            let submit = vk::SubmitInfo::default().command_buffers(&cmds);
+            self.device
+                .queue_submit(self.queue, &[submit], self.fence)
+                .context("queue submit")?;
+            self.device
+                .wait_for_fences(&[self.fence], true, 1_000_000_000)
+                .context("fence wait")?;
+            self.device
+                .reset_fences(&[self.fence])
+                .context("reset fence")?;
+
+            // De-stride from the CUDA view of the exportable memory into a pooled buffer.
+            cuda::make_current()?;
+            let out = pool.get()?;
+            cuda::copy_pitched_to_buffer(dst.cuda.ptr + offset as u64, stride as usize, &out)?;
+            Ok(out)
+        }
+    }
+}
+
+impl Drop for VkBridge {
+    fn drop(&mut self) {
+        unsafe {
+            let _ = self.device.device_wait_idle();
+            for (_, s) in self.src_cache.drain() {
+                self.device.destroy_buffer(s.buffer, None);
+                self.device.free_memory(s.memory, None);
+            }
+            if let Some(d) = self.dst.take() {
+                self.device.destroy_buffer(d.buffer, None);
+                self.device.free_memory(d.memory, None);
+            }
+            self.device.destroy_fence(self.fence, None);
+            self.device.destroy_command_pool(self.cmd_pool, None);
+            self.device.destroy_device(None);
+            self.instance.destroy_instance(None);
+        }
+    }
+}