From 826da9968eaa55a3d410069b7f7d1df1d3ef8797 Mon Sep 17 00:00:00 2001
From: enricobuehler <buehler@unom.io>
Date: Tue, 9 Jun 2026 23:18:38 +0000
Subject: [PATCH] =?UTF-8?q?feat:=20M2=20=E2=80=94=20Vulkan=20bridge:=20TRU?=
 =?UTF-8?q?E=20zero-copy=20for=20gamescope's=20LINEAR=20dmabufs=20(Phase?=
 =?UTF-8?q?=203)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The missing zero-copy path is closed. NVIDIA's EGL won't sample LINEAR and the CUDA driver
rejects raw dmabuf fds — but Vulkan imports dmabufs (VK_EXT_external_memory_dma_buf) and
exports OPAQUE_FD memory that CUDA officially imports. zerocopy/vulkan.rs (ash):

  dmabuf fd → VkBuffer (import cached per fd) → vkCmdCopyBuffer (GPU) →
  exportable VkBuffer → vkGetMemoryFdKHR(OPAQUE_FD) → cuImportExternalMemory → CUdeviceptr

The exportable buffer + CUDA mapping are per-resolution; per frame it's one GPU buffer copy
(fence-waited) + one pitched CUDA copy into the encoder's pool. No CPU touches pixels.
EglImporter::import_linear now routes through the bridge (lazy init; any failure still falls
back to the CPU mmap path). cuda::ExternalDmabuf gained import_owned_fd for the
Vulkan-exported fd.

Validated live: gamescope 720p120 → "Vulkan→CUDA exportable staging buffer ready
size=3686400" (exactly 1280*720*4), full-rate 122.7 fps, decoded frame pixel-correct
(vkcube). KWin's tiled EGL path regression-tested intact. NV12 negotiation dropped — moot
now that BGRx is fully zero-copy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                               |  10 +
 crates/lumen-host/Cargo.toml             |   4 +
 crates/lumen-host/src/zerocopy/cuda.rs   |   6 +
 crates/lumen-host/src/zerocopy/egl.rs    |  37 ++-
 crates/lumen-host/src/zerocopy/mod.rs    |   1 +
 crates/lumen-host/src/zerocopy/vulkan.rs | 366 +++++++++++++++++++++++
 6 files changed, 404 insertions(+), 20 deletions(-)
 create mode 100644 crates/lumen-host/src/zerocopy/vulkan.rs
diff --git a/Cargo.lock b/Cargo.lock
index 40cee1c..1cbb471 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -121,6 +121,15 @@ dependencies = [
  "rustversion",
 ]
 
+[[package]]
+name = "ash"
+version = "0.38.0+1.3.281"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bb44936d800fea8f016d7f2311c6a4f97aebd5dc86f09906139ec848cf3a46f"
+dependencies = [
+ "libloading",
+]
+
 [[package]]
 name = "ashpd"
 version = "0.13.11"
@@ -1486,6 +1495,7 @@ dependencies = [
  "aes",
  "aes-gcm",
  "anyhow",
+ "ash",
  "ashpd",
  "axum",
  "axum-server",
diff --git a/crates/lumen-host/Cargo.toml b/crates/lumen-host/Cargo.toml
index beb0954..2dc6002 100644
--- a/crates/lumen-host/Cargo.toml
+++ b/crates/lumen-host/Cargo.toml
@@ -70,3 +70,7 @@ futures-util = "0.3"
 # eglCreateImage + the dma_buf import; the CUDA driver API (EGL interop) and libgbm are linked
 # via hand-rolled FFI in `src/zerocopy/` (no Rust crate exposes the EGL-interop driver calls).
 khronos-egl = { version = "6", features = ["dynamic"] }
+# Vulkan bridge for LINEAR dmabufs (gamescope): import via VK_EXT_external_memory_dma_buf,
+# GPU-copy into an exportable allocation, export OPAQUE_FD → cuImportExternalMemory (the
+# officially-supported CUDA pairing; raw dmabuf fds are rejected by the desktop driver).
+ash = "0.38"
diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs
index c4edcb0..a9b7ebe 100644
--- a/crates/lumen-host/src/zerocopy/cuda.rs
+++ b/crates/lumen-host/src/zerocopy/cuda.rs
@@ -431,6 +431,12 @@ impl ExternalDmabuf {
         if dup < 0 {
             bail!("dup(dmabuf fd) failed");
         }
+        Self::import_owned_fd(dup, size)
+    }
+
+    /// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by
+    /// the driver on success, closed by us on failure.
+    pub fn import_owned_fd(dup: i32, size: u64) -> Result<ExternalDmabuf> {
         let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC {
             type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
             size,
diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs
index 45fbcef..bf7eaf8 100644
--- a/crates/lumen-host/src/zerocopy/egl.rs
+++ b/crates/lumen-host/src/zerocopy/egl.rs
@@ -252,9 +252,9 @@ pub struct EglImporter {
     egl_image_target: EglImageTargetFn,
     /// Lazily-created GL blit machinery (recreated if the frame size changes).
     blit: Option<GlBlit>,
-    /// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the
-    /// producer's buffer pool keeps fds stable for the stream's life) + the destination pool.
-    linear: std::collections::HashMap<i32, cuda::ExternalDmabuf>,
+    /// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
+    /// created lazily on the first LINEAR frame, + the destination pool.
+    vk: Option<super::vulkan::VkBridge>,
     linear_pool: Option<cuda::BufferPool>,
     gbm: *mut c_void,
     render_fd: c_int,
@@ -355,16 +355,16 @@ impl EglImporter {
             _gl_ctx: gl_ctx,
             egl_image_target,
             blit: None,
-            linear: std::collections::HashMap::new(),
+            vk: None,
             linear_pool: None,
             gbm,
             render_fd,
         })
     }
 
-    /// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't
-    /// sample LINEAR, but the bytes are directly addressable once imported). The import is
-    /// cached per fd; per frame this is one device→device copy into a pooled buffer.
+    /// Import a LINEAR dmabuf via the Vulkan bridge (no EGL/GL involved — NVIDIA's EGL can't
+    /// sample LINEAR, and the CUDA driver rejects raw dmabuf fds; Vulkan imports the dmabuf,
+    /// GPU-copies into an exportable allocation, and CUDA reads that). See [`super::vulkan`].
     pub fn import_linear(
         &mut self,
         plane: &DmabufPlane,
@@ -375,19 +375,16 @@ impl EglImporter {
         if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
             self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
         }
-        let fd = plane.fd;
-        let ext = match self.linear.entry(fd) {
-            std::collections::hash_map::Entry::Occupied(e) => e.into_mut(),
-            std::collections::hash_map::Entry::Vacant(e) => {
-                // Size from the fd itself (the chunk's size field is unreliable for dmabufs).
-                let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) };
-                anyhow::ensure!(size > 0, "lseek(dmabuf) failed");
-                e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?)
-            }
-        };
-        let dst = self.linear_pool.as_ref().unwrap().get()?;
-        cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?;
-        Ok(dst)
+        if self.vk.is_none() {
+            self.vk = Some(super::vulkan::VkBridge::new()?);
+        }
+        self.vk.as_mut().unwrap().import_linear(
+            plane.fd,
+            plane.offset,
+            plane.stride,
+            height,
+            self.linear_pool.as_ref().unwrap(),
+        )
     }
 
     /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
diff --git a/crates/lumen-host/src/zerocopy/mod.rs b/crates/lumen-host/src/zerocopy/mod.rs
index 09c5b35..b8d9aed 100644
--- a/crates/lumen-host/src/zerocopy/mod.rs
+++ b/crates/lumen-host/src/zerocopy/mod.rs
@@ -9,6 +9,7 @@
 
 pub mod cuda;
 pub mod egl;
+pub mod vulkan;
 
 pub use cuda::DeviceBuffer;
 pub use egl::{DmabufPlane, EglImporter};
diff --git a/crates/lumen-host/src/zerocopy/vulkan.rs b/crates/lumen-host/src/zerocopy/vulkan.rs
new file mode 100644
index 0000000..98cc78e
--- /dev/null
+++ b/crates/lumen-host/src/zerocopy/vulkan.rs
@@ -0,0 +1,366 @@
+//! Vulkan bridge for LINEAR dmabufs (gamescope's only offer), completing zero-copy where the
+//! other interops can't: NVIDIA's EGL won't sample LINEAR, and the CUDA driver rejects raw
+//! dmabuf fds as external memory. Vulkan *does* import dmabufs (`VK_EXT_external_memory_dma_buf`)
+//! and *does* export `OPAQUE_FD` memory that CUDA officially imports. So:
+//!
+//! ```text
+//!   dmabuf fd ──VkImportMemoryFdInfoKHR(DMA_BUF)──▶ VkBuffer (cached per fd)
+//!        │ vkCmdCopyBuffer (GPU, device-local)
+//!        ▼
+//!   exportable VkBuffer ──vkGetMemoryFdKHR(OPAQUE_FD)──▶ cuImportExternalMemory ──▶ CUdeviceptr
+//! ```
+//!
+//! The exportable buffer + its CUDA mapping are created once per resolution; per frame it's one
+//! GPU buffer copy (fence-waited) and one pitched CUDA copy into the encoder's pooled buffer.
+//! No CPU ever touches pixels. Imports are cached per fd (PipeWire's buffer pool is stable for
+//! a stream's life). Falls back cleanly: any init/import error disables the importer and the
+//! CPU mmap path takes over.
+
+use super::cuda::{self, DeviceBuffer};
+use anyhow::{anyhow, bail, Context as _, Result};
+use ash::vk;
+use std::collections::HashMap;
+
+/// Vulkan objects for one imported source dmabuf (cached per fd).
+struct SrcBuf {
+    buffer: vk::Buffer,
+    memory: vk::DeviceMemory,
+    size: u64,
+}
+
+/// The per-resolution destination: exportable Vulkan memory mapped into CUDA.
+struct DstBuf {
+    buffer: vk::Buffer,
+    memory: vk::DeviceMemory,
+    size: u64,
+    /// CUDA's view of the same memory (owns the exported OPAQUE_FD).
+    cuda: cuda::ExternalDmabuf,
+}
+
+pub struct VkBridge {
+    _entry: ash::Entry,
+    instance: ash::Instance,
+    device: ash::Device,
+    ext_fd: ash::khr::external_memory_fd::Device,
+    queue: vk::Queue,
+    cmd_pool: vk::CommandPool,
+    cmd: vk::CommandBuffer,
+    fence: vk::Fence,
+    mem_props: vk::PhysicalDeviceMemoryProperties,
+    src_cache: HashMap<i32, SrcBuf>,
+    dst: Option<DstBuf>,
+}
+
+// Confined to the capture thread; moved there once.
+unsafe impl Send for VkBridge {}
+
+impl VkBridge {
+    /// Bring up Vulkan on the NVIDIA GPU with the external-memory extensions.
+    pub fn new() -> Result<VkBridge> {
+        unsafe {
+            let entry = ash::Entry::load().context("load libvulkan")?;
+            let app = vk::ApplicationInfo::default().api_version(vk::API_VERSION_1_1);
+            let instance = entry
+                .create_instance(
+                    &vk::InstanceCreateInfo::default().application_info(&app),
+                    None,
+                )
+                .context("vkCreateInstance")?;
+
+            // Pick the NVIDIA GPU (matches CUDA device 0 on this single-dGPU host).
+            let phys = instance
+                .enumerate_physical_devices()
+                .context("enumerate GPUs")?
+                .into_iter()
+                .find(|&p| instance.get_physical_device_properties(p).vendor_id == 0x10DE)
+                .ok_or_else(|| anyhow!("no NVIDIA Vulkan device"))?;
+            let mem_props = instance.get_physical_device_memory_properties(phys);
+
+            // Any queue family supporting transfer (graphics/compute imply it).
+            let qf = instance
+                .get_physical_device_queue_family_properties(phys)
+                .iter()
+                .position(|q| {
+                    q.queue_flags.intersects(
+                        vk::QueueFlags::TRANSFER
+                            | vk::QueueFlags::GRAPHICS
+                            | vk::QueueFlags::COMPUTE,
+                    )
+                })
+                .ok_or_else(|| anyhow!("no transfer-capable queue family"))?
+                as u32;
+
+            let exts = [
+                ash::khr::external_memory_fd::NAME.as_ptr(),
+                ash::ext::external_memory_dma_buf::NAME.as_ptr(),
+            ];
+            let prio = [1.0f32];
+            let qci = [vk::DeviceQueueCreateInfo::default()
+                .queue_family_index(qf)
+                .queue_priorities(&prio)];
+            let device = instance
+                .create_device(
+                    phys,
+                    &vk::DeviceCreateInfo::default()
+                        .queue_create_infos(&qci)
+                        .enabled_extension_names(&exts),
+                    None,
+                )
+                .context("vkCreateDevice (external-memory extensions supported?)")?;
+            let ext_fd = ash::khr::external_memory_fd::Device::new(&instance, &device);
+            let queue = device.get_device_queue(qf, 0);
+
+            let cmd_pool = device
+                .create_command_pool(
+                    &vk::CommandPoolCreateInfo::default()
+                        .queue_family_index(qf)
+                        .flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER),
+                    None,
+                )
+                .context("create command pool")?;
+            let cmd = device
+                .allocate_command_buffers(
+                    &vk::CommandBufferAllocateInfo::default()
+                        .command_pool(cmd_pool)
+                        .level(vk::CommandBufferLevel::PRIMARY)
+                        .command_buffer_count(1),
+                )
+                .context("allocate command buffer")?[0];
+            let fence = device
+                .create_fence(&vk::FenceCreateInfo::default(), None)
+                .context("create fence")?;
+
+            tracing::info!("Vulkan bridge ready (dmabuf import → OPAQUE_FD export → CUDA)");
+            Ok(VkBridge {
+                _entry: entry,
+                instance,
+                device,
+                ext_fd,
+                queue,
+                cmd_pool,
+                cmd,
+                fence,
+                mem_props,
+                src_cache: HashMap::new(),
+                dst: None,
+            })
+        }
+    }
+
+    fn memory_type(&self, type_bits: u32, flags: vk::MemoryPropertyFlags) -> Result<u32> {
+        (0..self.mem_props.memory_type_count)
+            .find(|&i| {
+                type_bits & (1 << i) != 0
+                    && self.mem_props.memory_types[i as usize]
+                        .property_flags
+                        .contains(flags)
+            })
+            .ok_or_else(|| anyhow!("no compatible Vulkan memory type"))
+    }
+
+    /// Import `fd` (dup'd internally; Vulkan owns the dup) as a transfer-src buffer of `size`.
+    unsafe fn import_src(&mut self, fd: i32, size: u64) -> Result<()> {
+        let dup = libc::dup(fd);
+        if dup < 0 {
+            bail!("dup(dmabuf fd)");
+        }
+        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT);
+        let buffer = self
+            .device
+            .create_buffer(
+                &vk::BufferCreateInfo::default()
+                    .size(size)
+                    .usage(vk::BufferUsageFlags::TRANSFER_SRC)
+                    .push_next(&mut ext_info),
+                None,
+            )
+            .context("create import buffer")?;
+        let mut fd_props = vk::MemoryFdPropertiesKHR::default();
+        self.ext_fd
+            .get_memory_fd_properties(
+                vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT,
+                dup,
+                &mut fd_props,
+            )
+            .context("vkGetMemoryFdPropertiesKHR")?;
+        let reqs = self.device.get_buffer_memory_requirements(buffer);
+        let mem_type = self.memory_type(
+            reqs.memory_type_bits & fd_props.memory_type_bits,
+            vk::MemoryPropertyFlags::empty(),
+        )?;
+        let mut import = vk::ImportMemoryFdInfoKHR::default()
+            .handle_type(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT)
+            .fd(dup); // Vulkan takes ownership of `dup` on success
+        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
+        let memory = self
+            .device
+            .allocate_memory(
+                &vk::MemoryAllocateInfo::default()
+                    .allocation_size(reqs.size.max(size))
+                    .memory_type_index(mem_type)
+                    .push_next(&mut import)
+                    .push_next(&mut dedicated),
+                None,
+            )
+            .map_err(|e| {
+                libc::close(dup); // failed import does not consume the fd
+                anyhow!("import dmabuf memory: {e}")
+            })?;
+        self.device
+            .bind_buffer_memory(buffer, memory, 0)
+            .context("bind import memory")?;
+        self.src_cache.insert(
+            fd,
+            SrcBuf {
+                buffer,
+                memory,
+                size,
+            },
+        );
+        Ok(())
+    }
+
+    /// (Re)create the exportable destination of at least `size` bytes + its CUDA mapping.
+    unsafe fn ensure_dst(&mut self, size: u64) -> Result<()> {
+        if self.dst.as_ref().is_some_and(|d| d.size >= size) {
+            return Ok(());
+        }
+        if let Some(old) = self.dst.take() {
+            self.device.destroy_buffer(old.buffer, None);
+            self.device.free_memory(old.memory, None);
+            // old.cuda drops its mapping with it
+        }
+        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
+        let buffer = self
+            .device
+            .create_buffer(
+                &vk::BufferCreateInfo::default()
+                    .size(size)
+                    .usage(vk::BufferUsageFlags::TRANSFER_DST)
+                    .push_next(&mut ext_info),
+                None,
+            )
+            .context("create export buffer")?;
+        let reqs = self.device.get_buffer_memory_requirements(buffer);
+        let mem_type =
+            self.memory_type(reqs.memory_type_bits, vk::MemoryPropertyFlags::DEVICE_LOCAL)?;
+        let mut export = vk::ExportMemoryAllocateInfo::default()
+            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
+        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
+        let memory = self
+            .device
+            .allocate_memory(
+                &vk::MemoryAllocateInfo::default()
+                    .allocation_size(reqs.size)
+                    .memory_type_index(mem_type)
+                    .push_next(&mut export)
+                    .push_next(&mut dedicated),
+                None,
+            )
+            .context("allocate exportable memory")?;
+        self.device
+            .bind_buffer_memory(buffer, memory, 0)
+            .context("bind export memory")?;
+        let opaque_fd = self
+            .ext_fd
+            .get_memory_fd(
+                &vk::MemoryGetFdInfoKHR::default()
+                    .memory(memory)
+                    .handle_type(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD),
+            )
+            .context("vkGetMemoryFdKHR")?;
+        // CUDA imports (and on success owns) the exported fd. Size must match the allocation.
+        let cuda = cuda::ExternalDmabuf::import_owned_fd(opaque_fd, reqs.size)
+            .context("cuImportExternalMemory(OPAQUE_FD from Vulkan)")?;
+        tracing::info!(size, "Vulkan→CUDA exportable staging buffer ready");
+        self.dst = Some(DstBuf {
+            buffer,
+            memory,
+            size: reqs.size,
+            cuda,
+        });
+        Ok(())
+    }
+
+    /// Bridge one LINEAR dmabuf frame into a pooled CUDA buffer: GPU copy dmabuf→exportable,
+    /// then pitched CUDA copy exportable→`pool` buffer.
+    pub fn import_linear(
+        &mut self,
+        fd: i32,
+        offset: u32,
+        stride: u32,
+        height: u32,
+        pool: &cuda::BufferPool,
+    ) -> Result<DeviceBuffer> {
+        unsafe {
+            let span = offset as u64 + stride as u64 * height as u64;
+            if !self.src_cache.contains_key(&fd) {
+                let size = libc::lseek(fd, 0, libc::SEEK_END);
+                anyhow::ensure!(size > 0, "lseek(dmabuf)");
+                anyhow::ensure!(size as u64 >= span, "dmabuf smaller than frame span");
+                self.import_src(fd, size as u64)?;
+            }
+            let (src_buffer, src_size) = {
+                let s = &self.src_cache[&fd];
+                (s.buffer, s.size)
+            };
+            let copy_size = src_size.min(span);
+            self.ensure_dst(copy_size)?;
+            let dst = self.dst.as_ref().unwrap();
+
+            // Record + submit the GPU copy, wait on the fence (GPU-GPU, sub-millisecond).
+            self.device
+                .begin_command_buffer(
+                    self.cmd,
+                    &vk::CommandBufferBeginInfo::default()
+                        .flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT),
+                )
+                .context("begin cmd")?;
+            let region = vk::BufferCopy::default().size(copy_size);
+            self.device
+                .cmd_copy_buffer(self.cmd, src_buffer, dst.buffer, &[region]);
+            self.device
+                .end_command_buffer(self.cmd)
+                .context("end cmd")?;
+            let cmds = [self.cmd];
+            let submit = vk::SubmitInfo::default().command_buffers(&cmds);
+            self.device
+                .queue_submit(self.queue, &[submit], self.fence)
+                .context("queue submit")?;
+            self.device
+                .wait_for_fences(&[self.fence], true, 1_000_000_000)
+                .context("fence wait")?;
+            self.device
+                .reset_fences(&[self.fence])
+                .context("reset fence")?;
+
+            // De-stride from the CUDA view of the exportable memory into a pooled buffer.
+            cuda::make_current()?;
+            let out = pool.get()?;
+            cuda::copy_pitched_to_buffer(dst.cuda.ptr + offset as u64, stride as usize, &out)?;
+            Ok(out)
+        }
+    }
+}
+
+impl Drop for VkBridge {
+    fn drop(&mut self) {
+        unsafe {
+            let _ = self.device.device_wait_idle();
+            for (_, s) in self.src_cache.drain() {
+                self.device.destroy_buffer(s.buffer, None);
+                self.device.free_memory(s.memory, None);
+            }
+            if let Some(d) = self.dst.take() {
+                self.device.destroy_buffer(d.buffer, None);
+                self.device.free_memory(d.memory, None);
+            }
+            self.device.destroy_fence(self.fence, None);
+            self.device.destroy_command_pool(self.cmd_pool, None);
+            self.device.destroy_device(None);
+            self.instance.destroy_instance(None);
+        }
+    }
+}