From 826da9968eaa55a3d410069b7f7d1df1d3ef8797 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Tue, 9 Jun 2026 23:18:38 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20M2=20=E2=80=94=20Vulkan=20bridge:=20TRU?= =?UTF-8?q?E=20zero-copy=20for=20gamescope's=20LINEAR=20dmabufs=20(Phase?= =?UTF-8?q?=203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The missing zero-copy path is closed. NVIDIA's EGL won't sample LINEAR and the CUDA driver rejects raw dmabuf fds — but Vulkan imports dmabufs (VK_EXT_external_memory_dma_buf) and exports OPAQUE_FD memory that CUDA officially imports. zerocopy/vulkan.rs (ash): dmabuf fd → VkBuffer (import cached per fd) → vkCmdCopyBuffer (GPU) → exportable VkBuffer → vkGetMemoryFdKHR(OPAQUE_FD) → cuImportExternalMemory → CUdeviceptr The exportable buffer + CUDA mapping are per-resolution; per frame it's one GPU buffer copy (fence-waited) + one pitched CUDA copy into the encoder's pool. No CPU touches pixels. EglImporter::import_linear now routes through the bridge (lazy init; any failure still falls back to the CPU mmap path). cuda::ExternalDmabuf gained import_owned_fd for the Vulkan-exported fd. Validated live: gamescope 720p120 → "Vulkan→CUDA exportable staging buffer ready size=3686400" (exactly 1280*720*4), full-rate 122.7 fps, decoded frame pixel-correct (vkcube). KWin's tiled EGL path regression-tested intact. NV12 negotiation dropped — moot now that BGRx is fully zero-copy. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 10 + crates/lumen-host/Cargo.toml | 4 + crates/lumen-host/src/zerocopy/cuda.rs | 6 + crates/lumen-host/src/zerocopy/egl.rs | 37 ++- crates/lumen-host/src/zerocopy/mod.rs | 1 + crates/lumen-host/src/zerocopy/vulkan.rs | 366 +++++++++++++++++++++++ 6 files changed, 404 insertions(+), 20 deletions(-) create mode 100644 crates/lumen-host/src/zerocopy/vulkan.rs diff --git a/Cargo.lock b/Cargo.lock index 40cee1c..1cbb471 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,6 +121,15 @@ dependencies = [ "rustversion", ] +[[package]] +name = "ash" +version = "0.38.0+1.3.281" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bb44936d800fea8f016d7f2311c6a4f97aebd5dc86f09906139ec848cf3a46f" +dependencies = [ + "libloading", +] + [[package]] name = "ashpd" version = "0.13.11" @@ -1486,6 +1495,7 @@ dependencies = [ "aes", "aes-gcm", "anyhow", + "ash", "ashpd", "axum", "axum-server", diff --git a/crates/lumen-host/Cargo.toml b/crates/lumen-host/Cargo.toml index beb0954..2dc6002 100644 --- a/crates/lumen-host/Cargo.toml +++ b/crates/lumen-host/Cargo.toml @@ -70,3 +70,7 @@ futures-util = "0.3" # eglCreateImage + the dma_buf import; the CUDA driver API (EGL interop) and libgbm are linked # via hand-rolled FFI in `src/zerocopy/` (no Rust crate exposes the EGL-interop driver calls). khronos-egl = { version = "6", features = ["dynamic"] } +# Vulkan bridge for LINEAR dmabufs (gamescope): import via VK_EXT_external_memory_dma_buf, +# GPU-copy into an exportable allocation, export OPAQUE_FD → cuImportExternalMemory (the +# officially-supported CUDA pairing; raw dmabuf fds are rejected by the desktop driver). +ash = "0.38" diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs index c4edcb0..a9b7ebe 100644 --- a/crates/lumen-host/src/zerocopy/cuda.rs +++ b/crates/lumen-host/src/zerocopy/cuda.rs @@ -431,6 +431,12 @@ impl ExternalDmabuf { if dup < 0 { bail!("dup(dmabuf fd) failed"); } + Self::import_owned_fd(dup, size) + } + + /// Import an fd the caller hands over (e.g. a Vulkan-exported `OPAQUE_FD`) — consumed by + /// the driver on success, closed by us on failure. + pub fn import_owned_fd(dup: i32, size: u64) -> Result { let mut desc = CUDA_EXTERNAL_MEMORY_HANDLE_DESC { type_: CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, size, diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs index 45fbcef..bf7eaf8 100644 --- a/crates/lumen-host/src/zerocopy/egl.rs +++ b/crates/lumen-host/src/zerocopy/egl.rs @@ -252,9 +252,9 @@ pub struct EglImporter { egl_image_target: EglImageTargetFn, /// Lazily-created GL blit machinery (recreated if the frame size changes). blit: Option, - /// LINEAR-dmabuf path (gamescope): CUDA external-memory imports cached per buffer fd (the - /// producer's buffer pool keeps fds stable for the stream's life) + the destination pool. - linear: std::collections::HashMap, + /// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA), + /// created lazily on the first LINEAR frame, + the destination pool. + vk: Option, linear_pool: Option, gbm: *mut c_void, render_fd: c_int, @@ -355,16 +355,16 @@ impl EglImporter { _gl_ctx: gl_ctx, egl_image_target, blit: None, - linear: std::collections::HashMap::new(), + vk: None, linear_pool: None, gbm, render_fd, }) } - /// Import a LINEAR dmabuf via CUDA external memory (no EGL/GL involved — NVIDIA's EGL can't - /// sample LINEAR, but the bytes are directly addressable once imported). The import is - /// cached per fd; per frame this is one device→device copy into a pooled buffer. + /// Import a LINEAR dmabuf via the Vulkan bridge (no EGL/GL involved — NVIDIA's EGL can't + /// sample LINEAR, and the CUDA driver rejects raw dmabuf fds; Vulkan imports the dmabuf, + /// GPU-copies into an exportable allocation, and CUDA reads that). See [`super::vulkan`]. pub fn import_linear( &mut self, plane: &DmabufPlane, @@ -375,19 +375,16 @@ impl EglImporter { if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) { self.linear_pool = Some(cuda::BufferPool::new(width, height)?); } - let fd = plane.fd; - let ext = match self.linear.entry(fd) { - std::collections::hash_map::Entry::Occupied(e) => e.into_mut(), - std::collections::hash_map::Entry::Vacant(e) => { - // Size from the fd itself (the chunk's size field is unreliable for dmabufs). - let size = unsafe { libc::lseek(fd, 0, libc::SEEK_END) }; - anyhow::ensure!(size > 0, "lseek(dmabuf) failed"); - e.insert(cuda::ExternalDmabuf::import(fd, size as u64)?) - } - }; - let dst = self.linear_pool.as_ref().unwrap().get()?; - cuda::copy_pitched_to_buffer(ext.ptr + plane.offset as u64, plane.stride as usize, &dst)?; - Ok(dst) + if self.vk.is_none() { + self.vk = Some(super::vulkan::VkBridge::new()?); + } + self.vk.as_mut().unwrap().import_linear( + plane.fd, + plane.offset, + plane.stride, + height, + self.linear_pool.as_ref().unwrap(), + ) } /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via diff --git a/crates/lumen-host/src/zerocopy/mod.rs b/crates/lumen-host/src/zerocopy/mod.rs index 09c5b35..b8d9aed 100644 --- a/crates/lumen-host/src/zerocopy/mod.rs +++ b/crates/lumen-host/src/zerocopy/mod.rs @@ -9,6 +9,7 @@ pub mod cuda; pub mod egl; +pub mod vulkan; pub use cuda::DeviceBuffer; pub use egl::{DmabufPlane, EglImporter}; diff --git a/crates/lumen-host/src/zerocopy/vulkan.rs b/crates/lumen-host/src/zerocopy/vulkan.rs new file mode 100644 index 0000000..98cc78e --- /dev/null +++ b/crates/lumen-host/src/zerocopy/vulkan.rs @@ -0,0 +1,366 @@ +//! Vulkan bridge for LINEAR dmabufs (gamescope's only offer), completing zero-copy where the +//! other interops can't: NVIDIA's EGL won't sample LINEAR, and the CUDA driver rejects raw +//! dmabuf fds as external memory. Vulkan *does* import dmabufs (`VK_EXT_external_memory_dma_buf`) +//! and *does* export `OPAQUE_FD` memory that CUDA officially imports. So: +//! +//! ```text +//! dmabuf fd ──VkImportMemoryFdInfoKHR(DMA_BUF)──▶ VkBuffer (cached per fd) +//! │ vkCmdCopyBuffer (GPU, device-local) +//! ▼ +//! exportable VkBuffer ──vkGetMemoryFdKHR(OPAQUE_FD)──▶ cuImportExternalMemory ──▶ CUdeviceptr +//! ``` +//! +//! The exportable buffer + its CUDA mapping are created once per resolution; per frame it's one +//! GPU buffer copy (fence-waited) and one pitched CUDA copy into the encoder's pooled buffer. +//! No CPU ever touches pixels. Imports are cached per fd (PipeWire's buffer pool is stable for +//! a stream's life). Falls back cleanly: any init/import error disables the importer and the +//! CPU mmap path takes over. + +use super::cuda::{self, DeviceBuffer}; +use anyhow::{anyhow, bail, Context as _, Result}; +use ash::vk; +use std::collections::HashMap; + +/// Vulkan objects for one imported source dmabuf (cached per fd). +struct SrcBuf { + buffer: vk::Buffer, + memory: vk::DeviceMemory, + size: u64, +} + +/// The per-resolution destination: exportable Vulkan memory mapped into CUDA. +struct DstBuf { + buffer: vk::Buffer, + memory: vk::DeviceMemory, + size: u64, + /// CUDA's view of the same memory (owns the exported OPAQUE_FD). + cuda: cuda::ExternalDmabuf, +} + +pub struct VkBridge { + _entry: ash::Entry, + instance: ash::Instance, + device: ash::Device, + ext_fd: ash::khr::external_memory_fd::Device, + queue: vk::Queue, + cmd_pool: vk::CommandPool, + cmd: vk::CommandBuffer, + fence: vk::Fence, + mem_props: vk::PhysicalDeviceMemoryProperties, + src_cache: HashMap, + dst: Option, +} + +// Confined to the capture thread; moved there once. +unsafe impl Send for VkBridge {} + +impl VkBridge { + /// Bring up Vulkan on the NVIDIA GPU with the external-memory extensions. + pub fn new() -> Result { + unsafe { + let entry = ash::Entry::load().context("load libvulkan")?; + let app = vk::ApplicationInfo::default().api_version(vk::API_VERSION_1_1); + let instance = entry + .create_instance( + &vk::InstanceCreateInfo::default().application_info(&app), + None, + ) + .context("vkCreateInstance")?; + + // Pick the NVIDIA GPU (matches CUDA device 0 on this single-dGPU host). + let phys = instance + .enumerate_physical_devices() + .context("enumerate GPUs")? + .into_iter() + .find(|&p| instance.get_physical_device_properties(p).vendor_id == 0x10DE) + .ok_or_else(|| anyhow!("no NVIDIA Vulkan device"))?; + let mem_props = instance.get_physical_device_memory_properties(phys); + + // Any queue family supporting transfer (graphics/compute imply it). + let qf = instance + .get_physical_device_queue_family_properties(phys) + .iter() + .position(|q| { + q.queue_flags.intersects( + vk::QueueFlags::TRANSFER + | vk::QueueFlags::GRAPHICS + | vk::QueueFlags::COMPUTE, + ) + }) + .ok_or_else(|| anyhow!("no transfer-capable queue family"))? + as u32; + + let exts = [ + ash::khr::external_memory_fd::NAME.as_ptr(), + ash::ext::external_memory_dma_buf::NAME.as_ptr(), + ]; + let prio = [1.0f32]; + let qci = [vk::DeviceQueueCreateInfo::default() + .queue_family_index(qf) + .queue_priorities(&prio)]; + let device = instance + .create_device( + phys, + &vk::DeviceCreateInfo::default() + .queue_create_infos(&qci) + .enabled_extension_names(&exts), + None, + ) + .context("vkCreateDevice (external-memory extensions supported?)")?; + let ext_fd = ash::khr::external_memory_fd::Device::new(&instance, &device); + let queue = device.get_device_queue(qf, 0); + + let cmd_pool = device + .create_command_pool( + &vk::CommandPoolCreateInfo::default() + .queue_family_index(qf) + .flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER), + None, + ) + .context("create command pool")?; + let cmd = device + .allocate_command_buffers( + &vk::CommandBufferAllocateInfo::default() + .command_pool(cmd_pool) + .level(vk::CommandBufferLevel::PRIMARY) + .command_buffer_count(1), + ) + .context("allocate command buffer")?[0]; + let fence = device + .create_fence(&vk::FenceCreateInfo::default(), None) + .context("create fence")?; + + tracing::info!("Vulkan bridge ready (dmabuf import → OPAQUE_FD export → CUDA)"); + Ok(VkBridge { + _entry: entry, + instance, + device, + ext_fd, + queue, + cmd_pool, + cmd, + fence, + mem_props, + src_cache: HashMap::new(), + dst: None, + }) + } + } + + fn memory_type(&self, type_bits: u32, flags: vk::MemoryPropertyFlags) -> Result { + (0..self.mem_props.memory_type_count) + .find(|&i| { + type_bits & (1 << i) != 0 + && self.mem_props.memory_types[i as usize] + .property_flags + .contains(flags) + }) + .ok_or_else(|| anyhow!("no compatible Vulkan memory type")) + } + + /// Import `fd` (dup'd internally; Vulkan owns the dup) as a transfer-src buffer of `size`. + unsafe fn import_src(&mut self, fd: i32, size: u64) -> Result<()> { + let dup = libc::dup(fd); + if dup < 0 { + bail!("dup(dmabuf fd)"); + } + let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default() + .handle_types(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT); + let buffer = self + .device + .create_buffer( + &vk::BufferCreateInfo::default() + .size(size) + .usage(vk::BufferUsageFlags::TRANSFER_SRC) + .push_next(&mut ext_info), + None, + ) + .context("create import buffer")?; + let mut fd_props = vk::MemoryFdPropertiesKHR::default(); + self.ext_fd + .get_memory_fd_properties( + vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT, + dup, + &mut fd_props, + ) + .context("vkGetMemoryFdPropertiesKHR")?; + let reqs = self.device.get_buffer_memory_requirements(buffer); + let mem_type = self.memory_type( + reqs.memory_type_bits & fd_props.memory_type_bits, + vk::MemoryPropertyFlags::empty(), + )?; + let mut import = vk::ImportMemoryFdInfoKHR::default() + .handle_type(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT) + .fd(dup); // Vulkan takes ownership of `dup` on success + let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer); + let memory = self + .device + .allocate_memory( + &vk::MemoryAllocateInfo::default() + .allocation_size(reqs.size.max(size)) + .memory_type_index(mem_type) + .push_next(&mut import) + .push_next(&mut dedicated), + None, + ) + .map_err(|e| { + libc::close(dup); // failed import does not consume the fd + anyhow!("import dmabuf memory: {e}") + })?; + self.device + .bind_buffer_memory(buffer, memory, 0) + .context("bind import memory")?; + self.src_cache.insert( + fd, + SrcBuf { + buffer, + memory, + size, + }, + ); + Ok(()) + } + + /// (Re)create the exportable destination of at least `size` bytes + its CUDA mapping. + unsafe fn ensure_dst(&mut self, size: u64) -> Result<()> { + if self.dst.as_ref().is_some_and(|d| d.size >= size) { + return Ok(()); + } + if let Some(old) = self.dst.take() { + self.device.destroy_buffer(old.buffer, None); + self.device.free_memory(old.memory, None); + // old.cuda drops its mapping with it + } + let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default() + .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD); + let buffer = self + .device + .create_buffer( + &vk::BufferCreateInfo::default() + .size(size) + .usage(vk::BufferUsageFlags::TRANSFER_DST) + .push_next(&mut ext_info), + None, + ) + .context("create export buffer")?; + let reqs = self.device.get_buffer_memory_requirements(buffer); + let mem_type = + self.memory_type(reqs.memory_type_bits, vk::MemoryPropertyFlags::DEVICE_LOCAL)?; + let mut export = vk::ExportMemoryAllocateInfo::default() + .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD); + let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer); + let memory = self + .device + .allocate_memory( + &vk::MemoryAllocateInfo::default() + .allocation_size(reqs.size) + .memory_type_index(mem_type) + .push_next(&mut export) + .push_next(&mut dedicated), + None, + ) + .context("allocate exportable memory")?; + self.device + .bind_buffer_memory(buffer, memory, 0) + .context("bind export memory")?; + let opaque_fd = self + .ext_fd + .get_memory_fd( + &vk::MemoryGetFdInfoKHR::default() + .memory(memory) + .handle_type(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD), + ) + .context("vkGetMemoryFdKHR")?; + // CUDA imports (and on success owns) the exported fd. Size must match the allocation. + let cuda = cuda::ExternalDmabuf::import_owned_fd(opaque_fd, reqs.size) + .context("cuImportExternalMemory(OPAQUE_FD from Vulkan)")?; + tracing::info!(size, "Vulkan→CUDA exportable staging buffer ready"); + self.dst = Some(DstBuf { + buffer, + memory, + size: reqs.size, + cuda, + }); + Ok(()) + } + + /// Bridge one LINEAR dmabuf frame into a pooled CUDA buffer: GPU copy dmabuf→exportable, + /// then pitched CUDA copy exportable→`pool` buffer. + pub fn import_linear( + &mut self, + fd: i32, + offset: u32, + stride: u32, + height: u32, + pool: &cuda::BufferPool, + ) -> Result { + unsafe { + let span = offset as u64 + stride as u64 * height as u64; + if !self.src_cache.contains_key(&fd) { + let size = libc::lseek(fd, 0, libc::SEEK_END); + anyhow::ensure!(size > 0, "lseek(dmabuf)"); + anyhow::ensure!(size as u64 >= span, "dmabuf smaller than frame span"); + self.import_src(fd, size as u64)?; + } + let (src_buffer, src_size) = { + let s = &self.src_cache[&fd]; + (s.buffer, s.size) + }; + let copy_size = src_size.min(span); + self.ensure_dst(copy_size)?; + let dst = self.dst.as_ref().unwrap(); + + // Record + submit the GPU copy, wait on the fence (GPU-GPU, sub-millisecond). + self.device + .begin_command_buffer( + self.cmd, + &vk::CommandBufferBeginInfo::default() + .flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT), + ) + .context("begin cmd")?; + let region = vk::BufferCopy::default().size(copy_size); + self.device + .cmd_copy_buffer(self.cmd, src_buffer, dst.buffer, &[region]); + self.device + .end_command_buffer(self.cmd) + .context("end cmd")?; + let cmds = [self.cmd]; + let submit = vk::SubmitInfo::default().command_buffers(&cmds); + self.device + .queue_submit(self.queue, &[submit], self.fence) + .context("queue submit")?; + self.device + .wait_for_fences(&[self.fence], true, 1_000_000_000) + .context("fence wait")?; + self.device + .reset_fences(&[self.fence]) + .context("reset fence")?; + + // De-stride from the CUDA view of the exportable memory into a pooled buffer. + cuda::make_current()?; + let out = pool.get()?; + cuda::copy_pitched_to_buffer(dst.cuda.ptr + offset as u64, stride as usize, &out)?; + Ok(out) + } + } +} + +impl Drop for VkBridge { + fn drop(&mut self) { + unsafe { + let _ = self.device.device_wait_idle(); + for (_, s) in self.src_cache.drain() { + self.device.destroy_buffer(s.buffer, None); + self.device.free_memory(s.memory, None); + } + if let Some(d) = self.dst.take() { + self.device.destroy_buffer(d.buffer, None); + self.device.free_memory(d.memory, None); + } + self.device.destroy_fence(self.fence, None); + self.device.destroy_command_pool(self.cmd_pool, None); + self.device.destroy_device(None); + self.instance.destroy_instance(None); + } + } +}