punktfunk/crates/punktfunk-host/src/linux/zerocopy/vulkan.rs

//! Vulkan bridge for LINEAR dmabufs (gamescope's only offer), completing zero-copy where the
//! other interops can't: NVIDIA's EGL won't sample LINEAR, and the CUDA driver rejects raw
//! dmabuf fds as external memory. Vulkan *does* import dmabufs (`VK_EXT_external_memory_dma_buf`)
//! and *does* export `OPAQUE_FD` memory that CUDA officially imports. So:
//!
//! ```text
//!   dmabuf fd ──VkImportMemoryFdInfoKHR(DMA_BUF)──▶ VkBuffer (cached per fd)
//!        │ vkCmdCopyBuffer (GPU, device-local)
//!        ▼
//!   exportable VkBuffer ──vkGetMemoryFdKHR(OPAQUE_FD)──▶ cuImportExternalMemory ──▶ CUdeviceptr
//! ```
//!
//! The exportable buffer + its CUDA mapping are created once per resolution; per frame it's one
//! GPU buffer copy (fence-waited) and one pitched CUDA copy into the encoder's pooled buffer.
//! No CPU ever touches pixels. Imports are cached per fd (PipeWire's buffer pool is stable for
//! a stream's life). Falls back cleanly: any init/import error disables the importer and the
//! CPU mmap path takes over.

// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program).
#![deny(clippy::undocumented_unsafe_blocks)]

use super::cuda::{self, DeviceBuffer};
use anyhow::{anyhow, bail, Context as _, Result};
use ash::vk;
use std::collections::HashMap;

/// Vulkan objects for one imported source dmabuf (cached per fd).
struct SrcBuf {
    buffer: vk::Buffer,
    memory: vk::DeviceMemory,
    size: u64,
}

/// The per-resolution destination: exportable Vulkan memory mapped into CUDA.
struct DstBuf {
    buffer: vk::Buffer,
    memory: vk::DeviceMemory,
    size: u64,
    /// CUDA's view of the same memory (owns the exported OPAQUE_FD).
    cuda: cuda::ExternalDmabuf,
}

pub struct VkBridge {
    _entry: ash::Entry,
    instance: ash::Instance,
    device: ash::Device,
    ext_fd: ash::khr::external_memory_fd::Device,
    queue: vk::Queue,
    cmd_pool: vk::CommandPool,
    cmd: vk::CommandBuffer,
    fence: vk::Fence,
    mem_props: vk::PhysicalDeviceMemoryProperties,
    src_cache: HashMap<i32, SrcBuf>,
    dst: Option<DstBuf>,
}

// SAFETY: `VkBridge` owns ash Vulkan handles (instance/device/queue/command pool+buffer/fence), a
// CUDA external-memory mapping, and an fd→buffer cache — none `Sync`, and a single queue +
// command buffer must be externally synchronized. It is created inside `EglImporter::import_linear`
// on the dedicated `punktfunk-pipewire` capture thread and every method (`import_linear`, `Drop`)
// runs on that thread; it is never shared via `&` across threads. `Send` asserts only that
// transferring ownership is sound (so the bridge can live inside the `Send` `EglImporter`); the live
// handles are never touched off-thread, and `Sync` is deliberately NOT implied.
unsafe impl Send for VkBridge {}

impl VkBridge {
    /// Bring up Vulkan on the NVIDIA GPU with the external-memory extensions.
    pub fn new() -> Result<VkBridge> {
        // SAFETY: standard ash bring-up — every call is `unsafe` only because ash cannot statically
        // verify Vulkan handle/CreateInfo validity. `ash::Entry::load` dlopens a real system
        // libvulkan. Each `*CreateInfo`/`AllocateInfo` is built by ash's builders from locals (`app`,
        // `exts`, `prio`, `qci`, and the inline infos) that all live for the duration of the
        // synchronous `create_*`/`enumerate_*` call that reads them — in particular the
        // `enabled_extension_names(&exts)` and `queue_priorities(&prio)` borrows outlive their calls.
        // Every handle passed (`instance`, `phys`, `device`, `qf`, `cmd_pool`) was just created and
        // checked via `?`/`ok_or_else` in this same function, so no invalid handle is ever used. This
        // constructor shares nothing across threads.
        unsafe {
            let entry = ash::Entry::load().context("load libvulkan")?;
            let app = vk::ApplicationInfo::default().api_version(vk::API_VERSION_1_1);
            let instance = entry
                .create_instance(
                    &vk::InstanceCreateInfo::default().application_info(&app),
                    None,
                )
                .context("vkCreateInstance")?;

            // Pick the NVIDIA GPU (matches CUDA device 0 on this single-dGPU host).
            let phys = instance
                .enumerate_physical_devices()
                .context("enumerate GPUs")?
                .into_iter()
                .find(|&p| instance.get_physical_device_properties(p).vendor_id == 0x10DE)
                .ok_or_else(|| anyhow!("no NVIDIA Vulkan device"))?;
            let mem_props = instance.get_physical_device_memory_properties(phys);

            // Any queue family supporting transfer (graphics/compute imply it).
            let qf = instance
                .get_physical_device_queue_family_properties(phys)
                .iter()
                .position(|q| {
                    q.queue_flags.intersects(
                        vk::QueueFlags::TRANSFER
                            | vk::QueueFlags::GRAPHICS
                            | vk::QueueFlags::COMPUTE,
                    )
                })
                .ok_or_else(|| anyhow!("no transfer-capable queue family"))?
                as u32;

            let exts = [
                ash::khr::external_memory_fd::NAME.as_ptr(),
                ash::ext::external_memory_dma_buf::NAME.as_ptr(),
            ];
            let prio = [1.0f32];
            let qci = [vk::DeviceQueueCreateInfo::default()
                .queue_family_index(qf)
                .queue_priorities(&prio)];
            let device = instance
                .create_device(
                    phys,
                    &vk::DeviceCreateInfo::default()
                        .queue_create_infos(&qci)
                        .enabled_extension_names(&exts),
                    None,
                )
                .context("vkCreateDevice (external-memory extensions supported?)")?;
            let ext_fd = ash::khr::external_memory_fd::Device::new(&instance, &device);
            let queue = device.get_device_queue(qf, 0);

            let cmd_pool = device
                .create_command_pool(
                    &vk::CommandPoolCreateInfo::default()
                        .queue_family_index(qf)
                        .flags(vk::CommandPoolCreateFlags::RESET_COMMAND_BUFFER),
                    None,
                )
                .context("create command pool")?;
            let cmd = device
                .allocate_command_buffers(
                    &vk::CommandBufferAllocateInfo::default()
                        .command_pool(cmd_pool)
                        .level(vk::CommandBufferLevel::PRIMARY)
                        .command_buffer_count(1),
                )
                .context("allocate command buffer")?[0];
            let fence = device
                .create_fence(&vk::FenceCreateInfo::default(), None)
                .context("create fence")?;

            tracing::info!("Vulkan bridge ready (dmabuf import → OPAQUE_FD export → CUDA)");
            Ok(VkBridge {
                _entry: entry,
                instance,
                device,
                ext_fd,
                queue,
                cmd_pool,
                cmd,
                fence,
                mem_props,
                src_cache: HashMap::new(),
                dst: None,
            })
        }
    }

    fn memory_type(&self, type_bits: u32, flags: vk::MemoryPropertyFlags) -> Result<u32> {
        (0..self.mem_props.memory_type_count)
            .find(|&i| {
                type_bits & (1 << i) != 0
                    && self.mem_props.memory_types[i as usize]
                        .property_flags
                        .contains(flags)
            })
            .ok_or_else(|| anyhow!("no compatible Vulkan memory type"))
    }

    /// Import `fd` (dup'd internally; Vulkan owns the dup) as a transfer-src buffer of `size`.
    unsafe fn import_src(&mut self, fd: i32, size: u64) -> Result<()> {
        let dup = libc::dup(fd);
        if dup < 0 {
            bail!("dup(dmabuf fd)");
        }
        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
            .handle_types(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT);
        let buffer = self
            .device
            .create_buffer(
                &vk::BufferCreateInfo::default()
                    .size(size)
                    .usage(vk::BufferUsageFlags::TRANSFER_SRC)
                    .push_next(&mut ext_info),
                None,
            )
            .context("create import buffer")?;
        let mut fd_props = vk::MemoryFdPropertiesKHR::default();
        self.ext_fd
            .get_memory_fd_properties(
                vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT,
                dup,
                &mut fd_props,
            )
            .context("vkGetMemoryFdPropertiesKHR")?;
        let reqs = self.device.get_buffer_memory_requirements(buffer);
        let mem_type = self.memory_type(
            reqs.memory_type_bits & fd_props.memory_type_bits,
            vk::MemoryPropertyFlags::empty(),
        )?;
        let mut import = vk::ImportMemoryFdInfoKHR::default()
            .handle_type(vk::ExternalMemoryHandleTypeFlags::DMA_BUF_EXT)
            .fd(dup); // Vulkan takes ownership of `dup` on success
        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
        let memory = self
            .device
            .allocate_memory(
                &vk::MemoryAllocateInfo::default()
                    .allocation_size(reqs.size.max(size))
                    .memory_type_index(mem_type)
                    .push_next(&mut import)
                    .push_next(&mut dedicated),
                None,
            )
            .map_err(|e| {
                libc::close(dup); // failed import does not consume the fd
                anyhow!("import dmabuf memory: {e}")
            })?;
        self.device
            .bind_buffer_memory(buffer, memory, 0)
            .context("bind import memory")?;
        self.src_cache.insert(
            fd,
            SrcBuf {
                buffer,
                memory,
                size,
            },
        );
        Ok(())
    }

    /// (Re)create the exportable destination of at least `size` bytes + its CUDA mapping.
    unsafe fn ensure_dst(&mut self, size: u64) -> Result<()> {
        if self.dst.as_ref().is_some_and(|d| d.size >= size) {
            return Ok(());
        }
        if let Some(old) = self.dst.take() {
            self.device.destroy_buffer(old.buffer, None);
            self.device.free_memory(old.memory, None);
            // old.cuda drops its mapping with it
        }
        let mut ext_info = vk::ExternalMemoryBufferCreateInfo::default()
            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
        let buffer = self
            .device
            .create_buffer(
                &vk::BufferCreateInfo::default()
                    .size(size)
                    .usage(vk::BufferUsageFlags::TRANSFER_DST)
                    .push_next(&mut ext_info),
                None,
            )
            .context("create export buffer")?;
        let reqs = self.device.get_buffer_memory_requirements(buffer);
        let mem_type =
            self.memory_type(reqs.memory_type_bits, vk::MemoryPropertyFlags::DEVICE_LOCAL)?;
        let mut export = vk::ExportMemoryAllocateInfo::default()
            .handle_types(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD);
        let mut dedicated = vk::MemoryDedicatedAllocateInfo::default().buffer(buffer);
        let memory = self
            .device
            .allocate_memory(
                &vk::MemoryAllocateInfo::default()
                    .allocation_size(reqs.size)
                    .memory_type_index(mem_type)
                    .push_next(&mut export)
                    .push_next(&mut dedicated),
                None,
            )
            .context("allocate exportable memory")?;
        self.device
            .bind_buffer_memory(buffer, memory, 0)
            .context("bind export memory")?;
        let opaque_fd = self
            .ext_fd
            .get_memory_fd(
                &vk::MemoryGetFdInfoKHR::default()
                    .memory(memory)
                    .handle_type(vk::ExternalMemoryHandleTypeFlags::OPAQUE_FD),
            )
            .context("vkGetMemoryFdKHR")?;
        // CUDA imports (and on success owns) the exported fd. Size must match the allocation.
        let cuda = cuda::ExternalDmabuf::import_owned_fd(opaque_fd, reqs.size)
            .context("cuImportExternalMemory(OPAQUE_FD from Vulkan)")?;
        tracing::info!(size, "Vulkan→CUDA exportable staging buffer ready");
        self.dst = Some(DstBuf {
            buffer,
            memory,
            size: reqs.size,
            cuda,
        });
        Ok(())
    }

    /// Bridge one LINEAR dmabuf frame into a pooled CUDA buffer: GPU copy dmabuf→exportable,
    /// then pitched CUDA copy exportable→`pool` buffer.
    pub fn import_linear(
        &mut self,
        fd: i32,
        offset: u32,
        stride: u32,
        height: u32,
        pool: &cuda::BufferPool,
    ) -> Result<DeviceBuffer> {
        // SAFETY: `fd` is the live dmabuf fd handed in by the caller (borrowed; `import_src` dup's it
        // internally and Vulkan owns the dup). `libc::lseek` only queries the fd's size. The unsafe
        // `import_src`/`ensure_dst` are called with a valid fd and a checked size. The bounds are
        // proven: `import_src` asserts `size >= span` (so the cached `src_size >= span`),
        // `copy_size = src_size.min(span)`, and `ensure_dst(copy_size)` makes `dst` at least
        // `copy_size` — so the GPU `cmd_copy_buffer` of `copy_size` bytes reads/writes within both
        // buffers, and the later CUDA pitched copy reading `[offset, span)` from `dst.cuda.ptr` (=
        // `offset + stride*height = span <= copy_size`) stays inside the freshly-copied region. The
        // `*Info`/`region`/`cmds`/`submit` are locals that outlive the synchronous calls reading them.
        // `cmd`/`queue`/`fence` are this bridge's own handles, used on this single thread only. The
        // host-side `wait_for_fences` fully retires the Vulkan copy BEFORE CUDA reads the shared
        // memory, so there is no GPU write/read data race. `dst` is an `&self.dst` shared borrow that
        // does not alias the `&self.device` calls.
        unsafe {
            let span = offset as u64 + stride as u64 * height as u64;
            if !self.src_cache.contains_key(&fd) {
                let size = libc::lseek(fd, 0, libc::SEEK_END);
                anyhow::ensure!(size > 0, "lseek(dmabuf)");
                anyhow::ensure!(size as u64 >= span, "dmabuf smaller than frame span");
                self.import_src(fd, size as u64)?;
            }
            let (src_buffer, src_size) = {
                let s = &self.src_cache[&fd];
                (s.buffer, s.size)
            };
            let copy_size = src_size.min(span);
            self.ensure_dst(copy_size)?;
            let dst = self.dst.as_ref().unwrap();

            // Record + submit the GPU copy, wait on the fence (GPU-GPU, sub-millisecond).
            self.device
                .begin_command_buffer(
                    self.cmd,
                    &vk::CommandBufferBeginInfo::default()
                        .flags(vk::CommandBufferUsageFlags::ONE_TIME_SUBMIT),
                )
                .context("begin cmd")?;
            let region = vk::BufferCopy::default().size(copy_size);
            self.device
                .cmd_copy_buffer(self.cmd, src_buffer, dst.buffer, &[region]);
            self.device
                .end_command_buffer(self.cmd)
                .context("end cmd")?;
            let cmds = [self.cmd];
            let submit = vk::SubmitInfo::default().command_buffers(&cmds);
            self.device
                .queue_submit(self.queue, &[submit], self.fence)
                .context("queue submit")?;
            self.device
                .wait_for_fences(&[self.fence], true, 1_000_000_000)
                .context("fence wait")?;
            self.device
                .reset_fences(&[self.fence])
                .context("reset fence")?;

            // De-stride from the CUDA view of the exportable memory into a pooled buffer.
            cuda::make_current()?;
            let out = pool.get()?;
            cuda::copy_pitched_to_buffer(dst.cuda.ptr + offset as u64, stride as usize, &out)?;
            Ok(out)
        }
    }
}

impl Drop for VkBridge {
    fn drop(&mut self) {
        // SAFETY: runs once when the bridge is dropped on its owning capture thread.
        // `device_wait_idle` first drains all in-flight GPU work, so no queued command still
        // references these objects. Every handle freed (the `src_cache` buffers+memories, the `dst`
        // buffer+memory, `fence`, `cmd_pool`, `device`, `instance`) was created by this `VkBridge`
        // and owned exclusively by it, so each `destroy_*`/`free_*` runs exactly once with no
        // double-free, in dependency order (child objects before `device`, `device` before
        // `instance`). `dst.cuda` is dropped after `free_memory`, which is safe because CUDA holds
        // its own dup'd OPAQUE_FD reference to the underlying allocation. No other thread touches
        // these handles.
        unsafe {
            let _ = self.device.device_wait_idle();
            for (_, s) in self.src_cache.drain() {
                self.device.destroy_buffer(s.buffer, None);
                self.device.free_memory(s.memory, None);
            }
            if let Some(d) = self.dst.take() {
                self.device.destroy_buffer(d.buffer, None);
                self.device.free_memory(d.memory, None);
            }
            self.device.destroy_fence(self.fence, None);
            self.device.destroy_command_pool(self.cmd_pool, None);
            self.device.destroy_device(None);
            self.instance.destroy_instance(None);
        }
    }
}