diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs index 13da31e..5867dfe 100644 --- a/crates/lumen-host/src/zerocopy/cuda.rs +++ b/crates/lumen-host/src/zerocopy/cuda.rs @@ -12,7 +12,7 @@ use anyhow::{bail, Result}; use std::os::raw::{c_int, c_uint, c_void}; -use std::sync::OnceLock; +use std::sync::{Arc, Mutex, OnceLock}; pub type CUresult = c_uint; // CUDA_SUCCESS == 0 pub type CUdevice = c_int; @@ -134,45 +134,121 @@ pub fn make_current() -> Result<()> { unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") } } -/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder -/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be -/// returned to the compositor immediately. +/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`. +fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> { + let mut ptr: CUdeviceptr = 0; + let mut pitch: usize = 0; + unsafe { + ck( + cuMemAllocPitch_v2( + &mut ptr, + &mut pitch, + width as usize * 4, + height as usize, + 16, + ), + "cuMemAllocPitch_v2", + )?; + } + Ok((ptr, pitch)) +} + +/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the +/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and +/// returns its allocation here. Bulk-freed when the last reference drops. +struct PoolInner { + free: Vec, +} + +impl Drop for PoolInner { + fn drop(&mut self) { + unsafe { + if let Some(c) = CONTEXT.get() { + let _ = cuCtxSetCurrent(c.0); + } + for &p in &self.free { + let _ = cuMemFree_v2(p); + } + } + } +} + +/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame +/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock +/// and serializes against the GPU every frame. +#[derive(Clone)] +pub struct BufferPool { + inner: Arc>, + width: u32, + height: u32, + pitch: usize, +} + +impl BufferPool { + /// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the + /// driver's pitch, which is constant for a given width). + pub fn new(width: u32, height: u32) -> Result { + let (ptr, pitch) = alloc_pitched(width, height)?; + Ok(BufferPool { + inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })), + width, + height, + pitch, + }) + } + + /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this + /// pool when dropped (after the consumer has synchronized, so the GPU is done with it). + pub fn get(&self) -> Result { + let reuse = self.inner.lock().unwrap().free.pop(); + let ptr = match reuse { + Some(p) => p, + None => alloc_pitched(self.width, self.height)?.0, + }; + Ok(DeviceBuffer { + ptr, + pitch: self.pitch, + width: self.width, + height: self.height, + pool: Some(self.inner.clone()), + }) + } +} + +/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped +/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder. +/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees. pub struct DeviceBuffer { pub ptr: CUdeviceptr, pub pitch: usize, pub width: u32, pub height: u32, + pool: Option>>, } impl DeviceBuffer { - /// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels. + /// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path. pub fn alloc(width: u32, height: u32) -> Result { - let mut ptr: CUdeviceptr = 0; - let mut pitch: usize = 0; - unsafe { - ck( - cuMemAllocPitch_v2( - &mut ptr, - &mut pitch, - width as usize * 4, - height as usize, - 16, - ), - "cuMemAllocPitch_v2", - )?; - } + let (ptr, pitch) = alloc_pitched(width, height)?; Ok(DeviceBuffer { ptr, pitch, width, height, + pool: None, }) } } impl Drop for DeviceBuffer { fn drop(&mut self) { - if self.ptr != 0 { + if self.ptr == 0 { + return; + } + if let Some(pool) = &self.pool { + // Recycle (the consumer synchronized before dropping, so the GPU is done with it). + pool.lock().unwrap().free.push(self.ptr); + } else { // The buffer may be freed on the encode thread; cuMemFree needs a current context. unsafe { if let Some(c) = CONTEXT.get() { @@ -184,22 +260,22 @@ impl Drop for DeviceBuffer { } } -/// A live GL-texture→CUDA registration (mapped). The CUDA array aliases the texture/dmabuf, so -/// we copy out of it immediately; unmap + unregister happen on drop. -pub struct MappedTexture { +/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA +/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the +/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only +/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point), +/// instead of registering/unregistering every frame. Unregisters on drop. +pub struct RegisteredTexture { resource: CUgraphicsResource, - array: CUarray, } -impl MappedTexture { - /// Register a `GL_TEXTURE_2D` texture with CUDA, map it, and get its array. The desktop - /// NVIDIA driver only supports CUDA interop through GL textures (not dmabuf EGLImages - /// directly), so the EGLImage is first bound to a GL texture by the caller. +impl RegisteredTexture { + /// Register a `GL_TEXTURE_2D` once. /// /// # Safety /// The GL context and the shared CUDA context must both be current on this thread, and - /// `texture` must be a valid `GL_TEXTURE_2D` bound to the source image. - pub unsafe fn register_gl(texture: u32) -> Result { + /// `texture` must be a valid `GL_TEXTURE_2D`. + pub unsafe fn register_gl(texture: u32) -> Result { const GL_TEXTURE_2D: c_uint = 0x0DE1; const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01; let mut resource: CUgraphicsResource = std::ptr::null_mut(); @@ -212,37 +288,37 @@ impl MappedTexture { ), "cuGraphicsGLRegisterImage", )?; - if cuGraphicsMapResources(1, &mut resource, std::ptr::null_mut()) != 0 { - let _ = cuGraphicsUnregisterResource(resource); - bail!("cuGraphicsMapResources failed"); - } - let mut array: CUarray = std::ptr::null_mut(); - if cuGraphicsSubResourceGetMappedArray(&mut array, resource, 0, 0) != 0 { - let _ = cuGraphicsUnmapResources(1, &mut resource, std::ptr::null_mut()); - let _ = cuGraphicsUnregisterResource(resource); - bail!("cuGraphicsSubResourceGetMappedArray failed"); - } - Ok(MappedTexture { resource, array }) + Ok(RegisteredTexture { resource }) } - /// Copy the mapped array into `dst` (array → pitched device memory). The array is the GL - /// blit's already-linear RGBA8 output, so this is a straight copy. After it returns the - /// source dmabuf is no longer needed. - pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> { - let copy = CUDA_MEMCPY2D { - srcMemoryType: CU_MEMORYTYPE_ARRAY, - srcArray: self.array, - dstMemoryType: CU_MEMORYTYPE_DEVICE, - dstDevice: dst.ptr, - dstPitch: dst.pitch, - WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx) - Height: dst.height as usize, - ..Default::default() - }; + /// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then + /// unmap. The `cuCtxSynchronize` ensures `dst` is ready before the source dmabuf is recycled. + pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> { unsafe { - ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?; - // The copy must complete before the dmabuf is requeued / reused. - ck(cuCtxSynchronize(), "cuCtxSynchronize")?; + ck( + cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()), + "cuGraphicsMapResources", + )?; + let mut array: CUarray = std::ptr::null_mut(); + if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 { + let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); + bail!("cuGraphicsSubResourceGetMappedArray failed"); + } + let copy = CUDA_MEMCPY2D { + srcMemoryType: CU_MEMORYTYPE_ARRAY, + srcArray: array, + dstMemoryType: CU_MEMORYTYPE_DEVICE, + dstDevice: dst.ptr, + dstPitch: dst.pitch, + WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx) + Height: dst.height as usize, + ..Default::default() + }; + let r = cuMemcpy2D_v2(©); + let s = cuCtxSynchronize(); + let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); + ck(r, "cuMemcpy2D_v2")?; + ck(s, "cuCtxSynchronize")?; } Ok(()) } @@ -274,11 +350,10 @@ pub fn copy_device_to_device( Ok(()) } -impl Drop for MappedTexture { +impl Drop for RegisteredTexture { fn drop(&mut self) { if !self.resource.is_null() { unsafe { - let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut()); let _ = cuGraphicsUnregisterResource(self.resource); } } diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs index d6f5814..23c424d 100644 --- a/crates/lumen-host/src/zerocopy/egl.rs +++ b/crates/lumen-host/src/zerocopy/egl.rs @@ -13,7 +13,7 @@ #![allow(non_upper_case_globals)] -use super::cuda::{self, DeviceBuffer, MappedTexture}; +use super::cuda::{self, DeviceBuffer}; use anyhow::{bail, ensure, Context as _, Result}; use khronos_egl as egl; use std::os::raw::{c_int, c_void}; @@ -145,6 +145,10 @@ struct GlBlit { src_tex: u32, width: u32, height: u32, + /// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame. + registered: cuda::RegisteredTexture, + /// Recycled CUDA device buffers (the imported frames handed to the encoder). + pool: cuda::BufferPool, } impl GlBlit { @@ -183,6 +187,11 @@ impl GlBlit { status == GL_FRAMEBUFFER_COMPLETE, "blit FBO incomplete ({status:#x})" ); + // Register the (immutable, reused) destination texture with CUDA once, and stand up the + // device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be + // current (the caller makes it current before constructing the blit). + let registered = cuda::RegisteredTexture::register_gl(dst_tex)?; + let pool = cuda::BufferPool::new(width, height)?; Ok(GlBlit { program, vao, @@ -191,6 +200,8 @@ impl GlBlit { src_tex, width, height, + registered, + pool, }) } @@ -462,12 +473,14 @@ impl EglImporter { if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) { self.blit = Some(unsafe { GlBlit::new(width, height)? }); } - let blit = self.blit.as_ref().unwrap(); + let egl_image_target = self.egl_image_target; + let blit = self.blit.as_mut().unwrap(); // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage. - unsafe { blit.run(self.egl_image_target, image)? }; - let mapped = unsafe { MappedTexture::register_gl(blit.dst_tex)? }; - let dst = DeviceBuffer::alloc(width, height)?; - mapped.copy_to(&dst)?; + unsafe { blit.run(egl_image_target, image)? }; + // Persistent registration (mapped per frame) + a pooled buffer — no per-frame + // cuGraphicsGLRegisterImage / cuMemAllocPitch. + let dst = blit.pool.get()?; + blit.registered.copy_mapped_to(&dst)?; Ok(dst) } }