perf: M2 — amortize per-frame zero-copy overhead (pool buffers + register once)
The zero-copy import did real per-frame GPU churn that capped high-fps throughput: a fresh ~29MB cuMemAllocPitch + cuMemFree, a cuGraphicsGLRegisterImage/unregister, and a map of the *same* persistent blit texture — every frame. Two fixes: - BufferPool: a recycled free-list of pitched device buffers per resolution. DeviceBuffer returns its allocation to the pool on drop (after the encoder synchronized) instead of freeing — kills the per-frame 29MB alloc/free that took the device allocator lock and serialized against the GPU. - RegisteredTexture: register the (reused) GL_RGBA8 blit destination with CUDA ONCE when the GlBlit is built; each frame only maps → copies the array → unmaps, instead of registering/unregistering every frame. This is the "zero-copy should be overhead-free" cleanup. Verified the import still produces correct frames; the remaining per-frame cuCtxSynchronize pair (shared-context coupling) is the next step (CUDA stream + events). lumen-host builds, clippy/fmt/tests clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,7 @@
|
|||||||
|
|
||||||
use anyhow::{bail, Result};
|
use anyhow::{bail, Result};
|
||||||
use std::os::raw::{c_int, c_uint, c_void};
|
use std::os::raw::{c_int, c_uint, c_void};
|
||||||
use std::sync::OnceLock;
|
use std::sync::{Arc, Mutex, OnceLock};
|
||||||
|
|
||||||
pub type CUresult = c_uint; // CUDA_SUCCESS == 0
|
pub type CUresult = c_uint; // CUDA_SUCCESS == 0
|
||||||
pub type CUdevice = c_int;
|
pub type CUdevice = c_int;
|
||||||
@@ -134,45 +134,121 @@ pub fn make_current() -> Result<()> {
|
|||||||
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
|
unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder
|
/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
|
||||||
/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be
|
fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
|
||||||
/// returned to the compositor immediately.
|
let mut ptr: CUdeviceptr = 0;
|
||||||
|
let mut pitch: usize = 0;
|
||||||
|
unsafe {
|
||||||
|
ck(
|
||||||
|
cuMemAllocPitch_v2(
|
||||||
|
&mut ptr,
|
||||||
|
&mut pitch,
|
||||||
|
width as usize * 4,
|
||||||
|
height as usize,
|
||||||
|
16,
|
||||||
|
),
|
||||||
|
"cuMemAllocPitch_v2",
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
Ok((ptr, pitch))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
|
||||||
|
/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
|
||||||
|
/// returns its allocation here. Bulk-freed when the last reference drops.
|
||||||
|
struct PoolInner {
|
||||||
|
free: Vec<CUdeviceptr>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for PoolInner {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
unsafe {
|
||||||
|
if let Some(c) = CONTEXT.get() {
|
||||||
|
let _ = cuCtxSetCurrent(c.0);
|
||||||
|
}
|
||||||
|
for &p in &self.free {
|
||||||
|
let _ = cuMemFree_v2(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame
|
||||||
|
/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock
|
||||||
|
/// and serializes against the GPU every frame.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct BufferPool {
|
||||||
|
inner: Arc<Mutex<PoolInner>>,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
pitch: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BufferPool {
|
||||||
|
/// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the
|
||||||
|
/// driver's pitch, which is constant for a given width).
|
||||||
|
pub fn new(width: u32, height: u32) -> Result<BufferPool> {
|
||||||
|
let (ptr, pitch) = alloc_pitched(width, height)?;
|
||||||
|
Ok(BufferPool {
|
||||||
|
inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })),
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
pitch,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
|
||||||
|
/// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
|
||||||
|
pub fn get(&self) -> Result<DeviceBuffer> {
|
||||||
|
let reuse = self.inner.lock().unwrap().free.pop();
|
||||||
|
let ptr = match reuse {
|
||||||
|
Some(p) => p,
|
||||||
|
None => alloc_pitched(self.width, self.height)?.0,
|
||||||
|
};
|
||||||
|
Ok(DeviceBuffer {
|
||||||
|
ptr,
|
||||||
|
pitch: self.pitch,
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pool: Some(self.inner.clone()),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped
|
||||||
|
/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder.
|
||||||
|
/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees.
|
||||||
pub struct DeviceBuffer {
|
pub struct DeviceBuffer {
|
||||||
pub ptr: CUdeviceptr,
|
pub ptr: CUdeviceptr,
|
||||||
pub pitch: usize,
|
pub pitch: usize,
|
||||||
pub width: u32,
|
pub width: u32,
|
||||||
pub height: u32,
|
pub height: u32,
|
||||||
|
pool: Option<Arc<Mutex<PoolInner>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DeviceBuffer {
|
impl DeviceBuffer {
|
||||||
/// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels.
|
/// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path.
|
||||||
pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
|
pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
|
||||||
let mut ptr: CUdeviceptr = 0;
|
let (ptr, pitch) = alloc_pitched(width, height)?;
|
||||||
let mut pitch: usize = 0;
|
|
||||||
unsafe {
|
|
||||||
ck(
|
|
||||||
cuMemAllocPitch_v2(
|
|
||||||
&mut ptr,
|
|
||||||
&mut pitch,
|
|
||||||
width as usize * 4,
|
|
||||||
height as usize,
|
|
||||||
16,
|
|
||||||
),
|
|
||||||
"cuMemAllocPitch_v2",
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
Ok(DeviceBuffer {
|
Ok(DeviceBuffer {
|
||||||
ptr,
|
ptr,
|
||||||
pitch,
|
pitch,
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
|
pool: None,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for DeviceBuffer {
|
impl Drop for DeviceBuffer {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if self.ptr != 0 {
|
if self.ptr == 0 {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if let Some(pool) = &self.pool {
|
||||||
|
// Recycle (the consumer synchronized before dropping, so the GPU is done with it).
|
||||||
|
pool.lock().unwrap().free.push(self.ptr);
|
||||||
|
} else {
|
||||||
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
// The buffer may be freed on the encode thread; cuMemFree needs a current context.
|
||||||
unsafe {
|
unsafe {
|
||||||
if let Some(c) = CONTEXT.get() {
|
if let Some(c) = CONTEXT.get() {
|
||||||
@@ -184,22 +260,22 @@ impl Drop for DeviceBuffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A live GL-texture→CUDA registration (mapped). The CUDA array aliases the texture/dmabuf, so
|
/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA
|
||||||
/// we copy out of it immediately; unmap + unregister happen on drop.
|
/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the
|
||||||
pub struct MappedTexture {
|
/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only
|
||||||
|
/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point),
|
||||||
|
/// instead of registering/unregistering every frame. Unregisters on drop.
|
||||||
|
pub struct RegisteredTexture {
|
||||||
resource: CUgraphicsResource,
|
resource: CUgraphicsResource,
|
||||||
array: CUarray,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MappedTexture {
|
impl RegisteredTexture {
|
||||||
/// Register a `GL_TEXTURE_2D` texture with CUDA, map it, and get its array. The desktop
|
/// Register a `GL_TEXTURE_2D` once.
|
||||||
/// NVIDIA driver only supports CUDA interop through GL textures (not dmabuf EGLImages
|
|
||||||
/// directly), so the EGLImage is first bound to a GL texture by the caller.
|
|
||||||
///
|
///
|
||||||
/// # Safety
|
/// # Safety
|
||||||
/// The GL context and the shared CUDA context must both be current on this thread, and
|
/// The GL context and the shared CUDA context must both be current on this thread, and
|
||||||
/// `texture` must be a valid `GL_TEXTURE_2D` bound to the source image.
|
/// `texture` must be a valid `GL_TEXTURE_2D`.
|
||||||
pub unsafe fn register_gl(texture: u32) -> Result<MappedTexture> {
|
pub unsafe fn register_gl(texture: u32) -> Result<RegisteredTexture> {
|
||||||
const GL_TEXTURE_2D: c_uint = 0x0DE1;
|
const GL_TEXTURE_2D: c_uint = 0x0DE1;
|
||||||
const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
|
const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
|
||||||
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
let mut resource: CUgraphicsResource = std::ptr::null_mut();
|
||||||
@@ -212,37 +288,37 @@ impl MappedTexture {
|
|||||||
),
|
),
|
||||||
"cuGraphicsGLRegisterImage",
|
"cuGraphicsGLRegisterImage",
|
||||||
)?;
|
)?;
|
||||||
if cuGraphicsMapResources(1, &mut resource, std::ptr::null_mut()) != 0 {
|
Ok(RegisteredTexture { resource })
|
||||||
let _ = cuGraphicsUnregisterResource(resource);
|
|
||||||
bail!("cuGraphicsMapResources failed");
|
|
||||||
}
|
|
||||||
let mut array: CUarray = std::ptr::null_mut();
|
|
||||||
if cuGraphicsSubResourceGetMappedArray(&mut array, resource, 0, 0) != 0 {
|
|
||||||
let _ = cuGraphicsUnmapResources(1, &mut resource, std::ptr::null_mut());
|
|
||||||
let _ = cuGraphicsUnregisterResource(resource);
|
|
||||||
bail!("cuGraphicsSubResourceGetMappedArray failed");
|
|
||||||
}
|
|
||||||
Ok(MappedTexture { resource, array })
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy the mapped array into `dst` (array → pitched device memory). The array is the GL
|
/// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
|
||||||
/// blit's already-linear RGBA8 output, so this is a straight copy. After it returns the
|
/// unmap. The `cuCtxSynchronize` ensures `dst` is ready before the source dmabuf is recycled.
|
||||||
/// source dmabuf is no longer needed.
|
pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
|
||||||
pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
|
|
||||||
let copy = CUDA_MEMCPY2D {
|
|
||||||
srcMemoryType: CU_MEMORYTYPE_ARRAY,
|
|
||||||
srcArray: self.array,
|
|
||||||
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
|
||||||
dstDevice: dst.ptr,
|
|
||||||
dstPitch: dst.pitch,
|
|
||||||
WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
|
|
||||||
Height: dst.height as usize,
|
|
||||||
..Default::default()
|
|
||||||
};
|
|
||||||
unsafe {
|
unsafe {
|
||||||
ck(cuMemcpy2D_v2(©), "cuMemcpy2D_v2")?;
|
ck(
|
||||||
// The copy must complete before the dmabuf is requeued / reused.
|
cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
|
||||||
ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
|
"cuGraphicsMapResources",
|
||||||
|
)?;
|
||||||
|
let mut array: CUarray = std::ptr::null_mut();
|
||||||
|
if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||||
|
bail!("cuGraphicsSubResourceGetMappedArray failed");
|
||||||
|
}
|
||||||
|
let copy = CUDA_MEMCPY2D {
|
||||||
|
srcMemoryType: CU_MEMORYTYPE_ARRAY,
|
||||||
|
srcArray: array,
|
||||||
|
dstMemoryType: CU_MEMORYTYPE_DEVICE,
|
||||||
|
dstDevice: dst.ptr,
|
||||||
|
dstPitch: dst.pitch,
|
||||||
|
WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
|
||||||
|
Height: dst.height as usize,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let r = cuMemcpy2D_v2(©);
|
||||||
|
let s = cuCtxSynchronize();
|
||||||
|
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
||||||
|
ck(r, "cuMemcpy2D_v2")?;
|
||||||
|
ck(s, "cuCtxSynchronize")?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -274,11 +350,10 @@ pub fn copy_device_to_device(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for MappedTexture {
|
impl Drop for RegisteredTexture {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if !self.resource.is_null() {
|
if !self.resource.is_null() {
|
||||||
unsafe {
|
unsafe {
|
||||||
let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
|
|
||||||
let _ = cuGraphicsUnregisterResource(self.resource);
|
let _ = cuGraphicsUnregisterResource(self.resource);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
#![allow(non_upper_case_globals)]
|
#![allow(non_upper_case_globals)]
|
||||||
|
|
||||||
use super::cuda::{self, DeviceBuffer, MappedTexture};
|
use super::cuda::{self, DeviceBuffer};
|
||||||
use anyhow::{bail, ensure, Context as _, Result};
|
use anyhow::{bail, ensure, Context as _, Result};
|
||||||
use khronos_egl as egl;
|
use khronos_egl as egl;
|
||||||
use std::os::raw::{c_int, c_void};
|
use std::os::raw::{c_int, c_void};
|
||||||
@@ -145,6 +145,10 @@ struct GlBlit {
|
|||||||
src_tex: u32,
|
src_tex: u32,
|
||||||
width: u32,
|
width: u32,
|
||||||
height: u32,
|
height: u32,
|
||||||
|
/// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame.
|
||||||
|
registered: cuda::RegisteredTexture,
|
||||||
|
/// Recycled CUDA device buffers (the imported frames handed to the encoder).
|
||||||
|
pool: cuda::BufferPool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GlBlit {
|
impl GlBlit {
|
||||||
@@ -183,6 +187,11 @@ impl GlBlit {
|
|||||||
status == GL_FRAMEBUFFER_COMPLETE,
|
status == GL_FRAMEBUFFER_COMPLETE,
|
||||||
"blit FBO incomplete ({status:#x})"
|
"blit FBO incomplete ({status:#x})"
|
||||||
);
|
);
|
||||||
|
// Register the (immutable, reused) destination texture with CUDA once, and stand up the
|
||||||
|
// device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be
|
||||||
|
// current (the caller makes it current before constructing the blit).
|
||||||
|
let registered = cuda::RegisteredTexture::register_gl(dst_tex)?;
|
||||||
|
let pool = cuda::BufferPool::new(width, height)?;
|
||||||
Ok(GlBlit {
|
Ok(GlBlit {
|
||||||
program,
|
program,
|
||||||
vao,
|
vao,
|
||||||
@@ -191,6 +200,8 @@ impl GlBlit {
|
|||||||
src_tex,
|
src_tex,
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
|
registered,
|
||||||
|
pool,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -462,12 +473,14 @@ impl EglImporter {
|
|||||||
if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
|
||||||
self.blit = Some(unsafe { GlBlit::new(width, height)? });
|
self.blit = Some(unsafe { GlBlit::new(width, height)? });
|
||||||
}
|
}
|
||||||
let blit = self.blit.as_ref().unwrap();
|
let egl_image_target = self.egl_image_target;
|
||||||
|
let blit = self.blit.as_mut().unwrap();
|
||||||
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
// SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
|
||||||
unsafe { blit.run(self.egl_image_target, image)? };
|
unsafe { blit.run(egl_image_target, image)? };
|
||||||
let mapped = unsafe { MappedTexture::register_gl(blit.dst_tex)? };
|
// Persistent registration (mapped per frame) + a pooled buffer — no per-frame
|
||||||
let dst = DeviceBuffer::alloc(width, height)?;
|
// cuGraphicsGLRegisterImage / cuMemAllocPitch.
|
||||||
mapped.copy_to(&dst)?;
|
let dst = blit.pool.get()?;
|
||||||
|
blit.registered.copy_mapped_to(&dst)?;
|
||||||
Ok(dst)
|
Ok(dst)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user