diff --git a/crates/lumen-host/src/zerocopy/cuda.rs b/crates/lumen-host/src/zerocopy/cuda.rs
index 13da31e..5867dfe 100644
--- a/crates/lumen-host/src/zerocopy/cuda.rs
+++ b/crates/lumen-host/src/zerocopy/cuda.rs
@@ -12,7 +12,7 @@
 
 use anyhow::{bail, Result};
 use std::os::raw::{c_int, c_uint, c_void};
-use std::sync::OnceLock;
+use std::sync::{Arc, Mutex, OnceLock};
 
 pub type CUresult = c_uint; // CUDA_SUCCESS == 0
 pub type CUdevice = c_int;
@@ -134,45 +134,121 @@ pub fn make_current() -> Result<()> {
     unsafe { ck(cuCtxSetCurrent(ctx), "cuCtxSetCurrent") }
 }
 
-/// A device buffer we own (pitched), freed on drop. Used as the zero-copy frame the encoder
-/// reads — filled by a device-to-device copy from the EGL-mapped dmabuf so the dmabuf can be
-/// returned to the compositor immediately.
+/// Allocate one pitched device buffer for `width`x`height` 4-byte pixels; returns `(ptr, pitch)`.
+fn alloc_pitched(width: u32, height: u32) -> Result<(CUdeviceptr, usize)> {
+    let mut ptr: CUdeviceptr = 0;
+    let mut pitch: usize = 0;
+    unsafe {
+        ck(
+            cuMemAllocPitch_v2(
+                &mut ptr,
+                &mut pitch,
+                width as usize * 4,
+                height as usize,
+                16,
+            ),
+            "cuMemAllocPitch_v2",
+        )?;
+    }
+    Ok((ptr, pitch))
+}
+
+/// Free-list of recycled device allocations for one resolution. Shared (via `Arc`) between the
+/// capture thread that hands out buffers and the encode thread where a [`DeviceBuffer`] drops and
+/// returns its allocation here. Bulk-freed when the last reference drops.
+struct PoolInner {
+    free: Vec<CUdeviceptr>,
+}
+
+impl Drop for PoolInner {
+    fn drop(&mut self) {
+        unsafe {
+            if let Some(c) = CONTEXT.get() {
+                let _ = cuCtxSetCurrent(c.0);
+            }
+            for &p in &self.free {
+                let _ = cuMemFree_v2(p);
+            }
+        }
+    }
+}
+
+/// A pool of reusable pitched device buffers for a fixed resolution. Eliminates the per-frame
+/// `cuMemAllocPitch`/`cuMemFree` (a ~29 MB allocation at 5K) that takes the device allocator lock
+/// and serializes against the GPU every frame.
+#[derive(Clone)]
+pub struct BufferPool {
+    inner: Arc<Mutex<PoolInner>>,
+    width: u32,
+    height: u32,
+    pitch: usize,
+}
+
+impl BufferPool {
+    /// Create a pool for `width`x`height` 4-byte buffers (allocates one up front to learn the
+    /// driver's pitch, which is constant for a given width).
+    pub fn new(width: u32, height: u32) -> Result<BufferPool> {
+        let (ptr, pitch) = alloc_pitched(width, height)?;
+        Ok(BufferPool {
+            inner: Arc::new(Mutex::new(PoolInner { free: vec![ptr] })),
+            width,
+            height,
+            pitch,
+        })
+    }
+
+    /// Take a buffer — recycled if one is free, else freshly allocated. The buffer returns to this
+    /// pool when dropped (after the consumer has synchronized, so the GPU is done with it).
+    pub fn get(&self) -> Result<DeviceBuffer> {
+        let reuse = self.inner.lock().unwrap().free.pop();
+        let ptr = match reuse {
+            Some(p) => p,
+            None => alloc_pitched(self.width, self.height)?.0,
+        };
+        Ok(DeviceBuffer {
+            ptr,
+            pitch: self.pitch,
+            width: self.width,
+            height: self.height,
+            pool: Some(self.inner.clone()),
+        })
+    }
+}
+
+/// A pitched device buffer holding one captured frame. Filled by a copy from the EGL-mapped
+/// dmabuf (so the dmabuf can be returned to the compositor immediately) and read by the encoder.
+/// When it came from a [`BufferPool`] it recycles on drop; otherwise it frees.
 pub struct DeviceBuffer {
     pub ptr: CUdeviceptr,
     pub pitch: usize,
     pub width: u32,
     pub height: u32,
+    pool: Option<Arc<Mutex<PoolInner>>>,
 }
 
 impl DeviceBuffer {
-    /// Allocate a pitched device buffer for `width`x`height` 4-byte (BGRA) pixels.
+    /// Allocate a standalone (un-pooled) pitched buffer. Prefer [`BufferPool`] on the hot path.
     pub fn alloc(width: u32, height: u32) -> Result<DeviceBuffer> {
-        let mut ptr: CUdeviceptr = 0;
-        let mut pitch: usize = 0;
-        unsafe {
-            ck(
-                cuMemAllocPitch_v2(
-                    &mut ptr,
-                    &mut pitch,
-                    width as usize * 4,
-                    height as usize,
-                    16,
-                ),
-                "cuMemAllocPitch_v2",
-            )?;
-        }
+        let (ptr, pitch) = alloc_pitched(width, height)?;
         Ok(DeviceBuffer {
             ptr,
             pitch,
             width,
             height,
+            pool: None,
         })
     }
 }
 
 impl Drop for DeviceBuffer {
     fn drop(&mut self) {
-        if self.ptr != 0 {
+        if self.ptr == 0 {
+            return;
+        }
+        if let Some(pool) = &self.pool {
+            // Recycle (the consumer synchronized before dropping, so the GPU is done with it).
+            pool.lock().unwrap().free.push(self.ptr);
+        } else {
             // The buffer may be freed on the encode thread; cuMemFree needs a current context.
             unsafe {
                 if let Some(c) = CONTEXT.get() {
@@ -184,22 +260,22 @@ impl Drop for DeviceBuffer {
     }
 }
 
-/// A live GL-texture→CUDA registration (mapped). The CUDA array aliases the texture/dmabuf, so
-/// we copy out of it immediately; unmap + unregister happen on drop.
-pub struct MappedTexture {
+/// A *persistent* GL-texture→CUDA registration. The desktop NVIDIA driver only supports CUDA
+/// interop through GL textures (not dmabuf EGLImages directly), so the importer renders the
+/// dmabuf into a reusable `GL_RGBA8` texture and registers *that* once — then each frame only
+/// maps → copies the mapped array out → unmaps (the map/unmap pair is the GL↔CUDA sync point),
+/// instead of registering/unregistering every frame. Unregisters on drop.
+pub struct RegisteredTexture {
     resource: CUgraphicsResource,
-    array: CUarray,
 }
 
-impl MappedTexture {
-    /// Register a `GL_TEXTURE_2D` texture with CUDA, map it, and get its array. The desktop
-    /// NVIDIA driver only supports CUDA interop through GL textures (not dmabuf EGLImages
-    /// directly), so the EGLImage is first bound to a GL texture by the caller.
+impl RegisteredTexture {
+    /// Register a `GL_TEXTURE_2D` once.
     ///
     /// # Safety
     /// The GL context and the shared CUDA context must both be current on this thread, and
-    /// `texture` must be a valid `GL_TEXTURE_2D` bound to the source image.
-    pub unsafe fn register_gl(texture: u32) -> Result<MappedTexture> {
+    /// `texture` must be a valid `GL_TEXTURE_2D`.
+    pub unsafe fn register_gl(texture: u32) -> Result<RegisteredTexture> {
         const GL_TEXTURE_2D: c_uint = 0x0DE1;
         const CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: c_uint = 0x01;
         let mut resource: CUgraphicsResource = std::ptr::null_mut();
@@ -212,37 +288,37 @@ impl MappedTexture {
             ),
             "cuGraphicsGLRegisterImage",
         )?;
-        if cuGraphicsMapResources(1, &mut resource, std::ptr::null_mut()) != 0 {
-            let _ = cuGraphicsUnregisterResource(resource);
-            bail!("cuGraphicsMapResources failed");
-        }
-        let mut array: CUarray = std::ptr::null_mut();
-        if cuGraphicsSubResourceGetMappedArray(&mut array, resource, 0, 0) != 0 {
-            let _ = cuGraphicsUnmapResources(1, &mut resource, std::ptr::null_mut());
-            let _ = cuGraphicsUnregisterResource(resource);
-            bail!("cuGraphicsSubResourceGetMappedArray failed");
-        }
-        Ok(MappedTexture { resource, array })
+        Ok(RegisteredTexture { resource })
     }
 
-    /// Copy the mapped array into `dst` (array → pitched device memory). The array is the GL
-    /// blit's already-linear RGBA8 output, so this is a straight copy. After it returns the
-    /// source dmabuf is no longer needed.
-    pub fn copy_to(&self, dst: &DeviceBuffer) -> Result<()> {
-        let copy = CUDA_MEMCPY2D {
-            srcMemoryType: CU_MEMORYTYPE_ARRAY,
-            srcArray: self.array,
-            dstMemoryType: CU_MEMORYTYPE_DEVICE,
-            dstDevice: dst.ptr,
-            dstPitch: dst.pitch,
-            WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
-            Height: dst.height as usize,
-            ..Default::default()
-        };
+    /// Map the texture for this frame, copy its (already-linear RGBA8) array into `dst`, then
+    /// unmap. The `cuCtxSynchronize` ensures `dst` is ready before the source dmabuf is recycled.
+    pub fn copy_mapped_to(&mut self, dst: &DeviceBuffer) -> Result<()> {
         unsafe {
-            ck(cuMemcpy2D_v2(&copy), "cuMemcpy2D_v2")?;
-            // The copy must complete before the dmabuf is requeued / reused.
-            ck(cuCtxSynchronize(), "cuCtxSynchronize")?;
+            ck(
+                cuGraphicsMapResources(1, &mut self.resource, std::ptr::null_mut()),
+                "cuGraphicsMapResources",
+            )?;
+            let mut array: CUarray = std::ptr::null_mut();
+            if cuGraphicsSubResourceGetMappedArray(&mut array, self.resource, 0, 0) != 0 {
+                let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
+                bail!("cuGraphicsSubResourceGetMappedArray failed");
+            }
+            let copy = CUDA_MEMCPY2D {
+                srcMemoryType: CU_MEMORYTYPE_ARRAY,
+                srcArray: array,
+                dstMemoryType: CU_MEMORYTYPE_DEVICE,
+                dstDevice: dst.ptr,
+                dstPitch: dst.pitch,
+                WidthInBytes: dst.width as usize * 4, // 4 bytes/px (BGRx)
+                Height: dst.height as usize,
+                ..Default::default()
+            };
+            let r = cuMemcpy2D_v2(&copy);
+            let s = cuCtxSynchronize();
+            let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
+            ck(r, "cuMemcpy2D_v2")?;
+            ck(s, "cuCtxSynchronize")?;
         }
         Ok(())
     }
@@ -274,11 +350,10 @@ pub fn copy_device_to_device(
     Ok(())
 }
 
-impl Drop for MappedTexture {
+impl Drop for RegisteredTexture {
     fn drop(&mut self) {
         if !self.resource.is_null() {
             unsafe {
-                let _ = cuGraphicsUnmapResources(1, &mut self.resource, std::ptr::null_mut());
                 let _ = cuGraphicsUnregisterResource(self.resource);
             }
         }
diff --git a/crates/lumen-host/src/zerocopy/egl.rs b/crates/lumen-host/src/zerocopy/egl.rs
index d6f5814..23c424d 100644
--- a/crates/lumen-host/src/zerocopy/egl.rs
+++ b/crates/lumen-host/src/zerocopy/egl.rs
@@ -13,7 +13,7 @@
 
 #![allow(non_upper_case_globals)]
 
-use super::cuda::{self, DeviceBuffer, MappedTexture};
+use super::cuda::{self, DeviceBuffer};
 use anyhow::{bail, ensure, Context as _, Result};
 use khronos_egl as egl;
 use std::os::raw::{c_int, c_void};
@@ -145,6 +145,10 @@ struct GlBlit {
     src_tex: u32,
     width: u32,
     height: u32,
+    /// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame.
+    registered: cuda::RegisteredTexture,
+    /// Recycled CUDA device buffers (the imported frames handed to the encoder).
+    pool: cuda::BufferPool,
 }
 
 impl GlBlit {
@@ -183,6 +187,11 @@ impl GlBlit {
             status == GL_FRAMEBUFFER_COMPLETE,
             "blit FBO incomplete ({status:#x})"
         );
+        // Register the (immutable, reused) destination texture with CUDA once, and stand up the
+        // device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be
+        // current (the caller makes it current before constructing the blit).
+        let registered = cuda::RegisteredTexture::register_gl(dst_tex)?;
+        let pool = cuda::BufferPool::new(width, height)?;
         Ok(GlBlit {
             program,
             vao,
@@ -191,6 +200,8 @@ impl GlBlit {
             src_tex,
             width,
             height,
+            registered,
+            pool,
         })
     }
 
@@ -462,12 +473,14 @@ impl EglImporter {
         if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
             self.blit = Some(unsafe { GlBlit::new(width, height)? });
         }
-        let blit = self.blit.as_ref().unwrap();
+        let egl_image_target = self.egl_image_target;
+        let blit = self.blit.as_mut().unwrap();
         // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
-        unsafe { blit.run(self.egl_image_target, image)? };
-        let mapped = unsafe { MappedTexture::register_gl(blit.dst_tex)? };
-        let dst = DeviceBuffer::alloc(width, height)?;
-        mapped.copy_to(&dst)?;
+        unsafe { blit.run(egl_image_target, image)? };
+        // Persistent registration (mapped per frame) + a pooled buffer — no per-frame
+        // cuGraphicsGLRegisterImage / cuMemAllocPitch.
+        let dst = blit.pool.get()?;
+        blit.registered.copy_mapped_to(&dst)?;
         Ok(dst)
     }
 }