punktfunk/crates/punktfunk-host/src/zerocopy/egl.rs

//! EGL side of the zero-copy path: open a headless EGLDisplay on the NVIDIA GPU (GBM platform on
//! the render node) and import a PipeWire dmabuf as an `EGLImage` with `EGL_LINUX_DMA_BUF_EXT`.
//! The DRM format **modifier** is mandatory on NVIDIA (its buffers are tiled; importing without
//! the modifier yields a corrupt image or `EGL_BAD_MATCH`).
//!
//! Desktop NVIDIA can't register a dmabuf `EGLImage` with CUDA directly — `cuGraphicsEGLRegisterImage`
//! is Tegra-only and `cuGraphicsGLRegisterImage` rejects EGLImage-backed textures (their internal
//! format is opaque). So we follow OBS/Sunshine: bind the `EGLImage` to a GL texture
//! (`glEGLImageTargetTexture2DOES`), render it through a fullscreen-triangle shader into a plain
//! immutable `GL_RGBA8` texture (de-tiling and swizzling to the BGRx the encoder wants), then
//! register *that* texture with CUDA ([`MappedTexture`]) and copy it device-to-device into an
//! owned [`DeviceBuffer`] so the dmabuf can be returned to the compositor immediately.

#![allow(non_upper_case_globals)]

use super::cuda::{self, DeviceBuffer};
use anyhow::{bail, ensure, Context as _, Result};
use khronos_egl as egl;
use std::os::raw::{c_int, c_void};

// EGL_EXT_image_dma_buf_import / _modifiers + platform enums (not defined by khronos-egl).
const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270;
const EGL_PLATFORM_GBM_KHR: egl::Enum = 0x31D7;
const EGL_LINUX_DRM_FOURCC_EXT: egl::Attrib = 0x3271;
const EGL_DMA_BUF_PLANE0_FD_EXT: egl::Attrib = 0x3272;
const EGL_DMA_BUF_PLANE0_OFFSET_EXT: egl::Attrib = 0x3273;
const EGL_DMA_BUF_PLANE0_PITCH_EXT: egl::Attrib = 0x3274;
const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: egl::Attrib = 0x3443;
const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: egl::Attrib = 0x3444;

const GL_TEXTURE_2D: u32 = 0x0DE1;
const GL_TEXTURE_MIN_FILTER: u32 = 0x2801;
const GL_TEXTURE_MAG_FILTER: u32 = 0x2800;
const GL_LINEAR: c_int = 0x2601;
const GL_NEAREST: c_int = 0x2600;
const GL_RGBA8: u32 = 0x8058;
// Single/dual-channel 8-bit formats for the NV12 convert targets: R8 luma (full-res),
// RG8 interleaved chroma (half-res). The `_RED`/`_RG` enums are the matching client formats.
const GL_R8: u32 = 0x8229;
const GL_RG8: u32 = 0x822B;
// Client pixel format/type for texture uploads (self-test only): RGBA bytes.
const GL_RGBA: u32 = 0x1908;
const GL_UNSIGNED_BYTE: u32 = 0x1401;
const GL_FRAMEBUFFER: u32 = 0x8D40;
const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0;
const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5;
const GL_TEXTURE0: u32 = 0x84C0;
const GL_TRIANGLES: u32 = 0x0004;
const GL_VERTEX_SHADER: u32 = 0x8B31;
const GL_FRAGMENT_SHADER: u32 = 0x8B30;
const GL_COMPILE_STATUS: u32 = 0x8B81;
const GL_LINK_STATUS: u32 = 0x8B82;

// libglvnd's libGL dispatches these to the NVIDIA driver based on the current EGL/GL context.
#[link(name = "GL")]
extern "C" {
    fn glGenTextures(n: c_int, textures: *mut u32);
    fn glBindTexture(target: u32, texture: u32);
    fn glTexParameteri(target: u32, pname: u32, param: c_int);
    fn glDeleteTextures(n: c_int, textures: *const u32);
    fn glTexStorage2D(target: u32, levels: c_int, internalformat: u32, width: c_int, height: c_int);
    fn glGetError() -> u32;
    fn glGenFramebuffers(n: c_int, framebuffers: *mut u32);
    fn glDeleteFramebuffers(n: c_int, framebuffers: *const u32);
    fn glBindFramebuffer(target: u32, framebuffer: u32);
    fn glFramebufferTexture2D(
        target: u32,
        attachment: u32,
        textarget: u32,
        texture: u32,
        level: c_int,
    );
    fn glCheckFramebufferStatus(target: u32) -> u32;
    fn glViewport(x: c_int, y: c_int, width: c_int, height: c_int);
    fn glGenVertexArrays(n: c_int, arrays: *mut u32);
    fn glDeleteVertexArrays(n: c_int, arrays: *const u32);
    fn glBindVertexArray(array: u32);
    fn glDrawArrays(mode: u32, first: c_int, count: c_int);
    fn glActiveTexture(texture: u32);
    fn glUseProgram(program: u32);
    fn glFlush();
    fn glCreateShader(shader_type: u32) -> u32;
    fn glShaderSource(shader: u32, count: c_int, string: *const *const i8, length: *const c_int);
    fn glCompileShader(shader: u32);
    fn glGetShaderiv(shader: u32, pname: u32, params: *mut c_int);
    fn glDeleteShader(shader: u32);
    fn glCreateProgram() -> u32;
    fn glAttachShader(program: u32, shader: u32);
    fn glLinkProgram(program: u32);
    fn glGetProgramiv(program: u32, pname: u32, params: *mut c_int);
    fn glGetUniformLocation(program: u32, name: *const i8) -> c_int;
    fn glUniform1i(location: c_int, v0: c_int);
    fn glDeleteProgram(program: u32);
    fn glTexSubImage2D(
        target: u32,
        level: c_int,
        xoffset: c_int,
        yoffset: c_int,
        width: c_int,
        height: c_int,
        format: u32,
        type_: u32,
        pixels: *const c_void,
    );
}

#[link(name = "gbm")]
extern "C" {
    fn gbm_create_device(fd: c_int) -> *mut c_void;
    fn gbm_device_destroy(device: *mut c_void);
}

/// `glEGLImageTargetTexture2DOES(target, EGLImage)` — loaded via `eglGetProcAddress`.
type EglImageTargetFn = unsafe extern "system" fn(u32, *mut c_void);

// Fullscreen-triangle blit: sample the dmabuf EGLImage texture and write it (swizzled to BGRA,
// to match the BGRx the encoder expects) into a normal GL_RGBA8 texture that CUDA *can* register.
const VERT_SRC: &[u8] = b"#version 330 core\nout vec2 v_tex;\nvoid main(){vec2 p=vec2(float((gl_VertexID<<1)&2),float(gl_VertexID&2));v_tex=p;gl_Position=vec4(p*2.0-1.0,0.0,1.0);}\n";
const FRAG_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){o_color=texture(image,v_tex).bgra;}\n";

// NV12 BT.709 LIMITED-range convert from full-range RGB in [0,1]. Two passes share `VERT_SRC` and
// the same source texture (the de-tiled dmabuf):
//   Y pass  → GL_R8 luma, full-res:   Y = (16 + 219·(0.2126R+0.7152G+0.0722B))/255
//   UV pass → GL_RG8 chroma, half-res (GL_LINEAR averages the 2×2 footprint):
//     U = (128 + 224·(-0.1146R-0.3854G+0.5000B))/255  → R channel
//     V = (128 + 224·( 0.5000R-0.4542G-0.0458B))/255  → G channel
// RG8's (R=U, G=V) byte order matches NV12's interleaved [U,V]. All outputs clamped to [0,1].
// Matches the Windows VideoConverter (BT.709, limited/studio range) so the two hosts look identical.
const FRAG_Y_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float Y=(16.0+219.0*(0.2126*c.r+0.7152*c.g+0.0722*c.b))/255.0;o_color=vec4(clamp(Y,0.0,1.0),0.0,0.0,1.0);}\n";
const FRAG_UV_SRC: &[u8] = b"#version 330 core\nuniform sampler2D image;\nin vec2 v_tex;\nout vec4 o_color;\nvoid main(){vec3 c=texture(image,v_tex).rgb;float U=(128.0+224.0*(-0.1146*c.r-0.3854*c.g+0.5000*c.b))/255.0;float V=(128.0+224.0*(0.5000*c.r-0.4542*c.g-0.0458*c.b))/255.0;o_color=vec4(clamp(U,0.0,1.0),clamp(V,0.0,1.0),0.0,1.0);}\n";

unsafe fn compile_shader(kind: u32, src: &[u8]) -> Result<u32> {
    let sh = glCreateShader(kind);
    ensure!(sh != 0, "glCreateShader failed");
    let ptr = src.as_ptr() as *const i8;
    let len = src.len() as c_int;
    glShaderSource(sh, 1, &ptr, &len);
    glCompileShader(sh);
    let mut ok: c_int = 0;
    glGetShaderiv(sh, GL_COMPILE_STATUS, &mut ok);
    if ok == 0 {
        glDeleteShader(sh);
        bail!("GL shader compile failed");
    }
    Ok(sh)
}

/// Compile+link the fullscreen-triangle program with fragment source `frag` and bind its `image`
/// sampler to texture unit 0.
unsafe fn compile_program_with(frag: &[u8]) -> Result<u32> {
    let vs = compile_shader(GL_VERTEX_SHADER, VERT_SRC)?;
    let fs = compile_shader(GL_FRAGMENT_SHADER, frag)?;
    let prog = glCreateProgram();
    glAttachShader(prog, vs);
    glAttachShader(prog, fs);
    glLinkProgram(prog);
    glDeleteShader(vs);
    glDeleteShader(fs);
    let mut ok: c_int = 0;
    glGetProgramiv(prog, GL_LINK_STATUS, &mut ok);
    ensure!(ok != 0, "GL program link failed");
    glUseProgram(prog);
    let loc = glGetUniformLocation(prog, c"image".as_ptr());
    if loc >= 0 {
        glUniform1i(loc, 0); // sampler -> texture unit 0
    }
    glUseProgram(0);
    Ok(prog)
}

unsafe fn compile_program() -> Result<u32> {
    compile_program_with(FRAG_SRC)
}

/// Per-size GL machinery to blit a dmabuf EGLImage into a CUDA-registrable `GL_RGBA8` texture.
struct GlBlit {
    program: u32,
    vao: u32,
    fbo: u32,
    /// CUDA-registrable destination (immutable GL_RGBA8).
    dst_tex: u32,
    /// Source texture re-targeted to each frame's EGLImage.
    src_tex: u32,
    width: u32,
    height: u32,
    /// `dst_tex` registered with CUDA once (not per frame); mapped+copied each frame.
    registered: cuda::RegisteredTexture,
    /// Recycled CUDA device buffers (the imported frames handed to the encoder).
    pool: cuda::BufferPool,
}

impl GlBlit {
    unsafe fn new(width: u32, height: u32) -> Result<GlBlit> {
        let program = compile_program()?;
        let mut vao = 0u32;
        glGenVertexArrays(1, &mut vao); // core profile needs a bound VAO for glDrawArrays
        let mut fbo = 0u32;
        glGenFramebuffers(1, &mut fbo);

        let mut dst_tex = 0u32;
        glGenTextures(1, &mut dst_tex);
        glBindTexture(GL_TEXTURE_2D, dst_tex);
        glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

        let mut src_tex = 0u32;
        glGenTextures(1, &mut src_tex);
        glBindTexture(GL_TEXTURE_2D, src_tex);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
        glBindTexture(GL_TEXTURE_2D, 0);

        glBindFramebuffer(GL_FRAMEBUFFER, fbo);
        glFramebufferTexture2D(
            GL_FRAMEBUFFER,
            GL_COLOR_ATTACHMENT0,
            GL_TEXTURE_2D,
            dst_tex,
            0,
        );
        let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
        glBindFramebuffer(GL_FRAMEBUFFER, 0);
        ensure!(
            status == GL_FRAMEBUFFER_COMPLETE,
            "blit FBO incomplete ({status:#x})"
        );
        // Register the (immutable, reused) destination texture with CUDA once, and stand up the
        // device-buffer pool — both per-resolution, not per-frame. Requires the CUDA context to be
        // current (the caller makes it current before constructing the blit).
        let registered = cuda::RegisteredTexture::register_gl(dst_tex)?;
        let pool = cuda::BufferPool::new(width, height)?;
        Ok(GlBlit {
            program,
            vao,
            fbo,
            dst_tex,
            src_tex,
            width,
            height,
            registered,
            pool,
        })
    }

    /// Bind `image` to the source texture and render it into `dst_tex`.
    ///
    /// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
    unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
        glBindTexture(GL_TEXTURE_2D, self.src_tex);
        let _ = glGetError();
        egl_image_target(GL_TEXTURE_2D, image);
        let e = glGetError();
        glBindTexture(GL_TEXTURE_2D, 0);
        ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");

        glBindFramebuffer(GL_FRAMEBUFFER, self.fbo);
        glViewport(0, 0, self.width as c_int, self.height as c_int);
        glUseProgram(self.program);
        glActiveTexture(GL_TEXTURE0);
        glBindTexture(GL_TEXTURE_2D, self.src_tex);
        glBindVertexArray(self.vao);
        glDrawArrays(GL_TRIANGLES, 0, 3);
        glBindVertexArray(0);
        glBindFramebuffer(GL_FRAMEBUFFER, 0);
        glFlush(); // submit GL work before CUDA maps the texture
        Ok(())
    }
}

/// Per-size GL machinery to convert a dmabuf EGLImage into an NV12 (BT.709 limited-range) pair —
/// the [`GlBlit`] analogue for the `PUNKTFUNK_NV12` path. Two passes share `src_tex`: a full-res Y
/// pass into a CUDA-registrable `GL_R8` texture and a half-res UV pass into a `GL_RG8` texture.
/// Feeding NVENC native NV12 deletes its internal RGB→YUV CSC (which otherwise runs on the SM that a
/// saturating game pins at 100%); the convert here replaces the BGRx swizzle [`GlBlit`] did, at ~the
/// same 3D cost.
struct Nv12Blit {
    y_program: u32,
    uv_program: u32,
    vao: u32,
    y_fbo: u32,
    uv_fbo: u32,
    /// CUDA-registrable luma target (immutable `GL_R8`, W×H).
    y_tex: u32,
    /// CUDA-registrable chroma target (immutable `GL_RG8`, W/2 × H/2).
    uv_tex: u32,
    /// Source texture re-targeted to each frame's EGLImage. `GL_LINEAR` so the UV pass averages 2×2.
    src_tex: u32,
    width: u32,
    height: u32,
    y_registered: cuda::RegisteredTexture,
    uv_registered: cuda::RegisteredTexture,
    /// Recycled NV12 device buffers (two-plane) handed to the encoder.
    pool: cuda::BufferPool,
    /// Self-test only: whether `src_tex` has had immutable RGBA8 storage allocated for the upload
    /// path (the live path retargets `src_tex` via EGLImage instead, never allocating storage).
    test_src_storage: bool,
}

impl Nv12Blit {
    unsafe fn new(width: u32, height: u32) -> Result<Nv12Blit> {
        ensure!(
            width % 2 == 0 && height % 2 == 0,
            "NV12 convert needs even dimensions (got {width}x{height})"
        );
        let y_program = compile_program_with(FRAG_Y_SRC)?;
        let uv_program = compile_program_with(FRAG_UV_SRC)?;
        let mut vao = 0u32;
        glGenVertexArrays(1, &mut vao);
        let mut fbos = [0u32; 2];
        glGenFramebuffers(2, fbos.as_mut_ptr());
        let (y_fbo, uv_fbo) = (fbos[0], fbos[1]);

        // Luma target: GL_R8 at full resolution.
        let mut y_tex = 0u32;
        glGenTextures(1, &mut y_tex);
        glBindTexture(GL_TEXTURE_2D, y_tex);
        glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, width as c_int, height as c_int);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

        // Chroma target: GL_RG8 at half resolution (R=U, G=V).
        let mut uv_tex = 0u32;
        glGenTextures(1, &mut uv_tex);
        glBindTexture(GL_TEXTURE_2D, uv_tex);
        glTexStorage2D(
            GL_TEXTURE_2D,
            1,
            GL_RG8,
            (width / 2) as c_int,
            (height / 2) as c_int,
        );
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);

        // Source: GL_LINEAR so the half-res UV pass averages the 2×2 chroma footprint.
        let mut src_tex = 0u32;
        glGenTextures(1, &mut src_tex);
        glBindTexture(GL_TEXTURE_2D, src_tex);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
        glBindTexture(GL_TEXTURE_2D, 0);

        for (fbo, tex) in [(y_fbo, y_tex), (uv_fbo, uv_tex)] {
            glBindFramebuffer(GL_FRAMEBUFFER, fbo);
            glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex, 0);
            let status = glCheckFramebufferStatus(GL_FRAMEBUFFER);
            glBindFramebuffer(GL_FRAMEBUFFER, 0);
            ensure!(
                status == GL_FRAMEBUFFER_COMPLETE,
                "NV12 blit FBO incomplete ({status:#x}) — GL_R8/GL_RG8 not renderable?"
            );
        }
        // Register both convert targets with CUDA once (per-resolution), + the NV12 two-plane pool.
        let y_registered = cuda::RegisteredTexture::register_gl(y_tex)?;
        let uv_registered = cuda::RegisteredTexture::register_gl(uv_tex)?;
        let pool = cuda::BufferPool::new_nv12(width, height)?;
        Ok(Nv12Blit {
            y_program,
            uv_program,
            vao,
            y_fbo,
            uv_fbo,
            y_tex,
            uv_tex,
            src_tex,
            width,
            height,
            y_registered,
            uv_registered,
            pool,
            test_src_storage: false,
        })
    }

    /// Bind `image` to the source texture and run both convert passes into `y_tex`/`uv_tex`.
    ///
    /// # Safety: the GL context is current on this thread; `image` is a valid `EGLImage`.
    unsafe fn run(&self, egl_image_target: EglImageTargetFn, image: *mut c_void) -> Result<()> {
        glBindTexture(GL_TEXTURE_2D, self.src_tex);
        let _ = glGetError();
        egl_image_target(GL_TEXTURE_2D, image);
        let e = glGetError();
        glBindTexture(GL_TEXTURE_2D, 0);
        ensure!(e == 0, "glEGLImageTargetTexture2DOES failed ({e:#x})");
        self.run_passes()
    }

    /// Run the two convert passes from whatever is currently in `src_tex` (caller populated it).
    /// Shared by [`run`](Self::run) (EGLImage source) and the self-test (uploaded RGBA source).
    ///
    /// # Safety: the GL context is current on this thread.
    unsafe fn run_passes(&self) -> Result<()> {
        glActiveTexture(GL_TEXTURE0);
        glBindVertexArray(self.vao);
        // Y pass: full-res into the R8 target.
        glBindFramebuffer(GL_FRAMEBUFFER, self.y_fbo);
        glViewport(0, 0, self.width as c_int, self.height as c_int);
        glUseProgram(self.y_program);
        glBindTexture(GL_TEXTURE_2D, self.src_tex);
        glDrawArrays(GL_TRIANGLES, 0, 3);
        // UV pass: half-res into the RG8 target (GL_LINEAR averages the 2×2).
        glBindFramebuffer(GL_FRAMEBUFFER, self.uv_fbo);
        glViewport(0, 0, (self.width / 2) as c_int, (self.height / 2) as c_int);
        glUseProgram(self.uv_program);
        glBindTexture(GL_TEXTURE_2D, self.src_tex);
        glDrawArrays(GL_TRIANGLES, 0, 3);

        glBindVertexArray(0);
        glBindFramebuffer(GL_FRAMEBUFFER, 0);
        glFlush(); // submit GL work before CUDA maps the textures
        Ok(())
    }
}

impl Drop for Nv12Blit {
    fn drop(&mut self) {
        unsafe {
            glDeleteTextures(1, &self.y_tex);
            glDeleteTextures(1, &self.uv_tex);
            glDeleteTextures(1, &self.src_tex);
            glDeleteFramebuffers(2, [self.y_fbo, self.uv_fbo].as_ptr());
            glDeleteVertexArrays(1, &self.vao);
            glDeleteProgram(self.y_program);
            glDeleteProgram(self.uv_program);
        }
    }
}

/// One dmabuf plane as delivered by PipeWire (single-plane for BGRx).
#[derive(Clone, Copy, Debug)]
pub struct DmabufPlane {
    pub fd: i32,
    pub offset: u32,
    pub stride: u32,
}

type Egl = egl::DynamicInstance<egl::EGL1_5>;

/// Headless EGLDisplay (NVIDIA device platform) + a surfaceless desktop-GL context used to
/// import dmabufs and bridge them to CUDA via a GL texture. Lives on the capture thread (the GL
/// context is made current there once).
pub struct EglImporter {
    egl: Egl,
    display: egl::Display,
    no_ctx: egl::Context,
    /// Surfaceless GL context (current on the capture thread) for the EGLImage→texture bind.
    _gl_ctx: egl::Context,
    egl_image_target: EglImageTargetFn,
    /// Lazily-created GL blit machinery (recreated if the frame size changes).
    blit: Option<GlBlit>,
    /// Lazily-created NV12 convert machinery (`PUNKTFUNK_NV12` path; recreated on size change).
    nv12_blit: Option<Nv12Blit>,
    /// LINEAR-dmabuf path (gamescope): a Vulkan bridge (dmabuf → exportable OPAQUE_FD → CUDA),
    /// created lazily on the first LINEAR frame, + the destination pool.
    vk: Option<super::vulkan::VkBridge>,
    linear_pool: Option<cuda::BufferPool>,
    gbm: *mut c_void,
    render_fd: c_int,
}

// The EGL handles are confined to the capture thread; the struct is moved there once.
unsafe impl Send for EglImporter {}

impl EglImporter {
    /// Open a headless EGLDisplay on the NVIDIA EGL device. Also forces the shared CUDA context
    /// to exist (so a later `import` only touches the hot path).
    pub fn new() -> Result<EglImporter> {
        // GBM platform on the NVIDIA render node: this ties the EGLDisplay (and its GL contexts)
        // to the same DRM device CUDA-GL interop associates with, which the EGL device platform
        // did not (cuGraphicsGLRegisterImage rejected device-platform GL textures).
        let path = std::ffi::CString::new("/dev/dri/renderD128").unwrap();
        let render_fd = unsafe { libc::open(path.as_ptr(), libc::O_RDWR | libc::O_CLOEXEC) };
        ensure!(render_fd >= 0, "open /dev/dri/renderD128 for GBM");
        let gbm = unsafe { gbm_create_device(render_fd) };
        if gbm.is_null() {
            unsafe { libc::close(render_fd) };
            anyhow::bail!("gbm_create_device failed");
        }

        let egl: Egl =
            unsafe { Egl::load_required() }.context("load libEGL (EGL 1.5 dynamic instance)")?;
        let display = unsafe {
            egl.get_platform_display(
                EGL_PLATFORM_GBM_KHR,
                gbm as egl::NativeDisplayType,
                &[egl::ATTRIB_NONE],
            )
        }
        .context("eglGetPlatformDisplay(GBM) on the NVIDIA render node")?;
        egl.initialize(display).context("eglInitialize")?;

        let exts = egl
            .query_string(Some(display), egl::EXTENSIONS)
            .context("query EGL extensions")?
            .to_string_lossy()
            .into_owned();
        ensure!(
            exts.contains("EGL_EXT_image_dma_buf_import"),
            "EGL lacks EGL_EXT_image_dma_buf_import"
        );
        ensure!(
            exts.contains("EGL_EXT_image_dma_buf_import_modifiers"),
            "EGL lacks EGL_EXT_image_dma_buf_import_modifiers (needed for NVIDIA tiled dmabufs)"
        );

        // A surfaceless desktop-GL context so we can bind the dmabuf EGLImage to a GL texture
        // (cuGraphicsEGLRegisterImage is Tegra-only; desktop CUDA interop goes through GL).
        egl.bind_api(egl::OPENGL_API)
            .context("eglBindAPI(OpenGL)")?;
        // The default EGL_SURFACE_TYPE in eglChooseConfig is WINDOW_BIT, which a headless device
        // display has none of — request a pbuffer-capable config (we run surfaceless anyway).
        let config = egl
            .choose_first_config(
                display,
                &[
                    egl::SURFACE_TYPE,
                    egl::PBUFFER_BIT,
                    egl::RENDERABLE_TYPE,
                    egl::OPENGL_BIT,
                    egl::NONE,
                ],
            )
            .context("eglChooseConfig")?
            .context("no EGL config for OpenGL")?;
        let gl_ctx = egl
            .create_context(
                display,
                config,
                None,
                &[egl::CONTEXT_CLIENT_VERSION, 3, egl::NONE],
            )
            .context("eglCreateContext(OpenGL)")?;
        egl.make_current(display, None, None, Some(gl_ctx))
            .context("eglMakeCurrent surfaceless (needs EGL_KHR_surfaceless_context)")?;
        let egl_image_target: EglImageTargetFn = unsafe {
            std::mem::transmute(
                egl.get_proc_address("glEGLImageTargetTexture2DOES")
                    .context("glEGLImageTargetTexture2DOES unavailable")?,
            )
        };

        // Create the shared CUDA context up front so import() is pure hot path.
        cuda::context().context("create CUDA context")?;

        let no_ctx = unsafe { egl::Context::from_ptr(egl::NO_CONTEXT) };
        tracing::info!(
            "zero-copy EGL importer ready (GBM platform + GL texture interop, dma_buf_import + modifiers)"
        );
        Ok(EglImporter {
            egl,
            display,
            no_ctx,
            _gl_ctx: gl_ctx,
            egl_image_target,
            blit: None,
            nv12_blit: None,
            vk: None,
            linear_pool: None,
            gbm,
            render_fd,
        })
    }

    /// Import a LINEAR dmabuf via the Vulkan bridge (no EGL/GL involved — NVIDIA's EGL can't
    /// sample LINEAR, and the CUDA driver rejects raw dmabuf fds; Vulkan imports the dmabuf,
    /// GPU-copies into an exportable allocation, and CUDA reads that). See [`super::vulkan`].
    pub fn import_linear(
        &mut self,
        plane: &DmabufPlane,
        width: u32,
        height: u32,
    ) -> Result<DeviceBuffer> {
        cuda::make_current()?;
        if self.linear_pool.as_ref().map(|p| (p.width(), p.height())) != Some((width, height)) {
            self.linear_pool = Some(cuda::BufferPool::new(width, height)?);
        }
        if self.vk.is_none() {
            self.vk = Some(super::vulkan::VkBridge::new()?);
        }
        self.vk.as_mut().unwrap().import_linear(
            plane.fd,
            plane.offset,
            plane.stride,
            height,
            self.linear_pool.as_ref().unwrap(),
        )
    }

    /// The DRM format modifiers the NVIDIA EGL stack can import for `fourcc`, via
    /// `eglQueryDmaBufModifiersEXT`. We advertise these to PipeWire so the compositor allocates
    /// a dmabuf in a layout we can import. Empty on failure (caller falls back).
    pub fn supported_modifiers(&self, fourcc: u32) -> Vec<u64> {
        type QueryFn = unsafe extern "system" fn(
            dpy: *mut c_void,
            format: i32,
            max_modifiers: i32,
            modifiers: *mut u64,
            external_only: *mut u32,
            num_modifiers: *mut i32,
        ) -> u32;
        let Some(sym) = self.egl.get_proc_address("eglQueryDmaBufModifiersEXT") else {
            return Vec::new();
        };
        let query: QueryFn = unsafe { std::mem::transmute(sym) };
        let dpy = self.display.as_ptr();
        unsafe {
            let mut count: i32 = 0;
            if query(
                dpy,
                fourcc as i32,
                0,
                std::ptr::null_mut(),
                std::ptr::null_mut(),
                &mut count,
            ) == 0
                || count <= 0
            {
                return Vec::new();
            }
            let mut mods = vec![0u64; count as usize];
            let mut ext = vec![0u32; count as usize];
            let mut n: i32 = 0;
            if query(
                dpy,
                fourcc as i32,
                count,
                mods.as_mut_ptr(),
                ext.as_mut_ptr(),
                &mut n,
            ) == 0
            {
                return Vec::new();
            }
            mods.truncate(n.max(0) as usize);
            mods
        }
    }

    /// Import one dmabuf and copy it device-to-device into a fresh owned CUDA buffer. `fourcc`
    /// is the DRM FourCC; `modifier` is the explicit 64-bit DRM format modifier when one was
    /// negotiated, or `None` to import with the buffer's implicit modifier (base
    /// `EGL_EXT_image_dma_buf_import`, which the NVIDIA driver resolves for its own buffers).
    pub fn import(
        &mut self,
        plane: &DmabufPlane,
        width: u32,
        height: u32,
        fourcc: u32,
        modifier: Option<u64>,
    ) -> Result<DeviceBuffer> {
        self.import_inner(plane, width, height, fourcc, modifier, false)
    }

    /// Like [`import`](Self::import), but de-tiles **and converts** the dmabuf to NV12 (BT.709
    /// limited range) on the GPU — the `PUNKTFUNK_NV12` path — so NVENC can encode native YUV with
    /// no internal RGB→YUV CSC. The returned [`DeviceBuffer`] carries both NV12 planes
    /// (`DeviceBuffer::is_nv12`). Only the tiled EGL/GL path supports this (LINEAR/Vulkan stays RGB).
    pub fn import_nv12(
        &mut self,
        plane: &DmabufPlane,
        width: u32,
        height: u32,
        fourcc: u32,
        modifier: Option<u64>,
    ) -> Result<DeviceBuffer> {
        self.import_inner(plane, width, height, fourcc, modifier, true)
    }

    fn import_inner(
        &mut self,
        plane: &DmabufPlane,
        width: u32,
        height: u32,
        fourcc: u32,
        modifier: Option<u64>,
        nv12: bool,
    ) -> Result<DeviceBuffer> {
        let mut attrs: Vec<egl::Attrib> = vec![
            egl::WIDTH as egl::Attrib,
            width as egl::Attrib,
            egl::HEIGHT as egl::Attrib,
            height as egl::Attrib,
            EGL_LINUX_DRM_FOURCC_EXT,
            fourcc as egl::Attrib,
            EGL_DMA_BUF_PLANE0_FD_EXT,
            plane.fd as egl::Attrib,
            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
            plane.offset as egl::Attrib,
            EGL_DMA_BUF_PLANE0_PITCH_EXT,
            plane.stride as egl::Attrib,
        ];
        if let Some(m) = modifier {
            attrs.extend_from_slice(&[
                EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
                (m & 0xFFFF_FFFF) as egl::Attrib,
                EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
                (m >> 32) as egl::Attrib,
            ]);
        }
        attrs.push(egl::ATTRIB_NONE);
        let client = unsafe { egl::ClientBuffer::from_ptr(std::ptr::null_mut()) };
        let image = self
            .egl
            .create_image(
                self.display,
                self.no_ctx,
                EGL_LINUX_DMA_BUF_EXT,
                client,
                &attrs,
            )
            .context("eglCreateImage(EGL_LINUX_DMA_BUF_EXT) — modifier mismatch?")?;

        // EGLImage → (sampled by a shader) → GL_RGBA8 texture (or NV12 R8+RG8 pair) → register
        // *that* with CUDA → map → array → copy out. Registering the EGLImage texture directly
        // fails (its layout isn't a CUDA-registrable format); the render targets are.
        let result = if nv12 {
            self.blit_and_copy_nv12(image.as_ptr(), width, height)
        } else {
            self.blit_and_copy(image.as_ptr(), width, height)
        };
        let _ = self.egl.destroy_image(self.display, image);
        result
    }

    /// Render the dmabuf `image` into the registrable RGBA8 texture and copy it to an owned CUDA
    /// buffer. (Re)creates the per-size GL blit machinery as needed.
    fn blit_and_copy(
        &mut self,
        image: *mut c_void,
        width: u32,
        height: u32,
    ) -> Result<DeviceBuffer> {
        cuda::make_current()?;
        if self.blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
            self.blit = Some(unsafe { GlBlit::new(width, height)? });
        }
        let egl_image_target = self.egl_image_target;
        let blit = self.blit.as_mut().unwrap();
        // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
        unsafe { blit.run(egl_image_target, image)? };
        // Persistent registration (mapped per frame) + a pooled buffer — no per-frame
        // cuGraphicsGLRegisterImage / cuMemAllocPitch.
        let dst = blit.pool.get()?;
        blit.registered.copy_mapped_to(&dst)?;
        Ok(dst)
    }

    /// Convert the dmabuf `image` to NV12 (Y in an R8 texture, UV in an RG8 texture) and copy both
    /// planes into a pooled NV12 [`DeviceBuffer`]. (Re)creates the per-size convert machinery as
    /// needed. The `PUNKTFUNK_NV12` analogue of [`blit_and_copy`].
    fn blit_and_copy_nv12(
        &mut self,
        image: *mut c_void,
        width: u32,
        height: u32,
    ) -> Result<DeviceBuffer> {
        cuda::make_current()?;
        if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
            self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
        }
        let egl_image_target = self.egl_image_target;
        let blit = self.nv12_blit.as_mut().unwrap();
        // SAFETY: GL + CUDA contexts current on this thread; `image` is a valid EGLImage.
        unsafe { blit.run(egl_image_target, image)? };
        let dst = blit.pool.get()?;
        cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
        Ok(dst)
    }

    /// Self-test entry: upload a packed `width`×`height` RGBA8 host pattern into a GL texture, run
    /// the NV12 convert passes on the GPU, and copy both planes into a pooled NV12 [`DeviceBuffer`].
    /// Exercises the exact shaders + CUDA copy the live path uses, but sourced from an uploaded
    /// texture instead of a dmabuf EGLImage (no compositor needed). `rgba` is tightly packed, 4 B/px.
    pub fn convert_rgba_for_test(
        &mut self,
        rgba: &[u8],
        width: u32,
        height: u32,
    ) -> Result<DeviceBuffer> {
        anyhow::ensure!(
            rgba.len() == width as usize * height as usize * 4,
            "test RGBA buffer {} bytes != {}x{}x4",
            rgba.len(),
            width,
            height
        );
        cuda::make_current()?;
        if self.nv12_blit.as_ref().map(|b| (b.width, b.height)) != Some((width, height)) {
            self.nv12_blit = Some(unsafe { Nv12Blit::new(width, height)? });
        }
        let blit = self.nv12_blit.as_mut().unwrap();
        unsafe {
            // Upload the host RGBA into `src_tex` (an immutable GL_RGBA8 backing must exist first;
            // the live path never allocates it — it retargets `src_tex` via EGLImage instead).
            glBindTexture(GL_TEXTURE_2D, blit.src_tex);
            if !blit.test_src_storage {
                glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, width as c_int, height as c_int);
                blit.test_src_storage = true;
            }
            let _ = glGetError();
            glTexSubImage2D(
                GL_TEXTURE_2D,
                0,
                0,
                0,
                width as c_int,
                height as c_int,
                GL_RGBA,
                GL_UNSIGNED_BYTE,
                rgba.as_ptr() as *const c_void,
            );
            let e = glGetError();
            glBindTexture(GL_TEXTURE_2D, 0);
            ensure!(e == 0, "glTexSubImage2D(test source) failed ({e:#x})");
            blit.run_passes()?;
        }
        let dst = blit.pool.get()?;
        cuda::copy_mapped_nv12(&mut blit.y_registered, &mut blit.uv_registered, &dst)?;
        Ok(dst)
    }
}

impl Drop for EglImporter {
    fn drop(&mut self) {
        if !self.gbm.is_null() {
            unsafe { gbm_device_destroy(self.gbm) };
        }
        if self.render_fd >= 0 {
            unsafe { libc::close(self.render_fd) };
        }
    }
}