From b488bd1d99b5bb1af23802f245fecd71d8a72979 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Sat, 4 Jul 2026 11:59:53 +0000 Subject: [PATCH] =?UTF-8?q?feat(client-linux):=20in-process=20GL=20present?= =?UTF-8?q?er=20=E2=80=94=20hardware=20decode=20ships=20on=20the=20Steam?= =?UTF-8?q?=20Deck?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VAAPI decode stays; what changes is who touches the YUV. The direct path hands the NV12 dmabuf (tiled AMD modifier since Mesa 25.1) to GdkDmabufTexture, and GTK's tiled-NV12 import renders corrupt/gray/washed-out on the Deck. Moonlight and mpv are clean on the same box because they import the dmabuf into their own EGL context and convert with their own shader — video_gl.rs is that architecture for the GTK client: per-plane EGLImages (R8 + GR88, modifier passed through) → our YUV→RGB shader (matrix/range from the stream's CICP signaling, unit-tested) → RGBA texture in a GdkGLContext-shared context → fence-synced GdkGLTexture. GTK composites plain RGBA; no YUV negotiation, no compositor CSC. The Deck's decoder default flips back to hardware (the software stopgap is gone); desktops keep the direct dmabuf path (offload/scan-out eligible). PUNKTFUNK_PRESENT=direct|gl overrides either way. New failure ladder: GL converter init failure or a convert-error streak raises a shared flag and the session pump demotes the decoder to software with a keyframe re-request — the same mechanism also closes the old silent-black-screen gap where a rejected dmabuf import had no recovery at all. Co-Authored-By: Claude Fable 5 --- Cargo.lock | 1 + clients/linux/Cargo.toml | 3 + clients/linux/src/launch.rs | 9 + clients/linux/src/main.rs | 2 + clients/linux/src/session.rs | 15 + clients/linux/src/ui_stream.rs | 68 ++++ clients/linux/src/video.rs | 40 +- clients/linux/src/video_gl.rs | 662 +++++++++++++++++++++++++++++++++ 8 files changed, 781 insertions(+), 19 deletions(-) create mode 100644 clients/linux/src/video_gl.rs diff --git a/Cargo.lock b/Cargo.lock index c942981..50a8338 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2895,6 +2895,7 @@ dependencies = [ "async-channel", "ffmpeg-next", "gtk4", + "khronos-egl", "libadwaita", "mdns-sd", "opus", diff --git a/clients/linux/Cargo.toml b/clients/linux/Cargo.toml index 005b898..8689814 100644 --- a/clients/linux/Cargo.toml +++ b/clients/linux/Cargo.toml @@ -31,6 +31,9 @@ pipewire = "0.9" # Gamepads: capture + feedback (full DualSense fidelity — touchpad/motion/triggers/LEDs # need the hidapi driver). sdl3 = { version = "0.18", features = ["hidapi"] } +# The VAAPI GL presenter (video_gl.rs): EGL dmabuf import into a GDK-shared context, dlopened +# at runtime (`dynamic`) so GPU-less boxes and the software path never touch libEGL. +khronos-egl = { version = "6", features = ["dynamic"] } mdns-sd = "0.20" # Game-library fetch from the host's management API over mTLS + fingerprint pinning. diff --git a/clients/linux/src/launch.rs b/clients/linux/src/launch.rs index ecf036f..bca3844 100644 --- a/clients/linux/src/launch.rs +++ b/clients/linux/src/launch.rs @@ -106,6 +106,9 @@ pub fn start_session_with( } let mode = resolve_mode(&app); let s = app.settings.borrow(); + // The presenter raises this when hardware frames can't be displayed; the session pump + // demotes the decoder to software (see `SessionParams::force_software`). + let force_software = Arc::new(AtomicBool::new(false)); let params = SessionParams { host: req.addr.clone(), port: req.port, @@ -125,6 +128,7 @@ pub fn start_session_with( pin, identity: app.identity.clone(), connect_timeout: opts.connect_timeout, + force_software: force_software.clone(), }; let inhibit = s.inhibit_shortcuts; let show_stats = s.show_stats; @@ -149,6 +153,7 @@ pub fn start_session_with( inhibit, show_stats, frames: Some(frames), + force_software, waiting: opts.waiting, page: None, }; @@ -198,6 +203,9 @@ struct SessionUi { stop: Arc, /// Decoded-frame receiver, handed to the stream page once on `Connected`. frames: Option>, + /// Shared with the session pump — the stream page's presenter raises it to demote + /// the decoder to software when hardware frames can't be displayed. + force_software: Arc, /// The "waiting for approval" dialog (request-access flow), dismissed on the first event. waiting: Option, page: Option, @@ -259,6 +267,7 @@ impl SessionUi { window: self.app.window.clone(), connector, frames: self.frames.take().expect("Connected delivered once"), + force_software: self.force_software.clone(), clock_offset_ns, escape_rx: self.app.gamepad.escape_events(), disconnect_rx: self.app.gamepad.disconnect_events(), diff --git a/clients/linux/src/main.rs b/clients/linux/src/main.rs index 578f408..92823be 100644 --- a/clients/linux/src/main.rs +++ b/clients/linux/src/main.rs @@ -39,6 +39,8 @@ mod ui_stream; mod ui_trust; #[cfg(target_os = "linux")] mod video; +#[cfg(target_os = "linux")] +mod video_gl; #[cfg(target_os = "linux")] fn main() -> gtk::glib::ExitCode { diff --git a/clients/linux/src/session.rs b/clients/linux/src/session.rs index ce3fc69..2f41226 100644 --- a/clients/linux/src/session.rs +++ b/clients/linux/src/session.rs @@ -43,6 +43,11 @@ pub struct SessionParams { /// connection until the operator clicks Approve in its console (so this must exceed the /// host's approval window — see `PENDING_APPROVAL_WAIT`). pub connect_timeout: Duration, + /// Raised by the PRESENTER when hardware frames can't be displayed (GL converter init + /// failed / dmabuf import rejected): the pump demotes the decoder to software and + /// re-requests a keyframe. Decode itself succeeds in that state, so nothing else + /// would recover — without this the stream stays black. + pub force_software: Arc, } /// The session pump's share of the unified stats window (design/stats-unification.md): @@ -238,6 +243,7 @@ fn pump( return; } }; + let force_software = params.force_software.clone(); // Audio is best-effort: a session without it still streams. Gamepads are the // app-lifetime service's job (the UI attaches it on Connected). Audio runs on its own // thread (one puller per plane), blocking on the audio queue like the Apple client. @@ -331,6 +337,15 @@ fn pump( // Survivable (loss until the next IDR/RFI recovery) — keep feeding. Err(e) => tracing::debug!(error = %e, "decode error (recovering)"), } + // The presenter's verdict: hardware frames can't be displayed (GL converter + // init failed / dmabuf import rejected) — demote to software here, on the + // decoder's own thread. Decode succeeds in that state, so the error-streak + // demotion above never fires. + if force_software.swap(false, Ordering::Relaxed) { + if let Err(e) = decoder.force_software() { + break Some(format!("software decoder rebuild: {e}")); + } + } // A decode error / VAAPI→software demotion asks for a fresh IDR: the infinite // GOP has no periodic keyframe, so a rebuilt/erroring decoder would stay // gray/frozen until an unrelated packet drop happened to request one. Route it diff --git a/clients/linux/src/ui_stream.rs b/clients/linux/src/ui_stream.rs index 1ecbfd7..2fd0e57 100644 --- a/clients/linux/src/ui_stream.rs +++ b/clients/linux/src/ui_stream.rs @@ -111,6 +111,10 @@ pub struct StreamPageArgs { pub window: adw::ApplicationWindow, pub connector: Arc, pub frames: async_channel::Receiver, + /// Shared with the session pump: the presenter raises it when hardware frames can't + /// be displayed (GL converter init failed / dmabuf import rejected) and the pump + /// demotes the decoder to software. + pub force_software: Arc, /// Host-clock offset from the session's clock handshake — added to the local wall /// clock to express paintable-set time in the host's capture clock (present latency). pub clock_offset_ns: i64, @@ -253,6 +257,7 @@ pub fn new(args: StreamPageArgs) -> StreamPage { window, connector, frames, + force_software, clock_offset_ns, escape_rx, disconnect_rx, @@ -291,6 +296,7 @@ pub fn new(args: StreamPageArgs) -> StreamPage { spawn_frame_consumer( &w.picture, frames, + force_software, clock_offset_ns, presented.clone(), hdr.clone(), @@ -584,9 +590,33 @@ impl ColorStateCache { } } +/// How hardware (dmabuf) frames reach the screen. +#[derive(PartialEq, Clone, Copy)] +enum HwPresent { + /// Hand the NV12 dmabuf straight to `GdkDmabufTexture` — GTK (or the compositor via + /// offload) imports + converts. The desktop default: subsurface/scan-out eligible. + Direct, + /// Convert in-process first (`video_gl`): own EGL import + own YUV→RGB shader → RGBA + /// `GdkGLTexture`. The Steam Deck default — GTK's tiled-NV12 import is broken there + /// (Mesa ≥ 25.1 tiled VCN export), and this is the Moonlight-proven route around it. + Gl, +} + +impl HwPresent { + fn pick() -> HwPresent { + match std::env::var("PUNKTFUNK_PRESENT").ok().as_deref() { + Some("direct") => HwPresent::Direct, + Some("gl") => HwPresent::Gl, + _ if crate::gamepad::is_steam_deck() => HwPresent::Gl, + _ => HwPresent::Direct, + } + } +} + fn spawn_frame_consumer( picture: >k::Picture, frames: async_channel::Receiver, + force_software: Arc, clock_offset_ns: i64, presented_stats: Rc, hdr: Rc>, @@ -599,6 +629,11 @@ fn spawn_frame_consumer( // (SDR↔HDR flip) just rebuilds once. let mut yuv_state = ColorStateCache::default(); let mut rgb_state = ColorStateCache::default(); + let hw_present = HwPresent::pick(); + // Lazy (first dmabuf frame) so software-decode sessions never touch EGL. `Err` after + // a failed init = don't retry every frame. + let mut gl_conv: Option> = None; + let mut gl_fails = 0u32; glib::spawn_future_local(async move { // Window samples (µs): end-to-end capture→displayed (host-clock corrected) and // the client-local display stage decoded→displayed. @@ -646,6 +681,39 @@ fn spawn_frame_consumer( picture.set_paintable(Some(&tex)); presented = true; } + DecodedImage::Dmabuf(d) if hw_present == HwPresent::Gl => { + // In-process conversion (see `HwPresent::Gl`). Init once; a failed + // init or a streak of convert failures demotes the DECODER to + // software via the shared flag — never fall back to the direct path + // here, it's the known-broken one on this hardware. + let conv = gl_conv.get_or_insert_with(|| { + crate::video_gl::GlConverter::new(&picture).map_err(|e| { + tracing::warn!(error = %format!("{e:#}"), + "GL presenter unavailable — demoting to software decode"); + }) + }); + match conv { + Ok(c) => { + let color = d.color; + match c.convert(d, rgb_state.get(color, true).as_ref()) { + Ok(tex) => { + gl_fails = 0; + picture.set_paintable(Some(&tex)); + presented = true; + } + Err(e) => { + gl_fails += 1; + tracing::warn!(error = %format!("{e:#}"), fails = gl_fails, + "GL convert failed"); + if gl_fails >= 3 { + force_software.store(true, Ordering::Relaxed); + } + } + } + } + Err(()) => force_software.store(true, Ordering::Relaxed), + } + } DecodedImage::Dmabuf(d) => { let mut b = gdk::DmabufTextureBuilder::new() .set_display(&picture.display()) diff --git a/clients/linux/src/video.rs b/clients/linux/src/video.rs index 0c8beb1..dd5dbc0 100644 --- a/clients/linux/src/video.rs +++ b/clients/linux/src/video.rs @@ -187,25 +187,12 @@ impl Decoder { .ok() .filter(|v| !v.is_empty()) .unwrap_or_else(|| pref.to_string()); - // The Steam Deck's VAAPI zero-copy path renders corrupt/gray/washed-out — validated live; - // software decode is clean, correct-colour, and the Deck's APU handles 1280×800 HEVC - // easily. Likely cause: since Mesa 25.1 radeonsi exports VCN decode surfaces TILED (with - // AMD modifiers) instead of linear, and inside the Flatpak both the VAAPI driver and GTK's - // GL come from the runtime's Mesa 26.x — GTK's tiled-NV12 dmabuf import mishandles the new - // layout (desktop AMD/Intel boxes validated Tier-1 ran distro Mesa with linear export). - // So `auto` resolves to software on a Deck; an explicit `vaapi` (Settings or - // PUNKTFUNK_DECODER=vaapi) still forces the hw path for testing — the first-frame - // descriptor dump logs the modifier (LINEAR = 0x0), and GSK_RENDERER=ngl|vulkan bisects - // the import side. - let choice = if (choice == "auto" || choice.is_empty()) && crate::gamepad::is_steam_deck() { - tracing::info!( - "Steam Deck — defaulting to software decode (AMD VAAPI dmabuf is broken on this \ - SteamOS+Mesa combo); set the decoder to `vaapi` to override" - ); - "software".to_string() - } else { - choice - }; + // Deck note: `auto` means VAAPI here too. GTK's tiled-NV12 dmabuf import is broken on + // the Deck (Mesa ≥ 25.1 exports VCN surfaces TILED; artifacts/gray/washed-out), but the + // presenter routes Deck frames through the in-process GL converter (`video_gl`) instead + // of GdkDmabufTexture — and if THAT can't initialize, it demotes this decoder to + // software mid-session via [`Decoder::force_software`]. The broken direct path is never + // the fallback. if choice != "software" { match VaapiDecoder::new(codec_id) { Ok(v) => { @@ -239,6 +226,21 @@ impl Decoder { std::mem::take(&mut self.want_keyframe) } + /// Demote to software decode on the PRESENTER's verdict (dmabuf presentation impossible: + /// GL converter init failed, texture import rejected). Decode itself succeeds in that + /// state, so the error-streak demotion never fires — without this the stream would stay + /// black forever. No-op when already software. + pub fn force_software(&mut self) -> Result<()> { + if matches!(self.backend, Backend::Software(_)) { + return Ok(()); + } + tracing::warn!("presenter can't display hardware frames — demoting to software decode"); + self.backend = Backend::Software(SoftwareDecoder::new(self.codec_id)?); + self.vaapi_fails = 0; + self.want_keyframe = true; + Ok(()) + } + /// Feed one access unit; returns the decoded frame (the host's streams are /// one-in/one-out). A software decode error after packet loss is survivable — log /// upstream and keep feeding. A VAAPI error re-requests an IDR and retries the hardware diff --git a/clients/linux/src/video_gl.rs b/clients/linux/src/video_gl.rs new file mode 100644 index 0000000..7b4512d --- /dev/null +++ b/clients/linux/src/video_gl.rs @@ -0,0 +1,662 @@ +//! VAAPI dmabuf → RGBA GL texture converter — the Steam Deck's hardware-decode presenter. +//! +//! The direct path hands the decoder's NV12 dmabuf (fds + AMD tiled modifier) to +//! `GdkDmabufTexture` and lets GTK import + color-convert it. On the Deck that renders +//! corrupt/gray/washed-out: since Mesa 25.1 radeonsi exports VCN decode surfaces TILED, and +//! GTK's tiled-NV12 import mishandles the layout (the Flatpak runtime's Mesa drives both +//! sides). Moonlight-qt and mpv are clean on the same box because they never let a toolkit +//! near the YUV: they import the dmabuf into their own EGL context and convert with their +//! own shader. This module is that architecture for the GTK client: +//! +//! VAAPI frame → per-plane `EGLImage`s (R8 luma + GR88 chroma, modifier passed through) +//! → our YUV→RGB shader (matrix + range from the stream's real CICP signaling) +//! → an RGBA texture in a `GdkGLContext`-shared context → `GdkGLTexture` (fence-synced). +//! +//! GTK then composites a plain RGBA texture — no YUV format negotiation, no modifier +//! handling, no compositor CSC. Same-Mesa export/import is the exact proven-working path. +//! Everything runs on the GTK main thread (the converter is driven by the frame consumer); +//! one 800p–4K NV12→RGB pass is sub-millisecond GPU work. +//! +//! Failure at any step (GLX-backed GDK context, missing EGL extensions, import rejection) +//! is surfaced as an error — the caller falls back to software decode, never to the broken +//! direct path. + +use crate::video::{ColorDesc, DmabufFrame}; +use anyhow::{anyhow, bail, Context as _, Result}; +use gtk::{gdk, prelude::*}; +use khronos_egl as egl; +use std::ffi::c_void; +use std::sync::{Arc, Mutex}; + +// --- EGL_EXT_image_dma_buf_import(+_modifiers) constants (khronos-egl exposes none) ------ +const EGL_LINUX_DMA_BUF_EXT: egl::Enum = 0x3270; +const EGL_LINUX_DRM_FOURCC_EXT: usize = 0x3271; +const EGL_DMA_BUF_PLANE0_FD_EXT: usize = 0x3272; +const EGL_DMA_BUF_PLANE0_OFFSET_EXT: usize = 0x3273; +const EGL_DMA_BUF_PLANE0_PITCH_EXT: usize = 0x3274; +const EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT: usize = 0x3443; +const EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT: usize = 0x3444; +const EGL_WIDTH: usize = 0x3057; +const EGL_HEIGHT: usize = 0x3056; +const EGL_NONE: usize = 0x3038; +const DRM_FORMAT_MOD_INVALID: u64 = 0x00ff_ffff_ffff_ffff; + +/// `fourcc('N','V','1','2')` — the only decoder output today (8-bit 4:2:0). P010 joins when +/// the Linux host grows 10-bit. +const DRM_FORMAT_NV12: u32 = 0x3231_564e; +const DRM_FORMAT_R8: u32 = 0x2020_3852; +const DRM_FORMAT_GR88: u32 = 0x3838_5247; + +// --- The slice of GL we use (loaded via eglGetProcAddress — Mesa/NVIDIA both implement +// --- EGL_KHR_get_all_proc_addresses, so core functions resolve too) ---------------------- +const GL_TEXTURE_2D: u32 = 0x0DE1; +const GL_TEXTURE0: u32 = 0x84C0; +const GL_TEXTURE_MIN_FILTER: u32 = 0x2801; +const GL_TEXTURE_MAG_FILTER: u32 = 0x2800; +const GL_TEXTURE_WRAP_S: u32 = 0x2802; +const GL_TEXTURE_WRAP_T: u32 = 0x2803; +const GL_LINEAR: i32 = 0x2601; +const GL_CLAMP_TO_EDGE: i32 = 0x812F; +const GL_FRAMEBUFFER: u32 = 0x8D40; +const GL_COLOR_ATTACHMENT0: u32 = 0x8CE0; +const GL_FRAMEBUFFER_COMPLETE: u32 = 0x8CD5; +const GL_RGBA8: u32 = 0x8058; +const GL_RGBA: u32 = 0x1908; +const GL_UNSIGNED_BYTE: u32 = 0x1401; +const GL_TRIANGLES: u32 = 0x0004; +const GL_VERTEX_SHADER: u32 = 0x8B31; +const GL_FRAGMENT_SHADER: u32 = 0x8B30; +const GL_COMPILE_STATUS: u32 = 0x8B81; +const GL_LINK_STATUS: u32 = 0x8B82; +const GL_SYNC_GPU_COMMANDS_COMPLETE: u32 = 0x9117; + +macro_rules! gl_fns { + ($($name:ident : fn($($arg:ty),*) $(-> $ret:ty)?;)*) => { + #[allow(non_snake_case)] + struct GlFns { $($name: unsafe extern "C" fn($($arg),*) $(-> $ret)?,)* } + impl GlFns { + #[allow(non_snake_case)] + fn load(egl: &Egl) -> Result { + $( + // eglGetProcAddress returns a plain fn pointer; the signature is fixed + // by the GL spec for each name. + let $name = egl + .get_proc_address(concat!("gl", stringify!($name))) + .ok_or_else(|| anyhow!(concat!("gl", stringify!($name), " unresolvable")))?; + )* + // SAFETY: each pointer came from eglGetProcAddress for exactly that GL entry + // point; the transmute only fixes the signature the spec defines for it. + unsafe { + Ok(GlFns { $($name: std::mem::transmute:: $ret)?>($name),)* }) + } + } + } + }; +} + +gl_fns! { + GenTextures: fn(i32, *mut u32); + DeleteTextures: fn(i32, *const u32); + BindTexture: fn(u32, u32); + TexParameteri: fn(u32, u32, i32); + TexImage2D: fn(u32, i32, i32, i32, i32, i32, u32, u32, *const c_void); + ActiveTexture: fn(u32); + EGLImageTargetTexture2DOES: fn(u32, *const c_void); + GenFramebuffers: fn(i32, *mut u32); + DeleteFramebuffers: fn(i32, *const u32); + BindFramebuffer: fn(u32, u32); + FramebufferTexture2D: fn(u32, u32, u32, u32, i32); + CheckFramebufferStatus: fn(u32) -> u32; + Viewport: fn(i32, i32, i32, i32); + CreateShader: fn(u32) -> u32; + ShaderSource: fn(u32, i32, *const *const u8, *const i32); + CompileShader: fn(u32); + GetShaderiv: fn(u32, u32, *mut i32); + GetShaderInfoLog: fn(u32, i32, *mut i32, *mut u8); + DeleteShader: fn(u32); + CreateProgram: fn() -> u32; + AttachShader: fn(u32, u32); + LinkProgram: fn(u32); + GetProgramiv: fn(u32, u32, *mut i32); + UseProgram: fn(u32); + GetUniformLocation: fn(u32, *const u8) -> i32; + Uniform1i: fn(i32, i32); + Uniform3fv: fn(i32, i32, *const f32); + UniformMatrix3fv: fn(i32, i32, u8, *const f32); + GenVertexArrays: fn(i32, *mut u32); + DeleteVertexArrays: fn(i32, *const u32); + DeleteProgram: fn(u32); + BindVertexArray: fn(u32); + DrawArrays: fn(u32, i32, i32); + FenceSync: fn(u32, u32) -> *const c_void; + DeleteSync: fn(*const c_void); + Flush: fn(); + GetError: fn() -> u32; +} + +type Egl = egl::DynamicInstance; +type EglCreateImageKhr = unsafe extern "C" fn( + *mut c_void, // EGLDisplay + *mut c_void, // EGLContext (EGL_NO_CONTEXT for dmabuf) + egl::Enum, + *mut c_void, // EGLClientBuffer (null for dmabuf) + *const usize, +) -> *const c_void; +type EglDestroyImageKhr = unsafe extern "C" fn(*mut c_void, *const c_void) -> egl::Boolean; + +/// The YUV→RGB conversion for a stream's CICP signaling: `rgb = mat * (yuv + off)`, with the +/// limited/full-range expansion folded in. `mat` is column-major (GL convention). Pure — +/// unit-tested against the reference white/black points. +pub fn yuv_to_rgb(desc: ColorDesc) -> ([f32; 9], [f32; 3]) { + // BT.601 (5/6), BT.2020 (9/10); everything else — incl. unspecified — is the host's + // BT.709 SDR default (mirrors the software path's swscale coefficient choice). + let (kr, kb) = match desc.matrix { + 5 | 6 => (0.299, 0.114), + 9 | 10 => (0.2627, 0.0593), + _ => (0.2126, 0.0722), + }; + let kg = 1.0 - kr - kb; + let (sy, oy, sc) = if desc.full_range { + (1.0f32, 0.0f32, 1.0f32) + } else { + (255.0 / 219.0, -16.0 / 255.0, 255.0 / 224.0) + }; + let (kr, kb, kg) = (kr as f32, kb as f32, kg as f32); + // Column-major: columns are the Y, U, V contributions to (R, G, B). + let mat = [ + sy, + sy, + sy, // Y column + 0.0, + -2.0 * (1.0 - kb) * kb / kg * sc, + 2.0 * (1.0 - kb) * sc, // U column + 2.0 * (1.0 - kr) * sc, + -2.0 * (1.0 - kr) * kr / kg * sc, + 0.0, // V column + ]; + (mat, [oy, -0.5, -0.5]) +} + +/// An output texture GTK has released, waiting to be recycled (or its fence deleted). GL +/// objects can only be touched with our context current, so releases park here and +/// [`GlConverter::convert`] drains them. +struct Retired { + tex: u32, + sync: usize, // GLsync as usize — the release closure must be Send + size: (u32, u32), +} + +pub struct GlConverter { + ctx: gdk::GLContext, + egl: Egl, + egl_display: *mut c_void, + create_image: EglCreateImageKhr, + destroy_image: EglDestroyImageKhr, + gl: GlFns, + program: u32, + vao: u32, + fbo: u32, + u_mat: i32, + u_off: i32, + /// Uniforms match this signaling; a change (mid-stream SDR↔HDR) re-uploads them. + uniforms_for: Option, + /// Free output textures + fences returned by GTK's release funcs (shared with the + /// `Send` release closures; drained/recycled at each convert). + retired: Arc>>, +} + +impl GlConverter { + /// Build against the widget's display. Must run on the GTK main thread; fails cleanly + /// on a GLX-backed GDK context or missing EGL dmabuf-import extensions (the caller + /// falls back to software decode). + pub fn new(widget: &impl IsA) -> Result { + let display = widget.display(); + let ctx = display.create_gl_context().context("create GdkGLContext")?; + ctx.realize().context("realize GdkGLContext")?; + ctx.make_current(); + + // SAFETY (whole block): the GdkGLContext is current on this thread, so EGL/GL + // queries and object creation target it; pointers are only used while it lives. + unsafe { + let egl = Egl::load_required().context("dlopen libEGL")?; + let egl_display = egl + .get_current_display() + .ok_or_else(|| anyhow!("GDK context is not EGL-backed (GLX?)"))?; + let exts = egl + .query_string(Some(egl_display), egl::EXTENSIONS) + .context("EGL_EXTENSIONS")? + .to_string_lossy() + .into_owned(); + for need in ["EGL_EXT_image_dma_buf_import", "EGL_KHR_image_base"] { + if !exts.contains(need) { + bail!("EGL lacks {need}"); + } + } + // Tiled surfaces carry an explicit modifier — without the _modifiers extension + // the import would silently assume implied/linear and sample garbage. + if !exts.contains("EGL_EXT_image_dma_buf_import_modifiers") { + bail!("EGL lacks EGL_EXT_image_dma_buf_import_modifiers"); + } + let create_image: EglCreateImageKhr = + std::mem::transmute::( + egl.get_proc_address("eglCreateImageKHR") + .ok_or_else(|| anyhow!("no eglCreateImageKHR"))?, + ); + let destroy_image: EglDestroyImageKhr = + std::mem::transmute::( + egl.get_proc_address("eglDestroyImageKHR") + .ok_or_else(|| anyhow!("no eglDestroyImageKHR"))?, + ); + let gl = GlFns::load(&egl)?; + + let es = ctx.api().contains(gdk::GLAPI::GLES); + let program = build_program(&gl, es)?; + (gl.UseProgram)(program); + let u_mat = (gl.GetUniformLocation)(program, c"u_mat".as_ptr() as *const u8); + let u_off = (gl.GetUniformLocation)(program, c"u_off".as_ptr() as *const u8); + let u_y = (gl.GetUniformLocation)(program, c"u_y".as_ptr() as *const u8); + let u_c = (gl.GetUniformLocation)(program, c"u_c".as_ptr() as *const u8); + (gl.Uniform1i)(u_y, 0); + (gl.Uniform1i)(u_c, 1); + let mut vao = 0u32; + (gl.GenVertexArrays)(1, &mut vao); + let mut fbo = 0u32; + (gl.GenFramebuffers)(1, &mut fbo); + + tracing::info!( + gles = es, + "GL presenter ready — VAAPI dmabufs convert in-process (own EGL import + shader)" + ); + Ok(GlConverter { + ctx, + egl, + egl_display: egl_display.as_ptr(), + create_image, + destroy_image, + gl, + program, + vao, + fbo, + u_mat, + u_off, + uniforms_for: None, + retired: Arc::new(Mutex::new(Vec::new())), + }) + } + } + + /// Convert one decoded frame into an RGBA `GdkTexture`. The source surface (guard) is + /// held until GTK releases the output texture — the GPU read is long finished by then. + /// `color_state` tags the output (full-range RGB, transfer left baked — same semantics + /// as the software path's tagged `GdkMemoryTexture`); `None` = untagged sRGB. + pub fn convert( + &mut self, + frame: DmabufFrame, + color_state: Option<&gdk::ColorState>, + ) -> Result { + if frame.fourcc != DRM_FORMAT_NV12 { + bail!("GL presenter handles NV12 only (got {:#x})", frame.fourcc); + } + if frame.planes.len() < 2 { + bail!("NV12 needs 2 planes (got {})", frame.planes.len()); + } + self.ctx.make_current(); + let gl = &self.gl; + + // SAFETY (whole body): our context is current; every GL/EGL object created here is + // either destroyed before return or owned by the pool/release machinery. + unsafe { + // Recycle what GTK released since last frame (GL objects need the context, so + // the release closures only park entries — this is where they die/revive). + let size = (frame.width, frame.height); + let mut out_tex = 0u32; + { + let mut retired = self.retired.lock().unwrap(); + retired.retain_mut(|r| { + if r.sync != 0 { + (gl.DeleteSync)(r.sync as *const c_void); + r.sync = 0; + } + if out_tex == 0 && r.size == size { + out_tex = r.tex; + false + } else if r.size != size { + (gl.DeleteTextures)(1, &r.tex); // stale size (mode change) + false + } else { + true // spare same-size texture for a later frame + } + }); + } + if out_tex == 0 { + (gl.GenTextures)(1, &mut out_tex); + (gl.BindTexture)(GL_TEXTURE_2D, out_tex); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + (gl.TexImage2D)( + GL_TEXTURE_2D, + 0, + GL_RGBA8 as i32, + frame.width as i32, + frame.height as i32, + 0, + GL_RGBA, + GL_UNSIGNED_BYTE, + std::ptr::null(), + ); + } + + // Import both planes with the surface's modifier — exactly the layer-wise + // import Moonlight/mpv drive on this hardware. + let y = &frame.planes[0]; + let c = &frame.planes[1]; + let img_y = + self.plane_image(frame.width, frame.height, DRM_FORMAT_R8, y, frame.modifier)?; + let img_c = match self.plane_image( + frame.width.div_ceil(2), + frame.height.div_ceil(2), + DRM_FORMAT_GR88, + c, + frame.modifier, + ) { + Ok(img) => img, + Err(e) => { + (self.destroy_image)(self.egl_display, img_y); + return Err(e); + } + }; + + let mut planes = [0u32; 2]; + (gl.GenTextures)(2, planes.as_mut_ptr()); + for (tex, img) in planes.iter().zip([img_y, img_c]) { + (gl.BindTexture)(GL_TEXTURE_2D, *tex); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + (gl.TexParameteri)(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + (gl.EGLImageTargetTexture2DOES)(GL_TEXTURE_2D, img); + } + + (gl.UseProgram)(self.program); + if self.uniforms_for != Some(frame.color) { + let (mat, off) = yuv_to_rgb(frame.color); + (gl.UniformMatrix3fv)(self.u_mat, 1, 0, mat.as_ptr()); + (gl.Uniform3fv)(self.u_off, 1, off.as_ptr()); + self.uniforms_for = Some(frame.color); + } + (gl.BindFramebuffer)(GL_FRAMEBUFFER, self.fbo); + (gl.FramebufferTexture2D)( + GL_FRAMEBUFFER, + GL_COLOR_ATTACHMENT0, + GL_TEXTURE_2D, + out_tex, + 0, + ); + let status = (gl.CheckFramebufferStatus)(GL_FRAMEBUFFER); + if status != GL_FRAMEBUFFER_COMPLETE { + (gl.BindFramebuffer)(GL_FRAMEBUFFER, 0); + (gl.DeleteTextures)(2, planes.as_ptr()); + (self.destroy_image)(self.egl_display, img_y); + (self.destroy_image)(self.egl_display, img_c); + (gl.DeleteTextures)(1, &out_tex); + bail!("FBO incomplete ({status:#x})"); + } + (gl.Viewport)(0, 0, frame.width as i32, frame.height as i32); + (gl.BindVertexArray)(self.vao); + (gl.ActiveTexture)(GL_TEXTURE0); + (gl.BindTexture)(GL_TEXTURE_2D, planes[0]); + (gl.ActiveTexture)(GL_TEXTURE0 + 1); + (gl.BindTexture)(GL_TEXTURE_2D, planes[1]); + (gl.DrawArrays)(GL_TRIANGLES, 0, 3); + (gl.BindFramebuffer)(GL_FRAMEBUFFER, 0); + + let sync = (gl.FenceSync)(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + (gl.Flush)(); + // The draw is queued: plane textures + images can go now (the driver keeps the + // underlying buffers alive until the queued commands execute). + (gl.DeleteTextures)(2, planes.as_ptr()); + (self.destroy_image)(self.egl_display, img_y); + (self.destroy_image)(self.egl_display, img_c); + + let err = (gl.GetError)(); + if err != 0 { + (gl.DeleteTextures)(1, &out_tex); + bail!("GL error {err:#x} during convert"); + } + + let mut b = gdk::GLTextureBuilder::new() + .set_context(Some(&self.ctx)) + .set_id(out_tex) + .set_width(frame.width as i32) + .set_height(frame.height as i32) + .set_format(gdk::MemoryFormat::R8g8b8a8) + .set_sync(Some(sync)); + if let Some(state) = color_state { + b = b.set_color_state(state); + } + let retired = self.retired.clone(); + let guard = frame.guard; + let sync_bits = sync as usize; // GLsync as usize — the closure must be Send + let texture = b.build_with_release_func(move || { + drop(guard); // the decoder surface outlived every GPU read of it + retired.lock().unwrap().push(Retired { + tex: out_tex, + sync: sync_bits, + size, + }); + }); + Ok(texture) + } + } + + /// One single-plane `EGLImage` over a dmabuf plane (R8 luma / GR88 chroma), modifier + /// passed explicitly. + /// + /// # Safety + /// `self.ctx` must be current; the fd stays owned by the caller (EGL dups internally). + unsafe fn plane_image( + &self, + width: u32, + height: u32, + fourcc: u32, + plane: &crate::video::DmabufPlane, + modifier: u64, + ) -> Result<*const c_void> { + let mut attribs = vec![ + EGL_WIDTH, + width as usize, + EGL_HEIGHT, + height as usize, + EGL_LINUX_DRM_FOURCC_EXT, + fourcc as usize, + EGL_DMA_BUF_PLANE0_FD_EXT, + plane.fd as usize, + EGL_DMA_BUF_PLANE0_OFFSET_EXT, + plane.offset as usize, + EGL_DMA_BUF_PLANE0_PITCH_EXT, + plane.stride as usize, + ]; + if modifier != DRM_FORMAT_MOD_INVALID && modifier != 0 { + attribs.extend_from_slice(&[ + EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, + (modifier & 0xffff_ffff) as usize, + EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, + (modifier >> 32) as usize, + ]); + } + attribs.push(EGL_NONE); + // SAFETY: attribs is a valid EGL_NONE-terminated list; display/context are live. + let img = unsafe { + (self.create_image)( + self.egl_display, + std::ptr::null_mut(), // EGL_NO_CONTEXT — dmabuf import + EGL_LINUX_DMA_BUF_EXT, + std::ptr::null_mut(), + attribs.as_ptr(), + ) + }; + if img.is_null() { + bail!( + "eglCreateImageKHR rejected plane ({}x{} {:#x} mod {:#018x}): {:#x}", + width, + height, + fourcc, + modifier, + self.egl.get_error().map(|e| e as u32).unwrap_or(0) + ); + } + Ok(img) + } +} + +impl Drop for GlConverter { + /// Delete our objects from the shared context group (the context lives in GDK's share + /// group — per-session leftovers would pile up across sessions). Textures GTK still + /// holds at this moment release into `retired` afterwards, where nobody drains them: + /// those names leak, but it's ≤ the pool depth once per session, not per frame. + fn drop(&mut self) { + self.ctx.make_current(); + let gl = &self.gl; + // SAFETY: context current; only objects this converter created are deleted. + unsafe { + for r in self.retired.lock().unwrap().drain(..) { + if r.sync != 0 { + (gl.DeleteSync)(r.sync as *const c_void); + } + (gl.DeleteTextures)(1, &r.tex); + } + (gl.DeleteFramebuffers)(1, &self.fbo); + (gl.DeleteVertexArrays)(1, &self.vao); + (gl.DeleteProgram)(self.program); + } + } +} + +/// Compile the fullscreen-triangle NV12→RGB program (GLSL 300 es / 330 core per the GDK +/// context's API). `gl_VertexID` drives the geometry — no buffers at all. +/// +/// # Safety +/// A GL context must be current; `gl` must belong to it. +unsafe fn build_program(gl: &GlFns, es: bool) -> Result { + let header = if es { + "#version 300 es\nprecision highp float;\n" + } else { + "#version 330 core\n" + }; + let vs_src = format!( + "{header} +out vec2 v_uv; +void main() {{ + vec2 p = vec2(float((gl_VertexID & 1) << 2) - 1.0, float((gl_VertexID & 2) << 1) - 1.0); + v_uv = p * 0.5 + 0.5; + gl_Position = vec4(p, 0.0, 1.0); +}}" + ); + let fs_src = format!( + "{header} +in vec2 v_uv; +out vec4 frag; +uniform sampler2D u_y; +uniform sampler2D u_c; +uniform mat3 u_mat; +uniform vec3 u_off; +void main() {{ + vec3 yuv = vec3(texture(u_y, v_uv).r, texture(u_c, v_uv).rg); + frag = vec4(clamp(u_mat * (yuv + u_off), 0.0, 1.0), 1.0); +}}" + ); + // SAFETY: caller holds a current context; sources are valid UTF-8 with explicit lengths. + unsafe { + let compile = |kind: u32, src: &str| -> Result { + let sh = (gl.CreateShader)(kind); + let ptr = src.as_ptr(); + let len = src.len() as i32; + (gl.ShaderSource)(sh, 1, &ptr, &len); + (gl.CompileShader)(sh); + let mut ok = 0i32; + (gl.GetShaderiv)(sh, GL_COMPILE_STATUS, &mut ok); + if ok == 0 { + let mut log = vec![0u8; 1024]; + let mut n = 0i32; + (gl.GetShaderInfoLog)(sh, 1024, &mut n, log.as_mut_ptr()); + (gl.DeleteShader)(sh); + bail!( + "shader compile: {}", + String::from_utf8_lossy(&log[..n.max(0) as usize]) + ); + } + Ok(sh) + }; + let vs = compile(GL_VERTEX_SHADER, &vs_src)?; + let fs = match compile(GL_FRAGMENT_SHADER, &fs_src) { + Ok(fs) => fs, + Err(e) => { + (gl.DeleteShader)(vs); + return Err(e); + } + }; + let prog = (gl.CreateProgram)(); + (gl.AttachShader)(prog, vs); + (gl.AttachShader)(prog, fs); + (gl.LinkProgram)(prog); + (gl.DeleteShader)(vs); + (gl.DeleteShader)(fs); + let mut ok = 0i32; + (gl.GetProgramiv)(prog, GL_LINK_STATUS, &mut ok); + if ok == 0 { + bail!("program link failed"); + } + Ok(prog) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn desc(matrix: u8, full_range: bool) -> ColorDesc { + ColorDesc { + primaries: 1, + transfer: 1, + matrix, + full_range, + } + } + + fn apply(mat: &[f32; 9], off: &[f32; 3], yuv: [f32; 3]) -> [f32; 3] { + let v = [yuv[0] + off[0], yuv[1] + off[1], yuv[2] + off[2]]; + // Column-major: out[r] = Σ mat[col*3 + r] * v[col] + core::array::from_fn(|r| (0..3).map(|c| mat[c * 3 + r] * v[c]).sum()) + } + + /// Reference white (Y=235, U=V=128 limited) → RGB 1.0; reference black (Y=16) → 0.0. + #[test] + fn bt709_limited_white_black() { + let (mat, off) = yuv_to_rgb(desc(1, false)); + let white = apply(&mat, &off, [235.0 / 255.0, 128.0 / 255.0, 128.0 / 255.0]); + let black = apply(&mat, &off, [16.0 / 255.0, 128.0 / 255.0, 128.0 / 255.0]); + for (w, b) in white.iter().zip(black) { + assert!((w - 1.0).abs() < 0.005, "white {white:?}"); + assert!(b.abs() < 0.005, "black {black:?}"); + } + } + + /// Full-range identity points: Y=1 → white, Y=0 → black, and a 601-vs-709 red spot + /// check (pure V excursion produces R = 2(1−Kr)·0.5). + #[test] + fn full_range_and_red_excursion() { + let (mat, off) = yuv_to_rgb(desc(5, true)); + let white = apply(&mat, &off, [1.0, 0.5, 0.5]); + assert!(white.iter().all(|v| (v - 1.0).abs() < 1e-5), "{white:?}"); + let red = apply(&mat, &off, [0.0, 0.5, 1.0]); + assert!((red[0] - 2.0 * (1.0 - 0.299) * 0.5).abs() < 1e-4, "{red:?}"); + // 709 differs from 601 in the same spot — guards the matrix-code dispatch. + let (mat709, off709) = yuv_to_rgb(desc(1, true)); + let red709 = apply(&mat709, &off709, [0.0, 0.5, 1.0]); + assert!( + (red709[0] - 2.0 * (1.0 - 0.2126) * 0.5).abs() < 1e-4, + "{red709:?}" + ); + assert!((red[0] - red709[0]).abs() > 0.05); + } +}