From 64b167946f2eaee793119570215c881e49f9f900 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Wed, 17 Jun 2026 07:24:27 +0000 Subject: [PATCH] =?UTF-8?q?fix(client-linux):=20VAAPI=20green=20screen=20o?= =?UTF-8?q?n=20AMD=20=E2=80=94=20flatten=20NV12=20planes=20across=20DRM=20?= =?UTF-8?q?layers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First AMD test (Steam Deck, Mesa radeonsi) showed a mostly-green image with red whites — the classic fingerprint of NV12 chroma read as 0. Root cause (confirmed against FFmpeg/GTK/mpv source): FFmpeg's VAAPI export uses VA_EXPORT_SURFACE_SEPARATE_LAYERS unconditionally, so an NV12 surface comes back as TWO single-plane layers — layers[0]=R8 (luma), layers[1]=GR88 (chroma) — sharing one object/fd, the UV plane reached via offset. map_dmabuf took layers[0] only and used its format (R8) as the GTK fourcc, so GdkDmabufTexture got a luma-only texture with the chroma plane dropped → chroma defaults to 0 → green field, red highlights. Fix (matches mpv's dmabuf_interop_gl flatten pattern): - Derive the combined fourcc from the decoder's sw_format (AVHWFramesContext.sw_format → NV12 → DRM_FORMAT_NV12), NOT from the per-plane component formats. The frame format is absent from the separate-layer descriptor and must be deduced from sw_format. - Flatten every plane across every layer in declared order (Y then UV), each with its own fd (objects[plane.object_index].fd), offset, pitch. - One-time descriptor dump (objects/layers/formats/modifier) so a new driver's real layout is visible in the logs. - Unit test locks the DRM FourCC magic numbers (NV12=0x3231564e). Software decode (swscale, reads colorspace from the VUI) was always correct, which isolated the bug to this path. PUNKTFUNK_DECODER=software is the immediate workaround on an un-rebuilt binary. Awaiting Steam Deck reconfirm (no AMD VAAPI on the NVIDIA dev box to live-verify). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 10 +- crates/punktfunk-client-linux/src/video.rs | 124 ++++++++++++++++++--- 2 files changed, 118 insertions(+), 16 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2b03a56..f01cc2f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -108,8 +108,14 @@ Low-latency desktop/game streaming stack, Linux-first, with a shared Rust protoc default, saved-hosts list, .deb + RPM-subpackage CI (deb.yml/rpm.yml). **VAAPI decode → DRM-PRIME dmabuf → `GdkDmabufTexture`** (BT.709 color state; Tier-1 zero-copy on Intel/AMD, `PUNKTFUNK_DECODER=software|vaapi` override) with a proven fallback ladder — - no VAAPI device (NVIDIA) or mid-session VAAPI error → software decode; needs an - Intel/AMD client box to live-verify the hw path. Next: the stage-2 raw-Wayland + no VAAPI device (NVIDIA) or mid-session VAAPI error → software decode. **First AMD test + (Steam Deck) hit a green-screen bug, fixed:** FFmpeg's VAAPI export uses + `SEPARATE_LAYERS`, so NV12 arrives as two single-plane layers (R8 luma + GR88 chroma, + one shared fd); the mapper took `layers[0]` only → GTK got a luma-only R8 texture, chroma + read as 0 → green field / red whites. Fix derives the combined fourcc from the decoder + `sw_format` (→ `DRM_FORMAT_NV12`) and flattens all planes across all layers (mpv's + pattern); a first-frame descriptor dump logs the real layout. Awaiting Steam Deck + reconfirm. Next: the stage-2 raw-Wayland presenter (wp_presentation feedback, tearing-control, Vulkan Video on NVIDIA) — **wgpu/winit rejected** (no dmabuf import / presentation feedback / shortcuts-inhibit). **Windows stage 1 done 2026-06-15** (`crates/punktfunk-client-windows`, binary diff --git a/crates/punktfunk-client-linux/src/video.rs b/crates/punktfunk-client-linux/src/video.rs index 5351356..fa459c4 100644 --- a/crates/punktfunk-client-linux/src/video.rs +++ b/crates/punktfunk-client-linux/src/video.rs @@ -43,7 +43,8 @@ pub struct CpuFrame { pub struct DmabufFrame { pub width: u32, pub height: u32, - /// DRM fourcc of the layer (NV12 for 8-bit VAAPI output). + /// Combined DRM fourcc of the whole surface (NV12 for 8-bit VAAPI output), derived + /// from the decoder's software format — NOT the per-plane component formats. pub fourcc: u32, pub modifier: u64, pub planes: Vec, @@ -292,12 +293,31 @@ impl VaapiDecoder { /// Map the VAAPI surface to DRM PRIME (zero copy) and lift the descriptor into a /// `DmabufFrame`. The mapped frame keeps the surface alive via its buffer refs. + /// + /// FFmpeg's VAAPI export uses `VA_EXPORT_SURFACE_SEPARATE_LAYERS`, so an NV12 surface + /// comes back as TWO layers (`R8` luma + `GR88` chroma), each one plane — NOT a single + /// `NV12` layer. The previous code took `layers[0]` only: GTK then saw an `R8` + /// single-plane texture with the chroma dropped, painting the screen green. The fix: + /// derive the COMBINED fourcc from the decoder's software pixel format (NV12 → + /// `DRM_FORMAT_NV12`) and flatten every plane across every layer in order (Y then UV). unsafe fn map_dmabuf(&mut self) -> Result { use ffmpeg::ffi; unsafe { if (*self.frame).format != ffi::AVPixelFormat::AV_PIX_FMT_VAAPI as i32 { bail!("decoder returned a software frame (no VAAPI surface)"); } + // The real pixel layout lives on the hardware frames context, not the + // DRM-PRIME layer formats (those are the per-plane R8/GR88 component formats). + let sw_format = { + let hwfc = (*self.frame).hw_frames_ctx; + if hwfc.is_null() { + bail!("VAAPI frame without a hardware frames context"); + } + (*((*hwfc).data as *const ffi::AVHWFramesContext)).sw_format + }; + let fourcc = drm_fourcc_for(sw_format) + .ok_or_else(|| anyhow!("unsupported VAAPI output format {sw_format:?}"))?; + let drm = ffi::av_frame_alloc(); (*drm).format = ffi::AVPixelFormat::AV_PIX_FMT_DRM_PRIME as i32; let r = ffi::av_hwframe_map(drm, self.frame, ffi::AV_HWFRAME_MAP_READ as i32); @@ -309,24 +329,35 @@ impl VaapiDecoder { let desc = (*drm).data[0] as *const ffi::AVDRMFrameDescriptor; let guard = DrmFrameGuard(drm); let d = &*desc; - if d.nb_layers < 1 { - bail!("DRM descriptor without layers"); + if d.nb_layers < 1 || d.nb_objects < 1 { + bail!("DRM descriptor without layers/objects"); } - let layer = &d.layers[0]; - let mut planes = Vec::with_capacity(layer.nb_planes as usize); - for p in &layer.planes[..layer.nb_planes as usize] { - let obj = &d.objects[p.object_index as usize]; - planes.push(DmabufPlane { - fd: obj.fd, - offset: p.offset as u32, - stride: p.pitch as u32, - }); + + // Flatten planes across ALL layers, in declared order — the combined fourcc's + // plane order (Y, then UV for NV12) matches the layer order FFmpeg emits. + let mut planes = Vec::new(); + for layer in &d.layers[..d.nb_layers as usize] { + for p in &layer.planes[..layer.nb_planes as usize] { + let obj = &d.objects[p.object_index as usize]; + planes.push(DmabufPlane { + fd: obj.fd, + offset: p.offset as u32, + stride: p.pitch as u32, + }); + } } + + // The whole surface shares one tiling modifier (one BO on radeonsi); GTK takes + // a single modifier for the texture. + let modifier = d.objects[0].format_modifier; + + log_descriptor_once(d, sw_format, fourcc, modifier); + Ok(DmabufFrame { width: (*self.frame).width as u32, height: (*self.frame).height as u32, - fourcc: layer.format, - modifier: d.objects[0].format_modifier, + fourcc, + modifier, planes, guard, }) @@ -334,6 +365,50 @@ impl VaapiDecoder { } } +/// `fourcc(a,b,c,d)` — the DRM FourCC packing (little-endian, `a | b<<8 | c<<16 | d<<24`). +const fn fourcc(a: u8, b: u8, c: u8, d: u8) -> u32 { + (a as u32) | ((b as u32) << 8) | ((c as u32) << 16) | ((d as u32) << 24) +} + +/// The combined DRM FourCC for a decoder software pixel format. The host streams 8-bit +/// 4:2:0 (NV12); P010 is here for the eventual 10-bit/HDR path. +fn drm_fourcc_for(sw: ffmpeg_next::ffi::AVPixelFormat) -> Option { + use ffmpeg_next::ffi::AVPixelFormat::*; + Some(match sw { + AV_PIX_FMT_NV12 => fourcc(b'N', b'V', b'1', b'2'), + AV_PIX_FMT_P010LE => fourcc(b'P', b'0', b'1', b'0'), + _ => return None, + }) +} + +/// One-time dump of the DRM descriptor layout (objects, layers, planes, modifier) — so a +/// new client/driver combination's real layout is visible in the logs without a debugger. +fn log_descriptor_once( + d: &ffmpeg_next::ffi::AVDRMFrameDescriptor, + sw: ffmpeg_next::ffi::AVPixelFormat, + fourcc: u32, + modifier: u64, +) { + use std::sync::atomic::{AtomicBool, Ordering}; + static ONCE: AtomicBool = AtomicBool::new(true); + if !ONCE.swap(false, Ordering::Relaxed) { + return; + } + let layers: Vec<(u32, i32)> = d.layers[..d.nb_layers.max(0) as usize] + .iter() + .map(|l| (l.format, l.nb_planes)) + .collect(); + tracing::info!( + sw_format = ?sw, + chosen_fourcc = format_args!("{:#010x}", fourcc), + nb_objects = d.nb_objects, + nb_layers = d.nb_layers, + ?layers, + modifier = format_args!("{:#018x}", modifier), + "VAAPI dmabuf descriptor layout (first frame)" + ); +} + impl Drop for VaapiDecoder { fn drop(&mut self) { use ffmpeg::ffi; @@ -345,3 +420,24 @@ impl Drop for VaapiDecoder { } } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Lock the DRM FourCC magic numbers against typos — these are the exact values + /// `` defines, and a wrong one is what painted the Steam Deck green. + #[test] + fn drm_fourcc_constants() { + assert_eq!(fourcc(b'N', b'V', b'1', b'2'), 0x3231_564e); + assert_eq!(fourcc(b'P', b'0', b'1', b'0'), 0x3031_3050); + assert_eq!( + drm_fourcc_for(ffmpeg::ffi::AVPixelFormat::AV_PIX_FMT_NV12), + Some(0x3231_564e) + ); + assert_eq!( + drm_fourcc_for(ffmpeg::ffi::AVPixelFormat::AV_PIX_FMT_RGBA), + None + ); + } +}