diff --git a/crates/punktfunk-host/src/capture/linux.rs b/crates/punktfunk-host/src/capture/linux.rs index c3068f8..abd7d7e 100644 --- a/crates/punktfunk-host/src/capture/linux.rs +++ b/crates/punktfunk-host/src/capture/linux.rs @@ -79,7 +79,7 @@ impl PortalCapturer { node_id, "ScreenCast portal session started; connecting PipeWire" ); - Ok(spawn_pipewire(Some(fd), node_id, None, false)?.into_capturer(node_id, None)) + Ok(spawn_pipewire(Some(fd), node_id, None)?.into_capturer(node_id, None)) } /// Build a capturer from an already-created virtual output ([`crate::vdisplay::VirtualOutput`]): @@ -93,7 +93,7 @@ impl PortalCapturer { ); let node_id = vout.node_id; Ok( - spawn_pipewire(vout.remote_fd, node_id, vout.preferred_mode, vout.mutter)? + spawn_pipewire(vout.remote_fd, node_id, vout.preferred_mode)? .into_capturer(node_id, Some(vout.keepalive)), ) } @@ -133,7 +133,6 @@ fn spawn_pipewire( fd: Option, node_id: u32, preferred: Option<(u32, u32, u32)>, - mutter: bool, ) -> Result { // Frames flow from the pipewire thread over a small bounded channel. let (frame_tx, frame_rx) = sync_channel::(8); @@ -158,7 +157,6 @@ fn spawn_pipewire( zerocopy, preferred, quit_rx, - mutter, ) { tracing::error!(error = %format!("{e:#}"), "pipewire capture thread failed"); } @@ -468,66 +466,6 @@ mod pipewire { negotiated: Arc, /// Present when zero-copy is enabled: imports a dmabuf → CUDA device buffer. importer: Option, - /// Explicit-sync (SyncTimeline) syncobj ops; lazily opened on the first sync-carrying - /// buffer. `sync_tried` keeps a failed open from retrying per frame. - sync: Option, - sync_tried: bool, - /// Announce SyncTimeline in `param_changed` (post-format, the OBS pattern — at - /// connect time the meta acts as a hard filter and producers without support - /// fail buffer allocation outright; observed on KWin). - want_sync: bool, - /// Which Buffers pod shape to re-emit alongside the meta announcement. - want_dmabuf: bool, - /// Mutter virtual monitor: dmabufs MUST carry explicit sync (see MUTTER_SYNC_UNUSABLE). - mutter: bool, - } - - /// Mutter renders directly into the pool, so its dmabufs are only safe with explicit - /// sync. Set when sync negotiation failed (unallocated buffers) or Mutter delivered - /// dmabufs without the SyncTimeline meta: the pipeline's first-frame-timeout retry - /// then rebuilds this capture on the synchronous CPU/shm path (Mutter downloads the - /// frame, which orders against its render) — slower, never stale. - static MUTTER_SYNC_UNUSABLE: AtomicBool = AtomicBool::new(false); - - /// The explicit-sync points of one buffer: producers that render directly into the - /// pool (Mutter virtual monitors) attach a `SPA_META_SyncTimeline` plus two - /// `SPA_DATA_SyncObj` datas (acquire first, release second — PipeWire convention). - /// Reading before the acquire point fires shows the buffer's PREVIOUS contents on - /// drivers without implicit dmabuf fencing (NVIDIA) — the "stale frame flashes" bug. - struct SyncPoints { - acquire_fd: i32, - release_fd: i32, - acquire_point: u64, - release_point: u64, - } - - /// Extract the sync points, if the producer attached them. - unsafe fn sync_points(b: *const spa::sys::spa_buffer) -> Option { - unsafe { - if b.is_null() { - return None; - } - let meta = spa::sys::spa_buffer_find_meta_data( - b, - spa::sys::SPA_META_SyncTimeline, - std::mem::size_of::(), - ) as *const spa::sys::spa_meta_sync_timeline; - if meta.is_null() { - return None; - } - let datas = std::slice::from_raw_parts((*b).datas, (*b).n_datas as usize); - let mut objs = datas - .iter() - .filter(|d| d.type_ == spa::sys::SPA_DATA_SyncObj && d.fd >= 0); - let acquire = objs.next()?; - let release = objs.next()?; - Some(SyncPoints { - acquire_fd: acquire.fd as i32, - release_fd: release.fd as i32, - acquire_point: (*meta).acquire_point, - release_point: (*meta).release_point, - }) - } } /// Log a frame-drop reason once per process (the process callback runs per frame; a stuck @@ -573,33 +511,6 @@ mod pipewire { } } - /// Announce SyncTimeline (explicit sync) support: producers that render directly - /// into the buffer pool (Mutter virtual monitors) then attach per-buffer acquire / - /// release timeline points instead of relying on implicit dmabuf fencing — which - /// NVIDIA doesn't do. Producers without explicit-sync support simply ignore this. - fn build_sync_timeline_meta() -> Result> { - serialize_pod(pw::spa::pod::Object { - type_: pw::spa::utils::SpaTypes::ObjectParamMeta.as_raw(), - id: pw::spa::param::ParamType::Meta.as_raw(), - properties: vec![ - pw::spa::pod::Property { - key: pw::spa::sys::SPA_PARAM_META_type, - flags: pw::spa::pod::PropertyFlags::empty(), - value: pw::spa::pod::Value::Id(pw::spa::utils::Id( - pw::spa::sys::SPA_META_SyncTimeline, - )), - }, - pw::spa::pod::Property { - key: pw::spa::sys::SPA_PARAM_META_size, - flags: pw::spa::pod::PropertyFlags::empty(), - value: pw::spa::pod::Value::Int(std::mem::size_of::< - pw::spa::sys::spa_meta_sync_timeline, - >() as i32), - }, - ], - }) - } - fn serialize_pod(obj: pw::spa::pod::Object) -> Result> { Ok(pw::spa::pod::serialize::PodSerializer::serialize( std::io::Cursor::new(Vec::new()), @@ -739,60 +650,31 @@ mod pipewire { /// without this bit the buffer-type intersection is empty and the link silently stalls in /// "negotiating". A LINEAR dmabuf is mmap-able by MAP_BUFFERS, so the CPU de-pad copy works. fn build_mappable_buffers() -> Result> { - let mask = (1i32 << pw::spa::sys::SPA_DATA_MemPtr) - | (1i32 << pw::spa::sys::SPA_DATA_MemFd) - | (1i32 << pw::spa::sys::SPA_DATA_DmaBuf); serialize_pod(pw::spa::pod::Object { type_: pw::spa::utils::SpaTypes::ObjectParamBuffers.as_raw(), id: pw::spa::param::ParamType::Buffers.as_raw(), properties: vec![pw::spa::pod::Property { key: pw::spa::sys::SPA_PARAM_BUFFERS_dataType, flags: pw::spa::pod::PropertyFlags::empty(), - value: pw::spa::pod::Value::Int(mask), + value: pw::spa::pod::Value::Int( + (1i32 << pw::spa::sys::SPA_DATA_MemPtr) + | (1i32 << pw::spa::sys::SPA_DATA_MemFd) + | (1i32 << pw::spa::sys::SPA_DATA_DmaBuf), + ), }], }) } /// Build a Buffers param requesting dmabuf-only buffers. - /// `sync` (Mutter explicit sync): the dataType must be a CHOICE_FLAGS holding ONLY - /// the DmaBuf bit — Mutter enables explicit sync only when the negotiated - /// buffer_types are exactly DmaBuf, and only its sync Buffers pod reserves the - /// blocks for the two SyncObj datas (the syncobjs are NOT a dataType bit). The - /// plain-Int form keeps the non-sync path byte-identical to what KWin/gamescope - /// already negotiate. - fn build_dmabuf_buffers(sync: bool) -> Result> { - let mask = 1i32 << pw::spa::sys::SPA_DATA_DmaBuf; - let value = if sync { - pw::spa::pod::Value::Choice(pw::spa::pod::ChoiceValue::Int(pw::spa::utils::Choice( - pw::spa::utils::ChoiceFlags::empty(), - pw::spa::utils::ChoiceEnum::Flags { - default: mask, - flags: vec![mask], - }, - ))) - } else { - pw::spa::pod::Value::Int(mask) - }; - let mut properties = vec![pw::spa::pod::Property { - key: pw::spa::sys::SPA_PARAM_BUFFERS_dataType, - flags: pw::spa::pod::PropertyFlags::empty(), - value, - }]; - if sync { - // Pin blocks to media + 2 SyncObj datas: the producer offers a sync pod - // (blocks=3) AND a non-sync fallback (blocks=1) — without this filter the - // fallback can win the intersection while the SyncTimeline meta still - // negotiates, and the producer then asserts on its own 1-block buffers. - properties.push(pw::spa::pod::Property { - key: pw::spa::sys::SPA_PARAM_BUFFERS_blocks, - flags: pw::spa::pod::PropertyFlags::empty(), - value: pw::spa::pod::Value::Int(3), - }); - } + fn build_dmabuf_buffers() -> Result> { serialize_pod(pw::spa::pod::Object { type_: pw::spa::utils::SpaTypes::ObjectParamBuffers.as_raw(), id: pw::spa::param::ParamType::Buffers.as_raw(), - properties, + properties: vec![pw::spa::pod::Property { + key: pw::spa::sys::SPA_PARAM_BUFFERS_dataType, + flags: pw::spa::pod::PropertyFlags::empty(), + value: pw::spa::pod::Value::Int(1i32 << pw::spa::sys::SPA_DATA_DmaBuf), + }], }) } @@ -806,7 +688,6 @@ mod pipewire { zerocopy: bool, preferred: Option<(u32, u32, u32)>, quit_rx: pw::channel::Receiver<()>, - mutter: bool, ) -> Result<()> { crate::pwinit::ensure_init(); @@ -855,26 +736,9 @@ mod pipewire { if importer.is_some() && !modifiers.contains(&0) { modifiers.push(0); // DRM_FORMAT_MOD_LINEAR } - // Explicit sync (SyncTimeline meta + SyncObj datas), announced post-format in - // param_changed — only on Mutter: it is the one compositor that renders directly - // into the pool, and announcing the meta to producers whose syncobj path is - // broken makes them fail buffer allocation outright (observed on KWin + NVIDIA). - // PUNKTFUNK_EXPLICIT_SYNC=0 is the escape hatch. - let want_sync = mutter - && !MUTTER_SYNC_UNUSABLE.load(Ordering::Relaxed) - && std::env::var("PUNKTFUNK_EXPLICIT_SYNC").as_deref() != Ok("0"); - // On Mutter, dmabufs without explicit sync are not safe to consume — fall back to - // the shm/CPU path (Mutter's synchronous download) when sync is unavailable. - let want_dmabuf = importer.is_some() && !modifiers.is_empty() && (!mutter || want_sync); + let want_dmabuf = importer.is_some() && !modifiers.is_empty(); if zerocopy && !want_dmabuf { - if mutter && !want_sync { - tracing::warn!( - "Mutter without working explicit sync — using the synchronous CPU \ - path (dmabuf capture would show stale frames on NVIDIA)" - ); - } else { - tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path"); - } + tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path"); } else if want_dmabuf { tracing::info!( count = modifiers.len(), @@ -891,11 +755,6 @@ mod pipewire { active, negotiated, importer, - sync: None, - sync_tried: false, - want_sync, - want_dmabuf, - mutter, }; let stream = pw::stream::StreamBox::new( @@ -916,24 +775,10 @@ mod pipewire { let _listener = stream .add_local_listener_with_user_data(data) - .state_changed(|_stream, ud, old, new| { + .state_changed(|_stream, _ud, old, new| { tracing::info!(?old, ?new, "pipewire stream state"); - // A sync-announced negotiation the producer cannot satisfy fails buffer - // allocation ("error alloc buffers") — no process callback will ever run, - // so flag it HERE and starve the first-frame timeout: the pipeline retry - // then rebuilds this capture on the synchronous CPU path. - if matches!(new, pw::stream::StreamState::Error(_)) - && ud.want_sync - && !MUTTER_SYNC_UNUSABLE.swap(true, Ordering::Relaxed) - { - tracing::warn!( - "explicit-sync buffer negotiation failed (Mutter without \ - DRM_CAP_SYNCOBJ_TIMELINE / cogl sync_fd, e.g. NVIDIA) — retrying \ - this capture on the synchronous CPU path" - ); - } }) - .param_changed(|stream, ud, id, param| { + .param_changed(|_stream, ud, id, param| { let Some(param) = param else { return }; if id != pw::spa::param::ParamType::Format.as_raw() { return; @@ -967,78 +812,198 @@ mod pipewire { "negotiated a pixel format the encoder cannot consume — frames will be skipped" ); } - // Post-format renegotiation (the OBS pattern): announce SyncTimeline - // + a Buffers pod accepting the SyncObj datas. Producers without - // explicit-sync support ignore it here, instead of failing allocation - // as they do when it arrives at connect time. - if ud.want_sync && ud.want_dmabuf { - let update = (|| -> Result<()> { - let buffers = build_dmabuf_buffers(true)?; - let meta = build_sync_timeline_meta()?; - let mut pods = vec![ - Pod::from_bytes(&buffers).context("buffers pod")?, - Pod::from_bytes(&meta).context("meta pod")?, - ]; - stream - .update_params(&mut pods) - .context("update_params(SyncTimeline)")?; - Ok(()) - })(); - if let Err(e) = update { - tracing::warn!(error = %format!("{e:#}"), - "explicit-sync param update failed — capturing without it"); - } - } } }) .process(|stream, ud| { // PipeWire dispatches this from a C trampoline with no catch_unwind; a // panic crossing that FFI boundary would abort the whole host. Contain it. let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { - // Raw dequeue: the safe Buffer wrapper hides the spa_buffer metas that - // carry the explicit-sync timeline points. - let raw = unsafe { stream.dequeue_raw_buffer() }; - if raw.is_null() { + let Some(mut buffer) = stream.dequeue_buffer() else { + return; + }; + // No active stream: release the buffer without the (expensive at 5K) de-pad. + if !ud.active.load(Ordering::Relaxed) { return; } - let spa_buf = unsafe { (*raw).buffer }; - let sync = unsafe { sync_points(spa_buf) }; - if let Some(s) = &sync { - if !ud.sync_tried { - ud.sync_tried = true; - match crate::drm_sync::DrmSync::open() { - Ok(d) => { + let datas = buffer.datas_mut(); + if datas.is_empty() { + return; + } + let sz = ud.info.size(); + let (w, h) = (sz.width as usize, sz.height as usize); + if w == 0 || h == 0 { + return; // format not negotiated yet + } + + // Implicit-fence wait: Mutter renders into the dmabuf and hands it over at + // GPU-submit time; with no producer explicit sync (Mutter+NVIDIA can't) we snapshot + // the buffer's implicit fence and wait the producer's render before sampling — + // closing the stale/old-frame race on NVIDIA. No-op for shm buffers or drivers that + // attach no fence. Covers both the GPU import and the CPU mmap read below. + if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { + match crate::dmabuf_fence::wait_read_ready(datas[0].fd(), 100) { + Ok(waited) => { + static F1: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(true); + if F1.swap(false, Ordering::Relaxed) { tracing::info!( - "pipewire explicit sync active (SyncTimeline — producer \ - renders are awaited before encode)" + waited, + "dmabuf implicit-fence sync active (waited=true → driver fences \ + the render, race closed; false → no implicit fence, zero-copy \ + may still show stale frames)" ); - ud.sync = Some(d); } - Err(e) => tracing::warn!( - error = %format!("{e:#}"), - "explicit sync offered but syncobj ops unavailable — reading \ - unsynchronized" - ), } - } - // Wait for the producer's render to land before ANY read (GPU import - // or CPU mmap). A bounded wait: a wedged producer must not stall the - // PipeWire loop. - if let Some(drm) = &ud.sync { - if drm.wait_point(s.acquire_fd, s.acquire_point, 100).is_err() { - warn_once("explicit-sync acquire wait failed — frame may be stale"); + Err(e) => { + static F2: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(true); + if F2.swap(false, Ordering::Relaxed) { + tracing::warn!( + error = %format!("{e}"), + "dmabuf EXPORT_SYNC_FILE failed — no implicit-fence sync; NVIDIA \ + zero-copy may show stale frames (no producer explicit sync)" + ); + } } } } - consume_frame(ud, spa_buf, sync.is_some()); - // The producer reuses the buffer only after the release point fires — - // signal it on EVERY path, even skipped frames, or the producer stalls. - if let (Some(s), Some(drm)) = (&sync, &ud.sync) { - if drm.signal_point(s.release_fd, s.release_point).is_err() { - warn_once("explicit-sync release signal failed — producer may stall"); + + // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it + // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall + // through to the shm de-pad copy below. + let mut gpu_import_broken = false; + if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) { + if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { + let plane = crate::zerocopy::DmabufPlane { + fd: datas[0].fd(), + offset: datas[0].chunk().offset(), + stride: datas[0].chunk().stride().max(0) as u32, + }; + // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g. + // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't + // sample LINEAR). + let modifier = (ud.modifier != 0).then_some(ud.modifier); + if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) { + let imported = if modifier.is_some() { + importer.import(&plane, w as u32, h as u32, fourcc, modifier) + } else { + importer.import_linear(&plane, w as u32, h as u32) + }; + match imported { + Ok(devbuf) => { + static ONCE: std::sync::atomic::AtomicBool = + std::sync::atomic::AtomicBool::new(true); + if ONCE.swap(false, Ordering::Relaxed) { + tracing::info!(w, h, modifier = ud.modifier, + "zero-copy: dmabuf imported to CUDA (no CPU copy)"); + } + let pts_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos() as u64) + .unwrap_or(0); + let _ = ud.tx.try_send(CapturedFrame { + width: w as u32, + height: h as u32, + pts_ns, + format: fmt, + payload: FramePayload::Cuda(devbuf), + }); + return; + } + Err(e) => { + // GPU import unavailable for this buffer kind (e.g. the + // driver rejects LINEAR external-memory import). Disable + // the importer and fall through to the CPU mmap path — + // degraded, not dead. + tracing::warn!(error = %format!("{e:#}"), + "dmabuf GPU import failed — falling back to the CPU copy path"); + gpu_import_broken = true; + } + } + } else { + return; // format has no DRM fourcc mapping — skip the frame + } } } - unsafe { stream.queue_raw_buffer(raw) }; + if gpu_import_broken { + ud.importer = None; + } + + let d = &mut datas[0]; + // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its + // modifier-bearing format pod wins); capture the fd before `data()` borrows `d`. + let dmabuf_fd = + (d.type_() == pw::spa::buffer::DataType::DmaBuf).then(|| d.fd()); + let (size, offset, stride) = { + let c = d.chunk(); + ( + c.size() as usize, + c.offset() as usize, + c.stride().max(0) as usize, + ) + }; + let Some(fmt) = ud.format else { return }; // unsupported/not negotiated + let bpp = fmt.bytes_per_pixel(); + let row = w * bpp; + let stride = if stride == 0 { row } else { stride }; + if stride < row { + warn_once("chunk stride < row — frames dropped"); + return; + } + let needed = stride * (h - 1) + row; + // dmabuf chunks commonly report size 0; fall back to the computed span. + let size = if size == 0 { needed } else { size }; + // MAP_BUFFERS only maps buffers flagged mappable; Vulkan-exported dmabufs + // (gamescope) usually aren't, so mmap the fd ourselves for the de-pad read. + let _mapping; // keeps a manual mmap alive for the copy below + let buf: &[u8] = if let Some(data) = d.data() { + data + } else if let Some(fd) = dmabuf_fd.filter(|&fd| fd > 0) { + match DmabufMap::new(fd, offset + needed) { + Some(m) => { + _mapping = m; + unsafe { + std::slice::from_raw_parts(_mapping.ptr as *const u8, _mapping.len) + } + } + None => { + warn_once("mmap(dmabuf) failed — frames dropped"); + return; + } + } + } else { + warn_once("buffer has no mappable data — frames dropped"); + return; + }; + // Need stride*(h-1)+row valid bytes within [offset, offset+size). + if offset > buf.len() { + return; + } + let avail = buf.len() - offset; + if needed > avail || needed > size { + warn_once("buffer smaller than frame span — frames dropped"); + return; + } + let region = &buf[offset..offset + size.min(avail)]; + // De-pad into a tightly-packed buffer (chunk stride may exceed w*bpp). + let mut tight = vec![0u8; row * h]; + for y in 0..h { + tight[y * row..y * row + row] + .copy_from_slice(®ion[y * stride..y * stride + row]); + } + let pts_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_nanos() as u64) + .unwrap_or(0); + let frame = CapturedFrame { + width: w as u32, + height: h as u32, + pts_ns, + format: fmt, + payload: FramePayload::Cpu(tight), + }; + // Drop if the encoder is behind — never block the pipewire loop. + let _ = ud.tx.try_send(frame); })); if outcome.is_err() { tracing::error!("panic in pipewire process callback — frame dropped"); @@ -1099,18 +1064,10 @@ mod pipewire { // `param_changed` (the two-step DMA-BUF handshake). Otherwise offer the multi-format shm // pod and let MAP_BUFFERS map it. let shm_values = serialize_pod(obj)?; - // The SyncTimeline announcement itself happens post-format in param_changed (the - // OBS pattern); at connect time we only declare the data types we accept. let (dmabuf_values, buffers_values) = if want_dmabuf { ( Some(build_dmabuf_format(&modifiers, preferred)?), - // Sync path: NO Buffers pod at connect — buffers must not be allocated - // until the SyncTimeline meta is on the table (param_changed), or the - // producer's syncobj setup meets already-allocated 1-block buffers - // (observed: gnome-shell "n_datas >= SYNCOBJ_MINIMUM_N_DATAS" assertions). - (!want_sync) - .then(|| build_dmabuf_buffers(false)) - .transpose()?, + Some(build_dmabuf_buffers()?), ) } else { // CPU path still accepts mappable dmabufs (gamescope offers only those once its @@ -1144,199 +1101,4 @@ mod pipewire { mainloop.run(); Ok(()) } - - /// Consume one dequeued buffer: GPU-import or de-pad-copy `datas[0]` and hand the - /// frame to the encoder. Extracted from the process callback so the explicit-sync - /// release point can be signaled on every early return. - fn consume_frame(ud: &mut UserData, spa_buf: *mut spa::sys::spa_buffer, has_sync: bool) { - // No active stream: release the buffer without the (expensive at 5K) de-pad. - if !ud.active.load(Ordering::Relaxed) { - return; - } - let datas: &mut [pw::spa::buffer::Data] = unsafe { - if spa_buf.is_null() || (*spa_buf).n_datas == 0 || (*spa_buf).datas.is_null() { - &mut [] - } else { - // Same transparent cast libspa's Buffer::datas_mut performs. - std::slice::from_raw_parts_mut( - (*spa_buf).datas as *mut pw::spa::buffer::Data, - (*spa_buf).n_datas as usize, - ) - } - }; - if datas.is_empty() { - return; - } - // A sync-announced negotiation that the producer could not complete leaves the - // buffers unallocated (type SPA_ID_INVALID, fd -1). Disable sync and starve the - // first-frame timeout so the pipeline retries on the safe path. - if datas[0].as_raw().type_ == u32::MAX - /* SPA_ID_INVALID */ - { - if ud.want_sync && !MUTTER_SYNC_UNUSABLE.swap(true, Ordering::Relaxed) { - tracing::error!( - "explicit-sync negotiation produced unallocated buffers — retrying \ - this capture on the synchronous CPU path" - ); - } else { - warn_once("buffer arrived unallocated — frames dropped"); - } - return; - } - if ud.mutter && !has_sync && datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { - // Mutter renders straight into the pool; without explicit sync the encode - // races the render on NVIDIA and flashes the buffer's previous contents. - // Don't consume these — starve the first-frame timeout so the pipeline - // retries on the synchronous CPU path. - if !MUTTER_SYNC_UNUSABLE.swap(true, Ordering::Relaxed) { - tracing::error!( - "Mutter delivered dmabufs without explicit sync — refusing them \ - (stale-frame corruption); retrying on the synchronous CPU path" - ); - } - return; - } - let sz = ud.info.size(); - let (w, h) = (sz.width as usize, sz.height as usize); - if w == 0 || h == 0 { - return; // format not negotiated yet - } - - // Zero-copy path: if the buffer is a dmabuf and we have an importer, import it - // into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall - // through to the shm de-pad copy below. - let mut gpu_import_broken = false; - if let (Some(importer), Some(fmt)) = (ud.importer.as_mut(), ud.format) { - if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf { - let plane = crate::zerocopy::DmabufPlane { - fd: datas[0].fd(), - offset: datas[0].chunk().offset(), - stride: datas[0].chunk().stride().max(0) as u32, - }; - // Tiled modifier → EGL/GL de-tile import; LINEAR (0/unset, e.g. - // gamescope) → direct CUDA external-memory import (NVIDIA EGL can't - // sample LINEAR). - let modifier = (ud.modifier != 0).then_some(ud.modifier); - if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) { - let imported = if modifier.is_some() { - importer.import(&plane, w as u32, h as u32, fourcc, modifier) - } else { - importer.import_linear(&plane, w as u32, h as u32) - }; - match imported { - Ok(devbuf) => { - static ONCE: std::sync::atomic::AtomicBool = - std::sync::atomic::AtomicBool::new(true); - if ONCE.swap(false, Ordering::Relaxed) { - tracing::info!( - w, - h, - modifier = ud.modifier, - "zero-copy: dmabuf imported to CUDA (no CPU copy)" - ); - } - let pts_ns = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_nanos() as u64) - .unwrap_or(0); - let _ = ud.tx.try_send(CapturedFrame { - width: w as u32, - height: h as u32, - pts_ns, - format: fmt, - payload: FramePayload::Cuda(devbuf), - }); - return; - } - Err(e) => { - // GPU import unavailable for this buffer kind (e.g. the - // driver rejects LINEAR external-memory import). Disable - // the importer and fall through to the CPU mmap path — - // degraded, not dead. - tracing::warn!(error = %format!("{e:#}"), - "dmabuf GPU import failed — falling back to the CPU copy path"); - gpu_import_broken = true; - } - } - } else { - return; // format has no DRM fourcc mapping — skip the frame - } - } - } - if gpu_import_broken { - ud.importer = None; - } - - let d = &mut datas[0]; - // CPU path may also receive LINEAR dmabufs (gamescope offers only those once its - // modifier-bearing format pod wins); capture the fd before `data()` borrows `d`. - let dmabuf_fd = (d.type_() == pw::spa::buffer::DataType::DmaBuf).then(|| d.fd()); - let (size, offset, stride) = { - let c = d.chunk(); - ( - c.size() as usize, - c.offset() as usize, - c.stride().max(0) as usize, - ) - }; - let Some(fmt) = ud.format else { return }; // unsupported/not negotiated - let bpp = fmt.bytes_per_pixel(); - let row = w * bpp; - let stride = if stride == 0 { row } else { stride }; - if stride < row { - warn_once("chunk stride < row — frames dropped"); - return; - } - let needed = stride * (h - 1) + row; - // dmabuf chunks commonly report size 0; fall back to the computed span. - let size = if size == 0 { needed } else { size }; - // MAP_BUFFERS only maps buffers flagged mappable; Vulkan-exported dmabufs - // (gamescope) usually aren't, so mmap the fd ourselves for the de-pad read. - let _mapping; // keeps a manual mmap alive for the copy below - let buf: &[u8] = if let Some(data) = d.data() { - data - } else if let Some(fd) = dmabuf_fd.filter(|&fd| fd > 0) { - match DmabufMap::new(fd, offset + needed) { - Some(m) => { - _mapping = m; - unsafe { std::slice::from_raw_parts(_mapping.ptr as *const u8, _mapping.len) } - } - None => { - warn_once("mmap(dmabuf) failed — frames dropped"); - return; - } - } - } else { - warn_once("buffer has no mappable data — frames dropped"); - return; - }; - // Need stride*(h-1)+row valid bytes within [offset, offset+size). - if offset > buf.len() { - return; - } - let avail = buf.len() - offset; - if needed > avail || needed > size { - warn_once("buffer smaller than frame span — frames dropped"); - return; - } - let region = &buf[offset..offset + size.min(avail)]; - // De-pad into a tightly-packed buffer (chunk stride may exceed w*bpp). - let mut tight = vec![0u8; row * h]; - for y in 0..h { - tight[y * row..y * row + row].copy_from_slice(®ion[y * stride..y * stride + row]); - } - let pts_ns = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_nanos() as u64) - .unwrap_or(0); - let frame = CapturedFrame { - width: w as u32, - height: h as u32, - pts_ns, - format: fmt, - payload: FramePayload::Cpu(tight), - }; - // Drop if the encoder is behind — never block the pipewire loop. - let _ = ud.tx.try_send(frame); - } } diff --git a/crates/punktfunk-host/src/dmabuf_fence.rs b/crates/punktfunk-host/src/dmabuf_fence.rs new file mode 100644 index 0000000..223d6b7 --- /dev/null +++ b/crates/punktfunk-host/src/dmabuf_fence.rs @@ -0,0 +1,75 @@ +//! Consumer-side implicit-fence wait for dmabuf capture (`DMA_BUF_IOCTL_EXPORT_SYNC_FILE`). +//! +//! Mutter renders its virtual monitor DIRECTLY into the PipeWire dmabuf and hands the buffer over +//! at GPU-submit time. With no fencing the consumer can sample mid-render and encode the buffer's +//! *previous* contents — the "stale/old frame" flashing on NVIDIA (KWin/gamescope blit into the +//! buffer so they don't hit this). The producer-driven fix is PipeWire explicit sync, but +//! Mutter+NVIDIA can't produce a sync_fd (`error alloc buffers` / no cogl sync_fd). +//! +//! So sync from the *consumer* side instead: a dmabuf carries its in-flight GPU work as an implicit +//! fence on its reservation object. `DMA_BUF_IOCTL_EXPORT_SYNC_FILE` snapshots that into a sync_file +//! fd we can `poll()` — readable once the producer's writes complete. This makes zero-copy capture +//! race-free WITHOUT the producer doing anything, *iff* the driver actually attaches the fence. If it +//! attaches none, the export yields an already-signaled sync_file (poll returns immediately) — no +//! wait, no harm, and `waited=false` tells us the driver doesn't fence (so zero-copy would still race). + +use std::os::fd::RawFd; + +// linux/dma-buf.h ioctls on the DMA_BUF_BASE ('b' = 0x62) magic. _IOWR = dir(3)<<30 | size<<16 | base<<8 | nr. +const DMA_BUF_BASE: u64 = 0x62; +const fn iowr(nr: u32, size: usize) -> u64 { + (3u64 << 30) | ((size as u64) << 16) | (DMA_BUF_BASE << 8) | nr as u64 +} + +#[repr(C)] +struct DmaBufExportSyncFile { + flags: u32, + fd: i32, +} + +const DMA_BUF_IOCTL_EXPORT_SYNC_FILE: u64 = iowr(2, std::mem::size_of::()); +/// We will READ the buffer → export the fence(s) we must wait for before reading (the producer's writes). +const DMA_BUF_SYNC_READ: u32 = 1 << 0; + +/// Wait until the producer's writes to `dmabuf_fd` complete (or `timeout_ms` elapses). Returns: +/// - `Ok(true)` — a render was still in flight and we waited on its fence (the race was real, now closed). +/// - `Ok(false)` — no fence / already signaled (the driver attaches no implicit fence; zero-copy can race). +/// - `Err` — the ioctl failed (e.g. the kernel/driver lacks `EXPORT_SYNC_FILE`). +pub fn wait_read_ready(dmabuf_fd: RawFd, timeout_ms: i32) -> std::io::Result { + let mut req = DmaBufExportSyncFile { + flags: DMA_BUF_SYNC_READ, + fd: -1, + }; + let r = unsafe { libc::ioctl(dmabuf_fd, DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &mut req) }; + if r < 0 { + return Err(std::io::Error::last_os_error()); + } + let sync_fd = req.fd; + if sync_fd < 0 { + return Ok(false); // no sync_file exported + } + let mut pfd = libc::pollfd { + fd: sync_fd, + events: libc::POLLIN, + revents: 0, + }; + // Non-blocking probe: not-yet-signaled (poll==0) means the producer is still rendering. + let pending = unsafe { libc::poll(&mut pfd, 1, 0) } == 0; + if pending { + pfd.revents = 0; + unsafe { libc::poll(&mut pfd, 1, timeout_ms) }; // block until the render fence signals + } + unsafe { libc::close(sync_fd) }; + Ok(pending) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The ioctl number must match linux/dma-buf.h exactly — it's computed, so lock it down. + #[test] + fn ioctl_number_matches_dma_buf_h() { + assert_eq!(DMA_BUF_IOCTL_EXPORT_SYNC_FILE, 0xC008_6202); + } +} diff --git a/crates/punktfunk-host/src/drm_sync.rs b/crates/punktfunk-host/src/drm_sync.rs index 9ca7e35..b8c92ba 100644 --- a/crates/punktfunk-host/src/drm_sync.rs +++ b/crates/punktfunk-host/src/drm_sync.rs @@ -1,6 +1,14 @@ //! Minimal DRM timeline-syncobj operations — the consumer side of PipeWire explicit sync //! (`SPA_META_SyncTimeline`). //! +//! RETAINED BUT CURRENTLY UNUSED: producer-driven explicit sync is the "right" fix, but no +//! compositor we target produces a usable sync_fd today — Mutter+NVIDIA fails buffer allocation +//! (`error alloc buffers`, no cogl sync_fd), KWin/gamescope blit so they don't race at all. We sync +//! zero-copy from the consumer side instead (see [`crate::dmabuf_fence`]). This module is kept, +//! verified (ioctl numbers + a live signal→wait round trip), ready to wire in the moment a producer +//! gains working `SPA_META_SyncTimeline`. +#![allow(dead_code)] +//! //! Compositors that render directly into the PipeWire buffer pool (Mutter's virtual //! monitors) hand buffers over at GPU-submit time; on drivers without implicit dmabuf //! fencing (NVIDIA) reading immediately races the render and shows the buffer's diff --git a/crates/punktfunk-host/src/main.rs b/crates/punktfunk-host/src/main.rs index d7681ba..ede2031 100644 --- a/crates/punktfunk-host/src/main.rs +++ b/crates/punktfunk-host/src/main.rs @@ -16,6 +16,7 @@ mod audio; mod capture; mod discovery; +mod dmabuf_fence; mod drm_sync; mod encode; mod gamestream; diff --git a/crates/punktfunk-host/src/vdisplay.rs b/crates/punktfunk-host/src/vdisplay.rs index 56e4551..c584e40 100644 --- a/crates/punktfunk-host/src/vdisplay.rs +++ b/crates/punktfunk-host/src/vdisplay.rs @@ -35,11 +35,6 @@ pub struct VirtualOutput { pub preferred_mode: Option<(u32, u32, u32)>, /// Keeps the output — and whatever connection/thread backs it — alive; dropped on teardown. pub keepalive: Box, - /// This is a Mutter virtual monitor: the compositor renders DIRECTLY into the - /// PipeWire buffer pool, so consuming its dmabufs needs explicit sync (SyncTimeline) - /// — without it, NVIDIA's missing implicit fencing shows stale frames. The capture - /// layer keys its sync negotiation / shm fallback on this. - pub mutter: bool, } /// Pluggable virtual-output creation, per compositor. diff --git a/crates/punktfunk-host/src/vdisplay/gamescope.rs b/crates/punktfunk-host/src/vdisplay/gamescope.rs index 2364961..0c22039 100644 --- a/crates/punktfunk-host/src/vdisplay/gamescope.rs +++ b/crates/punktfunk-host/src/vdisplay/gamescope.rs @@ -83,7 +83,6 @@ impl VirtualDisplay for GamescopeDisplay { point_injector_at_eis(); tracing::info!(node_id, "gamescope: attaching to existing PipeWire node"); return Ok(VirtualOutput { - mutter: false, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)), @@ -108,7 +107,6 @@ impl VirtualDisplay for GamescopeDisplay { "gamescope virtual output ready" ); Ok(VirtualOutput { - mutter: false, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)), @@ -138,7 +136,6 @@ fn create_managed_session(client: &str, mode: Mode) -> Result { "gamescope session: reusing the running session (same mode — no Steam restart)" ); return Ok(VirtualOutput { - mutter: false, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)), @@ -165,7 +162,6 @@ fn create_managed_session(client: &str, mode: Mode) -> Result { "gamescope session: launched gamescope-session-plus at the client's mode" ); Ok(VirtualOutput { - mutter: false, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)), diff --git a/crates/punktfunk-host/src/vdisplay/kwin.rs b/crates/punktfunk-host/src/vdisplay/kwin.rs index 2ed92ca..b3ae402 100644 --- a/crates/punktfunk-host/src/vdisplay/kwin.rs +++ b/crates/punktfunk-host/src/vdisplay/kwin.rs @@ -104,7 +104,6 @@ impl VirtualDisplay for KwinDisplay { mode.refresh_hz }; Ok(VirtualOutput { - mutter: false, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, achieved_hz)), diff --git a/crates/punktfunk-host/src/vdisplay/mutter.rs b/crates/punktfunk-host/src/vdisplay/mutter.rs index 44ee28d..7ec3560 100644 --- a/crates/punktfunk-host/src/vdisplay/mutter.rs +++ b/crates/punktfunk-host/src/vdisplay/mutter.rs @@ -85,7 +85,6 @@ impl VirtualDisplay for MutterDisplay { "Mutter virtual monitor ready" ); Ok(VirtualOutput { - mutter: true, node_id, remote_fd: None, preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)), diff --git a/crates/punktfunk-host/src/vdisplay/wlroots.rs b/crates/punktfunk-host/src/vdisplay/wlroots.rs index 2fa0d91..33d7488 100644 --- a/crates/punktfunk-host/src/vdisplay/wlroots.rs +++ b/crates/punktfunk-host/src/vdisplay/wlroots.rs @@ -123,7 +123,6 @@ impl VirtualDisplay for WlrootsDisplay { "sway headless output ready" ); Ok(VirtualOutput { - mutter: false, node_id, remote_fd: Some(fd), preferred_mode: Some((mode.width, mode.height, mode.refresh_hz)),