diff --git a/CLAUDE.md b/CLAUDE.md index 748c2ec..5383d85 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -472,7 +472,10 @@ Pinned crate facts: `ashpd` 0.13 + `pipewire` 0.9 (must match ashpd's) + `ffmpeg (`ffmpeg-sys-next` auto-detects the system FFmpeg, so it builds against **FFmpeg 7.x/libavcodec 61 or 8.x/libavcodec 62** — validated live on Ubuntu 26.04 (8) and Bazzite F43 (7.1); the zero-copy FFI also link-needs `libGL`/`libgbm`/`libcuda` at build time). Env knobs: `PUNKTFUNK_VIDEO_SOURCE=virtual|portal`, -`PUNKTFUNK_COMPOSITOR=kwin|gamescope|mutter`, `PUNKTFUNK_ZEROCOPY=1`, `PUNKTFUNK_GAMESCOPE_APP=...`, +`PUNKTFUNK_COMPOSITOR=kwin|gamescope|mutter`, `PUNKTFUNK_ZEROCOPY=1|0` (Linux default: ON for +VAAPI/AMD/Intel with a one-shot CPU downgrade if the dmabuf offer never negotiates, OFF/opt-in for +NVENC), `PUNKTFUNK_VAAPI_LOW_POWER=1|0` (pin the VAAPI entrypoint; auto = full-feature then VDEnc +fallback for modern Intel), `PUNKTFUNK_GAMESCOPE_APP=...`, `PUNKTFUNK_INPUT_BACKEND=...`, `PUNKTFUNK_PERF=1` (per-stage timing), `PUNKTFUNK_VIDEO_DROP=N` (FEC test), `PUNKTFUNK_FEC_PCT=N`, `PUNKTFUNK_DSCP=1` (opt-in DSCP/SO_PRIORITY media QoS on the data + GameStream video/audio sockets; no-op on the wire on Windows without a qWAVE policy), diff --git a/crates/punktfunk-host/src/capture/linux/mod.rs b/crates/punktfunk-host/src/capture/linux/mod.rs index 5d22df5..5a4c0e1 100644 --- a/crates/punktfunk-host/src/capture/linux/mod.rs +++ b/crates/punktfunk-host/src/capture/linux/mod.rs @@ -47,6 +47,11 @@ pub struct PortalCapturer { /// renegotiation before declaring the source lost. Cleared whenever a frame arrives or the stream /// is `Streaming`. stall_since: Option, + /// True when this capture runs the VAAPI dmabuf passthrough (a LINEAR-dmabuf-only offer). If + /// that offer never negotiates, [`next_frame`](Capturer::next_frame)'s timeout branch latches + /// the process-wide downgrade ([`crate::zerocopy::note_vaapi_dmabuf_failed`]) so the pipeline + /// rebuild retries on the CPU offer instead of failing identically forever. + vaapi_dmabuf: bool, /// The PipeWire node this capturer consumes — surfaced in error messages for diagnosis. node_id: u32, /// Stops the PipeWire loop on teardown (sent in `Drop`). Without it a dropped or failed @@ -125,6 +130,9 @@ struct PwHandles { active: Arc, negotiated: Arc, streaming: Arc, + /// This capture will offer LINEAR-dmabuf-only for the VAAPI passthrough (see + /// [`PortalCapturer::vaapi_dmabuf`]). + vaapi_dmabuf: bool, quit: ::pipewire::channel::Sender<()>, join: thread::JoinHandle<()>, } @@ -139,6 +147,7 @@ impl PwHandles { negotiated: self.negotiated, streaming: self.streaming, stall_since: None, + vaapi_dmabuf: self.vaapi_dmabuf, node_id, quit: Some(self.quit), join: Some(self.join), @@ -174,6 +183,12 @@ fn spawn_pipewire( // inner `mod pipewire` shadows the crate name at this scope. let (quit_tx, quit_rx) = ::pipewire::channel::channel::<()>(); let zerocopy = allow_zerocopy && crate::zerocopy::enabled(); + // Mirror of the thread's `vaapi_passthrough` decision (deterministic from here: on a VAAPI + // backend the EGL→CUDA importer is never built) — kept on the capturer so `next_frame`'s + // negotiation-timeout branch knows a failed negotiation was the LINEAR-dmabuf offer. + let vaapi_dmabuf = zerocopy + && std::env::var("PUNKTFUNK_FORCE_SHM").as_deref() != Ok("1") + && crate::encode::linux_zero_copy_is_vaapi(); let join = thread::Builder::new() .name("punktfunk-pipewire".into()) .spawn(move || { @@ -197,6 +212,7 @@ fn spawn_pipewire( active, negotiated, streaming, + vaapi_dmabuf, quit: quit_tx, join, }) @@ -218,6 +234,17 @@ impl Capturer for PortalCapturer { or capture never started)", self.node_id )) + } else if self.vaapi_dmabuf && !crate::zerocopy::vaapi_dmabuf_forced() { + // The LINEAR-dmabuf-only offer (VAAPI passthrough default) was never accepted. + // Latch the process-wide downgrade so the encode loop's pipeline rebuild + // retries on the CPU offer instead of failing this same negotiation forever. + crate::zerocopy::note_vaapi_dmabuf_failed(); + Err(anyhow!( + "no PipeWire frame within 10s (node {}): the compositor never accepted \ + the LINEAR-dmabuf offer (VAAPI zero-copy) — downgrading this host to the \ + CPU capture path; the pipeline rebuild will renegotiate without dmabuf", + self.node_id + )) } else { Err(anyhow!( "no PipeWire frame within 10s (node {}): format negotiation never \ @@ -1139,8 +1166,12 @@ mod pipewire { }; // Build the EGL→CUDA importer up front; if it fails, log and fall back to the CPU path - // (we simply won't request dmabuf below). - let importer = if zerocopy { + // (we simply won't request dmabuf below). Skipped entirely when the encode backend is + // VAAPI: those frames go to the raw-dmabuf passthrough, and building the importer there + // would waste a CUDA probe — or worse, on an NVIDIA box forced to PUNKTFUNK_ENCODER=vaapi, + // succeed and produce CUDA payloads the VAAPI encoder must reject. + let backend_is_vaapi = crate::encode::linux_zero_copy_is_vaapi(); + let importer = if zerocopy && !backend_is_vaapi { match crate::zerocopy::EglImporter::new() { Ok(i) => Some(i), Err(e) => { @@ -1157,10 +1188,7 @@ mod pipewire { let force_shm = std::env::var("PUNKTFUNK_FORCE_SHM").as_deref() == Ok("1"); // VAAPI zero-copy passthrough: zero-copy on, no EGL→CUDA importer (any non-NVIDIA host), and // the encoder backend is VAAPI → hand the raw dmabuf to the encoder (it imports + GPU-CSCs). - let vaapi_passthrough = zerocopy - && !force_shm - && importer.is_none() - && crate::encode::linux_zero_copy_is_vaapi(); + let vaapi_passthrough = zerocopy && !force_shm && importer.is_none() && backend_is_vaapi; // Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus LINEAR // (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer) import via // CUDA external memory instead. For the VAAPI passthrough path we advertise LINEAR only: @@ -1190,6 +1218,19 @@ mod pipewire { sample = ?&modifiers[..modifiers.len().min(6)], "zero-copy: advertising EGL-importable dmabuf modifiers" ); + } else if backend_is_vaapi && crate::capture::gpu_encode() { + // A VAAPI session on the CPU path pays three full-frame CPU touches (mmap de-pad + + // swscale RGB→NV12 + surface upload) — make the silent fallback visible. + tracing::warn!( + "VAAPI encode with the CPU capture path (per-frame de-pad + swscale CSC + \ + upload) — zero-copy was disabled ({}); clear PUNKTFUNK_ZEROCOPY to restore \ + the dmabuf default", + if std::env::var_os("PUNKTFUNK_ZEROCOPY").is_some() { + "PUNKTFUNK_ZEROCOPY is set falsy" + } else { + "downgraded after a failed dmabuf negotiation" + } + ); } if want_dmabuf && !vaapi_passthrough && crate::zerocopy::nv12_enabled() { tracing::info!( diff --git a/crates/punktfunk-host/src/linux/zerocopy/mod.rs b/crates/punktfunk-host/src/linux/zerocopy/mod.rs index 3cf4e98..7dcb673 100644 --- a/crates/punktfunk-host/src/linux/zerocopy/mod.rs +++ b/crates/punktfunk-host/src/linux/zerocopy/mod.rs @@ -1,7 +1,10 @@ //! Zero-copy capture→encode (plan §9): the PipeWire dmabuf is imported into CUDA via EGL and //! handed straight to NVENC, eliminating the per-frame CPU copies (at 5K the CPU-copy path -//! moves ~3.5 GB/s). Opt in with `PUNKTFUNK_ZEROCOPY=1`; the CPU-copy path stays the default and -//! the runtime fallback (foreign-allocator / no-dmabuf / import failure). +//! moves ~3.5 GB/s). On NVENC opt in with `PUNKTFUNK_ZEROCOPY=1` (the CPU-copy path stays that +//! backend's default and the runtime fallback: foreign-allocator / no-dmabuf / import failure). +//! On the VAAPI (AMD/Intel) backend zero-copy is the **default** — its LINEAR-dmabuf passthrough +//! replaces a triple CPU touch (mmap de-pad + swscale CSC + surface upload) — with a one-shot +//! downgrade to the CPU path if the compositor never accepts the dmabuf offer. //! //! Pieces: [`cuda`] (driver-API FFI + the shared `CUcontext` + device buffers), [`egl`] (the //! headless EGLDisplay + dmabuf→`EGLImage`→CUDA import). The encoder's CUDA-frame path lives in @@ -11,19 +14,53 @@ pub mod cuda; pub mod egl; pub mod vulkan; +use std::sync::atomic::{AtomicBool, Ordering}; + pub use cuda::DeviceBuffer; pub use egl::{DmabufPlane, EglImporter}; -/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`). -fn flag(name: &str) -> bool { +/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`), or `None` when unset. +fn flag_opt(name: &str) -> Option { std::env::var(name) + .ok() .map(|v| matches!(v.trim(), "1" | "true" | "yes" | "on")) - .unwrap_or(false) } -/// Whether the zero-copy path is opted in (`PUNKTFUNK_ZEROCOPY` truthy). +/// Whether a `PUNKTFUNK_*` flag is truthy (`1`/`true`/`yes`/`on`); unset ⇒ false. +fn flag(name: &str) -> bool { + flag_opt(name).unwrap_or(false) +} + +/// One-shot downgrade latch: a VAAPI-passthrough capture whose dmabuf-only offer never negotiated +/// (the compositor can't allocate a LINEAR BGRx dmabuf) flips this, so the encode loop's pipeline +/// rebuild lands on the CPU offer instead of failing the same negotiation forever. Only consulted +/// when `PUNKTFUNK_ZEROCOPY` is unset — an explicit `=1` keeps forcing the dmabuf offer. +static VAAPI_DMABUF_FAILED: AtomicBool = AtomicBool::new(false); + +/// Record that the VAAPI LINEAR-dmabuf offer failed negotiation (see [`VAAPI_DMABUF_FAILED`]). +pub fn note_vaapi_dmabuf_failed() { + VAAPI_DMABUF_FAILED.store(true, Ordering::Relaxed); +} + +/// True when `PUNKTFUNK_ZEROCOPY` is explicitly truthy — the operator forced the dmabuf offer, so +/// a failed negotiation keeps erroring loudly instead of silently downgrading to the CPU path. +pub fn vaapi_dmabuf_forced() -> bool { + flag_opt("PUNKTFUNK_ZEROCOPY") == Some(true) +} + +/// Whether the zero-copy path is on. `PUNKTFUNK_ZEROCOPY` decides when set (truthy = on, else +/// off). Unset defaults **on for the VAAPI (AMD/Intel) backend** — the stock AMD/Intel install +/// gets the GPU dmabuf path, not three full-frame CPU touches — unless a failed negotiation +/// downgraded it ([`note_vaapi_dmabuf_failed`]); and **off for NVENC**, whose EGL→CUDA import +/// stays opt-in (Mutter+NVIDIA has known dmabuf-capture races; see `PUNKTFUNK_FORCE_SHM`). pub fn enabled() -> bool { - flag("PUNKTFUNK_ZEROCOPY") + match flag_opt("PUNKTFUNK_ZEROCOPY") { + Some(v) => v, + None => { + crate::encode::linux_zero_copy_is_vaapi() + && !VAAPI_DMABUF_FAILED.load(Ordering::Relaxed) + } + } } /// Whether the NV12 convert path is opted in (`PUNKTFUNK_NV12` truthy). When set AND the zero-copy