feat(host/encode): VAAPI zero-copy dmabuf import (AMD/Intel GPU CSC)
apple / swift (push) Successful in 57s
ci / rust (push) Successful in 1m39s
ci / web (push) Successful in 32s
ci / docs-site (push) Successful in 31s
android / android (push) Successful in 3m29s
windows-host / package (push) Successful in 3m39s
deb / build-publish (push) Successful in 3m7s
decky / build-publish (push) Successful in 22s
ci / bench (push) Successful in 4m43s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 16s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 2m27s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 3m24s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 22s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 2m18s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m22s
docker / deploy-docs (push) Successful in 21s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m53s
apple / swift (push) Successful in 57s
ci / rust (push) Successful in 1m39s
ci / web (push) Successful in 32s
ci / docs-site (push) Successful in 31s
android / android (push) Successful in 3m29s
windows-host / package (push) Successful in 3m39s
deb / build-publish (push) Successful in 3m7s
decky / build-publish (push) Successful in 22s
ci / bench (push) Successful in 4m43s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 16s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 2m27s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 3m24s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 22s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 2m18s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m22s
docker / deploy-docs (push) Successful in 21s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m53s
Phase 2 of AMD/Intel support: the VAAPI encoder now takes the capture dmabuf directly and does the RGB->NV12 colour conversion on the GPU's video engine, eliminating the host-side de-pad + swscale CSC + upload the CPU path pays. - capture: a vendor-neutral FramePayload::Dmabuf (dup'd fd + fourcc/modifier/ layout). When zero-copy is on, the EGL->CUDA importer is unavailable (any non-NVIDIA host), and the backend is VAAPI, the capturer advertises LINEAR dmabuf and hands the raw buffer to the encoder instead of CPU-copying it. - encode/vaapi: the encoder self-configures from the first frame's payload (no open_video signature change). The dmabuf arm wraps the buffer as an AV_PIX_FMT_DRM_PRIME frame and pushes it through a filter graph buffer(drm_prime) -> hwmap(vaapi) -> scale_vaapi=nv12 -> buffersink; the encoder takes NV12 surfaces straight from the sink. The Phase 1 CPU-upload path is kept as the other arm (used when capture produces CPU frames). Live-validated on a Radeon 780M (real Sway/xdpw desktop capture): correct, pixel-perfect HEVC, and ~10x less host CPU at 1440p (4.2s -> 0.4s of CPU for 300 frames) -- the de-pad/CSC/upload moves to the GPU. NVIDIA unchanged (zero-copy still imports to CUDA; the passthrough path only engages on non-NVIDIA hosts). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
//! instead of leaking it to process exit. The portal thread (when used) still parks on its zbus
|
||||
//! connection until process exit.
|
||||
|
||||
use super::{CapturedFrame, Capturer, FramePayload, PixelFormat};
|
||||
use super::{CapturedFrame, Capturer, DmabufFrame, FramePayload, PixelFormat};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use std::os::fd::OwnedFd;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
@@ -425,11 +425,11 @@ fn portal_thread_remote_desktop(setup_tx: std::sync::mpsc::Sender<Result<(OwnedF
|
||||
mod pipewire {
|
||||
//! The PipeWire consumer, confined to its own thread (the PW types are `!Send`).
|
||||
|
||||
use super::{CapturedFrame, FramePayload, PixelFormat};
|
||||
use super::{CapturedFrame, DmabufFrame, FramePayload, PixelFormat};
|
||||
use anyhow::{Context, Result};
|
||||
use pipewire as pw;
|
||||
use pw::{properties::properties, spa};
|
||||
use std::os::fd::OwnedFd;
|
||||
use std::os::fd::{FromRawFd, OwnedFd};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::mpsc::SyncSender;
|
||||
use std::sync::Arc;
|
||||
@@ -464,8 +464,12 @@ mod pipewire {
|
||||
/// Set once a video format is agreed (`param_changed`), so a first-frame timeout can tell
|
||||
/// "format never negotiated" apart from "negotiated but no buffers arrived".
|
||||
negotiated: Arc<AtomicBool>,
|
||||
/// Present when zero-copy is enabled: imports a dmabuf → CUDA device buffer.
|
||||
/// Present when zero-copy is enabled on NVIDIA: imports a dmabuf → CUDA device buffer.
|
||||
importer: Option<crate::zerocopy::EglImporter>,
|
||||
/// VAAPI zero-copy: hand the raw dmabuf to the encoder (which imports + GPU-CSCs it) instead
|
||||
/// of a CUDA import. Set when zero-copy is on, the EGL→CUDA importer is unavailable, and the
|
||||
/// encoder backend is VAAPI (AMD/Intel).
|
||||
vaapi_passthrough: bool,
|
||||
/// `PUNKTFUNK_NV12`: on the tiled EGL/GL zero-copy path, convert to NV12 on the GPU and feed
|
||||
/// NVENC native YUV (Tier 2A). Off ⇒ the BGRx path is unchanged.
|
||||
nv12: bool,
|
||||
@@ -767,6 +771,57 @@ mod pipewire {
|
||||
}
|
||||
}
|
||||
|
||||
// VAAPI zero-copy passthrough: hand the raw dmabuf straight to the encoder, which imports
|
||||
// it into a VA surface and does RGB→NV12 on the GPU video engine. No CUDA importer here.
|
||||
if ud.vaapi_passthrough {
|
||||
if let Some(fmt) = ud.format {
|
||||
if datas[0].type_() == pw::spa::buffer::DataType::DmaBuf {
|
||||
if let Some(fourcc) = crate::zerocopy::drm_fourcc(fmt) {
|
||||
let chunk = datas[0].chunk();
|
||||
let offset = chunk.offset();
|
||||
let stride = chunk.stride().max(0) as u32;
|
||||
// dup the fd so it survives the SPA buffer recycle — the encode thread
|
||||
// imports it. (Content stability across the brief map+CSC window relies on
|
||||
// the compositor's buffer-pool depth, like any zero-copy capture.)
|
||||
let dup =
|
||||
unsafe { libc::fcntl(datas[0].fd() as i32, libc::F_DUPFD_CLOEXEC, 0) };
|
||||
if dup >= 0 {
|
||||
let pts_ns = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.map(|d| d.as_nanos() as u64)
|
||||
.unwrap_or(0);
|
||||
let _ = ud.tx.try_send(CapturedFrame {
|
||||
width: w as u32,
|
||||
height: h as u32,
|
||||
pts_ns,
|
||||
format: fmt,
|
||||
payload: FramePayload::Dmabuf(DmabufFrame {
|
||||
fd: unsafe { OwnedFd::from_raw_fd(dup) },
|
||||
fourcc,
|
||||
modifier: ud.modifier,
|
||||
offset,
|
||||
stride,
|
||||
}),
|
||||
});
|
||||
static ONCE: std::sync::atomic::AtomicBool =
|
||||
std::sync::atomic::AtomicBool::new(true);
|
||||
if ONCE.swap(false, Ordering::Relaxed) {
|
||||
tracing::info!(
|
||||
w,
|
||||
h,
|
||||
modifier = ud.modifier,
|
||||
fourcc = format_args!("{:#010x}", fourcc),
|
||||
"zero-copy: handing dmabuf to VAAPI (GPU import + CSC)"
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Not a dmabuf (or unmappable format) — fall through to the CPU de-pad path.
|
||||
}
|
||||
|
||||
// Zero-copy path: if the buffer is a dmabuf and we have an importer, import it
|
||||
// into a CUDA device buffer (no CPU touch) and deliver that. Otherwise fall
|
||||
// through to the shm de-pad copy below.
|
||||
@@ -998,28 +1053,39 @@ mod pipewire {
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus
|
||||
// LINEAR (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer)
|
||||
// import via CUDA external memory instead. Tiled stays first so allocators that can do
|
||||
// both (KWin) prefer it. If none, we can't negotiate dmabuf → shm path.
|
||||
let mut modifiers = importer
|
||||
.as_ref()
|
||||
.map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap()))
|
||||
.unwrap_or_default();
|
||||
if importer.is_some() && !modifiers.contains(&0) {
|
||||
modifiers.push(0); // DRM_FORMAT_MOD_LINEAR
|
||||
}
|
||||
// PUNKTFUNK_FORCE_SHM=1 forces the race-free download path (SHM, no dmabuf) — required on
|
||||
// Mutter+NVIDIA where dmabuf capture has no working sync and shows stale frames. KWin/
|
||||
// gamescope don't need it (they blit into the buffer, so no read-before-render race).
|
||||
let force_shm = std::env::var("PUNKTFUNK_FORCE_SHM").as_deref() == Ok("1");
|
||||
let want_dmabuf = importer.is_some() && !modifiers.is_empty() && !force_shm;
|
||||
// VAAPI zero-copy passthrough: zero-copy on, no EGL→CUDA importer (any non-NVIDIA host), and
|
||||
// the encoder backend is VAAPI → hand the raw dmabuf to the encoder (it imports + GPU-CSCs).
|
||||
let vaapi_passthrough = zerocopy
|
||||
&& !force_shm
|
||||
&& importer.is_none()
|
||||
&& crate::encode::linux_zero_copy_is_vaapi();
|
||||
// Modifiers our import stack handles for BGRx: the EGL-importable (tiled) set, plus LINEAR
|
||||
// (0) — NVIDIA's EGL won't list it, but LINEAR dmabufs (gamescope's only offer) import via
|
||||
// CUDA external memory instead. For the VAAPI passthrough path we advertise LINEAR only:
|
||||
// radeonsi/iHD import it and any compositor can allocate it.
|
||||
let mut modifiers = importer
|
||||
.as_ref()
|
||||
.map(|i| i.supported_modifiers(crate::zerocopy::drm_fourcc(PixelFormat::Bgrx).unwrap()))
|
||||
.unwrap_or_default();
|
||||
if (importer.is_some() || vaapi_passthrough) && !modifiers.contains(&0) {
|
||||
modifiers.push(0); // DRM_FORMAT_MOD_LINEAR
|
||||
}
|
||||
let want_dmabuf =
|
||||
(importer.is_some() || vaapi_passthrough) && !modifiers.is_empty() && !force_shm;
|
||||
if force_shm {
|
||||
tracing::info!(
|
||||
"capture: PUNKTFUNK_FORCE_SHM — race-free SHM download path (no dmabuf, no zero-copy)"
|
||||
);
|
||||
} else if zerocopy && !want_dmabuf {
|
||||
tracing::warn!("zero-copy: no EGL-importable dmabuf modifiers — using CPU path");
|
||||
tracing::warn!("zero-copy: no importable dmabuf modifiers — using CPU path");
|
||||
} else if vaapi_passthrough {
|
||||
tracing::info!(
|
||||
"zero-copy: advertising LINEAR dmabuf for direct VAAPI import (GPU CSC)"
|
||||
);
|
||||
} else if want_dmabuf {
|
||||
tracing::info!(
|
||||
count = modifiers.len(),
|
||||
@@ -1027,7 +1093,7 @@ mod pipewire {
|
||||
"zero-copy: advertising EGL-importable dmabuf modifiers"
|
||||
);
|
||||
}
|
||||
if want_dmabuf && crate::zerocopy::nv12_enabled() {
|
||||
if want_dmabuf && !vaapi_passthrough && crate::zerocopy::nv12_enabled() {
|
||||
tracing::info!(
|
||||
"PUNKTFUNK_NV12: tiled dmabufs convert to NV12 (BT.709 limited) on the GPU — NVENC \
|
||||
fed native YUV (no internal RGB→YUV CSC)"
|
||||
@@ -1042,6 +1108,7 @@ mod pipewire {
|
||||
active,
|
||||
negotiated,
|
||||
importer,
|
||||
vaapi_passthrough,
|
||||
nv12: crate::zerocopy::nv12_enabled(),
|
||||
dbg_log_n: 0,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user