4cc57d5c39
apple / swift (push) Successful in 56s
ci / rust (push) Successful in 1m36s
android / android (push) Successful in 1m56s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 28s
deb / build-publish (push) Successful in 2m26s
decky / build-publish (push) Successful in 11s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 5s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
ci / bench (push) Successful in 4m33s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m15s
docker / deploy-docs (push) Successful in 18s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 7m58s
The Windows host capped at ~60 fps with 35-40 ms latency on a GPU-heavy game: the per-frame capture→encode path shared the 3D engine with the game and got scheduled behind it. Rework to minimize 3D-engine work per frame: - VideoConverter (D3D11 video processor): capture → NVENC-native NV12/P010 so NVENC skips its internal RGB→YUV (a 3D/compute step). Wired into both DDA (dxgi.rs) and WGC (wgc.rs). New PixelFormat::Nv12/P010 + NVENC YUV input. - GPU scheduling hardening (Apollo-style): D3DKMTSetProcessSchedulingPriorityClass HIGH, absolute SetGPUThreadPriority, SetMaximumFrameLatency(1). - WGC SDR zero-copy (hold pool frames; no CopyResource). DDA keeps a fast CopyResource to decouple its single-frame acquire/release from the async convert. - Pipelined helper encode loop (PUNKTFUNK_ENCODE_DEPTH, default 1) + perf split (cap_wait / encode / write). Live on the RTX 4090: hard 60 fps ceiling removed (now scene-scaling 40-200+), latency much reduced. Residual cap in GPU-pinned scenes is the irreducible RGB→YUV convert (no fixed-function unit on NVIDIA — VideoProcessing engine reads 0%) waiting behind an uncapped game under WDDM context time-slicing; Linux avoids it via gamescope capping the game to the display refresh. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
248 lines
9.3 KiB
Rust
248 lines
9.3 KiB
Rust
//! Software H.264 encoder (openh264) — the GPU-less encode path for the Windows host (and a
|
|
//! fallback when NVENC is unavailable). Low-latency screen-content config: single-reference,
|
|
//! no B-frames (Baseline), bitrate rate-control, in-band SPS/PPS each IDR, BT.709 limited range.
|
|
//! Synchronous: `submit` encodes immediately and stashes the AU for `poll` (no internal queue).
|
|
|
|
use super::{EncodedFrame, Encoder};
|
|
use crate::capture::{CapturedFrame, FramePayload, PixelFormat};
|
|
use anyhow::{bail, ensure, Context, Result};
|
|
use openh264::encoder::{
|
|
BitRate, Complexity, Encoder as Oh264, EncoderConfig, FrameRate, FrameType, IntraFramePeriod,
|
|
Profile, RateControlMode, SpsPpsStrategy, UsageType,
|
|
};
|
|
use openh264::formats::{BgraSliceU8, RgbSliceU8, YUVBuffer};
|
|
use openh264::OpenH264API;
|
|
|
|
pub struct OpenH264Encoder {
|
|
enc: Oh264,
|
|
yuv: YUVBuffer,
|
|
width: u32,
|
|
height: u32,
|
|
fps: u32,
|
|
src_format: PixelFormat,
|
|
/// BGRA scratch for the 3-bpp (Bgr) and R/B-swapped (Rgba/Rgbx) formats openh264 can't wrap
|
|
/// directly. Reused across frames.
|
|
scratch: Vec<u8>,
|
|
frame_idx: i64,
|
|
force_kf: bool,
|
|
/// At most one AU per submit (no lookahead), handed back by the next `poll`.
|
|
pending: Option<EncodedFrame>,
|
|
}
|
|
|
|
// openh264's Encoder holds a raw C handle (not auto-Send); it lives on the single encode thread.
|
|
unsafe impl Send for OpenH264Encoder {}
|
|
|
|
impl OpenH264Encoder {
|
|
pub fn open(
|
|
format: PixelFormat,
|
|
width: u32,
|
|
height: u32,
|
|
fps: u32,
|
|
bitrate_bps: u64,
|
|
) -> Result<Self> {
|
|
// validate_dimensions() ran in open_video: even, non-zero, <= 4096.
|
|
let bps: u32 = bitrate_bps.try_into().unwrap_or(u32::MAX);
|
|
let cfg = EncoderConfig::new()
|
|
.usage_type(UsageType::ScreenContentRealTime)
|
|
.max_frame_rate(FrameRate::from_hz(fps.max(1) as f32))
|
|
.rate_control_mode(RateControlMode::Bitrate)
|
|
.bitrate(BitRate::from_bps(bps))
|
|
.skip_frames(false)
|
|
.intra_frame_period(IntraFramePeriod::from_num_frames(intra_period_frames(fps)))
|
|
.sps_pps_strategy(SpsPpsStrategy::ConstantId) // SPS/PPS in-band on every IDR
|
|
.num_threads(num_threads())
|
|
.scene_change_detect(false) // no surprise IDRs (bitrate spikes / freeze)
|
|
.adaptive_quantization(true)
|
|
.complexity(Complexity::Low) // latency over BD-rate
|
|
.profile(Profile::Baseline); // no B-frames; BT.709 limited is the crate default VUI
|
|
let api = OpenH264API::from_source(); // statically-bundled build (default `source` feature)
|
|
let enc = Oh264::with_api_config(api, cfg).context("openh264 Encoder::with_api_config")?;
|
|
let yuv = YUVBuffer::new(width as usize, height as usize);
|
|
tracing::info!(
|
|
"openh264 software encoder: {width}x{height}@{fps} {} Mbps (Baseline, screen-content)",
|
|
bps / 1_000_000
|
|
);
|
|
Ok(Self {
|
|
enc,
|
|
yuv,
|
|
width,
|
|
height,
|
|
fps,
|
|
src_format: format,
|
|
scratch: Vec::new(),
|
|
frame_idx: 0,
|
|
force_kf: false,
|
|
pending: None,
|
|
})
|
|
}
|
|
|
|
/// Normalize a packed source buffer into the reused BGRA `scratch` ([B,G,R,A]). `rgb_order`
|
|
/// = source is R,G,B (swap into B,G,R); otherwise source is already B,G,R.
|
|
fn normalize_to_bgra(&mut self, src: &[u8], src_bpp: usize, rgb_order: bool) {
|
|
let w = self.width as usize;
|
|
let h = self.height as usize;
|
|
self.scratch.resize(w * h * 4, 0);
|
|
for px in 0..(w * h) {
|
|
let s = &src[px * src_bpp..px * src_bpp + 3];
|
|
let d = &mut self.scratch[px * 4..px * 4 + 4];
|
|
if rgb_order {
|
|
d[0] = s[2];
|
|
d[1] = s[1];
|
|
d[2] = s[0];
|
|
} else {
|
|
d[0] = s[0];
|
|
d[1] = s[1];
|
|
d[2] = s[2];
|
|
}
|
|
d[3] = 0xff;
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Encoder for OpenH264Encoder {
|
|
fn submit(&mut self, captured: &CapturedFrame) -> Result<()> {
|
|
ensure!(
|
|
captured.width == self.width && captured.height == self.height,
|
|
"captured {}x{} != encoder {}x{}",
|
|
captured.width,
|
|
captured.height,
|
|
self.width,
|
|
self.height
|
|
);
|
|
ensure!(
|
|
captured.format == self.src_format,
|
|
"captured format {:?} != encoder source {:?}",
|
|
captured.format,
|
|
self.src_format
|
|
);
|
|
// Refutable once the capture backend adds `FramePayload::D3d11`; today `Cpu` is the only
|
|
// non-Linux variant, so the pattern is (temporarily) irrefutable.
|
|
#[allow(irrefutable_let_patterns)]
|
|
let FramePayload::Cpu(bytes) = &captured.payload
|
|
else {
|
|
bail!("openh264 backend requires a CPU frame payload");
|
|
};
|
|
let w = self.width as usize;
|
|
let h = self.height as usize;
|
|
ensure!(
|
|
bytes.len() >= w * h * self.src_format.bytes_per_pixel(),
|
|
"captured buffer {} bytes too small for {w}x{h} {:?}",
|
|
bytes.len(),
|
|
self.src_format
|
|
);
|
|
|
|
match self.src_format {
|
|
PixelFormat::Rgb => self
|
|
.yuv
|
|
.read_rgb(RgbSliceU8::new(&bytes[..w * h * 3], (w, h))),
|
|
PixelFormat::Bgra | PixelFormat::Bgrx => self
|
|
.yuv
|
|
.read_rgb(BgraSliceU8::new(&bytes[..w * h * 4], (w, h))),
|
|
PixelFormat::Rgba | PixelFormat::Rgbx => {
|
|
self.normalize_to_bgra(bytes, 4, true);
|
|
self.yuv.read_rgb(BgraSliceU8::new(&self.scratch, (w, h)));
|
|
}
|
|
PixelFormat::Bgr => {
|
|
self.normalize_to_bgra(bytes, 3, false);
|
|
self.yuv.read_rgb(BgraSliceU8::new(&self.scratch, (w, h)));
|
|
}
|
|
// 10-bit HDR comes only from the GPU NVENC path; the software 8-bit H.264 encoder
|
|
// can't represent it (and never receives it — the capturer pairs Rgb10a2 with NVENC).
|
|
PixelFormat::Rgb10a2 => {
|
|
anyhow::bail!("software H.264 encoder cannot encode 10-bit HDR (Rgb10a2)")
|
|
}
|
|
// NV12/P010 are GPU-resident video-processor outputs for the NVENC path; the software
|
|
// encoder never receives them (it only gets CPU RGB frames).
|
|
PixelFormat::Nv12 | PixelFormat::P010 => {
|
|
anyhow::bail!(
|
|
"software encoder cannot encode YUV GPU textures (NV12/P010 → NVENC only)"
|
|
)
|
|
}
|
|
}
|
|
|
|
if self.force_kf {
|
|
self.enc.force_intra_frame();
|
|
self.force_kf = false;
|
|
}
|
|
let bs = self.enc.encode(&self.yuv).context("openh264 encode")?;
|
|
let mut data = Vec::new();
|
|
bs.write_vec(&mut data); // AnnexB start codes; SPS/PPS prepended on IDR
|
|
if !data.is_empty() {
|
|
let keyframe = matches!(bs.frame_type(), FrameType::IDR | FrameType::I);
|
|
let pts_ns = self.frame_idx as u64 * 1_000_000_000 / self.fps.max(1) as u64;
|
|
self.pending = Some(EncodedFrame {
|
|
data,
|
|
pts_ns,
|
|
keyframe,
|
|
});
|
|
}
|
|
self.frame_idx += 1;
|
|
Ok(())
|
|
}
|
|
|
|
fn request_keyframe(&mut self) {
|
|
self.force_kf = true;
|
|
}
|
|
|
|
fn poll(&mut self) -> Result<Option<EncodedFrame>> {
|
|
Ok(self.pending.take())
|
|
}
|
|
|
|
fn flush(&mut self) -> Result<()> {
|
|
Ok(()) // synchronous: nothing buffered
|
|
}
|
|
}
|
|
|
|
/// Approximate infinite-GOP: insert IDRs rarely (recovery is via `request_keyframe`/RFI). Env
|
|
/// `PUNKTFUNK_OH264_GOP` overrides (0 = encoder-auto).
|
|
fn intra_period_frames(fps: u32) -> u32 {
|
|
if let Ok(v) = std::env::var("PUNKTFUNK_OH264_GOP") {
|
|
if let Ok(n) = v.trim().parse::<u32>() {
|
|
return n;
|
|
}
|
|
}
|
|
fps.max(1).saturating_mul(600) // ~10 min between automatic IDRs
|
|
}
|
|
|
|
/// Encode threads. Env `PUNKTFUNK_OH264_THREADS` overrides; default 2 (latency over throughput).
|
|
fn num_threads() -> u16 {
|
|
std::env::var("PUNKTFUNK_OH264_THREADS")
|
|
.ok()
|
|
.and_then(|v| v.trim().parse::<u16>().ok())
|
|
.unwrap_or(2)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::capture::{CapturedFrame, FramePayload, PixelFormat};
|
|
|
|
#[test]
|
|
fn encodes_synthetic_frame_to_annexb_idr() {
|
|
let (w, h, fps) = (1280u32, 720u32, 60u32);
|
|
let mut enc =
|
|
OpenH264Encoder::open(PixelFormat::Bgrx, w, h, fps, 8_000_000).expect("open openh264");
|
|
// A flat gray BGRx frame.
|
|
let frame = CapturedFrame {
|
|
width: w,
|
|
height: h,
|
|
pts_ns: 0,
|
|
format: PixelFormat::Bgrx,
|
|
payload: FramePayload::Cpu(vec![0x80u8; (w * h * 4) as usize]),
|
|
};
|
|
enc.submit(&frame).expect("submit");
|
|
let au = enc.poll().expect("poll").expect("an AU");
|
|
assert!(au.keyframe, "first frame must be an IDR");
|
|
// AnnexB start code + an SPS NAL (type 7) somewhere in the first frame.
|
|
assert!(
|
|
au.data.starts_with(&[0, 0, 0, 1]) || au.data.starts_with(&[0, 0, 1]),
|
|
"expected AnnexB start code"
|
|
);
|
|
let has_sps = au
|
|
.data
|
|
.windows(5)
|
|
.any(|w| w[0] == 0 && w[1] == 0 && w[2] == 0 && w[3] == 1 && (w[4] & 0x1f) == 7);
|
|
assert!(has_sps, "IDR must carry an SPS NAL (type 7)");
|
|
}
|
|
}
|