From cbbeaa5c29802c8f684f14e2bc6c963e62921e48 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Mon, 15 Jun 2026 00:43:19 +0000 Subject: [PATCH] feat(host/windows): openh264 software H.264 encoder (GPU-less path) Windows Encoder impl via the openh264 crate (statically-bundled, BSD-2): low-latency screen-content config (Baseline/no-B-frames, bitrate RC, BT.709 limited, near-infinite GOP + forced-IDR recovery via request_keyframe), packed CPU pixels (BGRx/BGRA/RGB/RGBA/RGBx/BGR) -> I420 -> AnnexB with in-band SPS/PPS each IDR. Synchronous: submit encodes immediately, poll hands back the one AU, flush is a no-op. Windows open_video factory selects it (PUNKTFUNK_ENCODER=software|nvenc|auto; NVENC arm lands later), H.264-only with a clear error otherwise, SW bitrate ceiling. Unit-tested live on the VM: synthetic BGRx -> AnnexB IDR + SPS NAL. Unblocks the GPU-less capture->encode->FEC->send pipeline. Compiles clean on Windows + Linux. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 56 ++++++ crates/punktfunk-host/Cargo.toml | 3 + crates/punktfunk-host/src/encode.rs | 29 ++- crates/punktfunk-host/src/encode/sw.rs | 233 +++++++++++++++++++++++++ 4 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 crates/punktfunk-host/src/encode/sw.rs diff --git a/Cargo.lock b/Cargo.lock index 89439b3..68570ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -448,6 +448,12 @@ version = "3.20.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + [[package]] name = "bytes" version = "1.11.1" @@ -2049,6 +2055,15 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "nasm-rs" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "706bf8a5e8c8ddb99128c3291d31bd21f4bcde17f0f4c20ec678d85c74faa149" +dependencies = [ + "log", +] + [[package]] name = "ndk" version = "0.9.0" @@ -2240,6 +2255,27 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" +[[package]] +name = "openh264" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a12b82c14f702c2cece4e0fc28896c6a6bed5317dc13448c86ac41df91a6f82" +dependencies = [ + "openh264-sys2", + "wide", +] + +[[package]] +name = "openh264-sys2" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa9e072e9b270f3b291c80488dc160abc31ecc214ab3bfde937213cfd8c83b32" +dependencies = [ + "cc", + "nasm-rs", + "walkdir", +] + [[package]] name = "openssl-probe" version = "0.2.1" @@ -2579,6 +2615,7 @@ dependencies = [ "khronos-egl", "libc", "mdns-sd", + "openh264", "opus", "pipewire", "punktfunk-core", @@ -3071,6 +3108,15 @@ version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +[[package]] +name = "safe_arch" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" +dependencies = [ + "bytemuck", +] + [[package]] name = "same-file" version = "1.0.6" @@ -4124,6 +4170,16 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "wide" +version = "0.7.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce5da8ecb62bcd8ec8b7ea19f69a51275e91299be594ea5cc6ef7819e16cd03" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "winapi-util" version = "0.1.11" diff --git a/crates/punktfunk-host/Cargo.toml b/crates/punktfunk-host/Cargo.toml index 7e92778..2b30dc6 100644 --- a/crates/punktfunk-host/Cargo.toml +++ b/crates/punktfunk-host/Cargo.toml @@ -117,3 +117,6 @@ windows = { version = "0.62", features = [ "Win32_UI_WindowsAndMessaging", "Win32_System_StationsAndDesktops", ] } +# Software H.264 encoder (GPU-less path + NVENC fallback). The default `source` feature statically +# compiles OpenH264 (BSD-2) — no system lib, builds on MSVC; nasm on PATH adds the SIMD fast path. +openh264 = "0.9" diff --git a/crates/punktfunk-host/src/encode.rs b/crates/punktfunk-host/src/encode.rs index 94c37c0..03c7add 100644 --- a/crates/punktfunk-host/src/encode.rs +++ b/crates/punktfunk-host/src/encode.rs @@ -155,15 +155,40 @@ pub fn open_video( } Err(last.unwrap_or_else(|| anyhow::anyhow!("encoder open failed at every probed bitrate"))) } - #[cfg(not(target_os = "linux"))] + #[cfg(target_os = "windows")] + { + let _ = cuda; // always false on Windows (no Cuda payload) + let pref = std::env::var("PUNKTFUNK_ENCODER") + .unwrap_or_default() + .to_ascii_lowercase(); + if matches!(pref.as_str(), "nvenc" | "hw" | "nvidia") { + anyhow::bail!( + "NVENC hardware encode is not yet implemented on Windows — omit PUNKTFUNK_ENCODER \ + or set it to 'software' to use the openh264 encoder" + ); + } + anyhow::ensure!( + codec == Codec::H264, + "the Windows software encoder supports H.264 only; client negotiated {codec:?} \ + (request H264, or use a GPU host once NVENC lands)" + ); + // Software H.264 realistically caps far below the negotiated hardware rates. + const SW_BITRATE_CEIL: u64 = 100_000_000; + let enc = + sw::OpenH264Encoder::open(format, width, height, fps, bitrate_bps.min(SW_BITRATE_CEIL))?; + Ok(Box::new(enc) as Box) + } + #[cfg(not(any(target_os = "linux", target_os = "windows")))] { let _ = (codec, format, width, height, fps, bitrate_bps, cuda); - anyhow::bail!("NVENC encode requires Linux (FFmpeg + NVIDIA driver)") + anyhow::bail!("video encode requires Linux or Windows") } } #[cfg(target_os = "linux")] mod linux; +#[cfg(target_os = "windows")] +mod sw; #[cfg(test)] mod tests { diff --git a/crates/punktfunk-host/src/encode/sw.rs b/crates/punktfunk-host/src/encode/sw.rs new file mode 100644 index 0000000..4a4298d --- /dev/null +++ b/crates/punktfunk-host/src/encode/sw.rs @@ -0,0 +1,233 @@ +//! Software H.264 encoder (openh264) — the GPU-less encode path for the Windows host (and a +//! fallback when NVENC is unavailable). Low-latency screen-content config: single-reference, +//! no B-frames (Baseline), bitrate rate-control, in-band SPS/PPS each IDR, BT.709 limited range. +//! Synchronous: `submit` encodes immediately and stashes the AU for `poll` (no internal queue). + +use super::{EncodedFrame, Encoder}; +use crate::capture::{CapturedFrame, FramePayload, PixelFormat}; +use anyhow::{bail, ensure, Context, Result}; +use openh264::encoder::{ + BitRate, Complexity, Encoder as Oh264, EncoderConfig, FrameRate, FrameType, IntraFramePeriod, + Profile, RateControlMode, SpsPpsStrategy, UsageType, +}; +use openh264::formats::{BgraSliceU8, RgbSliceU8, YUVBuffer}; +use openh264::OpenH264API; + +pub struct OpenH264Encoder { + enc: Oh264, + yuv: YUVBuffer, + width: u32, + height: u32, + fps: u32, + src_format: PixelFormat, + /// BGRA scratch for the 3-bpp (Bgr) and R/B-swapped (Rgba/Rgbx) formats openh264 can't wrap + /// directly. Reused across frames. + scratch: Vec, + frame_idx: i64, + force_kf: bool, + /// At most one AU per submit (no lookahead), handed back by the next `poll`. + pending: Option, +} + +// openh264's Encoder holds a raw C handle (not auto-Send); it lives on the single encode thread. +unsafe impl Send for OpenH264Encoder {} + +impl OpenH264Encoder { + pub fn open( + format: PixelFormat, + width: u32, + height: u32, + fps: u32, + bitrate_bps: u64, + ) -> Result { + // validate_dimensions() ran in open_video: even, non-zero, <= 4096. + let bps: u32 = bitrate_bps.try_into().unwrap_or(u32::MAX); + let cfg = EncoderConfig::new() + .usage_type(UsageType::ScreenContentRealTime) + .max_frame_rate(FrameRate::from_hz(fps.max(1) as f32)) + .rate_control_mode(RateControlMode::Bitrate) + .bitrate(BitRate::from_bps(bps)) + .skip_frames(false) + .intra_frame_period(IntraFramePeriod::from_num_frames(intra_period_frames(fps))) + .sps_pps_strategy(SpsPpsStrategy::ConstantId) // SPS/PPS in-band on every IDR + .num_threads(num_threads()) + .scene_change_detect(false) // no surprise IDRs (bitrate spikes / freeze) + .adaptive_quantization(true) + .complexity(Complexity::Low) // latency over BD-rate + .profile(Profile::Baseline); // no B-frames; BT.709 limited is the crate default VUI + let api = OpenH264API::from_source(); // statically-bundled build (default `source` feature) + let enc = Oh264::with_api_config(api, cfg).context("openh264 Encoder::with_api_config")?; + let yuv = YUVBuffer::new(width as usize, height as usize); + tracing::info!( + "openh264 software encoder: {width}x{height}@{fps} {} Mbps (Baseline, screen-content)", + bps / 1_000_000 + ); + Ok(Self { + enc, + yuv, + width, + height, + fps, + src_format: format, + scratch: Vec::new(), + frame_idx: 0, + force_kf: false, + pending: None, + }) + } + + /// Normalize a packed source buffer into the reused BGRA `scratch` ([B,G,R,A]). `rgb_order` + /// = source is R,G,B (swap into B,G,R); otherwise source is already B,G,R. + fn normalize_to_bgra(&mut self, src: &[u8], src_bpp: usize, rgb_order: bool) { + let w = self.width as usize; + let h = self.height as usize; + self.scratch.resize(w * h * 4, 0); + for px in 0..(w * h) { + let s = &src[px * src_bpp..px * src_bpp + 3]; + let d = &mut self.scratch[px * 4..px * 4 + 4]; + if rgb_order { + d[0] = s[2]; + d[1] = s[1]; + d[2] = s[0]; + } else { + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + } + d[3] = 0xff; + } + } +} + +impl Encoder for OpenH264Encoder { + fn submit(&mut self, captured: &CapturedFrame) -> Result<()> { + ensure!( + captured.width == self.width && captured.height == self.height, + "captured {}x{} != encoder {}x{}", + captured.width, + captured.height, + self.width, + self.height + ); + ensure!( + captured.format == self.src_format, + "captured format {:?} != encoder source {:?}", + captured.format, + self.src_format + ); + // Refutable once the capture backend adds `FramePayload::D3d11`; today `Cpu` is the only + // non-Linux variant, so the pattern is (temporarily) irrefutable. + #[allow(irrefutable_let_patterns)] + let FramePayload::Cpu(bytes) = &captured.payload + else { + bail!("openh264 backend requires a CPU frame payload"); + }; + let w = self.width as usize; + let h = self.height as usize; + ensure!( + bytes.len() >= w * h * self.src_format.bytes_per_pixel(), + "captured buffer {} bytes too small for {w}x{h} {:?}", + bytes.len(), + self.src_format + ); + + match self.src_format { + PixelFormat::Rgb => self.yuv.read_rgb(RgbSliceU8::new(&bytes[..w * h * 3], (w, h))), + PixelFormat::Bgra | PixelFormat::Bgrx => { + self.yuv.read_rgb(BgraSliceU8::new(&bytes[..w * h * 4], (w, h))) + } + PixelFormat::Rgba | PixelFormat::Rgbx => { + self.normalize_to_bgra(bytes, 4, true); + self.yuv.read_rgb(BgraSliceU8::new(&self.scratch, (w, h))); + } + PixelFormat::Bgr => { + self.normalize_to_bgra(bytes, 3, false); + self.yuv.read_rgb(BgraSliceU8::new(&self.scratch, (w, h))); + } + } + + if self.force_kf { + self.enc.force_intra_frame(); + self.force_kf = false; + } + let bs = self.enc.encode(&self.yuv).context("openh264 encode")?; + let mut data = Vec::new(); + bs.write_vec(&mut data); // AnnexB start codes; SPS/PPS prepended on IDR + if !data.is_empty() { + let keyframe = matches!(bs.frame_type(), FrameType::IDR | FrameType::I); + let pts_ns = self.frame_idx as u64 * 1_000_000_000 / self.fps.max(1) as u64; + self.pending = Some(EncodedFrame { + data, + pts_ns, + keyframe, + }); + } + self.frame_idx += 1; + Ok(()) + } + + fn request_keyframe(&mut self) { + self.force_kf = true; + } + + fn poll(&mut self) -> Result> { + Ok(self.pending.take()) + } + + fn flush(&mut self) -> Result<()> { + Ok(()) // synchronous: nothing buffered + } +} + +/// Approximate infinite-GOP: insert IDRs rarely (recovery is via `request_keyframe`/RFI). Env +/// `PUNKTFUNK_OH264_GOP` overrides (0 = encoder-auto). +fn intra_period_frames(fps: u32) -> u32 { + if let Ok(v) = std::env::var("PUNKTFUNK_OH264_GOP") { + if let Ok(n) = v.trim().parse::() { + return n; + } + } + fps.max(1).saturating_mul(600) // ~10 min between automatic IDRs +} + +/// Encode threads. Env `PUNKTFUNK_OH264_THREADS` overrides; default 2 (latency over throughput). +fn num_threads() -> u16 { + std::env::var("PUNKTFUNK_OH264_THREADS") + .ok() + .and_then(|v| v.trim().parse::().ok()) + .unwrap_or(2) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::capture::{CapturedFrame, FramePayload, PixelFormat}; + + #[test] + fn encodes_synthetic_frame_to_annexb_idr() { + let (w, h, fps) = (1280u32, 720u32, 60u32); + let mut enc = + OpenH264Encoder::open(PixelFormat::Bgrx, w, h, fps, 8_000_000).expect("open openh264"); + // A flat gray BGRx frame. + let frame = CapturedFrame { + width: w, + height: h, + pts_ns: 0, + format: PixelFormat::Bgrx, + payload: FramePayload::Cpu(vec![0x80u8; (w * h * 4) as usize]), + }; + enc.submit(&frame).expect("submit"); + let au = enc.poll().expect("poll").expect("an AU"); + assert!(au.keyframe, "first frame must be an IDR"); + // AnnexB start code + an SPS NAL (type 7) somewhere in the first frame. + assert!( + au.data.starts_with(&[0, 0, 0, 1]) || au.data.starts_with(&[0, 0, 1]), + "expected AnnexB start code" + ); + let has_sps = au + .data + .windows(5) + .any(|w| w[0] == 0 && w[1] == 0 && w[2] == 0 && w[3] == 1 && (w[4] & 0x1f) == 7); + assert!(has_sps, "IDR must carry an SPS NAL (type 7)"); + } +}