diff --git a/clients/apple/Sources/PunktfunkClient/SpeedTestSheet.swift b/clients/apple/Sources/PunktfunkClient/SpeedTestSheet.swift index 5ccd59a..660502c 100644 --- a/clients/apple/Sources/PunktfunkClient/SpeedTestSheet.swift +++ b/clients/apple/Sources/PunktfunkClient/SpeedTestSheet.swift @@ -23,10 +23,12 @@ private final class ProbeToken: @unchecked Sendable { /// What the host is asked to burst: the host's full probe ceiling (it clamps to ≤ 3 Gbps), /// so the measurement surfaces the link's real ceiling instead of an artificial cap — /// bursting ABOVE what the link can carry is how the probe finds where delivery falls off. -/// Two seconds rides out scheduler jitter. File-scope so the detached probe task reads them -/// without crossing into the view's main actor. +/// Five seconds (was 2 s) averages out the scheduler/recv jitter that made a short probe swing +/// wildly (50 vs 900 Mbps on the same link) — long enough for the host's steady-state send and +/// the client's recv drain to settle. File-scope so the detached probe task reads them without +/// crossing into the view's main actor. private let probeTargetKbps: UInt32 = 3_000_000 -private let probeDurationMs: UInt32 = 2_000 +private let probeDurationMs: UInt32 = 5_000 struct SpeedTestSheet: View { @Environment(\.dismiss) private var dismiss diff --git a/crates/punktfunk-core/src/transport/udp.rs b/crates/punktfunk-core/src/transport/udp.rs index 7ec5936..c7801d1 100644 --- a/crates/punktfunk-core/src/transport/udp.rs +++ b/crates/punktfunk-core/src/transport/udp.rs @@ -108,10 +108,14 @@ fn send_one_gso(fd: libc::c_int, buf: &[u8], gso_size: u16) -> std::io::Result<( Ok(()) } -/// Apple (macOS/iOS) batched-receive enable state. Darwin has no `recvmmsg(2)`, so our macOS client -/// does one `recv` per packet (non-allocating, but a syscall each); `recvmsg_x(2)` is the batched -/// equivalent. Opt-in via `PUNKTFUNK_RECVMSG_X` (it's FFI we can't exercise off-Apple — the scalar -/// recv-loop is the tested default), with auto-fallback if the syscall ever errors unexpectedly. +/// Apple (macOS/iOS) batched-receive enable state. Darwin has no `recvmmsg(2)`, so without this our +/// macOS client does one `recv` syscall per packet — at a few hundred Mbps that's ~40-90k syscalls/s +/// on one core, and when the recv loop can't drain fast enough the kernel socket buffer backs up and +/// drops, which the client sees as a sustained stream stalling/freezing around 300-400 Mbps. +/// `recvmsg_x(2)` is the batched equivalent (the recv counterpart of Linux `recvmmsg`), cutting the +/// syscall rate ~30x. **Default ON** (the multi-Gbps Mac path); the `swift test` loopback on the +/// Apple CI runner exercises it, and it auto-falls-back to the scalar loop if the syscall ever errors +/// unexpectedly. Set `PUNKTFUNK_RECVMSG_X=0` to force the scalar fallback. #[cfg(target_vendor = "apple")] mod recvx { use std::sync::atomic::{AtomicU8, Ordering}; @@ -122,7 +126,10 @@ mod recvx { 1 => true, 2 => false, _ => { - let on = std::env::var_os("PUNKTFUNK_RECVMSG_X").is_some(); + // On unless explicitly disabled with PUNKTFUNK_RECVMSG_X=0. + let on = std::env::var("PUNKTFUNK_RECVMSG_X") + .map(|v| v != "0") + .unwrap_or(true); STATE.store(if on { 1 } else { 2 }, Ordering::Relaxed); on } diff --git a/crates/punktfunk-host/src/capture.rs b/crates/punktfunk-host/src/capture.rs index ed07a58..d5e58fc 100644 --- a/crates/punktfunk-host/src/capture.rs +++ b/crates/punktfunk-host/src/capture.rs @@ -165,6 +165,12 @@ pub struct FastSyntheticCapturer { height: u32, frame_idx: u64, buf: Vec, + /// PUNKTFUNK_SYNTH_NOISE: every frame is fresh high-entropy noise NVENC can't compress or + /// predict, so the encoder hits its (CBR) bitrate target — a throughput test of the real + /// encode→FEC→send→recv path. The default flat/band content compresses to ~nothing, so it + /// can't generate real Mbps (the encoder is content-driven). xorshift over u64 chunks. + noise: bool, + rng: u64, } impl FastSyntheticCapturer { @@ -175,20 +181,38 @@ impl FastSyntheticCapturer { height, frame_idx: 0, buf: vec![0u8; width as usize * height as usize * 4], + noise: std::env::var_os("PUNKTFUNK_SYNTH_NOISE").is_some(), + rng: 0x9e3779b97f4a7c15, } } } impl Capturer for FastSyntheticCapturer { fn next_frame(&mut self) -> Result { - let (w, h) = (self.width as usize, self.height as usize); - let row = w * 4; - let shade = (self.frame_idx % 256) as u8; - self.buf.fill(shade); - let band_h = (h / 20).max(1); - let band_y = (self.frame_idx as usize * 6) % h; - for y in band_y..(band_y + band_h).min(h) { - self.buf[y * row..(y + 1) * row].fill(0xff); + if self.noise { + // Fresh, every-frame-decorrelated noise: reseed from the frame index so consecutive + // frames share no structure (forces large P-frames too, not just the keyframe). + let mut s = self + .rng + .wrapping_add(self.frame_idx.wrapping_mul(0x2545F491_4F6CDD1D)) + | 1; + for c in self.buf.chunks_exact_mut(8) { + s ^= s << 13; + s ^= s >> 7; + s ^= s << 17; + c.copy_from_slice(&s.to_le_bytes()); + } + self.rng = s; + } else { + let (w, h) = (self.width as usize, self.height as usize); + let row = w * 4; + let shade = (self.frame_idx % 256) as u8; + self.buf.fill(shade); + let band_h = (h / 20).max(1); + let band_y = (self.frame_idx as usize * 6) % h; + for y in band_y..(band_y + band_h).min(h) { + self.buf[y * row..(y + 1) * row].fill(0xff); + } } self.frame_idx += 1; Ok(CapturedFrame { diff --git a/packaging/kde/host.env b/packaging/kde/host.env index bc36c87..5997740 100644 --- a/packaging/kde/host.env +++ b/packaging/kde/host.env @@ -10,6 +10,12 @@ PUNKTFUNK_COMPOSITOR=kwin PUNKTFUNK_VIDEO_SOURCE=virtual PUNKTFUNK_ZEROCOPY=1 PUNKTFUNK_INPUT_BACKEND=libei +# UDP Generic Segmentation Offload on the send path: coalesce a frame's equal-size packets into +# kernel super-buffers (one sendmsg per ~64 packets instead of one per packet) — the dominant +# lever above ~1 Gbps, where per-packet send syscalls/pps become the host bottleneck. Safe: it +# auto-falls back to sendmmsg on any kernel/path that rejects UDP_SEGMENT. Set PUNKTFUNK_GSO=0 to +# force it off if a NIC/middlebox mishandles GSO segments. +PUNKTFUNK_GSO=1 # Make the per-session streamed output the SOLE desktop, so plasmashell + windows render on it # rather than on the headless session's `kwin --virtual` bootstrap output (without this the client # sees only the wallpaper of an empty extended output). KWin re-homes the desktop; the bootstrap is