From 0324719b6ec70359e0445887f6683189851f77ad Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Tue, 16 Jun 2026 10:21:33 +0000 Subject: [PATCH] feat(host/windows): USO batched send for the GameStream video plane MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GameStream video sender did one send() syscall per packet on Windows (the #[cfg(not(target_os="linux"))] sendmmsg_all fallback), capping throughput at high packet rates. Wire it to UDP Send Offload (the Windows analogue of Linux GSO) so each paced 16-packet burst goes out in one WSASendMsg(UDP_SEND_MSG_SIZE) syscall instead of 16, preserving the microburst pacing. Expose a reusable punktfunk_core::transport::send_uso_all (Windows-only) that reuses the proven native-plane USO primitive (send_one_uso + the uso on/off latch + uso_unsupported), with the same uniform-size guard and ≤512-segment chunking as UdpTransport::send_gso. It returns how many leading packets it sent via USO; the GameStream sendmmsg_all sends any remainder (USO off via PUNKTFUNK_GSO=0, a size-mixed burst, or a frame's short final packet) with per-packet send. On-wire packet boundaries are unchanged. Resolves #4 in docs/apollo-comparison.md. Linux build unaffected; punktfunk-core type-checks for x86_64-pc-windows-msvc. Host Windows compile deferred to CI / dev box. Co-Authored-By: Claude Opus 4.8 --- crates/punktfunk-core/src/transport/mod.rs | 4 ++ crates/punktfunk-core/src/transport/udp.rs | 47 +++++++++++++++++++ .../punktfunk-host/src/gamestream/stream.rs | 21 +++++++-- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/crates/punktfunk-core/src/transport/mod.rs b/crates/punktfunk-core/src/transport/mod.rs index 2694a48..8f19b0a 100644 --- a/crates/punktfunk-core/src/transport/mod.rs +++ b/crates/punktfunk-core/src/transport/mod.rs @@ -5,6 +5,10 @@ mod loopback; mod udp; pub use loopback::{loopback_pair, LoopbackTransport}; +/// Windows-only: reusable USO (UDP Send Offload) batch send for callers that own their own connected +/// socket (the GameStream video sender) rather than going through [`UdpTransport`]. +#[cfg(target_os = "windows")] +pub use udp::send_uso_all; pub use udp::{spawn_data_punch, UdpTransport, PUNCH_MAGIC}; /// A datagram transport. `recv` is non-blocking: it returns `Ok(None)` when no packet diff --git a/crates/punktfunk-core/src/transport/udp.rs b/crates/punktfunk-core/src/transport/udp.rs index c711ded..94df11d 100644 --- a/crates/punktfunk-core/src/transport/udp.rs +++ b/crates/punktfunk-core/src/transport/udp.rs @@ -245,6 +245,53 @@ fn send_one_uso(socket: &std::net::UdpSocket, buf: &[u8], seg_size: u16) -> std: Ok(()) } +/// Reusable Windows USO batch send for callers that own their OWN connected `UdpSocket` and are not +/// the [`UdpTransport`] data plane — specifically the GameStream video sender, whose paced bursts of +/// equal-size RTP/FEC packets are otherwise sent one `send` syscall at a time on Windows. Coalesces +/// the LEADING run of uniform-size packets into ≤512-segment `WSASendMsg(UDP_SEND_MSG_SIZE)` +/// super-buffers and returns how many packets it sent that way; the caller sends any remainder with +/// its own per-packet path. Returns `Ok(0)` (caller sends everything scalar) when USO is disabled +/// (`PUNKTFUNK_GSO=0`) or the packets aren't uniform-size. On a USO-unsupported error it latches USO +/// off process-wide and returns the count sent so far; a transient full-buffer also returns the +/// count-so-far. Same uniform-size rule and `seg`/512 chunking as the [`UdpTransport`] `send_gso` +/// Windows path, reusing its [`send_one_uso`] primitive. +#[cfg(target_os = "windows")] +pub fn send_uso_all(socket: &std::net::UdpSocket, packets: &[&[u8]]) -> std::io::Result { + if packets.is_empty() || !uso::active() { + return Ok(0); + } + // USO needs every segment but the last to be exactly `seg` bytes; bail to the scalar caller path + // otherwise (a frame's final/short packet or a size-mixed burst). + let seg = packets[0].len(); + let last = packets.len() - 1; + if seg == 0 || packets[..last].iter().any(|p| p.len() != seg) || packets[last].len() > seg { + return Ok(0); + } + let max_seg = 512usize; // Win11 x64 accepts up to ~512 segments per WSASendMsg + let mut scratch: Vec = Vec::with_capacity(seg * packets.len().min(max_seg)); + let mut sent = 0usize; + for chunk in packets.chunks(max_seg) { + scratch.clear(); + for p in chunk { + scratch.extend_from_slice(p); + } + match send_one_uso(socket, &scratch, seg as u16) { + Ok(()) => sent += chunk.len(), + // Send buffer momentarily full — stop here; the caller sends the rest (and the pacing + // loop / blocking socket absorbs it). Never block or tear down here. + Err(e) if is_transient_io(&e) => break, + // USO unsupported on this OS/NIC/path — latch off; the caller sends the rest scalar and + // every later burst skips USO via `uso::active()`. + Err(e) if uso_unsupported(&e) => { + uso::disable(); + break; + } + Err(e) => return Err(e), + } + } + Ok(sent) +} + /// Apple (macOS/iOS) batched-receive enable state. Darwin has no `recvmmsg(2)`, so without this our /// macOS client does one `recv` syscall per packet — at a few hundred Mbps that's ~40-90k syscalls/s /// on one core, and when the recv loop can't drain fast enough the kernel socket buffer backs up and diff --git a/crates/punktfunk-host/src/gamestream/stream.rs b/crates/punktfunk-host/src/gamestream/stream.rs index 1b55f9b..b2eab42 100644 --- a/crates/punktfunk-host/src/gamestream/stream.rs +++ b/crates/punktfunk-host/src/gamestream/stream.rs @@ -180,9 +180,24 @@ fn sendmmsg_all(sock: &UdpSocket, pkts: &[Vec]) -> std::io::Result<()> { Ok(()) } -/// Portable fallback (non-Linux dev builds — GameStream hosting never ships there): one -/// syscall per packet. -#[cfg(not(target_os = "linux"))] +/// Windows: coalesce each paced burst's equal-size packets into `WSASendMsg(UDP_SEND_MSG_SIZE)` +/// super-buffers (UDP Send Offload — the Windows analogue of Linux GSO), so a 16-packet burst is one +/// syscall instead of 16. Reuses the proven core USO primitive; it returns how many leading packets +/// it sent, and we send any remainder (USO off via `PUNKTFUNK_GSO=0`, a size-mixed burst, or a +/// frame's short final packet) with a per-packet `send`. The socket is connected. +#[cfg(target_os = "windows")] +fn sendmmsg_all(sock: &UdpSocket, pkts: &[Vec]) -> std::io::Result<()> { + let refs: Vec<&[u8]> = pkts.iter().map(|p| p.as_slice()).collect(); + let n = punktfunk_core::transport::send_uso_all(sock, &refs)?; + for p in &pkts[n..] { + sock.send(p)?; + } + Ok(()) +} + +/// Portable fallback (other non-Linux dev builds, e.g. macOS — GameStream hosting never ships there): +/// one syscall per packet. +#[cfg(not(any(target_os = "linux", target_os = "windows")))] fn sendmmsg_all(sock: &UdpSocket, pkts: &[Vec]) -> std::io::Result<()> { for p in pkts { sock.send(p)?;