From 0f7f1be3c3d185d587ebf25e4066b38e88a100ce Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Sat, 20 Jun 2026 14:49:59 +0000 Subject: [PATCH] fix(core/transport): treat ENOBUFS as a transient drop, not a fatal error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WiFi drivers (e.g. ath11k on the Steam Deck) return ENOBUFS — not EAGAIN/EWOULDBLOCK — when the tx queue is momentarily full. Rust maps ENOBUFS to ErrorKind::Uncategorized, so `is_transient_io` (which only matched WouldBlock/ConnRefused/ConnReset) treated it as a real error and tore the whole stream down on a single transient burst. This presented as a vicious Heisenbug on the Deck: the native host streamed flawlessly on loopback and under a debugger (anything slow enough not to fill the small ~416 KB wlan0 buffer), but died at full rate cross-machine over WiFi — flaky hang-or-SIGKILL because tx-queue-full is probabilistic. Diagnosed live via a forced core dump (gdb on the hung core): the data-plane thread had bailed on a fatal send error. Treat ENOBUFS (and asynchronous network-path blips ENETUNREACH / EHOSTUNREACH / ENETDOWN / EHOSTDOWN) as a lossy drop like WouldBlock — FEC + the next frame recover. Validated: 6/6 back-to-back cross-machine streams over the Deck's WiFi, host stable, p50 ~4.4 ms (one run dropped 4/300 frames *gracefully*, 0 mismatched — the fix working as intended). Also surface a data-plane bind/hole-punch failure directly in punktfunk1 (it was previously only reported after teardown, which a stall could swallow entirely). Co-Authored-By: Claude Opus 4.8 --- crates/punktfunk-core/src/transport/udp.rs | 28 +++++++++++++++++++++- crates/punktfunk-host/src/punktfunk1.rs | 13 +++++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/crates/punktfunk-core/src/transport/udp.rs b/crates/punktfunk-core/src/transport/udp.rs index 81a9672..29152e4 100644 --- a/crates/punktfunk-core/src/transport/udp.rs +++ b/crates/punktfunk-core/src/transport/udp.rs @@ -26,9 +26,35 @@ const RECV_BUF: usize = MAX_DATAGRAM_BYTES + 1; /// so erroring out here kills a stream that the very next packet would resume. If the peer is /// genuinely gone, the QUIC control plane times out and ends the session cleanly instead. (This is /// the classic connected-UDP "ICMP errors are advisory" rule, doubly true with hole-punching.) +/// - `ENOBUFS`: a WiFi/wlan driver (e.g. `ath11k` on the Steam Deck) returns this — NOT `EAGAIN`/ +/// `WouldBlock` — when its tx queue is momentarily full. Rust maps `ENOBUFS` to +/// `ErrorKind::Uncategorized`, so the `WouldBlock` arm misses it; without this a transient +/// tx-queue burst tears the whole stream down (observed live: the host streamed flawlessly on +/// loopback / under a debugger — anything slow enough not to fill the small wlan0 buffer — but +/// died at full rate over WiFi). Same lossy-drop contract as `WouldBlock`; FEC + the next frame +/// recover. Asynchronous network-path blips (`ENETUNREACH`/`EHOSTUNREACH`/`ENETDOWN`/`EHOSTDOWN`) +/// are droppable for the same reason a stale ICMP is. fn is_transient_io(e: &std::io::Error) -> bool { use std::io::ErrorKind::{ConnectionRefused, ConnectionReset, WouldBlock}; - matches!(e.kind(), WouldBlock | ConnectionRefused | ConnectionReset) + if matches!(e.kind(), WouldBlock | ConnectionRefused | ConnectionReset) { + return true; + } + // `ENOBUFS` & friends have no stable `ErrorKind`, so match the raw errno (unix only). + #[cfg(unix)] + { + matches!( + e.raw_os_error(), + Some(libc::ENOBUFS) + | Some(libc::ENETUNREACH) + | Some(libc::EHOSTUNREACH) + | Some(libc::ENETDOWN) + | Some(libc::EHOSTDOWN) + ) + } + #[cfg(not(unix))] + { + false + } } /// Build one `mmsghdr` per `iovec` (each a single-buffer message) for `sendmmsg`/`recvmmsg`. Shared diff --git a/crates/punktfunk-host/src/punktfunk1.rs b/crates/punktfunk-host/src/punktfunk1.rs index 0086609..bd4fbb5 100644 --- a/crates/punktfunk-host/src/punktfunk1.rs +++ b/crates/punktfunk-host/src/punktfunk1.rs @@ -837,12 +837,19 @@ async fn serve_session( // can be on different subnets; control + side planes ride the client-initiated QUIC, but // the raw video UDP needs the client to open the path first). Falls back to the // client-reported address for clients that don't punch (flat-LAN, unchanged). - let (transport, punched) = UdpTransport::connect_via_punch( + let (transport, punched) = match UdpTransport::connect_via_punch( &format!("0.0.0.0:{udp_port}"), &client_udp.to_string(), std::time::Duration::from_millis(2500), - ) - .context("bind data plane")?; + ) { + Ok(v) => v, + Err(e) => { + // Surface the failure here directly: a data-plane bind error would otherwise be + // reported only after teardown (and a teardown stall could swallow it entirely). + tracing::error!(error = %e, %client_udp, udp_port, "data-plane socket bind/hole-punch failed"); + return Err(anyhow::Error::new(e)).context("bind data plane"); + } + }; tracing::info!( %client_udp, punched,