Bigger-bet #1 from the latency plan. virtual_stream ran capture+encode+seal+ paced-send on ONE thread, so frame N+1's capture/encode couldn't start until frame N's entire paced tail had left the wire — the pacing budget (~0.9×interval) was serialized in front of the next encode. Port GameStream's spawn_sender model to the native path: - A dedicated send thread (`send_loop`) owns the WHOLE Session (so no socket clone or shared/Arc stats needed — `seal_frame` mutates the nonce, `send_sealed` + the probe bursts all live there) and does FEC+seal + microburst-paced send. - The encode thread captures+encodes + handles reconfig and hands each AU over a bounded sync_channel(3) as a FrameMsg (data, capture_ns, flags, deadline, encode_us). It BLOCKS on backpressure if the send falls behind — frames slow down rather than a dropped frame freezing the infinite-GOP stream (we don't drop). Clean shutdown: drop the channel → send thread drains/exits → join. - Probes (run_probe_burst) move to the send thread since they need the Session; a burst naturally pauses video (the encode thread blocks on the full channel). - Per-frame encode_us/pace_us histogram moved to the send thread (carries encode_us in the FrameMsg) and now reflects the overlap. Removes the encode↔paced-tail serialization (~2-8 ms @60-120 fps), independent of the pacing policy, no quality cost. Substrate for the future NVENC slice wrapper. Verified live on this box (appliance restarted onto it): a client streamed the KWin desktop (1.49 MB H.265, clean, no panic) and a 200 Mbps speed-test probe completed through the send thread (0 drops). Build + clippy + fmt green. Real-NIC sustained soak (reconfig under load, line-rate, mode switches) pending the Ubuntu third host. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+158
-67
@@ -671,15 +671,15 @@ async fn serve_session(
|
|||||||
let compositor = compositor
|
let compositor = compositor
|
||||||
.expect("the Virtual source resolves a compositor during the handshake");
|
.expect("the Virtual source resolves a compositor during the handshake");
|
||||||
virtual_stream(
|
virtual_stream(
|
||||||
&mut session,
|
session,
|
||||||
mode,
|
mode,
|
||||||
seconds,
|
seconds,
|
||||||
&stop_stream,
|
stop_stream,
|
||||||
&reconfig_rx,
|
&reconfig_rx,
|
||||||
compositor,
|
compositor,
|
||||||
bitrate_kbps,
|
bitrate_kbps,
|
||||||
&probe_rx,
|
probe_rx,
|
||||||
&probe_result_tx,
|
probe_result_tx,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1469,6 +1469,104 @@ fn percentile(sorted_or_not: &mut [u32], q: f64) -> u32 {
|
|||||||
sorted_or_not[i]
|
sorted_or_not[i]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// One encoded frame handed from the capture/encode thread to the send thread (the encode|send
|
||||||
|
/// split). The send thread does FEC+seal+paced-send while this thread captures+encodes the next.
|
||||||
|
struct FrameMsg {
|
||||||
|
data: Vec<u8>,
|
||||||
|
capture_ns: u64,
|
||||||
|
flags: u32,
|
||||||
|
/// When this frame's packets should have fully left (the next frame's due time) = the pacing
|
||||||
|
/// budget. In the past when the send thread is behind → immediate send (catch up).
|
||||||
|
deadline: std::time::Instant,
|
||||||
|
/// capture→encoded latency (µs), measured on the encode thread, carried for the perf histogram.
|
||||||
|
encode_us: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The dedicated send thread: it owns the whole [`Session`] (so no socket clone or shared stats are
|
||||||
|
/// needed) and does FEC+seal + microburst-paced send OFF the capture/encode thread, plus the
|
||||||
|
/// speed-test probe bursts (which also need the Session). Decoupling the paced send from encoding
|
||||||
|
/// lets the encode of frame N+1 overlap the transmit of frame N instead of waiting behind its tail.
|
||||||
|
/// Runs until the encode thread drops the frame channel (end of stream) or `stop` is set.
|
||||||
|
fn send_loop(
|
||||||
|
mut session: Session,
|
||||||
|
frame_rx: std::sync::mpsc::Receiver<FrameMsg>,
|
||||||
|
probe_rx: std::sync::mpsc::Receiver<ProbeRequest>,
|
||||||
|
probe_result_tx: tokio::sync::mpsc::UnboundedSender<ProbeResult>,
|
||||||
|
stop: Arc<AtomicBool>,
|
||||||
|
perf: bool,
|
||||||
|
burst_cap: usize,
|
||||||
|
) {
|
||||||
|
let mut last_perf = std::time::Instant::now();
|
||||||
|
let mut last_bytes = 0u64;
|
||||||
|
let mut last_send_dropped = 0u64;
|
||||||
|
let mut encode_us: Vec<u32> = Vec::new();
|
||||||
|
let mut pace_us: Vec<u32> = Vec::new();
|
||||||
|
let (mut paced_frames, mut immediate_frames) = (0u64, 0u64);
|
||||||
|
loop {
|
||||||
|
if stop.load(Ordering::SeqCst) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Probes run here (they need the Session); a burst pauses video — the encode thread blocks
|
||||||
|
// on the full frame channel meanwhile, which is exactly the intended pause.
|
||||||
|
service_probes(&mut session, &stop, &probe_rx, &probe_result_tx);
|
||||||
|
// Short timeout so we keep re-checking `stop` + probes when no frames are flowing.
|
||||||
|
match frame_rx.recv_timeout(std::time::Duration::from_millis(50)) {
|
||||||
|
Ok(msg) => match paced_submit(
|
||||||
|
&mut session,
|
||||||
|
&msg.data,
|
||||||
|
msg.capture_ns,
|
||||||
|
msg.flags,
|
||||||
|
msg.deadline,
|
||||||
|
burst_cap,
|
||||||
|
) {
|
||||||
|
Ok(stat) => {
|
||||||
|
if perf {
|
||||||
|
encode_us.push(msg.encode_us);
|
||||||
|
pace_us.push(stat.spread_us);
|
||||||
|
if stat.paced {
|
||||||
|
paced_frames += 1;
|
||||||
|
} else {
|
||||||
|
immediate_frames += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(error = %format!("{e:#}"), "send failed — stopping stream");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {}
|
||||||
|
Err(std::sync::mpsc::RecvTimeoutError::Disconnected) => break, // encode thread done
|
||||||
|
}
|
||||||
|
if perf && last_perf.elapsed() >= std::time::Duration::from_secs(2) {
|
||||||
|
let s = session.stats();
|
||||||
|
let secs = last_perf.elapsed().as_secs_f64();
|
||||||
|
// Attempted (sealed) transmit rate; `send_dropped` is what didn't reach the wire.
|
||||||
|
let tx_mbps = (s.bytes_sent - last_bytes) as f64 * 8.0 / secs / 1_000_000.0;
|
||||||
|
tracing::info!(
|
||||||
|
tx_mbps = format!("{tx_mbps:.0}"),
|
||||||
|
send_dropped = s.packets_send_dropped - last_send_dropped,
|
||||||
|
send_dropped_total = s.packets_send_dropped,
|
||||||
|
encode_us_p50 = percentile(&mut encode_us, 0.50),
|
||||||
|
encode_us_p99 = percentile(&mut encode_us, 0.99),
|
||||||
|
pace_us_p50 = percentile(&mut pace_us, 0.50),
|
||||||
|
pace_us_p99 = percentile(&mut pace_us, 0.99),
|
||||||
|
pace_us_max = pace_us.last().copied().unwrap_or(0),
|
||||||
|
immediate_frames,
|
||||||
|
paced_frames,
|
||||||
|
"perf"
|
||||||
|
);
|
||||||
|
last_perf = std::time::Instant::now();
|
||||||
|
last_bytes = s.bytes_sent;
|
||||||
|
last_send_dropped = s.packets_send_dropped;
|
||||||
|
encode_us.clear();
|
||||||
|
pace_us.clear();
|
||||||
|
paced_frames = 0;
|
||||||
|
immediate_frames = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Real capture→encode→punktfunk/1: a native virtual output at the client's mode, NVENC AUs
|
/// Real capture→encode→punktfunk/1: a native virtual output at the client's mode, NVENC AUs
|
||||||
/// stamped with the capture wall clock (the client derives per-frame pipeline latency).
|
/// stamped with the capture wall clock (the client derives per-frame pipeline latency).
|
||||||
///
|
///
|
||||||
@@ -1479,15 +1577,15 @@ fn percentile(sorted_or_not: &mut [u32], q: f64) -> u32 {
|
|||||||
/// carry speed-test bursts (see [`service_probes`]).
|
/// carry speed-test bursts (see [`service_probes`]).
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
fn virtual_stream(
|
fn virtual_stream(
|
||||||
session: &mut Session,
|
session: Session,
|
||||||
mode: punktfunk_core::Mode,
|
mode: punktfunk_core::Mode,
|
||||||
seconds: u32,
|
seconds: u32,
|
||||||
stop: &AtomicBool,
|
stop: Arc<AtomicBool>,
|
||||||
reconfig: &std::sync::mpsc::Receiver<punktfunk_core::Mode>,
|
reconfig: &std::sync::mpsc::Receiver<punktfunk_core::Mode>,
|
||||||
compositor: crate::vdisplay::Compositor,
|
compositor: crate::vdisplay::Compositor,
|
||||||
bitrate_kbps: u32,
|
bitrate_kbps: u32,
|
||||||
probe_rx: &std::sync::mpsc::Receiver<ProbeRequest>,
|
probe_rx: std::sync::mpsc::Receiver<ProbeRequest>,
|
||||||
probe_result_tx: &tokio::sync::mpsc::UnboundedSender<ProbeResult>,
|
probe_result_tx: tokio::sync::mpsc::UnboundedSender<ProbeResult>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
compositor = compositor.id(),
|
compositor = compositor.id(),
|
||||||
@@ -1499,32 +1597,43 @@ fn virtual_stream(
|
|||||||
let (mut capturer, mut enc, mut frame, mut interval) =
|
let (mut capturer, mut enc, mut frame, mut interval) =
|
||||||
build_pipeline_with_retry(&mut vd, mode, bitrate_kbps)?;
|
build_pipeline_with_retry(&mut vd, mode, bitrate_kbps)?;
|
||||||
|
|
||||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(seconds as u64);
|
|
||||||
let mut next = std::time::Instant::now();
|
|
||||||
let mut sent: u64 = 0;
|
|
||||||
// Throughput/drop + per-frame-latency instrumentation (PUNKTFUNK_PERF) — makes a high-bitrate
|
|
||||||
// soak observable: wire goodput, send-buffer drops, and the per-frame encode + pacing-tail
|
|
||||||
// distributions (so the pacing latency the microburst cap is meant to recover is visible).
|
|
||||||
let perf = std::env::var("PUNKTFUNK_PERF").is_ok();
|
let perf = std::env::var("PUNKTFUNK_PERF").is_ok();
|
||||||
let mut last_perf = std::time::Instant::now();
|
// Microburst cap (applied in send_loop/paced_submit): a frame ≤ this bursts out immediately;
|
||||||
let mut last_bytes = 0u64;
|
// only a bigger frame's overflow is spread. PUNKTFUNK_PACE_BURST_KB overrides the 128 KB default.
|
||||||
let mut last_send_dropped = 0u64;
|
|
||||||
let mut encode_us: Vec<u32> = Vec::new();
|
|
||||||
let mut pace_us: Vec<u32> = Vec::new();
|
|
||||||
let (mut paced_frames, mut immediate_frames) = (0u64, 0u64);
|
|
||||||
// Microburst cap: a frame whose sealed size is ≤ this goes out in one immediate burst (no
|
|
||||||
// pacing latency); only the overflow of a bigger frame (IDR / sustained high bitrate) is spread.
|
|
||||||
// 128 KB is a conservative default (well under the ~150 Mbps@60 frame size where bursts started
|
|
||||||
// dropping). Raise it via PUNKTFUNK_PACE_BURST_KB after confirming send_dropped stays 0 on your
|
|
||||||
// link, to recover more pacing tail on higher-bitrate streams.
|
|
||||||
let burst_cap = std::env::var("PUNKTFUNK_PACE_BURST_KB")
|
let burst_cap = std::env::var("PUNKTFUNK_PACE_BURST_KB")
|
||||||
.ok()
|
.ok()
|
||||||
.and_then(|s| s.parse::<usize>().ok())
|
.and_then(|s| s.parse::<usize>().ok())
|
||||||
.unwrap_or(128)
|
.unwrap_or(128)
|
||||||
* 1024;
|
* 1024;
|
||||||
|
|
||||||
|
// Encode|send split: this thread captures+encodes (the GPU work) + handles reconfig, and hands
|
||||||
|
// each AU to a dedicated send thread that owns the Session and does FEC+seal+paced-send — so the
|
||||||
|
// encode of frame N+1 overlaps the paced transmit of frame N instead of waiting behind its tail.
|
||||||
|
// The bounded channel applies backpressure (the encode thread blocks if the send falls behind,
|
||||||
|
// so frames slow down rather than a dropped frame freezing the infinite-GOP stream).
|
||||||
|
let (frame_tx, frame_rx) = std::sync::mpsc::sync_channel::<FrameMsg>(3);
|
||||||
|
let send_thread = std::thread::Builder::new()
|
||||||
|
.name("punktfunk-send".into())
|
||||||
|
.spawn({
|
||||||
|
let stop = stop.clone();
|
||||||
|
move || {
|
||||||
|
send_loop(
|
||||||
|
session,
|
||||||
|
frame_rx,
|
||||||
|
probe_rx,
|
||||||
|
probe_result_tx,
|
||||||
|
stop,
|
||||||
|
perf,
|
||||||
|
burst_cap,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.context("spawn send thread")?;
|
||||||
|
|
||||||
|
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(seconds as u64);
|
||||||
|
let mut next = std::time::Instant::now();
|
||||||
|
let mut sent: u64 = 0;
|
||||||
while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
|
while !stop.load(Ordering::SeqCst) && std::time::Instant::now() < deadline {
|
||||||
// Service speed-test probes between frames (each burst pauses video for its duration).
|
|
||||||
service_probes(session, stop, probe_rx, probe_result_tx);
|
|
||||||
// Drain to the NEWEST requested mode (a resize drag queues many) so we rebuild once,
|
// Drain to the NEWEST requested mode (a resize drag queues many) so we rebuild once,
|
||||||
// not once per stale intermediate mode.
|
// not once per stale intermediate mode.
|
||||||
let mut want = None;
|
let mut want = None;
|
||||||
@@ -1552,61 +1661,43 @@ fn virtual_stream(
|
|||||||
}
|
}
|
||||||
let capture_ns = now_ns();
|
let capture_ns = now_ns();
|
||||||
enc.submit(&frame).context("encoder submit")?;
|
enc.submit(&frame).context("encoder submit")?;
|
||||||
// The deadline for this frame's packets: pace the send up to here so a high-bitrate frame
|
// The deadline for this frame's packets (the next frame's due time); the send thread paces
|
||||||
// spreads over the interval instead of bursting all at once into the NIC (a real link drops
|
// up to here so a high-bitrate frame spreads over the interval instead of bursting.
|
||||||
// the burst, freezing the infinite-GOP stream until the next keyframe — the 1 Gbps+ fix).
|
|
||||||
next += interval;
|
next += interval;
|
||||||
|
let mut send_gone = false;
|
||||||
while let Some(au) = enc.poll().context("encoder poll")? {
|
while let Some(au) = enc.poll().context("encoder poll")? {
|
||||||
let flags = if au.keyframe {
|
let flags = if au.keyframe {
|
||||||
(FLAG_PIC | FLAG_SOF) as u32
|
(FLAG_PIC | FLAG_SOF) as u32
|
||||||
} else {
|
} else {
|
||||||
FLAG_PIC as u32
|
FLAG_PIC as u32
|
||||||
};
|
};
|
||||||
let t_encode_done = now_ns();
|
let encode_us = (now_ns().saturating_sub(capture_ns) / 1000) as u32;
|
||||||
let stat = paced_submit(session, &au.data, capture_ns, flags, next, burst_cap)?;
|
let msg = FrameMsg {
|
||||||
sent += 1;
|
data: au.data,
|
||||||
if perf {
|
capture_ns,
|
||||||
encode_us.push((t_encode_done.saturating_sub(capture_ns) / 1000) as u32);
|
flags,
|
||||||
pace_us.push(stat.spread_us);
|
deadline: next,
|
||||||
if stat.paced {
|
encode_us,
|
||||||
paced_frames += 1;
|
};
|
||||||
} else {
|
// Hand to the send thread; this blocks (backpressure) if it's behind. An Err means it
|
||||||
immediate_frames += 1;
|
// exited (send failure / stop) — end the encode loop too.
|
||||||
}
|
if frame_tx.send(msg).is_err() {
|
||||||
|
send_gone = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
sent += 1;
|
||||||
}
|
}
|
||||||
if perf && last_perf.elapsed() >= std::time::Duration::from_secs(2) {
|
if send_gone {
|
||||||
let s = session.stats();
|
break;
|
||||||
let secs = last_perf.elapsed().as_secs_f64();
|
|
||||||
// Attempted (sealed) transmit rate; `send_dropped` below is what didn't reach the wire.
|
|
||||||
let tx_mbps = (s.bytes_sent - last_bytes) as f64 * 8.0 / secs / 1_000_000.0;
|
|
||||||
tracing::info!(
|
|
||||||
tx_mbps = format!("{tx_mbps:.0}"),
|
|
||||||
frames = sent,
|
|
||||||
send_dropped = s.packets_send_dropped - last_send_dropped,
|
|
||||||
send_dropped_total = s.packets_send_dropped,
|
|
||||||
encode_us_p50 = percentile(&mut encode_us, 0.50),
|
|
||||||
encode_us_p99 = percentile(&mut encode_us, 0.99),
|
|
||||||
pace_us_p50 = percentile(&mut pace_us, 0.50),
|
|
||||||
pace_us_p99 = percentile(&mut pace_us, 0.99),
|
|
||||||
pace_us_max = pace_us.last().copied().unwrap_or(0),
|
|
||||||
immediate_frames,
|
|
||||||
paced_frames,
|
|
||||||
"perf"
|
|
||||||
);
|
|
||||||
last_perf = std::time::Instant::now();
|
|
||||||
last_bytes = s.bytes_sent;
|
|
||||||
last_send_dropped = s.packets_send_dropped;
|
|
||||||
encode_us.clear();
|
|
||||||
pace_us.clear();
|
|
||||||
paced_frames = 0;
|
|
||||||
immediate_frames = 0;
|
|
||||||
}
|
}
|
||||||
match next.checked_duration_since(std::time::Instant::now()) {
|
match next.checked_duration_since(std::time::Instant::now()) {
|
||||||
Some(d) => std::thread::sleep(d),
|
Some(d) => std::thread::sleep(d),
|
||||||
None => next = std::time::Instant::now(),
|
None => next = std::time::Instant::now(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Signal the send thread to drain + exit (drop the channel), then join it.
|
||||||
|
drop(frame_tx);
|
||||||
|
let _ = send_thread.join();
|
||||||
tracing::info!(sent, "punktfunk/1 virtual stream complete");
|
tracing::info!(sent, "punktfunk/1 virtual stream complete");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user