diff --git a/crates/lumen-host/src/encode/linux.rs b/crates/lumen-host/src/encode/linux.rs index 7bc3633..0577d20 100644 --- a/crates/lumen-host/src/encode/linux.rs +++ b/crates/lumen-host/src/encode/linux.rs @@ -158,8 +158,15 @@ impl NvencEncoder { video.set_frame_rate(Some(Rational(fps as i32, 1))); video.set_bit_rate(bitrate_bps as usize); video.set_max_bit_rate(bitrate_bps as usize); - video.set_gop(fps.saturating_mul(2).max(1)); // ~2s keyframe interval video.set_max_b_frames(0); + // Infinite GOP — NO periodic IDR. A keyframe at 5120x1440 is ~20-40x a P-frame, so a + // periodic IDR is a recurring multi-millisecond encode+packetize+send spike — the ~2s + // "freeze". NVENC emits one IDR at stream start, then P-frames only; `forced-idr` (below) + // turns a client recovery request (RFI, via `request_keyframe`) into an IDR on demand. + // This is the Moonlight/Sunshine low-latency model. + unsafe { + (*video.as_mut_ptr()).gop_size = -1; + } // For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA // hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context @@ -185,6 +192,7 @@ impl NvencEncoder { opts.set("rc", "cbr"); opts.set("bf", "0"); opts.set("delay", "0"); + opts.set("forced-idr", "1"); // RFI/request_keyframe → real IDR under the infinite GOP let enc = video .open_with(opts) diff --git a/crates/lumen-host/src/gamestream/stream.rs b/crates/lumen-host/src/gamestream/stream.rs index c654700..bcc0253 100644 --- a/crates/lumen-host/src/gamestream/stream.rs +++ b/crates/lumen-host/src/gamestream/stream.rs @@ -160,11 +160,12 @@ fn stream_body( .unwrap_or(20); let mut pk = VideoPacketizer::new(cfg.packet_size, fec_pct, cfg.min_fec); - // Pace at a steady rate (capped at 60fps), re-encoding the last captured frame when the - // compositor produced no new one. wlroots only emits frames on damage, so a static or + // Pace at the client's negotiated frame rate, re-encoding the last captured frame when the + // compositor produced no new one. Compositors only emit frames on damage, so a static or // slow-updating desktop would otherwise starve the client into a "network too slow" abort. - // Re-encoding an unchanged frame is cheap — NVENC emits a near-empty P-frame. - let target_fps = cfg.fps.clamp(1, 60); + // Re-encoding an unchanged frame is cheap — NVENC emits a near-empty P-frame. The upper + // bound just guards against an absurd client request (the encoder is opened at `cfg.fps`). + let target_fps = cfg.fps.clamp(1, 240); let frame_interval = Duration::from_secs_f64(1.0 / target_fps as f64); let mut sent_pkts: u64 = 0; let mut fps_count: u32 = 0; @@ -178,18 +179,29 @@ fn stream_body( let mut rng = rand::thread_rng(); let mut dropped: u64 = 0; + // Per-stage timing (LUMEN_PERF=1): max µs/stage per second + unique vs re-encoded frames, + // to pinpoint stalls. `unique` counts genuinely-new captured frames (vs re-encoded holds). + let perf = std::env::var_os("LUMEN_PERF").is_some(); + let (mut mx_cap, mut mx_enc, mut mx_pkt, mut mx_send, mut mx_pkts, mut uniq) = + (0u128, 0u128, 0u128, 0u128, 0usize, 0u32); + // Absolute next-frame deadline — the single pacing clock for the loop. + let mut next_frame = Instant::now(); + while running.load(Ordering::SeqCst) { let tick = Instant::now(); // Advance to the freshest captured frame if one arrived; otherwise reuse the last. if let Some(f) = capturer.try_latest().context("capture frame")? { frame = f; + uniq += 1; } + let t_cap = tick.elapsed(); // Honor a client recovery request (RFI / request-IDR): force a keyframe so the client // resyncs immediately instead of waiting for the next GOP boundary. if force_idr.swap(false, Ordering::SeqCst) { enc.request_keyframe(); } enc.submit(&frame).context("encoder submit")?; + let t_enc = tick.elapsed(); // 90 kHz RTP timestamp from wall-clock, so a variable capture rate stays correct. let ts = (stream_start.elapsed().as_secs_f64() * 90_000.0) as u32; @@ -202,47 +214,72 @@ fn stream_body( }; batch.extend(pk.packetize(&au.data, ft, ts)); } + let t_pkt = tick.elapsed(); - // Pace the frame's packets evenly across the frame interval rather than blasting them - // at line rate (a real link drops microbursts). The per-packet schedule also paces the - // frame itself; sub-500µs sleeps are skipped (unreliable), batching the spread. + // Send the frame's packets at line rate. Per-packet pacing (microburst shaping) must NOT + // run on this thread — it serializes against capture/encode and was the prime cause of + // the fps oscillation. Pacing belongs on a dedicated send thread (TODO: split encode|send); + // cadence is driven below by a single absolute deadline. let mut client_gone = false; let n = batch.len(); - if n > 0 { - let per_ns = frame_interval.as_nanos() as u64 / n as u64; - for (i, pkt) in batch.iter().enumerate() { - if drop_pct > 0 && rng.gen_range(0..100) < drop_pct { - dropped += 1; // simulated loss: built the packet, skip the send - } else if sock.send(pkt).is_err() { - client_gone = true; - break; - } else { - sent_pkts += 1; - } - let target = tick + Duration::from_nanos(per_ns * (i as u64 + 1)); - if let Some(ahead) = target.checked_duration_since(Instant::now()) { - if ahead >= Duration::from_micros(500) { - std::thread::sleep(ahead); - } - } + for pkt in &batch { + if drop_pct > 0 && rng.gen_range(0..100) < drop_pct { + dropped += 1; // simulated loss: built the packet, skip the send + } else if sock.send(pkt).is_err() { + client_gone = true; + break; + } else { + sent_pkts += 1; } } if client_gone { tracing::info!(sent_pkts, "video: client unreachable — stopping stream"); break; } + if perf { + let t_send = tick.elapsed(); + mx_cap = mx_cap.max(t_cap.as_micros()); + mx_enc = mx_enc.max((t_enc - t_cap).as_micros()); + mx_pkt = mx_pkt.max((t_pkt - t_enc).as_micros()); + mx_send = mx_send.max((t_send - t_pkt).as_micros()); + mx_pkts = mx_pkts.max(n); + } fps_count += 1; if fps_t.elapsed() >= Duration::from_secs(1) { - tracing::info!(fps = fps_count, sent_pkts, dropped, "video: streaming"); + if perf { + // Max µs/stage this second: cap=drain channel, enc=submit (zero-copy device + // copy + NVENC), pkt=poll+FEC+packetize, send=paced packet send. `uniq`=new + // captured frames (vs re-encoded). `pkts`=max packets in one frame (IDR spike). + tracing::info!( + fps = fps_count, + uniq, + enc_us = mx_enc, + pkt_us = mx_pkt, + send_us = mx_send, + cap_us = mx_cap, + max_pkts = mx_pkts, + "video: streaming (perf)" + ); + mx_cap = 0; + mx_enc = 0; + mx_pkt = 0; + mx_send = 0; + mx_pkts = 0; + uniq = 0; + } else { + tracing::info!(fps = fps_count, sent_pkts, dropped, "video: streaming"); + } fps_count = 0; fps_t = Instant::now(); } - // Backstop the frame rate when few/no packets were produced (the packet pacing above - // otherwise consumes ~one frame interval on its own). - let elapsed = tick.elapsed(); - if elapsed < frame_interval { - std::thread::sleep(frame_interval - elapsed); + // Single pacing authority: hold a steady cadence at the target rate from an absolute + // clock. No double-sleep. If a slow frame put us behind, resync to now rather than + // bursting to catch up. + next_frame += frame_interval; + match next_frame.checked_duration_since(Instant::now()) { + Some(d) => std::thread::sleep(d), + None => next_frame = Instant::now(), } } Ok(())