feat(punktfunk/1): request-IDR recovery for a wedged client decode
apple / swift (push) Successful in 1m17s
ci / rust (push) Failing after 31s
ci / web (push) Failing after 42s
ci / docs-site (push) Failing after 40s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Failing after 10s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 6s
docker / deploy-docs (push) Has been skipped
rpm / build-publish (push) Failing after 15s
deb / build-publish (push) Failing after 43s
apple / swift (push) Successful in 1m17s
ci / rust (push) Failing after 31s
ci / web (push) Failing after 42s
ci / docs-site (push) Failing after 40s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Failing after 10s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 5s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 6s
docker / deploy-docs (push) Has been skipped
rpm / build-publish (push) Failing after 15s
deb / build-publish (push) Failing after 43s
Fixes the intermittent first-connect freeze. The host streams infinite GOP — one opening IDR, then P-frames only (recovery keyframes just on loss) — so when the client's decoder wedges on the cold first session (a lost/corrupt opening IDR, a bad early P-frame) the picture stays frozen until the far-off next keyframe. The client had no way to ask for one; now it does. Add a RequestKeyframe control message (client -> host, reliable control stream), mirroring Reconfigure: - core: quic.rs RequestKeyframe (type 0x03) + roundtrip test; client.rs CtrlRequest::Keyframe + NativeClient::request_keyframe; abi.rs punktfunk_connection_request_keyframe (header regenerated). - host: m3.rs decodes it in the control loop and signals the encode loop, which coalesces a burst and calls enc.request_keyframe() — wiring the existing NvencEncoder hook (force_kf -> next frame pict_type=I), the same recovery the GameStream path already had via force_idr. - apple: PunktfunkConnection.requestKeyframe(); StreamPump (stage-1) requests on layer.status==.failed; Stage2Pipeline (stage-2) on a sync submit failure and on the async decode-error callback via a thread-safe KeyframeRecovery. All throttled to <=1/250ms (the decode stays wedged for several frames until the IDR lands, so per-frame requests would flood the control stream). Self-healing: a lost recovery IDR is re-requested after the throttle; the host coalesces bursts into a single IDR. Validated: cargo fmt + clippy clean; core + host test suites green (incl. new request_keyframe_roundtrip); swift build + test (39 passed); xcframework rebuilt (all 5 slices), header regenerated with no unrelated drift. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -332,6 +332,21 @@ public final class PunktfunkConnection {
|
||||
_ = punktfunk_connection_request_mode(h, width, height, refreshHz)
|
||||
}
|
||||
|
||||
/// Ask the host's encoder to emit a fresh IDR keyframe now — recovery when the local
|
||||
/// decoder has wedged. The host opens the infinite-GOP stream with one IDR and then sends
|
||||
/// P-frames only, so a stalled decode (a lost/corrupt opening IDR, a bad early P-frame —
|
||||
/// most likely on the cold first connect) would otherwise stay frozen until the next
|
||||
/// loss-triggered recovery keyframe, which may be far off. Fire-and-forget; the recovered
|
||||
/// keyframe is the only ack. THROTTLE at the call site — the decode stays wedged for
|
||||
/// several frames until the IDR lands, so requesting every frame would flood the control
|
||||
/// stream. Silently dropped after close.
|
||||
public func requestKeyframe() {
|
||||
abiLock.lock()
|
||||
defer { abiLock.unlock() }
|
||||
guard let h = handle, !closeRequested else { return }
|
||||
_ = punktfunk_connection_request_keyframe(h)
|
||||
}
|
||||
|
||||
/// The currently active session mode (updated by accepted `requestMode` switches).
|
||||
public func currentMode() -> (width: UInt32, height: UInt32, refreshHz: UInt32) {
|
||||
abiLock.lock()
|
||||
|
||||
@@ -44,11 +44,36 @@ private final class PumpToken: @unchecked Sendable {
|
||||
func cancel() { lock.lock(); live = false; lock.unlock() }
|
||||
}
|
||||
|
||||
/// Throttled host keyframe requests for decode recovery. The decoder's async error callback
|
||||
/// (a VT thread) and the pump thread (a submit failure) both signal a wedge; this coalesces
|
||||
/// them so the control stream isn't flooded while the decode stays stalled for several frames
|
||||
/// until the requested IDR lands. Bound to the live connection in `start`, unbound in `stop`.
|
||||
private final class KeyframeRecovery: @unchecked Sendable {
|
||||
private let lock = NSLock()
|
||||
private var connection: PunktfunkConnection?
|
||||
private var lastNs: UInt64 = 0
|
||||
|
||||
func bind(_ c: PunktfunkConnection?) {
|
||||
lock.lock(); connection = c; lastNs = 0; lock.unlock()
|
||||
}
|
||||
|
||||
func request() {
|
||||
lock.lock()
|
||||
let now = DispatchTime.now().uptimeNanoseconds
|
||||
let due = lastNs == 0 || now &- lastNs > 250_000_000 // ≥ 250 ms since the last request
|
||||
if due { lastNs = now }
|
||||
let conn = due ? connection : nil
|
||||
lock.unlock()
|
||||
conn?.requestKeyframe()
|
||||
}
|
||||
}
|
||||
|
||||
public final class Stage2Pipeline {
|
||||
private let ring = ReadyRing()
|
||||
private let presenter: MetalVideoPresenter
|
||||
private let decoder: VideoDecoder
|
||||
private let presentMeter: LatencyMeter
|
||||
private let recovery = KeyframeRecovery()
|
||||
private var token = PumpToken()
|
||||
private var offsetNs: Int64 = 0
|
||||
|
||||
@@ -63,9 +88,13 @@ public final class Stage2Pipeline {
|
||||
self.presenter = presenter
|
||||
self.presentMeter = presentMeter
|
||||
let ring = ring
|
||||
let recovery = recovery
|
||||
self.decoder = VideoDecoder(
|
||||
onDecoded: { ring.submit($0) },
|
||||
onDecodeError: { _ in /* the pump resets the session via reset() on the next IDR */ })
|
||||
// Async decode failure (a bad P-frame referencing a lost/corrupt IDR): the pump
|
||||
// resets to re-gate on the next IDR, and we ask the host to send one now (infinite
|
||||
// GOP — it wouldn't otherwise come soon). Throttled in KeyframeRecovery.
|
||||
onDecodeError: { _ in recovery.request() })
|
||||
}
|
||||
|
||||
/// Start pulling AUs into the decoder. `onFrame` fires per AU at receipt (capture→client
|
||||
@@ -77,9 +106,11 @@ public final class Stage2Pipeline {
|
||||
onSessionEnd: (@Sendable () -> Void)?
|
||||
) {
|
||||
offsetNs = connection.clockOffsetNs
|
||||
recovery.bind(connection) // arm host-keyframe recovery for this session
|
||||
token = PumpToken() // fresh token per start — cancel is permanent (like StreamPump)
|
||||
let token = token
|
||||
let decoder = decoder
|
||||
let recovery = recovery
|
||||
let thread = Thread {
|
||||
var format: CMVideoFormatDescription?
|
||||
while token.isLive {
|
||||
@@ -92,8 +123,10 @@ public final class Stage2Pipeline {
|
||||
guard let f = format, token.isLive else { continue }
|
||||
if !decoder.decode(au: au, format: f) {
|
||||
// Submit/decoder error: drop the session and re-gate on the next IDR's
|
||||
// in-band parameter sets (a delta frame can't recover) — stage-1's policy.
|
||||
// in-band parameter sets (a delta frame can't recover) — stage-1's policy
|
||||
// — and ask the host for that IDR now (infinite GOP; throttled).
|
||||
decoder.reset()
|
||||
recovery.request()
|
||||
}
|
||||
} catch {
|
||||
if token.isLive { onSessionEnd?() }
|
||||
@@ -125,6 +158,7 @@ public final class Stage2Pipeline {
|
||||
public func stop() {
|
||||
token.cancel()
|
||||
decoder.reset()
|
||||
recovery.bind(nil) // stop requesting keyframes once the session is torn down
|
||||
}
|
||||
|
||||
deinit { token.cancel() }
|
||||
|
||||
@@ -41,6 +41,7 @@ final class StreamPump {
|
||||
|
||||
let thread = Thread {
|
||||
var format: CMVideoFormatDescription?
|
||||
var lastKeyframeRequest = Date.distantPast
|
||||
while token.isLive {
|
||||
do {
|
||||
guard let au = try connection.nextAU(timeoutMs: 100) else { continue }
|
||||
@@ -49,13 +50,19 @@ final class StreamPump {
|
||||
format = f // refreshed on every IDR (mode changes included)
|
||||
}
|
||||
if layer.status == .failed {
|
||||
// Decode wedged: flush and re-gate on the next in-band parameter
|
||||
// sets — resuming with a delta frame can't recover. (A
|
||||
// request-IDR channel on punktfunk/1 is a host-side TODO; with the
|
||||
// host's infinite GOP this may otherwise stay black until the
|
||||
// next recovery keyframe.)
|
||||
// Decode wedged: flush and re-gate on the next in-band parameter sets
|
||||
// (resuming with a delta frame can't recover), AND ask the host for a
|
||||
// fresh IDR. With the host's infinite GOP the next keyframe could be
|
||||
// far off, so without the request the picture stays frozen — the
|
||||
// intermittent first-connect freeze. Throttled: the layer stays .failed
|
||||
// across several polls until the IDR lands, and one request suffices.
|
||||
layer.flush()
|
||||
format = AnnexB.formatDescription(fromIDR: au.data)
|
||||
let now = Date()
|
||||
if now.timeIntervalSince(lastKeyframeRequest) > 0.25 {
|
||||
connection.requestKeyframe()
|
||||
lastKeyframeRequest = now
|
||||
}
|
||||
}
|
||||
guard let f = format,
|
||||
let sample = AnnexB.sampleBuffer(au: au, format: f),
|
||||
|
||||
Reference in New Issue
Block a user