Files
punktfunk/clients/apple/Tests/PunktfunkKitTests/VideoToolboxRoundTripTests.swift
T
enricobuehler 7b10714b62
ci / web (push) Failing after 38s
ci / rust (push) Successful in 53s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 3s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 4s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 16s
ci / docs-site (push) Failing after 39s
docker / deploy-docs (push) Successful in 16s
apple / swift (push) Successful in 1m17s
feat(apple): stage-2 presenter — explicit decode + Metal present + glass-to-glass
Opt-in (Settings -> Presenter; `punktfunk.presenter`, default stage-1). Stage-1's
AVSampleBufferDisplayLayer decodes AND presents internally with no per-frame
callback, so neither decode nor present can be stamped or hand-paced. Stage-2
takes explicit control:

- VideoDecoder: VTDecompressionSession, async output callback stamps
  decode-completion, session rebuilt on every IDR / format change. Unit-tested
  (testVideoDecoderAsyncCallbackDeliversPixels).
- MetalVideoPresenter: CAMetalLayer + CVMetalTextureCache + a runtime-compiled
  BT.709 limited-range NV12->RGB shader, present at the next vsync. The
  CVMetalTextures + pixel buffer are held until the GPU completes.
- Stage2Pipeline: pump thread -> decoder -> newest-ready 1-slot ring; the hosting
  view's display link drains it once per vsync and stamps capture->present
  (the display-link target time projected into CLOCK_REALTIME).
- LatencyMeter gains record(ptsNs:atNs:offsetNs:); the HUD shows a capture->present
  (glass-to-glass, modulo host render->capture) line, skew-corrected via
  clockOffsetNs. Measured live ~11 ms p50 vs ~2.2 ms capture->client.
- StreamView / StreamViewIOS host the CAMetalLayer as a sublayer + a CADisplayLink
  (NSView.displayLink on macOS) when stage-2; input capture + HUD unchanged. The
  session-active gates switch from `pump != nil` to `connection != nil` so capture
  engages without a StreamPump.

Validated: builds macOS/iOS/tvOS; the decode half is unit-tested; the Metal
present is live-validated on glass (correct image + the capture->present number).
Colorspace is BT.709 SDR for now; 10-bit/HDR + a pacing policy are later.
Plan: docs-site/content/docs/apple-stage2-presenter.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-12 15:29:23 +02:00

221 lines
9.8 KiB
Swift

// Real-bitstream proof of the decode-prep path: VTCompressionSession encodes HEVC, we
// rebuild the host's wire shape (Annex-B AU with in-band VPS/SPS/PPS exactly what
// punktfunk-host emits on every IDR), run it through AnnexB, and hand the result to a real
// VTDecompressionSession. Pixels out = the whole client decode path is sound.
import AVFoundation
import CoreMedia
import VideoToolbox
import XCTest
@testable import PunktfunkKit
/// Sendable holder for the values the (background-thread) decode callback writes.
private final class FrameBox: @unchecked Sendable {
let lock = NSLock()
var frame: ReadyFrame?
var error: OSStatus?
}
final class VideoToolboxRoundTripTests: XCTestCase {
private let width = 320
private let height = 240
func testEncodeAnnexBDecodeRoundTrip() throws {
let (formatDesc, avccSample) = try encodeOneHEVCKeyframe()
// Rebuild the host's wire format: Annex-B AU, parameter sets in-band before the VCL.
let annexB = try annexBAU(formatDesc: formatDesc, avccSample: avccSample)
// 1) Parameter-set extraction format description.
let rebuilt = try XCTUnwrap(
AnnexB.formatDescription(fromIDR: annexB),
"in-band VPS/SPS/PPS should yield a format description")
let dims = CMVideoFormatDescriptionGetDimensions(rebuilt)
XCTAssertEqual(Int(dims.width), width)
XCTAssertEqual(Int(dims.height), height)
// 2) Annex-B AVCC re-pack must reproduce the encoder's own sample bytes.
XCTAssertEqual(AnnexB.avcc(from: annexB), avccSample)
// 3) Sample buffer real decoder pixels.
let au = AccessUnit(data: annexB, ptsNs: 1_000_000, frameIndex: 0, flags: 0)
let sample = try XCTUnwrap(AnnexB.sampleBuffer(au: au, format: rebuilt))
var session: VTDecompressionSession?
XCTAssertEqual(
VTDecompressionSessionCreate(
allocator: nil, formatDescription: rebuilt, decoderSpecification: nil,
imageBufferAttributes: nil, outputCallback: nil,
decompressionSessionOut: &session),
noErr)
let decoder = try XCTUnwrap(session)
defer { VTDecompressionSessionInvalidate(decoder) }
var decoded: CVImageBuffer?
var decodeStatus: OSStatus = -1
// No async flag the handler runs before DecodeFrame returns.
VTDecompressionSessionDecodeFrame(
decoder, sampleBuffer: sample, flags: [], infoFlagsOut: nil
) { status, _, imageBuffer, _, _ in
decodeStatus = status
decoded = imageBuffer
}
XCTAssertEqual(decodeStatus, noErr)
let pixels = try XCTUnwrap(decoded) // CVImageBuffer and CVPixelBuffer are the same CF type
XCTAssertEqual(CVPixelBufferGetWidth(pixels), width)
XCTAssertEqual(CVPixelBufferGetHeight(pixels), height)
}
/// Stage-2 decode half: the same known IDR through `VideoDecoder` assert its async output
/// callback fires with a CVPixelBuffer of the right dimensions, the pts round-trips, and
/// decode-completion is stamped.
func testVideoDecoderAsyncCallbackDeliversPixels() throws {
let (formatDesc, avccSample) = try encodeOneHEVCKeyframe()
let annexB = try annexBAU(formatDesc: formatDesc, avccSample: avccSample)
let format = try XCTUnwrap(AnnexB.formatDescription(fromIDR: annexB))
let au = AccessUnit(data: annexB, ptsNs: 42_000_000, frameIndex: 0, flags: 0)
let box = FrameBox()
let done = DispatchSemaphore(value: 0)
let decoder = VideoDecoder(
onDecoded: { frame in
box.lock.lock(); box.frame = frame; box.lock.unlock()
done.signal()
},
onDecodeError: { status in
box.lock.lock(); box.error = status; box.lock.unlock()
done.signal()
})
XCTAssertTrue(decoder.decode(au: au, format: format), "frame submit should succeed")
XCTAssertEqual(done.wait(timeout: .now() + 10), .success, "the decode callback must fire")
decoder.reset()
box.lock.lock()
let frame = box.frame
let error = box.error
box.lock.unlock()
XCTAssertNil(error.map { "decode error \($0)" })
let ready = try XCTUnwrap(frame, "the async output callback must deliver a ReadyFrame")
XCTAssertEqual(CVPixelBufferGetWidth(ready.pixelBuffer), width)
XCTAssertEqual(CVPixelBufferGetHeight(ready.pixelBuffer), height)
XCTAssertEqual(ready.ptsNs, 42_000_000, "pts round-trips through the decoder")
XCTAssertGreaterThan(ready.decodedNs, 0, "decode-completion is stamped")
}
// MARK: - encode helpers
/// One forced-IDR HEVC frame; returns its format description and raw AVCC sample bytes.
private func encodeOneHEVCKeyframe() throws -> (CMVideoFormatDescription, Data) {
var session: VTCompressionSession?
let rc = VTCompressionSessionCreate(
allocator: nil, width: Int32(width), height: Int32(height),
codecType: kCMVideoCodecType_HEVC, encoderSpecification: nil,
imageBufferAttributes: nil, compressedDataAllocator: nil,
outputCallback: nil, refcon: nil, compressionSessionOut: &session)
guard rc == noErr, let encoder = session else {
throw XCTSkip("no HEVC encoder available (\(rc))")
}
defer { VTCompressionSessionInvalidate(encoder) }
VTSessionSetProperty(encoder, key: kVTCompressionPropertyKey_RealTime, value: kCFBooleanTrue)
VTSessionSetProperty(
encoder, key: kVTCompressionPropertyKey_AllowFrameReordering, value: kCFBooleanFalse)
let lock = NSLock()
var output: CMSampleBuffer?
let done = expectation(description: "encoded")
VTCompressionSessionEncodeFrame(
encoder, imageBuffer: try gradientPixelBuffer(),
presentationTimeStamp: CMTime(value: 0, timescale: 30),
duration: CMTime(value: 1, timescale: 30),
frameProperties: [kVTEncodeFrameOptionKey_ForceKeyFrame: kCFBooleanTrue] as CFDictionary,
infoFlagsOut: nil
) { status, _, sample in
XCTAssertEqual(status, noErr)
lock.lock()
output = sample
lock.unlock()
done.fulfill()
}
VTCompressionSessionCompleteFrames(encoder, untilPresentationTimeStamp: .invalid)
wait(for: [done], timeout: 10)
lock.lock()
defer { lock.unlock() }
let sample = try XCTUnwrap(output)
let desc = try XCTUnwrap(CMSampleBufferGetFormatDescription(sample))
let block = try XCTUnwrap(CMSampleBufferGetDataBuffer(sample))
var bytes = Data(count: CMBlockBufferGetDataLength(block))
try bytes.withUnsafeMutableBytes { raw in
let rc = CMBlockBufferCopyDataBytes(
block, atOffset: 0, dataLength: raw.count,
destination: raw.baseAddress!)
if rc != noErr { throw NSError(domain: "CMBlockBuffer", code: Int(rc)) }
}
return (desc, bytes)
}
/// The host's wire shape: 4-byte start codes, VPS/SPS/PPS in-band, then the VCL NALs.
private func annexBAU(formatDesc: CMVideoFormatDescription, avccSample: Data) throws -> Data {
var au = Data()
var psCount = 0
var nalHeaderLen: Int32 = 0
XCTAssertEqual(
CMVideoFormatDescriptionGetHEVCParameterSetAtIndex(
formatDesc, parameterSetIndex: 0, parameterSetPointerOut: nil,
parameterSetSizeOut: nil, parameterSetCountOut: &psCount,
nalUnitHeaderLengthOut: &nalHeaderLen),
noErr)
XCTAssertEqual(nalHeaderLen, 4, "AnnexB.avcc assumes 4-byte NAL length prefixes")
for i in 0..<psCount {
var ptr: UnsafePointer<UInt8>?
var size = 0
XCTAssertEqual(
CMVideoFormatDescriptionGetHEVCParameterSetAtIndex(
formatDesc, parameterSetIndex: i, parameterSetPointerOut: &ptr,
parameterSetSizeOut: &size, parameterSetCountOut: nil,
nalUnitHeaderLengthOut: nil),
noErr)
au.append(contentsOf: [0, 0, 0, 1])
au.append(Data(bytes: try XCTUnwrap(ptr), count: size))
}
// AVCC sample (4-byte BE length per NAL) start codes.
var i = avccSample.startIndex
while i + 4 <= avccSample.endIndex {
let len = avccSample[i..<i + 4].reduce(0) { ($0 << 8) | Int($1) }
let body = avccSample.index(i, offsetBy: 4)
guard let end = avccSample.index(body, offsetBy: len, limitedBy: avccSample.endIndex)
else { break }
au.append(contentsOf: [0, 0, 0, 1])
au.append(avccSample[body..<end])
i = end
}
return au
}
private func gradientPixelBuffer() throws -> CVPixelBuffer {
var pb: CVPixelBuffer?
let attrs = [kCVPixelBufferIOSurfacePropertiesKey: [:]] as CFDictionary
XCTAssertEqual(
CVPixelBufferCreate(nil, width, height, kCVPixelFormatType_32BGRA, attrs, &pb),
kCVReturnSuccess)
let buf = try XCTUnwrap(pb)
CVPixelBufferLockBaseAddress(buf, [])
defer { CVPixelBufferUnlockBaseAddress(buf, []) }
let base = try XCTUnwrap(CVPixelBufferGetBaseAddress(buf))
let stride = CVPixelBufferGetBytesPerRow(buf)
for y in 0..<height {
let row = base.advanced(by: y * stride).assumingMemoryBound(to: UInt8.self)
for x in 0..<width {
row[x * 4 + 0] = UInt8(x & 0xFF) // B
row[x * 4 + 1] = UInt8(y & 0xFF) // G
row[x * 4 + 2] = UInt8((x ^ y) & 0xFF) // R
row[x * 4 + 3] = 0xFF
}
}
return buf
}
}