1c04e77293
apple / screenshots (push) Has been cancelled
apple / swift (push) Has been cancelled
ci / docs-site (push) Has been cancelled
ci / bench (push) Has been cancelled
ci / web (push) Has been cancelled
ci / rust (push) Has been cancelled
android-screenshots / screenshots (push) Successful in 2m16s
deb / build-publish (push) Successful in 3m26s
decky / build-publish (push) Successful in 13s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 6s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 6s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 5s
windows-host / package (push) Successful in 6m48s
release / apple (push) Successful in 7m45s
windows-msix / package (arm64, C:\Users\Public\ffmpeg-arm64, aarch64-pc-windows-msvc, C:\t-a64) (push) Successful in 1m22s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 2m37s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 4s
android / android (push) Successful in 9m35s
windows-msix / package (x64, C:\Users\Public\ffmpeg, x86_64-pc-windows-msvc, C:\t) (push) Successful in 1m32s
linux-client-screenshots / screenshots (push) Successful in 2m31s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Successful in 8m53s
web-screenshots / screenshots (push) Successful in 2m32s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Successful in 8m37s
docker / deploy-docs (push) Failing after 1m4s
flatpak / build-publish (push) Failing after 3m44s
feat(apple): add cursor capture on iPad
231 lines
11 KiB
Swift
231 lines
11 KiB
Swift
// Stage-2 presenter, decode half: explicit VideoToolbox decode of the host's HEVC AUs.
|
||
//
|
||
// Stage-1 hands compressed samples to AVSampleBufferDisplayLayer, which decodes AND presents
|
||
// internally with no per-frame callback — so neither decode-completion nor present can be
|
||
// stamped, and frames can't be hand-paced. Here we drive VTDecompressionSession ourselves: the
|
||
// output callback delivers a decoded CVPixelBuffer, we stamp decode-completion, and push it into
|
||
// a ready ring the presenter's display link drains. See docs apple-stage2-presenter.md.
|
||
|
||
import CoreMedia
|
||
import CoreVideo
|
||
import Foundation
|
||
import VideoToolbox
|
||
|
||
/// One decoded frame waiting to be presented. Owns a retained `CVPixelBuffer` until shown.
|
||
public struct ReadyFrame: @unchecked Sendable {
|
||
/// Host capture clock (the AU's pts), in nanoseconds.
|
||
public let ptsNs: UInt64
|
||
/// Client `CLOCK_REALTIME` instant decode completed, in nanoseconds.
|
||
public let decodedNs: Int64
|
||
/// The decoded image — 8-bit NV12 biplanar (SDR) or 10-bit P010 biplanar (HDR), Metal-compatible.
|
||
public let pixelBuffer: CVPixelBuffer
|
||
/// True when the stream is HDR (BT.2020 PQ): the buffer is 10-bit P010 and the presenter must
|
||
/// configure EDR + BT.2020 PQ output. Derived from the decoded buffer's pixel format.
|
||
public let isHDR: Bool
|
||
}
|
||
|
||
/// The C output callback can't capture context, so VideoToolbox hands it the refcon we set at
|
||
/// session creation — a pointer back to the owning `VideoDecoder`.
|
||
private let decoderOutputCallback: VTDecompressionOutputCallback = {
|
||
refcon, _, status, _, imageBuffer, pts, _ in
|
||
guard let refcon else { return }
|
||
Unmanaged<VideoDecoder>.fromOpaque(refcon)
|
||
.takeUnretainedValue()
|
||
.handleDecoded(status: status, imageBuffer: imageBuffer, pts: pts)
|
||
}
|
||
|
||
/// Owns a `VTDecompressionSession` rebuilt whenever the format description changes (every IDR /
|
||
/// mode change, the same trigger stage-1 uses). Thread-safe: `decode` runs on the pump thread,
|
||
/// the output callback on a VT-managed thread; the only shared mutable state is the session +
|
||
/// format, guarded by `lock`. `@unchecked Sendable` — the lock enforces the contract.
|
||
public final class VideoDecoder: @unchecked Sendable {
|
||
private let lock = NSLock()
|
||
private var session: VTDecompressionSession?
|
||
private var format: CMVideoFormatDescription?
|
||
|
||
/// Called on the VT thread for each successfully decoded frame — stamp + enqueue, don't block.
|
||
private let onDecoded: @Sendable (ReadyFrame) -> Void
|
||
/// Called on the VT thread when a frame fails to decode (bad data / decoder reset) so the
|
||
/// pump can re-gate on the next IDR.
|
||
private let onDecodeError: @Sendable (OSStatus) -> Void
|
||
|
||
/// Whether the negotiated stream is full-chroma 4:4:4 (`connection.isChroma444`), set once at
|
||
/// session start before any decode. Selects the 4:4:4 decode pixel format (orthogonal to bit
|
||
/// depth / HDR). Read inside `createSessionLocked` under `lock`.
|
||
private var chroma444 = false
|
||
|
||
public init(
|
||
onDecoded: @escaping @Sendable (ReadyFrame) -> Void,
|
||
onDecodeError: @escaping @Sendable (OSStatus) -> Void = { _ in }
|
||
) {
|
||
self.onDecoded = onDecoded
|
||
self.onDecodeError = onDecodeError
|
||
}
|
||
|
||
deinit { teardown() }
|
||
|
||
/// Select the chroma subsampling of the decode output (4:2:0 vs full-chroma 4:4:4). Call once at
|
||
/// session start, before decoding, from `connection.isChroma444`. Takes effect on the next
|
||
/// session (re)build. Thread-safe.
|
||
public func setChroma444(_ on: Bool) {
|
||
lock.lock()
|
||
chroma444 = on
|
||
lock.unlock()
|
||
}
|
||
|
||
/// Submit one AU for asynchronous decode, (re)creating the session if `format` changed. The
|
||
/// caller resolves `format` from the IDR exactly as stage-1 does (`AnnexB.formatDescription`).
|
||
/// Returns false if the session couldn't be created or the frame couldn't be submitted.
|
||
@discardableResult
|
||
public func decode(au: AccessUnit, format newFormat: CMVideoFormatDescription) -> Bool {
|
||
lock.lock()
|
||
let needsNew: Bool = {
|
||
guard let session, let format else { return true }
|
||
if CMFormatDescriptionEqual(format, otherFormatDescription: newFormat) { return false }
|
||
// A new desc that the live session can still accept (rare for HEVC) avoids a rebuild.
|
||
return !VTDecompressionSessionCanAcceptFormatDescription(session, formatDescription: newFormat)
|
||
}()
|
||
if needsNew, !createSessionLocked(format: newFormat) {
|
||
lock.unlock()
|
||
return false
|
||
}
|
||
// Submit WHILE holding the lock so a concurrent reset()/teardown (main thread) can't
|
||
// invalidate the session between here and DecodeFrame. The VT output callback takes the
|
||
// ring lock, not this one, so there's no re-entrancy. DecodeFrame is async — non-blocking.
|
||
guard let session,
|
||
let sample = AnnexB.sampleBuffer(au: au, format: newFormat)
|
||
else { lock.unlock(); return false }
|
||
var infoOut = VTDecodeInfoFlags()
|
||
let status = VTDecompressionSessionDecodeFrame(
|
||
session,
|
||
sampleBuffer: sample,
|
||
flags: [._EnableAsynchronousDecompression],
|
||
frameRefcon: nil,
|
||
infoFlagsOut: &infoOut)
|
||
lock.unlock()
|
||
if status != noErr {
|
||
onDecodeError(status)
|
||
return false
|
||
}
|
||
return true
|
||
}
|
||
|
||
/// Drop the session — the next `decode` rebuilds it. Used on stop and to recover from a
|
||
/// wedged decoder (re-gates on the next in-band parameter sets, like stage-1's flush).
|
||
public func reset() {
|
||
lock.lock()
|
||
teardownLocked()
|
||
lock.unlock()
|
||
}
|
||
|
||
private func teardown() {
|
||
lock.lock()
|
||
teardownLocked()
|
||
lock.unlock()
|
||
}
|
||
|
||
private func teardownLocked() {
|
||
if let session {
|
||
VTDecompressionSessionWaitForAsynchronousFrames(session)
|
||
VTDecompressionSessionInvalidate(session)
|
||
}
|
||
session = nil
|
||
format = nil
|
||
}
|
||
|
||
/// True when `newFormat` carries a PQ (SMPTE ST 2084) or HLG transfer function — i.e. the host
|
||
/// is sending HDR (BT.2020). VideoToolbox populates the transfer-function extension from the
|
||
/// HEVC VUI, so this picks the decode bit depth (10-bit P010/x444 vs 8-bit NV12/444v) from the
|
||
/// stream. The present-side HDR config (colorspace/EDR/shader) is latched once per session from
|
||
/// the Welcome (`connection.isHDR`), which the host does NOT flip mid-session — so this predicate
|
||
/// and that config agree for the session (a `#if DEBUG` assert in the presenter guards it).
|
||
static func isHDRFormat(_ format: CMVideoFormatDescription) -> Bool {
|
||
guard
|
||
let tf = CMFormatDescriptionGetExtension(
|
||
format, extensionKey: kCMFormatDescriptionExtension_TransferFunction)
|
||
else { return false }
|
||
let s = tf as? String
|
||
return s == (kCMFormatDescriptionTransferFunction_SMPTE_ST_2084_PQ as String)
|
||
|| s == (kCMFormatDescriptionTransferFunction_ITU_R_2100_HLG as String)
|
||
}
|
||
|
||
/// `lock` held. Replace the session with one for `newFormat`. SDR streams decode to 8-bit NV12;
|
||
/// HDR streams (BT.2020 PQ) decode to 10-bit P010 so the presenter can drive EDR.
|
||
private func createSessionLocked(format newFormat: CMVideoFormatDescription) -> Bool {
|
||
if let session {
|
||
VTDecompressionSessionWaitForAsynchronousFrames(session)
|
||
VTDecompressionSessionInvalidate(session)
|
||
}
|
||
session = nil
|
||
format = nil
|
||
|
||
// Decode pixel format is a 2×2 of (chroma, depth/HDR), both biplanar so the presenter binds
|
||
// plane 0 = luma, plane 1 = interleaved chroma uniformly — 4:4:4 just delivers a full-size
|
||
// chroma plane. 10-bit (P010 / `x444`) for HDR (PQ/HLG), 8-bit (NV12 / `444v`) otherwise.
|
||
let hdr = Self.isHDRFormat(newFormat)
|
||
let pixelFormat: OSType = {
|
||
switch (chroma444, hdr) {
|
||
case (false, false): return kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange // NV12
|
||
case (false, true): return kCVPixelFormatType_420YpCbCr10BiPlanarVideoRange // P010
|
||
case (true, false): return kCVPixelFormatType_444YpCbCr8BiPlanarVideoRange // 444v
|
||
case (true, true): return kCVPixelFormatType_444YpCbCr10BiPlanarVideoRange // x444
|
||
}
|
||
}()
|
||
let imageAttrs: [CFString: Any] = [
|
||
kCVPixelBufferMetalCompatibilityKey: true,
|
||
kCVPixelBufferPixelFormatTypeKey: pixelFormat,
|
||
]
|
||
var callback = VTDecompressionOutputCallbackRecord(
|
||
decompressionOutputCallback: decoderOutputCallback,
|
||
decompressionOutputRefCon: Unmanaged.passUnretained(self).toOpaque())
|
||
// 4:4:4 sessions REQUIRE a hardware decoder: we only advertise 4:4:4 when the hardware probe
|
||
// passed, so a hardware-incapable mode (e.g. a resolution past the HW 4:4:4 ceiling) must fail
|
||
// HERE, synchronously, letting the pump's backstop end the session — rather than silently
|
||
// falling back to a software 4:4:4 decoder far too slow for a real-time stream. 4:2:0 keeps the
|
||
// software fallback (nil spec) as a robustness net.
|
||
let spec: CFDictionary? =
|
||
chroma444
|
||
? [kVTVideoDecoderSpecification_RequireHardwareAcceleratedVideoDecoder: true] as CFDictionary
|
||
: nil
|
||
var newSession: VTDecompressionSession?
|
||
let status = VTDecompressionSessionCreate(
|
||
allocator: kCFAllocatorDefault,
|
||
formatDescription: newFormat,
|
||
decoderSpecification: spec,
|
||
imageBufferAttributes: imageAttrs as CFDictionary,
|
||
outputCallback: &callback,
|
||
decompressionSessionOut: &newSession)
|
||
guard status == noErr, let newSession else { return false }
|
||
session = newSession
|
||
format = newFormat
|
||
return true
|
||
}
|
||
|
||
/// VT thread. Stamp decode-completion and enqueue, or report the error.
|
||
fileprivate func handleDecoded(status: OSStatus, imageBuffer: CVImageBuffer?, pts: CMTime) {
|
||
guard status == noErr, let imageBuffer else {
|
||
onDecodeError(status)
|
||
return
|
||
}
|
||
var ts = timespec()
|
||
clock_gettime(CLOCK_REALTIME, &ts)
|
||
let decodedNs = Int64(ts.tv_sec) * 1_000_000_000 + Int64(ts.tv_nsec)
|
||
// pts was stamped at timescale 1e9 (AnnexB.sampleBuffer); normalize defensively.
|
||
let p = CMTimeConvertScale(pts, timescale: 1_000_000_000, method: .default)
|
||
let ptsNs = p.value > 0 ? UInt64(p.value) : 0
|
||
// HDR iff the decoder produced a 10-bit buffer (we only request a 10-bit format for PQ/HLG
|
||
// streams). Covers 4:2:0 (P010) and 4:4:4 (`x444`), video- and full-range, so a 10-bit 4:4:4
|
||
// HDR frame isn't misclassified as SDR. (The mastering metadata is applied to the presenter's
|
||
// CAMetalLayer via CAEDRMetadata, not to this source buffer — a separate-drawable presenter
|
||
// never composites the source buffer's attachments, so attaching them here would be dead.)
|
||
let fmt = CVPixelBufferGetPixelFormatType(imageBuffer)
|
||
let isHDR =
|
||
fmt == kCVPixelFormatType_420YpCbCr10BiPlanarVideoRange
|
||
|| fmt == kCVPixelFormatType_420YpCbCr10BiPlanarFullRange
|
||
|| fmt == kCVPixelFormatType_444YpCbCr10BiPlanarVideoRange
|
||
|| fmt == kCVPixelFormatType_444YpCbCr10BiPlanarFullRange
|
||
onDecoded(
|
||
ReadyFrame(ptsNs: ptsNs, decodedNs: decodedNs, pixelBuffer: imageBuffer, isHDR: isHDR))
|
||
}
|
||
}
|