Files
punktfunk/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift
T
enricobuehler 42d1c74663
apple / swift (push) Successful in 1m5s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Has been cancelled
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Has been cancelled
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Has been cancelled
android / android (push) Has been cancelled
apple / screenshots (push) Has been cancelled
ci / web (push) Has been cancelled
ci / docs-site (push) Has been cancelled
ci / bench (push) Has been cancelled
ci / rust (push) Has been cancelled
deb / build-publish (push) Has been cancelled
decky / build-publish (push) Has been cancelled
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Has been cancelled
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Has been cancelled
docker / deploy-docs (push) Has been cancelled
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Has been cancelled
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Has been cancelled
release / apple (push) Has been cancelled
fix(apple-client/audio): capture the right channel of a multi-channel mic + diagnostics
The mic uplink handed the host pure digital silence on a multi-channel
interface: AVAudioConverter's N→stereo downmix takes channels 0/1, but a
pro interface puts the mic on ONE higher discrete channel. Fold the input
to a mono bus ourselves instead — pick the mic's channel (or sum all) and
resample that to the encoder's 48 kHz stereo, so the silent 0/1 downmix
never happens.

- New "Microphone channel" setting (macOS): Auto (sum every channel — a
  lone hot mic passes at full level) or pin 1-based channel N. Picker
  appears only for multi-channel devices, driven by the device's input
  channel count.
- Diagnostics that make this class of failure self-naming next session:
  log the actual live capture device + format + fold mode, warn on a
  silent UID fallback, and a one-shot silence tripwire on the EXTRACTED
  signal (WARN on 10 s of zeros, else peak dBFS).
- foldToMono extracted as a pure, unit-tested helper (pin / sum-clamp x
  interleaved / deinterleaved / mono / out-of-range).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-04 00:36:35 +02:00

550 lines
26 KiB
Swift
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Session audio, both directions:
//
// host speaker: a drain thread pulls Opus packets (nextAudio, its own plane in the
// core), decodes via OpusDecoder, and writes PCM into a jitter ring; an
// AVAudioSourceNode pulls from the ring (silence on underrun with re-priming, so a
// network gap costs one dip, not permanent crackle).
//
// mic host: a second AVAudioEngine taps the input device, folds it to one mono bus (the
// chosen channel of a multi-channel interface, or a sum of all channels), resamples to 48 kHz
// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet the host feeds them
// into a virtual PipeWire source.
//
// Devices are chosen by UID ("" = system default: the engine is then never pinned to a
// concrete device and follows default-device changes). Two engines, not one a single
// AVAudioEngine ties input+output to one aggregate clock, separate engines keep
// arbitrary mic/speaker combinations trivial.
import AVFoundation
import os
private let log = Logger(subsystem: "io.unom.punktfunk", category: "audio")
/// Render-block-owned scratch storage: freed exactly when the closure (and thus the
/// last possible render call) is released never racing CoreAudio.
private final class ScratchBuffer {
// 8192 frames × up to 8 channels (7.1) the render block caps `frames` at 8192.
let ptr = UnsafeMutablePointer<Float>.allocate(capacity: 8192 * 8)
deinit { ptr.deallocate() }
}
public final class SessionAudio {
private let connection: PunktfunkConnection
private let flag = StopFlag()
private let drainDone = DispatchSemaphore(value: 0)
/// Owns the engine handles + drainStarted, paired with `flag`: stop() sets the flag
/// BEFORE taking the engines, every publisher re-checks the flag under this lock
/// after publishing-side work so a startCapture racing stop() (the mic-permission
/// callback arrives whenever the user clicks the prompt) can never leave a hot
/// microphone with no owner.
private let stateLock = NSLock()
private var playbackEngine: AVAudioEngine?
private var captureEngine: AVAudioEngine?
private var drainStarted = false
#if !os(macOS)
/// AVAudioSession `setCategory`/`setActive` are synchronous and block on the audio server, so
/// they must not run on the main thread (UI stall AVFoundation warns about it). PROCESS-WIDE
/// (static) so every SessionAudio shares one serial queue: the AVAudioSession is a process
/// singleton, and across a reconnect the old session's deactivate must be ordered before the
/// new session's activate (a per-instance queue would let them race and leave the new session's
/// audio deactivated). stop() enqueues its deactivate promptly so it lands before the next
/// session's activate.
private static let sessionQueue = DispatchQueue(label: "io.unom.punktfunk.audio.session")
#endif
public init(connection: PunktfunkConnection) {
self.connection = connection
}
/// Backstop for an owner dropping us without stop() unblocks the drain thread
/// (which captures the connection strongly, NOT self) within one poll timeout.
/// Engine teardown still belongs to stop().
deinit {
flag.stop()
}
/// Start playback (and, if enabled+authorized, the mic uplink). Empty UIDs = system default
/// device; on iOS the UIDs are ignored entirely (routes are AVAudioSession-managed). On macOS
/// the engines start synchronously on the caller's (main) thread. On iOS/tvOS start() is
/// ASYNCHRONOUS: it activates the AVAudioSession off the main thread, then starts the engines on
/// a later main-queue hop (gated by `!flag.isStopped`) so playback is live shortly after, not
/// on return. The mic may start later still if the permission prompt is pending.
public func start(speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool) {
#if os(macOS)
// No AVAudioSession on macOS start the engines directly (caller's thread, as before).
startEngines(
speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, micEnabled: micEnabled)
#else
// Configure + activate the session OFF the main thread (it blocks on the audio server),
// then start the engines back on the main thread once it's active engine routing/format
// depend on the active session. A stop() racing in between is caught by the flag guard.
Self.sessionQueue.async { [weak self] in
guard let self else { return }
self.activateAudioSession(micEnabled: micEnabled)
DispatchQueue.main.async { [weak self] in
guard let self, !self.flag.isStopped else { return }
self.startEngines(
speakerUID: speakerUID, micUID: micUID, micChannel: micChannel,
micEnabled: micEnabled)
}
}
#endif
}
#if !os(macOS)
/// Route + policy live in the session, not per-engine: stereo playback, mic capture when
/// enabled, Bluetooth allowed. Failure is non-fatal (defaults). Runs on `sessionQueue`.
private func activateAudioSession(micEnabled: Bool) {
let session = AVAudioSession.sharedInstance()
do {
#if os(iOS)
if micEnabled {
// .defaultToSpeaker: .playAndRecord otherwise routes to the iPhone EARPIECE; only
// affects the built-in route (headphones/BT still win).
try session.setCategory(
.playAndRecord, mode: .default,
options: [.allowBluetoothA2DP, .defaultToSpeaker])
} else {
try session.setCategory(.playback, mode: .default)
}
#else // tvOS no app-accessible mic
try session.setCategory(.playback, mode: .default)
#endif
try session.setActive(true)
} catch {
log.warning("AVAudioSession setup failed: \(error.localizedDescription)")
}
}
#endif
/// Build + start the playback engine (and the mic uplink when enabled + authorized). Main
/// thread (engine setup); on iOS/tvOS the session is already active by the time this runs.
private func startEngines(
speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool
) {
startPlayback(speakerUID: speakerUID)
#if os(tvOS)
// No app-accessible microphone input on tvOS playback only.
#else
guard micEnabled else { return }
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .authorized:
startCapture(micUID: micUID, micChannel: micChannel)
case .notDetermined:
AVCaptureDevice.requestAccess(for: .audio) { [weak self] granted in
DispatchQueue.main.async {
guard let self, granted, !self.flag.isStopped else { return }
self.startCapture(micUID: micUID, micChannel: micChannel)
}
}
default:
log.warning("microphone access denied — mic uplink disabled (System Settings → Privacy)")
}
#endif
}
/// Stop both directions. Safe from any thread; waits the drain thread out ( its
/// poll timeout) so the caller can close the connection right after.
public func stop() {
flag.stop() // before taking the engines see stateLock's comment
stateLock.lock()
let capture = captureEngine
captureEngine = nil
let playback = playbackEngine
playbackEngine = nil
let wasDraining = drainStarted
drainStarted = false
stateLock.unlock()
if let capture {
capture.inputNode.removeTap(onBus: 0)
capture.stop()
}
playback?.stop()
#if !os(macOS)
// Release the session so audio we interrupted (Music, podcasts) gets its resume cue. Like
// activation, setActive is synchronous/blocking run it on the shared serial session queue
// (off the main thread). Enqueued HERE engines already stopped, and BEFORE the drain wait
// below so across a reconnect it lands ahead of the next session's activate on the shared
// queue (otherwise a deferred deactivate could deactivate the new session). Fire-and-forget.
Self.sessionQueue.async {
do {
try AVAudioSession.sharedInstance().setActive(
false, options: .notifyOthersOnDeactivation)
} catch {
log.warning("AVAudioSession deactivation failed: \(error.localizedDescription)")
}
}
#endif
if wasDraining {
_ = drainDone.wait(timeout: .now() + .milliseconds(400))
}
}
// MARK: - Playback (host speaker)
private func startPlayback(speakerUID: String) {
// Build the playback layout from the host-RESOLVED channel count (never the request):
// 2 = stereo / 6 = 5.1 / 8 = 7.1, canonical wire order FL FR FC LFE RL RR SL SR.
let channels = Int(connection.resolvedAudioChannels)
// 1 s interleaved capacity, ~20 ms prefill (four 5 ms host packets of jitter absorption
// before the first sample plays), both scaled by the channel count.
let ring = AudioRing(
capacity: 48_000 * channels, prefill: 960 * channels, channels: channels)
let engine = AVAudioEngine()
#if os(macOS)
if !speakerUID.isEmpty {
if let dev = AudioDevices.deviceID(forUID: speakerUID),
let unit = engine.outputNode.audioUnit {
if !Self.setDevice(dev, on: unit) {
log.error("could not select speaker \(speakerUID) — using default")
}
} else {
log.warning("speaker \(speakerUID) not present — using default")
}
}
#endif
// Engine-native deinterleaved float; the render block deinterleaves from the ring. Surround
// uses an explicit wire-order channel layout; the mixer downmixes to the output device when
// it has fewer speakers (e.g. an iPhone's stereo built-ins). (Explicit if/else rather than
// map/flatMap so it's correct whether the channelLayout initializer is failable or not.)
var format: AVAudioFormat?
if channels == 2 {
format = AVAudioFormat(standardFormatWithSampleRate: 48_000, channels: 2)
} else if let layout = wireChannelLayout(channels: channels) {
format = AVAudioFormat(standardFormatWithSampleRate: 48_000, channelLayout: layout)
}
guard let format else {
log.error("could not build \(channels)-channel audio format — audio disabled")
return
}
let scratch = ScratchBuffer() // block-owned; freed with the closure
let source = AVAudioSourceNode(format: format) { _, _, frameCount, abl -> OSStatus in
let frames = Int(frameCount)
guard frames <= 8192 else { return kAudioUnitErr_TooManyFramesToProcess }
ring.read(into: scratch.ptr, count: frames * channels)
let buffers = UnsafeMutableAudioBufferListPointer(abl)
// Deinterleave the wire-order interleaved ring into the engine's per-channel buses.
if buffers.count >= channels {
for ch in 0..<channels {
if let dst = buffers[ch].mData?.assumingMemoryBound(to: Float.self) {
for f in 0..<frames { dst[f] = scratch.ptr[f * channels + ch] }
}
}
}
return noErr
}
engine.attach(source)
engine.connect(source, to: engine.mainMixerNode, format: format)
engine.prepare()
do {
try engine.start()
} catch {
log.error("playback engine failed to start: \(error.localizedDescription)")
return
}
stateLock.lock()
if flag.isStopped {
stateLock.unlock()
engine.stop() // stop() already ran don't strand a started engine
return
}
playbackEngine = engine
stateLock.unlock()
startDrain(into: ring)
}
private func startDrain(into ring: AudioRing) {
stateLock.lock()
drainStarted = true
stateLock.unlock()
let thread = Thread { [connection, flag, drainDone] in
defer { drainDone.signal() }
// Decode happens IN-CORE (libopus multistream) AudioToolbox's Opus path is
// stereo-only and is handed back as interleaved f32 PCM in wire channel order.
while !flag.isStopped {
let pcm: PunktfunkConnection.AudioPCM?
do {
pcm = try connection.nextAudioPcm(timeoutMs: 100)
} catch {
break // session closed
}
guard let pcm, pcm.frameCount > 0 else { continue }
pcm.samples.withUnsafeBufferPointer { p in
if let base = p.baseAddress {
ring.write(base, count: pcm.frameCount * pcm.channels)
}
}
}
}
thread.name = "punktfunk-audio"
thread.qualityOfService = .userInteractive
thread.start()
}
// MARK: - Mic (mic host)
#if !os(tvOS)
private func startCapture(micUID: String, micChannel: Int) {
let engine = AVAudioEngine()
let input = engine.inputNode
#if os(macOS)
if !micUID.isEmpty {
if let dev = AudioDevices.deviceID(forUID: micUID), let unit = input.audioUnit {
if !Self.setDevice(dev, on: unit) {
log.error("could not select microphone \(micUID) — using default")
}
} else {
log.warning("microphone \(micUID) not present — using default")
}
}
#endif
let inFormat = input.outputFormat(forBus: 0)
guard inFormat.sampleRate > 0, inFormat.channelCount > 0 else {
log.error("no usable input device — mic uplink disabled")
return
}
// Multi-channel-interface handling. A pro interface exposes N discrete inputs with the mic
// on ONE of them, but AVAudioConverter's Nstereo downmix takes channels 0/1 dead
// silence when the mic sits higher up (the classic "host receives zeros"). So we fold the
// input to a single mono bus OURSELVES and resample that. micChannel: 0 = Auto (sum every
// channel a lone hot mic passes at full level), n1 pins 1-based input channel n.
let inChannels = Int(inFormat.channelCount)
let pinnedChannel: Int? = {
guard micChannel >= 1 else { return nil }
let idx = micChannel - 1
guard idx < inChannels else {
log.warning(
"mic channel \(micChannel) out of range (device has \(inChannels)) — mixing all")
return nil
}
return idx
}()
let channelPlan = pinnedChannel.map { "channel \($0 + 1)/\(inChannels)" }
?? (inChannels > 1 ? "mix \(inChannels)ch→mono" : "mono")
// Name the device we're ACTUALLY recording from + its format + how we fold it, once per
// session. This single line localizes the whole class of "host receives silence" failures
// that otherwise need a host-side tone injection to pin down: a UID that silently fell back
// to the default, the wrong device being live, or the wrong channel picked.
#if os(macOS)
if let unit = input.audioUnit, let live = Self.currentDevice(of: unit),
let dev = AudioDevices.describe(live) {
if !micUID.isEmpty, dev.uid != micUID {
log.warning("""
mic selection not honored — requested \(micUID) but capturing from \
\(dev.name) [\(dev.uid)]; the device's UID likely changed (replug) — \
reselect it in Settings
""")
}
log.info("""
mic capture: \(dev.name) [\(dev.uid)] — \(Int(inFormat.sampleRate)) Hz, \
\(inChannels) ch, \(channelPlan)
""")
} else {
log.info("""
mic capture: <device unavailable> — \(Int(inFormat.sampleRate)) Hz, \
\(inChannels) ch, \(channelPlan)
""")
}
#else
log.info(
"mic capture: \(Int(inFormat.sampleRate)) Hz, \(inChannels) ch, \(channelPlan)")
#endif
// Encode a single mono bus (folded from `inFormat` in the tap): the resampler goes
// mono@inputSR the encoder's 48 kHz stereo, so it handles both the rate change and the
// monostereo duplication, and the wrong-channel downmix never happens.
guard let monoFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32, sampleRate: inFormat.sampleRate,
channels: 1, interleaved: false),
let encoder = try? OpusEncoder(),
let resampler = AVAudioConverter(from: monoFormat, to: encoder.pcmFormat),
let chunk = AVAudioPCMBuffer(
pcmFormat: encoder.pcmFormat, frameCapacity: OpusEncoder.framesPerPacket)
else {
log.error("Opus encoder unavailable — mic uplink disabled")
return
}
// Tap-thread-confined state: resample into `staging`, accumulate in `fifo`,
// slice 960-frame chunks for the encoder.
var fifo: [Float] = []
fifo.reserveCapacity(48_000)
var seq: UInt32 = 0
let connection = connection
let flag = flag
// Silence tripwire (tap-confined): a "recording" app can be handed pure digital zeros
// a zeroed input-volume slider, a stale TCC grant, a muted device, OR the wrong channel
// picked and everything downstream looks alive while the host gets silence. Track the
// peak of the EXTRACTED mono bus over the first ~10 s (not the raw device a mic present
// on a channel we didn't grab must still read as silence) and emit exactly ONE verdict.
// This is the log line whose absence made the last occurrence take a host-side tone.
let silenceWindow = Int(inFormat.sampleRate * 10)
let deviceLabel = micUID.isEmpty ? "default input" : micUID
var framesInspected = 0
var inputPeak: Float = 0
var levelReported = false
input.installTap(onBus: 0, bufferSize: 2048, format: inFormat) { buffer, _ in
if flag.isStopped { return }
let frames = Int(buffer.frameLength)
guard frames > 0, let src = buffer.floatChannelData,
let mono = AVAudioPCMBuffer(
pcmFormat: monoFormat, frameCapacity: buffer.frameLength),
let dst = mono.floatChannelData?[0]
else { return }
mono.frameLength = buffer.frameLength
// Fold the multi-channel input down to the one mono bus we encode.
Self.foldToMono(
input: src, frames: frames, channels: Int(buffer.format.channelCount),
interleaved: buffer.format.isInterleaved, pinned: pinnedChannel, out: dst)
if !levelReported {
var localPeak: Float = 0
for i in 0..<frames where abs(dst[i]) > localPeak { localPeak = abs(dst[i]) }
if localPeak > inputPeak { inputPeak = localPeak }
framesInspected += frames
if framesInspected >= silenceWindow {
levelReported = true
if inputPeak == 0 {
log.warning("""
mic uplink has been pure digital SILENCE for 10 s (\(deviceLabel), \
\(channelPlan)) — check the input level (System Settings → Sound → \
Input), Privacy & Security → Microphone, and the Microphone channel in \
Settings; the host is receiving zeros
""")
} else {
let dbfs = 20 * log10(inputPeak)
log.info("""
mic uplink OK — peak \(String(format: "%.1f", dbfs)) dBFS over first \
10 s (\(deviceLabel), \(channelPlan))
""")
}
}
}
let ratio = 48_000 / inFormat.sampleRate
let outCapacity = AVAudioFrameCount((Double(frames) * ratio).rounded(.up) + 64)
guard let staging = AVAudioPCMBuffer(
pcmFormat: encoder.pcmFormat, frameCapacity: outCapacity)
else { return }
var fed = false
var convError: NSError?
let status = resampler.convert(to: staging, error: &convError) { _, outStatus in
if fed {
outStatus.pointee = .noDataNow
return nil
}
fed = true
outStatus.pointee = .haveData
return mono
}
guard status != .error, let p = staging.floatChannelData?[0] else { return }
fifo.append(contentsOf: UnsafeBufferPointer(
start: p, count: Int(staging.frameLength) * 2))
let samplesPerChunk = Int(OpusEncoder.framesPerPacket) * 2
while fifo.count >= samplesPerChunk {
chunk.frameLength = OpusEncoder.framesPerPacket
fifo.withUnsafeBufferPointer { src in
chunk.floatChannelData![0].update(
from: src.baseAddress!, count: samplesPerChunk)
}
fifo.removeFirst(samplesPerChunk)
guard let packets = try? encoder.encode(chunk) else { continue }
for packet in packets {
connection.sendMic(
packet, seq: seq, ptsNs: DispatchTime.now().uptimeNanoseconds)
seq &+= 1
}
}
}
engine.prepare()
do {
try engine.start()
} catch {
log.error("capture engine failed to start: \(error.localizedDescription)")
input.removeTap(onBus: 0)
return
}
stateLock.lock()
if flag.isStopped {
// stop() ran while we were starting (the permission prompt resolves at the
// user's leisure) tear the engine down ourselves, nobody else owns it now.
stateLock.unlock()
input.removeTap(onBus: 0)
engine.stop()
return
}
captureEngine = engine
stateLock.unlock()
log.info("mic uplink started (\(micUID.isEmpty ? "default input" : micUID))")
}
/// Fold `channels` of input (`floatChannelData` layout: `interleaved` one buffer strided by
/// channel count; else one buffer per channel) down to a single mono bus in `out` (`frames`
/// long). `pinned` (0-based, must be `< channels`) copies exactly that channel the fix for a
/// mic on one input of a multi-channel interface; `nil` sums every channel, clamped to
/// [-1, 1], so a lone hot channel still passes at full level instead of the silent 0/1 the
/// default Nstereo downmix would grab. Pure + `internal` for unit testing the index math.
static func foldToMono(
input: UnsafePointer<UnsafeMutablePointer<Float>>, frames: Int, channels: Int,
interleaved: Bool, pinned: Int?, out: UnsafeMutablePointer<Float>
) {
if let ch = pinned, ch < channels {
if interleaved {
let d = input[0]
for i in 0..<frames { out[i] = d[i * channels + ch] }
} else {
let d = input[ch]
for i in 0..<frames { out[i] = d[i] }
}
} else if interleaved {
let d = input[0]
for i in 0..<frames {
var s: Float = 0
for c in 0..<channels { s += d[i * channels + c] }
out[i] = max(-1, min(1, s))
}
} else {
let d0 = input[0]
for i in 0..<frames { out[i] = d0[i] }
for c in 1..<channels {
let d = input[c]
for i in 0..<frames { out[i] += d[i] }
}
if channels > 1 { for i in 0..<frames { out[i] = max(-1, min(1, out[i])) } }
}
}
#endif
#if os(macOS)
private static func setDevice(_ id: AudioDeviceID, on unit: AudioUnit) -> Bool {
var dev = id
return AudioUnitSetProperty(
unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0,
&dev, UInt32(MemoryLayout<AudioDeviceID>.size)) == noErr
}
/// Read back the AUHAL's live device the definitive "what are we actually capturing
/// from", which catches a selection that succeeded on paper but silently fell back to
/// the system default (a stale/changed UID, a device that vanished between resolve and
/// start). 0 / an error means we couldn't tell.
private static func currentDevice(of unit: AudioUnit) -> AudioDeviceID? {
var dev = AudioDeviceID(0)
var size = UInt32(MemoryLayout<AudioDeviceID>.size)
let status = AudioUnitGetProperty(
unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &dev, &size)
guard status == noErr, dev != 0 else { return nil }
return dev
}
#endif
}