fix(apple-client/audio): capture the right channel of a multi-channel mic + diagnostics
apple / swift (push) Successful in 1m5s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Has been cancelled
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Has been cancelled
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Has been cancelled
android / android (push) Has been cancelled
apple / screenshots (push) Has been cancelled
ci / web (push) Has been cancelled
ci / docs-site (push) Has been cancelled
ci / bench (push) Has been cancelled
ci / rust (push) Has been cancelled
deb / build-publish (push) Has been cancelled
decky / build-publish (push) Has been cancelled
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Has been cancelled
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Has been cancelled
docker / deploy-docs (push) Has been cancelled
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Has been cancelled
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Has been cancelled
release / apple (push) Has been cancelled

The mic uplink handed the host pure digital silence on a multi-channel
interface: AVAudioConverter's N→stereo downmix takes channels 0/1, but a
pro interface puts the mic on ONE higher discrete channel. Fold the input
to a mono bus ourselves instead — pick the mic's channel (or sum all) and
resample that to the encoder's 48 kHz stereo, so the silent 0/1 downmix
never happens.

- New "Microphone channel" setting (macOS): Auto (sum every channel — a
  lone hot mic passes at full level) or pin 1-based channel N. Picker
  appears only for multi-channel devices, driven by the device's input
  channel count.
- Diagnostics that make this class of failure self-naming next session:
  log the actual live capture device + format + fold mode, warn on a
  silent UID fallback, and a one-shot silence tripwire on the EXTRACTED
  signal (WARN on 10 s of zeros, else peak dBFS).
- foldToMono extracted as a pure, unit-tested helper (pin / sum-clamp x
  interleaved / deinterleaved / mono / out-of-range).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-07-04 00:36:24 +02:00
parent 136f6e8f0e
commit 42d1c74663
7 changed files with 339 additions and 16 deletions
@@ -338,6 +338,7 @@ final class SessionModel: ObservableObject {
audio.start(
speakerUID: defaults.string(forKey: DefaultsKey.speakerUID) ?? "",
micUID: defaults.string(forKey: DefaultsKey.micUID) ?? "",
micChannel: defaults.integer(forKey: DefaultsKey.micChannel),
micEnabled: defaults.object(forKey: DefaultsKey.micEnabled) as? Bool ?? true)
self.audio = audio
// Gamepads: forward GamepadManager's active controller as pad 0 and render the
@@ -208,6 +208,17 @@ extension SettingsView {
}
}
.disabled(!micEnabled)
// Multi-channel interfaces only: the mic sits on ONE discrete input, so let the user
// pick it. Auto sums every channel (a lone hot mic still passes at full level).
if micChannelCount > 1 {
Picker("Microphone channel", selection: $micChannel) {
Text("Auto (all channels)").tag(0)
ForEach(1...micChannelCount, id: \.self) { ch in
Text("Channel \(ch)").tag(ch)
}
}
.disabled(!micEnabled)
}
#endif
} header: {
Text("Audio")
@@ -61,8 +61,12 @@ struct SettingsView: View {
#if os(macOS)
@AppStorage(DefaultsKey.speakerUID) var speakerUID = ""
@AppStorage(DefaultsKey.micUID) var micUID = ""
@AppStorage(DefaultsKey.micChannel) var micChannel = 0
@State var outputDevices: [AudioDevice] = []
@State var inputDevices: [AudioDevice] = []
// Input channels of the selected mic drives the "Microphone channel" picker, which only
// appears for a multi-channel interface (>1). 0 until the Audio tab loads it.
@State var micChannelCount = 0
#endif
#if os(iOS)
@@ -115,6 +119,12 @@ struct SettingsView: View {
.onAppear {
outputDevices = AudioDevices.outputs()
inputDevices = AudioDevices.inputs()
micChannelCount = AudioDevices.inputChannelCount(forUID: micUID)
}
.onChange(of: micUID) { _, newUID in
// A different mic different channel count; drop a now-out-of-range pin to Auto.
micChannelCount = AudioDevices.inputChannelCount(forUID: newUID)
if micChannel > micChannelCount { micChannel = 0 }
}
.tabItem { Label("Audio", systemImage: "speaker.wave.2") }
@@ -33,6 +33,49 @@ public enum AudioDevices {
}
}
/// Input channel count of the mic the picker would use the device with this UID, or the
/// system default input when `uid` is empty. 0 when it can't be resolved. Drives the
/// "Microphone channel" picker (only shown for multi-channel interfaces).
public static func inputChannelCount(forUID uid: String) -> Int {
let id = uid.isEmpty ? defaultInputDevice() : deviceID(forUID: uid)
guard let id else { return 0 }
return channelCount(id, scope: kAudioObjectPropertyScopeInput)
}
private static func defaultInputDevice() -> AudioDeviceID? {
var address = AudioObjectPropertyAddress(
mSelector: kAudioHardwarePropertyDefaultInputDevice,
mScope: kAudioObjectPropertyScopeGlobal,
mElement: kAudioObjectPropertyElementMain)
var dev = AudioDeviceID(0)
var size = UInt32(MemoryLayout<AudioDeviceID>.size)
guard AudioObjectGetPropertyData(
AudioObjectID(kAudioObjectSystemObject), &address, 0, nil, &size, &dev) == noErr,
dev != 0
else { return nil }
return dev
}
/// Sum of channels across the device's streams in `scope` (its total input/output channels).
private static func channelCount(
_ id: AudioDeviceID, scope: AudioObjectPropertyScope
) -> Int {
var address = AudioObjectPropertyAddress(
mSelector: kAudioDevicePropertyStreamConfiguration,
mScope: scope,
mElement: kAudioObjectPropertyElementMain)
var size: UInt32 = 0
guard AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr, size > 0
else { return 0 }
let raw = UnsafeMutableRawPointer.allocate(
byteCount: Int(size), alignment: MemoryLayout<AudioBufferList>.alignment)
defer { raw.deallocate() }
guard AudioObjectGetPropertyData(id, &address, 0, nil, &size, raw) == noErr else { return 0 }
let abl = UnsafeMutableAudioBufferListPointer(
raw.assumingMemoryBound(to: AudioBufferList.self))
return abl.reduce(0) { $0 + Int($1.mNumberChannels) }
}
private static func all() -> [AudioDeviceID] {
var address = AudioObjectPropertyAddress(
mSelector: kAudioHardwarePropertyDevices,
@@ -62,7 +105,8 @@ public enum AudioDevices {
return AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr && size > 0
}
private static func describe(_ id: AudioDeviceID) -> AudioDevice? {
/// UID + human name for a live AudioDeviceID (nil if either property is unreadable).
static func describe(_ id: AudioDeviceID) -> AudioDevice? {
guard let uid = stringProperty(id, kAudioDevicePropertyDeviceUID),
let name = stringProperty(id, kAudioObjectPropertyName)
else { return nil }
@@ -5,9 +5,10 @@
// AVAudioSourceNode pulls from the ring (silence on underrun with re-priming, so a
// network gap costs one dip, not permanent crackle).
//
// mic host: a second AVAudioEngine taps the input device, resamples to 48 kHz
// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet the host
// feeds them into a virtual PipeWire source.
// mic host: a second AVAudioEngine taps the input device, folds it to one mono bus (the
// chosen channel of a multi-channel interface, or a sum of all channels), resamples to 48 kHz
// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet the host feeds them
// into a virtual PipeWire source.
//
// Devices are chosen by UID ("" = system default: the engine is then never pinned to a
// concrete device and follows default-device changes). Two engines, not one a single
@@ -68,10 +69,11 @@ public final class SessionAudio {
/// ASYNCHRONOUS: it activates the AVAudioSession off the main thread, then starts the engines on
/// a later main-queue hop (gated by `!flag.isStopped`) so playback is live shortly after, not
/// on return. The mic may start later still if the permission prompt is pending.
public func start(speakerUID: String, micUID: String, micEnabled: Bool) {
public func start(speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool) {
#if os(macOS)
// No AVAudioSession on macOS start the engines directly (caller's thread, as before).
startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled)
startEngines(
speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, micEnabled: micEnabled)
#else
// Configure + activate the session OFF the main thread (it blocks on the audio server),
// then start the engines back on the main thread once it's active engine routing/format
@@ -81,7 +83,9 @@ public final class SessionAudio {
self.activateAudioSession(micEnabled: micEnabled)
DispatchQueue.main.async { [weak self] in
guard let self, !self.flag.isStopped else { return }
self.startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled)
self.startEngines(
speakerUID: speakerUID, micUID: micUID, micChannel: micChannel,
micEnabled: micEnabled)
}
}
#endif
@@ -115,7 +119,9 @@ public final class SessionAudio {
/// Build + start the playback engine (and the mic uplink when enabled + authorized). Main
/// thread (engine setup); on iOS/tvOS the session is already active by the time this runs.
private func startEngines(speakerUID: String, micUID: String, micEnabled: Bool) {
private func startEngines(
speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool
) {
startPlayback(speakerUID: speakerUID)
#if os(tvOS)
// No app-accessible microphone input on tvOS playback only.
@@ -123,12 +129,12 @@ public final class SessionAudio {
guard micEnabled else { return }
switch AVCaptureDevice.authorizationStatus(for: .audio) {
case .authorized:
startCapture(micUID: micUID)
startCapture(micUID: micUID, micChannel: micChannel)
case .notDetermined:
AVCaptureDevice.requestAccess(for: .audio) { [weak self] granted in
DispatchQueue.main.async {
guard let self, granted, !self.flag.isStopped else { return }
self.startCapture(micUID: micUID)
self.startCapture(micUID: micUID, micChannel: micChannel)
}
}
default:
@@ -280,7 +286,7 @@ public final class SessionAudio {
// MARK: - Mic (mic host)
#if !os(tvOS)
private func startCapture(micUID: String) {
private func startCapture(micUID: String, micChannel: Int) {
let engine = AVAudioEngine()
let input = engine.inputNode
#if os(macOS)
@@ -300,8 +306,63 @@ public final class SessionAudio {
log.error("no usable input device — mic uplink disabled")
return
}
guard let encoder = try? OpusEncoder(),
let resampler = AVAudioConverter(from: inFormat, to: encoder.pcmFormat),
// Multi-channel-interface handling. A pro interface exposes N discrete inputs with the mic
// on ONE of them, but AVAudioConverter's Nstereo downmix takes channels 0/1 dead
// silence when the mic sits higher up (the classic "host receives zeros"). So we fold the
// input to a single mono bus OURSELVES and resample that. micChannel: 0 = Auto (sum every
// channel a lone hot mic passes at full level), n1 pins 1-based input channel n.
let inChannels = Int(inFormat.channelCount)
let pinnedChannel: Int? = {
guard micChannel >= 1 else { return nil }
let idx = micChannel - 1
guard idx < inChannels else {
log.warning(
"mic channel \(micChannel) out of range (device has \(inChannels)) — mixing all")
return nil
}
return idx
}()
let channelPlan = pinnedChannel.map { "channel \($0 + 1)/\(inChannels)" }
?? (inChannels > 1 ? "mix \(inChannels)ch→mono" : "mono")
// Name the device we're ACTUALLY recording from + its format + how we fold it, once per
// session. This single line localizes the whole class of "host receives silence" failures
// that otherwise need a host-side tone injection to pin down: a UID that silently fell back
// to the default, the wrong device being live, or the wrong channel picked.
#if os(macOS)
if let unit = input.audioUnit, let live = Self.currentDevice(of: unit),
let dev = AudioDevices.describe(live) {
if !micUID.isEmpty, dev.uid != micUID {
log.warning("""
mic selection not honored — requested \(micUID) but capturing from \
\(dev.name) [\(dev.uid)]; the device's UID likely changed (replug) — \
reselect it in Settings
""")
}
log.info("""
mic capture: \(dev.name) [\(dev.uid)] — \(Int(inFormat.sampleRate)) Hz, \
\(inChannels) ch, \(channelPlan)
""")
} else {
log.info("""
mic capture: <device unavailable> — \(Int(inFormat.sampleRate)) Hz, \
\(inChannels) ch, \(channelPlan)
""")
}
#else
log.info(
"mic capture: \(Int(inFormat.sampleRate)) Hz, \(inChannels) ch, \(channelPlan)")
#endif
// Encode a single mono bus (folded from `inFormat` in the tap): the resampler goes
// mono@inputSR the encoder's 48 kHz stereo, so it handles both the rate change and the
// monostereo duplication, and the wrong-channel downmix never happens.
guard let monoFormat = AVAudioFormat(
commonFormat: .pcmFormatFloat32, sampleRate: inFormat.sampleRate,
channels: 1, interleaved: false),
let encoder = try? OpusEncoder(),
let resampler = AVAudioConverter(from: monoFormat, to: encoder.pcmFormat),
let chunk = AVAudioPCMBuffer(
pcmFormat: encoder.pcmFormat, frameCapacity: OpusEncoder.framesPerPacket)
else {
@@ -317,11 +378,59 @@ public final class SessionAudio {
let connection = connection
let flag = flag
// Silence tripwire (tap-confined): a "recording" app can be handed pure digital zeros
// a zeroed input-volume slider, a stale TCC grant, a muted device, OR the wrong channel
// picked and everything downstream looks alive while the host gets silence. Track the
// peak of the EXTRACTED mono bus over the first ~10 s (not the raw device a mic present
// on a channel we didn't grab must still read as silence) and emit exactly ONE verdict.
// This is the log line whose absence made the last occurrence take a host-side tone.
let silenceWindow = Int(inFormat.sampleRate * 10)
let deviceLabel = micUID.isEmpty ? "default input" : micUID
var framesInspected = 0
var inputPeak: Float = 0
var levelReported = false
input.installTap(onBus: 0, bufferSize: 2048, format: inFormat) { buffer, _ in
if flag.isStopped { return }
let frames = Int(buffer.frameLength)
guard frames > 0, let src = buffer.floatChannelData,
let mono = AVAudioPCMBuffer(
pcmFormat: monoFormat, frameCapacity: buffer.frameLength),
let dst = mono.floatChannelData?[0]
else { return }
mono.frameLength = buffer.frameLength
// Fold the multi-channel input down to the one mono bus we encode.
Self.foldToMono(
input: src, frames: frames, channels: Int(buffer.format.channelCount),
interleaved: buffer.format.isInterleaved, pinned: pinnedChannel, out: dst)
if !levelReported {
var localPeak: Float = 0
for i in 0..<frames where abs(dst[i]) > localPeak { localPeak = abs(dst[i]) }
if localPeak > inputPeak { inputPeak = localPeak }
framesInspected += frames
if framesInspected >= silenceWindow {
levelReported = true
if inputPeak == 0 {
log.warning("""
mic uplink has been pure digital SILENCE for 10 s (\(deviceLabel), \
\(channelPlan)) — check the input level (System Settings → Sound → \
Input), Privacy & Security → Microphone, and the Microphone channel in \
Settings; the host is receiving zeros
""")
} else {
let dbfs = 20 * log10(inputPeak)
log.info("""
mic uplink OK — peak \(String(format: "%.1f", dbfs)) dBFS over first \
10 s (\(deviceLabel), \(channelPlan))
""")
}
}
}
let ratio = 48_000 / inFormat.sampleRate
let outCapacity = AVAudioFrameCount(
(Double(buffer.frameLength) * ratio).rounded(.up) + 64)
let outCapacity = AVAudioFrameCount((Double(frames) * ratio).rounded(.up) + 64)
guard let staging = AVAudioPCMBuffer(
pcmFormat: encoder.pcmFormat, frameCapacity: outCapacity)
else { return }
@@ -334,7 +443,7 @@ public final class SessionAudio {
}
fed = true
outStatus.pointee = .haveData
return buffer
return mono
}
guard status != .error, let p = staging.floatChannelData?[0] else { return }
fifo.append(contentsOf: UnsafeBufferPointer(
@@ -378,6 +487,42 @@ public final class SessionAudio {
stateLock.unlock()
log.info("mic uplink started (\(micUID.isEmpty ? "default input" : micUID))")
}
/// Fold `channels` of input (`floatChannelData` layout: `interleaved` one buffer strided by
/// channel count; else one buffer per channel) down to a single mono bus in `out` (`frames`
/// long). `pinned` (0-based, must be `< channels`) copies exactly that channel the fix for a
/// mic on one input of a multi-channel interface; `nil` sums every channel, clamped to
/// [-1, 1], so a lone hot channel still passes at full level instead of the silent 0/1 the
/// default Nstereo downmix would grab. Pure + `internal` for unit testing the index math.
static func foldToMono(
input: UnsafePointer<UnsafeMutablePointer<Float>>, frames: Int, channels: Int,
interleaved: Bool, pinned: Int?, out: UnsafeMutablePointer<Float>
) {
if let ch = pinned, ch < channels {
if interleaved {
let d = input[0]
for i in 0..<frames { out[i] = d[i * channels + ch] }
} else {
let d = input[ch]
for i in 0..<frames { out[i] = d[i] }
}
} else if interleaved {
let d = input[0]
for i in 0..<frames {
var s: Float = 0
for c in 0..<channels { s += d[i * channels + c] }
out[i] = max(-1, min(1, s))
}
} else {
let d0 = input[0]
for i in 0..<frames { out[i] = d0[i] }
for c in 1..<channels {
let d = input[c]
for i in 0..<frames { out[i] += d[i] }
}
if channels > 1 { for i in 0..<frames { out[i] = max(-1, min(1, out[i])) } }
}
}
#endif
#if os(macOS)
@@ -387,5 +532,18 @@ public final class SessionAudio {
unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0,
&dev, UInt32(MemoryLayout<AudioDeviceID>.size)) == noErr
}
/// Read back the AUHAL's live device the definitive "what are we actually capturing
/// from", which catches a selection that succeeded on paper but silently fell back to
/// the system default (a stale/changed UID, a device that vanished between resolve and
/// start). 0 / an error means we couldn't tell.
private static func currentDevice(of unit: AudioUnit) -> AudioDeviceID? {
var dev = AudioDeviceID(0)
var size = UInt32(MemoryLayout<AudioDeviceID>.size)
let status = AudioUnitGetProperty(
unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &dev, &size)
guard status == noErr, dev != 0 else { return nil }
return dev
}
#endif
}
@@ -24,6 +24,12 @@ public enum DefaultsKey {
public static let micEnabled = "punktfunk.micEnabled"
public static let speakerUID = "punktfunk.speakerUID"
public static let micUID = "punktfunk.micUID"
/// macOS: which input channel of the chosen mic device feeds the host. 0 = "Auto" (sum every
/// channel to mono a mic on a single input of a multi-channel interface passes at full
/// level); n1 pins 1-based input channel n. Multi-channel interfaces expose the mic on ONE
/// discrete channel, and the default Nstereo downmix grabs channels 0/1 (silence when the mic
/// is higher up), so we fold to mono ourselves. Only meaningful for multi-channel devices.
public static let micChannel = "punktfunk.micChannel"
public static let presenter = "punktfunk.presenter"
/// Request a 10-bit BT.2020 PQ (HDR10) stream. On by default; only takes effect when the host
/// has HDR content AND this display supports HDR otherwise the stream stays 8-bit SDR.
@@ -0,0 +1,93 @@
// Multi-channel input mono fold (SessionAudio.foldToMono): the fix for a mic on one channel of
// a multi-channel interface. AVAudioConverter's default Nstereo downmix grabs channels 0/1 dead
// silence when the mic sits higher up so we fold ourselves. This pins the fiddly bits (the
// interleaved stride, channel pinning, the sum-clamp) against regressions without needing hardware.
#if !os(tvOS)
import XCTest
@testable import PunktfunkKit
final class AudioChannelFoldTests: XCTestCase {
/// Drive `foldToMono` over channel data expressed as `[[Float]]`, mirroring the two
/// `floatChannelData` layouts:
/// - deinterleaved: each inner array is one channel (all `frames` long).
/// - interleaved: a single inner array already interleaved (c0f0, c1f0, ), with the real
/// channel count passed separately.
private func fold(
_ planes: [[Float]], frames: Int, channels: Int, interleaved: Bool, pinned: Int?
) -> [Float] {
// One C buffer per plane + a table of pointers to them the shape of floatChannelData.
let buffers: [UnsafeMutablePointer<Float>] = planes.map { plane in
let p = UnsafeMutablePointer<Float>.allocate(capacity: plane.count)
for i in 0..<plane.count { p[i] = plane[i] }
return p
}
let table = UnsafeMutablePointer<UnsafeMutablePointer<Float>>.allocate(
capacity: buffers.count)
for (i, b) in buffers.enumerated() { table[i] = b }
let out = UnsafeMutablePointer<Float>.allocate(capacity: frames)
defer {
buffers.forEach { $0.deallocate() }
table.deallocate()
out.deallocate()
}
SessionAudio.foldToMono(
input: table, frames: frames, channels: channels,
interleaved: interleaved, pinned: pinned, out: out)
return (0..<frames).map { out[$0] }
}
// A pinned channel is copied verbatim the exact fix: mic on a HIGH channel, not 0/1.
func testPinsHigherChannelDeinterleaved() {
let result = fold(
[[0, 0, 0], [0, 0, 0], [0.1, 0.2, 0.3], [0, 0, 0]],
frames: 3, channels: 4, interleaved: false, pinned: 2)
XCTAssertEqual(result, [0.1, 0.2, 0.3])
}
// Same signal, interleaved layout: [c0f0,c1f0,c2f0,c3f0, c0f1,]. Guards the `i*ch + c` stride.
func testPinsHigherChannelInterleaved() {
let interleaved: [Float] = [
0, 0, 0.1, 0,
0, 0, 0.2, 0,
0, 0, 0.3, 0,
]
let result = fold([interleaved], frames: 3, channels: 4, interleaved: true, pinned: 2)
XCTAssertEqual(result, [0.1, 0.2, 0.3])
}
// Auto (pinned: nil): a lone hot channel amid silence passes at FULL level, never attenuated.
func testAutoSumsAllChannelsSoALoneMicSurvives() {
let result = fold(
[[0, 0], [0.4, -0.4], [0, 0]],
frames: 2, channels: 3, interleaved: false, pinned: nil)
XCTAssertEqual(result, [0.4, -0.4])
}
// Two simultaneously-hot channels sum past the unit range clamped, never wraps/overflows.
func testAutoSumClampsToUnitRange() {
let result = fold(
[[0.8, -0.8], [0.9, -0.9]],
frames: 2, channels: 2, interleaved: false, pinned: nil)
XCTAssertEqual(result, [1.0, -1.0])
}
// A plain mono device is passed through untouched (no clamp, no attenuation).
func testMonoIsIdentity() {
let result = fold(
[[0.25, -0.5, 0.75]], frames: 3, channels: 1, interleaved: false, pinned: nil)
XCTAssertEqual(result, [0.25, -0.5, 0.75])
}
// Belt-and-suspenders: an out-of-range pin (the tap already guards, but the setting is
// persisted) is ignored by foldToMono's own `ch < channels` guard, which sums instead of
// reading past the buffer.
func testOutOfRangePinFallsBackToSum() {
let result = fold(
[[0, 0], [0.3, 0.3]],
frames: 2, channels: 2, interleaved: false, pinned: 2)
XCTAssertEqual(result, [0.3, 0.3])
}
}
#endif