From 42d1c746635409bd5857448bd7e2cf2df16e5096 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Sat, 4 Jul 2026 00:36:24 +0200 Subject: [PATCH] fix(apple-client/audio): capture the right channel of a multi-channel mic + diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mic uplink handed the host pure digital silence on a multi-channel interface: AVAudioConverter's N→stereo downmix takes channels 0/1, but a pro interface puts the mic on ONE higher discrete channel. Fold the input to a mono bus ourselves instead — pick the mic's channel (or sum all) and resample that to the encoder's 48 kHz stereo, so the silent 0/1 downmix never happens. - New "Microphone channel" setting (macOS): Auto (sum every channel — a lone hot mic passes at full level) or pin 1-based channel N. Picker appears only for multi-channel devices, driven by the device's input channel count. - Diagnostics that make this class of failure self-naming next session: log the actual live capture device + format + fold mode, warn on a silent UID fallback, and a one-shot silence tripwire on the EXTRACTED signal (WARN on 10 s of zeros, else peak dBFS). - foldToMono extracted as a pure, unit-tested helper (pin / sum-clamp x interleaved / deinterleaved / mono / out-of-range). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Session/SessionModel.swift | 1 + .../Settings/SettingsView+Sections.swift | 11 + .../Settings/SettingsView.swift | 10 + .../PunktfunkKit/Audio/AudioDevices.swift | 46 ++++- .../PunktfunkKit/Audio/SessionAudio.swift | 188 ++++++++++++++++-- .../PunktfunkKit/Support/DefaultsKeys.swift | 6 + .../AudioChannelFoldTests.swift | 93 +++++++++ 7 files changed, 339 insertions(+), 16 deletions(-) create mode 100644 clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift diff --git a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift index c57c158..599b2f5 100644 --- a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift +++ b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift @@ -338,6 +338,7 @@ final class SessionModel: ObservableObject { audio.start( speakerUID: defaults.string(forKey: DefaultsKey.speakerUID) ?? "", micUID: defaults.string(forKey: DefaultsKey.micUID) ?? "", + micChannel: defaults.integer(forKey: DefaultsKey.micChannel), micEnabled: defaults.object(forKey: DefaultsKey.micEnabled) as? Bool ?? true) self.audio = audio // Gamepads: forward GamepadManager's active controller as pad 0 and render the diff --git a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift index 088fa63..baa2ef0 100644 --- a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift +++ b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift @@ -208,6 +208,17 @@ extension SettingsView { } } .disabled(!micEnabled) + // Multi-channel interfaces only: the mic sits on ONE discrete input, so let the user + // pick it. Auto sums every channel (a lone hot mic still passes at full level). + if micChannelCount > 1 { + Picker("Microphone channel", selection: $micChannel) { + Text("Auto (all channels)").tag(0) + ForEach(1...micChannelCount, id: \.self) { ch in + Text("Channel \(ch)").tag(ch) + } + } + .disabled(!micEnabled) + } #endif } header: { Text("Audio") diff --git a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift index 1f34761..d66ed1d 100644 --- a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift +++ b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift @@ -61,8 +61,12 @@ struct SettingsView: View { #if os(macOS) @AppStorage(DefaultsKey.speakerUID) var speakerUID = "" @AppStorage(DefaultsKey.micUID) var micUID = "" + @AppStorage(DefaultsKey.micChannel) var micChannel = 0 @State var outputDevices: [AudioDevice] = [] @State var inputDevices: [AudioDevice] = [] + // Input channels of the selected mic — drives the "Microphone channel" picker, which only + // appears for a multi-channel interface (>1). 0 until the Audio tab loads it. + @State var micChannelCount = 0 #endif #if os(iOS) @@ -115,6 +119,12 @@ struct SettingsView: View { .onAppear { outputDevices = AudioDevices.outputs() inputDevices = AudioDevices.inputs() + micChannelCount = AudioDevices.inputChannelCount(forUID: micUID) + } + .onChange(of: micUID) { _, newUID in + // A different mic → different channel count; drop a now-out-of-range pin to Auto. + micChannelCount = AudioDevices.inputChannelCount(forUID: newUID) + if micChannel > micChannelCount { micChannel = 0 } } .tabItem { Label("Audio", systemImage: "speaker.wave.2") } diff --git a/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift b/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift index f2113af..4584fd9 100644 --- a/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift +++ b/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift @@ -33,6 +33,49 @@ public enum AudioDevices { } } + /// Input channel count of the mic the picker would use — the device with this UID, or the + /// system default input when `uid` is empty. 0 when it can't be resolved. Drives the + /// "Microphone channel" picker (only shown for multi-channel interfaces). + public static func inputChannelCount(forUID uid: String) -> Int { + let id = uid.isEmpty ? defaultInputDevice() : deviceID(forUID: uid) + guard let id else { return 0 } + return channelCount(id, scope: kAudioObjectPropertyScopeInput) + } + + private static func defaultInputDevice() -> AudioDeviceID? { + var address = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDefaultInputDevice, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + var dev = AudioDeviceID(0) + var size = UInt32(MemoryLayout.size) + guard AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), &address, 0, nil, &size, &dev) == noErr, + dev != 0 + else { return nil } + return dev + } + + /// Sum of channels across the device's streams in `scope` (its total input/output channels). + private static func channelCount( + _ id: AudioDeviceID, scope: AudioObjectPropertyScope + ) -> Int { + var address = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyStreamConfiguration, + mScope: scope, + mElement: kAudioObjectPropertyElementMain) + var size: UInt32 = 0 + guard AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr, size > 0 + else { return 0 } + let raw = UnsafeMutableRawPointer.allocate( + byteCount: Int(size), alignment: MemoryLayout.alignment) + defer { raw.deallocate() } + guard AudioObjectGetPropertyData(id, &address, 0, nil, &size, raw) == noErr else { return 0 } + let abl = UnsafeMutableAudioBufferListPointer( + raw.assumingMemoryBound(to: AudioBufferList.self)) + return abl.reduce(0) { $0 + Int($1.mNumberChannels) } + } + private static func all() -> [AudioDeviceID] { var address = AudioObjectPropertyAddress( mSelector: kAudioHardwarePropertyDevices, @@ -62,7 +105,8 @@ public enum AudioDevices { return AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr && size > 0 } - private static func describe(_ id: AudioDeviceID) -> AudioDevice? { + /// UID + human name for a live AudioDeviceID (nil if either property is unreadable). + static func describe(_ id: AudioDeviceID) -> AudioDevice? { guard let uid = stringProperty(id, kAudioDevicePropertyDeviceUID), let name = stringProperty(id, kAudioObjectPropertyName) else { return nil } diff --git a/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift b/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift index 327e785..00727e3 100644 --- a/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift +++ b/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift @@ -5,9 +5,10 @@ // AVAudioSourceNode pulls from the ring (silence on underrun with re-priming, so a // network gap costs one dip, not permanent crackle). // -// mic → host: a second AVAudioEngine taps the input device, resamples to 48 kHz -// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet — the host -// feeds them into a virtual PipeWire source. +// mic → host: a second AVAudioEngine taps the input device, folds it to one mono bus (the +// chosen channel of a multi-channel interface, or a sum of all channels), resamples to 48 kHz +// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet — the host feeds them +// into a virtual PipeWire source. // // Devices are chosen by UID ("" = system default: the engine is then never pinned to a // concrete device and follows default-device changes). Two engines, not one — a single @@ -68,10 +69,11 @@ public final class SessionAudio { /// ASYNCHRONOUS: it activates the AVAudioSession off the main thread, then starts the engines on /// a later main-queue hop (gated by `!flag.isStopped`) — so playback is live shortly after, not /// on return. The mic may start later still if the permission prompt is pending. - public func start(speakerUID: String, micUID: String, micEnabled: Bool) { + public func start(speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool) { #if os(macOS) // No AVAudioSession on macOS — start the engines directly (caller's thread, as before). - startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled) + startEngines( + speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, micEnabled: micEnabled) #else // Configure + activate the session OFF the main thread (it blocks on the audio server), // then start the engines back on the main thread once it's active — engine routing/format @@ -81,7 +83,9 @@ public final class SessionAudio { self.activateAudioSession(micEnabled: micEnabled) DispatchQueue.main.async { [weak self] in guard let self, !self.flag.isStopped else { return } - self.startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled) + self.startEngines( + speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, + micEnabled: micEnabled) } } #endif @@ -115,7 +119,9 @@ public final class SessionAudio { /// Build + start the playback engine (and the mic uplink when enabled + authorized). Main /// thread (engine setup); on iOS/tvOS the session is already active by the time this runs. - private func startEngines(speakerUID: String, micUID: String, micEnabled: Bool) { + private func startEngines( + speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool + ) { startPlayback(speakerUID: speakerUID) #if os(tvOS) // No app-accessible microphone input on tvOS — playback only. @@ -123,12 +129,12 @@ public final class SessionAudio { guard micEnabled else { return } switch AVCaptureDevice.authorizationStatus(for: .audio) { case .authorized: - startCapture(micUID: micUID) + startCapture(micUID: micUID, micChannel: micChannel) case .notDetermined: AVCaptureDevice.requestAccess(for: .audio) { [weak self] granted in DispatchQueue.main.async { guard let self, granted, !self.flag.isStopped else { return } - self.startCapture(micUID: micUID) + self.startCapture(micUID: micUID, micChannel: micChannel) } } default: @@ -280,7 +286,7 @@ public final class SessionAudio { // MARK: - Mic (mic → host) #if !os(tvOS) - private func startCapture(micUID: String) { + private func startCapture(micUID: String, micChannel: Int) { let engine = AVAudioEngine() let input = engine.inputNode #if os(macOS) @@ -300,8 +306,63 @@ public final class SessionAudio { log.error("no usable input device — mic uplink disabled") return } - guard let encoder = try? OpusEncoder(), - let resampler = AVAudioConverter(from: inFormat, to: encoder.pcmFormat), + + // Multi-channel-interface handling. A pro interface exposes N discrete inputs with the mic + // on ONE of them, but AVAudioConverter's N→stereo downmix takes channels 0/1 — dead + // silence when the mic sits higher up (the classic "host receives zeros"). So we fold the + // input to a single mono bus OURSELVES and resample that. micChannel: 0 = Auto (sum every + // channel — a lone hot mic passes at full level), n≥1 pins 1-based input channel n. + let inChannels = Int(inFormat.channelCount) + let pinnedChannel: Int? = { + guard micChannel >= 1 else { return nil } + let idx = micChannel - 1 + guard idx < inChannels else { + log.warning( + "mic channel \(micChannel) out of range (device has \(inChannels)) — mixing all") + return nil + } + return idx + }() + let channelPlan = pinnedChannel.map { "channel \($0 + 1)/\(inChannels)" } + ?? (inChannels > 1 ? "mix \(inChannels)ch→mono" : "mono") + + // Name the device we're ACTUALLY recording from + its format + how we fold it, once per + // session. This single line localizes the whole class of "host receives silence" failures + // that otherwise need a host-side tone injection to pin down: a UID that silently fell back + // to the default, the wrong device being live, or the wrong channel picked. + #if os(macOS) + if let unit = input.audioUnit, let live = Self.currentDevice(of: unit), + let dev = AudioDevices.describe(live) { + if !micUID.isEmpty, dev.uid != micUID { + log.warning(""" + mic selection not honored — requested \(micUID) but capturing from \ + \(dev.name) [\(dev.uid)]; the device's UID likely changed (replug) — \ + reselect it in Settings + """) + } + log.info(""" + mic capture: \(dev.name) [\(dev.uid)] — \(Int(inFormat.sampleRate)) Hz, \ + \(inChannels) ch, \(channelPlan) + """) + } else { + log.info(""" + mic capture: — \(Int(inFormat.sampleRate)) Hz, \ + \(inChannels) ch, \(channelPlan) + """) + } + #else + log.info( + "mic capture: \(Int(inFormat.sampleRate)) Hz, \(inChannels) ch, \(channelPlan)") + #endif + + // Encode a single mono bus (folded from `inFormat` in the tap): the resampler goes + // mono@inputSR → the encoder's 48 kHz stereo, so it handles both the rate change and the + // mono→stereo duplication, and the wrong-channel downmix never happens. + guard let monoFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, sampleRate: inFormat.sampleRate, + channels: 1, interleaved: false), + let encoder = try? OpusEncoder(), + let resampler = AVAudioConverter(from: monoFormat, to: encoder.pcmFormat), let chunk = AVAudioPCMBuffer( pcmFormat: encoder.pcmFormat, frameCapacity: OpusEncoder.framesPerPacket) else { @@ -317,11 +378,59 @@ public final class SessionAudio { let connection = connection let flag = flag + // Silence tripwire (tap-confined): a "recording" app can be handed pure digital zeros — + // a zeroed input-volume slider, a stale TCC grant, a muted device, OR the wrong channel + // picked — and everything downstream looks alive while the host gets silence. Track the + // peak of the EXTRACTED mono bus over the first ~10 s (not the raw device — a mic present + // on a channel we didn't grab must still read as silence) and emit exactly ONE verdict. + // This is the log line whose absence made the last occurrence take a host-side tone. + let silenceWindow = Int(inFormat.sampleRate * 10) + let deviceLabel = micUID.isEmpty ? "default input" : micUID + var framesInspected = 0 + var inputPeak: Float = 0 + var levelReported = false + input.installTap(onBus: 0, bufferSize: 2048, format: inFormat) { buffer, _ in if flag.isStopped { return } + let frames = Int(buffer.frameLength) + guard frames > 0, let src = buffer.floatChannelData, + let mono = AVAudioPCMBuffer( + pcmFormat: monoFormat, frameCapacity: buffer.frameLength), + let dst = mono.floatChannelData?[0] + else { return } + mono.frameLength = buffer.frameLength + + // Fold the multi-channel input down to the one mono bus we encode. + Self.foldToMono( + input: src, frames: frames, channels: Int(buffer.format.channelCount), + interleaved: buffer.format.isInterleaved, pinned: pinnedChannel, out: dst) + + if !levelReported { + var localPeak: Float = 0 + for i in 0.. localPeak { localPeak = abs(dst[i]) } + if localPeak > inputPeak { inputPeak = localPeak } + framesInspected += frames + if framesInspected >= silenceWindow { + levelReported = true + if inputPeak == 0 { + log.warning(""" + mic uplink has been pure digital SILENCE for 10 s (\(deviceLabel), \ + \(channelPlan)) — check the input level (System Settings → Sound → \ + Input), Privacy & Security → Microphone, and the Microphone channel in \ + Settings; the host is receiving zeros + """) + } else { + let dbfs = 20 * log10(inputPeak) + log.info(""" + mic uplink OK — peak \(String(format: "%.1f", dbfs)) dBFS over first \ + 10 s (\(deviceLabel), \(channelPlan)) + """) + } + } + } + let ratio = 48_000 / inFormat.sampleRate - let outCapacity = AVAudioFrameCount( - (Double(buffer.frameLength) * ratio).rounded(.up) + 64) + let outCapacity = AVAudioFrameCount((Double(frames) * ratio).rounded(.up) + 64) guard let staging = AVAudioPCMBuffer( pcmFormat: encoder.pcmFormat, frameCapacity: outCapacity) else { return } @@ -334,7 +443,7 @@ public final class SessionAudio { } fed = true outStatus.pointee = .haveData - return buffer + return mono } guard status != .error, let p = staging.floatChannelData?[0] else { return } fifo.append(contentsOf: UnsafeBufferPointer( @@ -378,6 +487,42 @@ public final class SessionAudio { stateLock.unlock() log.info("mic uplink started (\(micUID.isEmpty ? "default input" : micUID))") } + + /// Fold `channels` of input (`floatChannelData` layout: `interleaved` → one buffer strided by + /// channel count; else one buffer per channel) down to a single mono bus in `out` (`frames` + /// long). `pinned` (0-based, must be `< channels`) copies exactly that channel — the fix for a + /// mic on one input of a multi-channel interface; `nil` sums every channel, clamped to + /// [-1, 1], so a lone hot channel still passes at full level instead of the silent 0/1 the + /// default N→stereo downmix would grab. Pure + `internal` for unit testing the index math. + static func foldToMono( + input: UnsafePointer>, frames: Int, channels: Int, + interleaved: Bool, pinned: Int?, out: UnsafeMutablePointer + ) { + if let ch = pinned, ch < channels { + if interleaved { + let d = input[0] + for i in 0.. 1 { for i in 0...size)) == noErr } + + /// Read back the AUHAL's live device — the definitive "what are we actually capturing + /// from", which catches a selection that succeeded on paper but silently fell back to + /// the system default (a stale/changed UID, a device that vanished between resolve and + /// start). 0 / an error means we couldn't tell. + private static func currentDevice(of unit: AudioUnit) -> AudioDeviceID? { + var dev = AudioDeviceID(0) + var size = UInt32(MemoryLayout.size) + let status = AudioUnitGetProperty( + unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &dev, &size) + guard status == noErr, dev != 0 else { return nil } + return dev + } #endif } diff --git a/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift b/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift index 7a6ee1c..94656ae 100644 --- a/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift +++ b/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift @@ -24,6 +24,12 @@ public enum DefaultsKey { public static let micEnabled = "punktfunk.micEnabled" public static let speakerUID = "punktfunk.speakerUID" public static let micUID = "punktfunk.micUID" + /// macOS: which input channel of the chosen mic device feeds the host. 0 = "Auto" (sum every + /// channel to mono — a mic on a single input of a multi-channel interface passes at full + /// level); n≥1 pins 1-based input channel n. Multi-channel interfaces expose the mic on ONE + /// discrete channel, and the default N→stereo downmix grabs channels 0/1 (silence when the mic + /// is higher up), so we fold to mono ourselves. Only meaningful for multi-channel devices. + public static let micChannel = "punktfunk.micChannel" public static let presenter = "punktfunk.presenter" /// Request a 10-bit BT.2020 PQ (HDR10) stream. On by default; only takes effect when the host /// has HDR content AND this display supports HDR — otherwise the stream stays 8-bit SDR. diff --git a/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift b/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift new file mode 100644 index 0000000..ec8a21f --- /dev/null +++ b/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift @@ -0,0 +1,93 @@ +// Multi-channel input → mono fold (SessionAudio.foldToMono): the fix for a mic on one channel of +// a multi-channel interface. AVAudioConverter's default N→stereo downmix grabs channels 0/1 — dead +// silence when the mic sits higher up — so we fold ourselves. This pins the fiddly bits (the +// interleaved stride, channel pinning, the sum-clamp) against regressions without needing hardware. + +#if !os(tvOS) +import XCTest + +@testable import PunktfunkKit + +final class AudioChannelFoldTests: XCTestCase { + /// Drive `foldToMono` over channel data expressed as `[[Float]]`, mirroring the two + /// `floatChannelData` layouts: + /// - deinterleaved: each inner array is one channel (all `frames` long). + /// - interleaved: a single inner array already interleaved (c0f0, c1f0, …), with the real + /// channel count passed separately. + private func fold( + _ planes: [[Float]], frames: Int, channels: Int, interleaved: Bool, pinned: Int? + ) -> [Float] { + // One C buffer per plane + a table of pointers to them — the shape of floatChannelData. + let buffers: [UnsafeMutablePointer] = planes.map { plane in + let p = UnsafeMutablePointer.allocate(capacity: plane.count) + for i in 0..>.allocate( + capacity: buffers.count) + for (i, b) in buffers.enumerated() { table[i] = b } + let out = UnsafeMutablePointer.allocate(capacity: frames) + defer { + buffers.forEach { $0.deallocate() } + table.deallocate() + out.deallocate() + } + SessionAudio.foldToMono( + input: table, frames: frames, channels: channels, + interleaved: interleaved, pinned: pinned, out: out) + return (0..