From 00acf5e44edcf277bf6f8b33194aa7bb96081ae3 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Fri, 3 Jul 2026 22:31:18 +0000 Subject: [PATCH 1/3] =?UTF-8?q?fix(host/audio):=20WASAPI=20virtual=20mic?= =?UTF-8?q?=20=E2=80=94=20port=20the=20priming=20jitter=20buffer=20(crackl?= =?UTF-8?q?ing=20fix)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mac → Windows mic passthrough crackled heavily while the identical stream was clean on the Linux host. Cause: clients push mic audio in BURSTS on their own clock (the Mac input tap yields ~two 20 ms Opus packets every ~42 ms) while the WASAPI render loop pulled a block every ~10 ms device period and greedily drained whatever was queued, padding the rest with zeros — the queue sat near-empty and most periods inserted mid-stream silence. The Linux backend has absorbed this since day one with its priming jitter buffer; the WASAPI loop had none. Port the same semantics: emit silence until ~48 ms is buffered (covers the worst inter-burst gap), then play from the cushion (zero-filling only a momentary shortfall), re-prime only after a genuine full drain (client went quiet). Queue cap raised 80 → 120 ms for burst headroom; steady-state added latency ≈ the 48 ms cushion. Diagnosed live on .173: probe tone recording from CABLE Output proved the endpoint wiring, then the burst-vs-period math explained the crackle. Build-verified on Windows; on-glass listen pending. Co-Authored-By: Claude Fable 5 --- .../src/audio/windows/wasapi_mic.rs | 48 +++++++++++++++---- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/crates/punktfunk-host/src/audio/windows/wasapi_mic.rs b/crates/punktfunk-host/src/audio/windows/wasapi_mic.rs index cfb8f8b..1bf825f 100644 --- a/crates/punktfunk-host/src/audio/windows/wasapi_mic.rs +++ b/crates/punktfunk-host/src/audio/windows/wasapi_mic.rs @@ -19,10 +19,11 @@ //! returns `false` and the pump reopens (re-planning, so endpoint churn re-resolves). Before this //! existed, the first device change silently killed mic passthrough for the rest of the host's life. //! -//! `push` enqueues decoded interleaved-f32 PCM into a bounded ring (drop-oldest beyond ~80 ms so mic -//! latency stays bounded); a dedicated COM-apartment thread renders it event-driven, filling silence -//! when the client isn't talking. WASAPI objects are `!Send`, so they live entirely on that thread -//! (mirrors `WasapiLoopbackCapturer`). +//! `push` enqueues decoded interleaved-f32 PCM into a bounded ring (drop-oldest beyond ~120 ms so +//! mic latency stays bounded); a dedicated COM-apartment thread renders it event-driven through an +//! adaptive jitter buffer (prime → hold → re-prime, see the render loop — clients arrive in bursts, +//! the device pulls per-period), filling silence when the client isn't talking. WASAPI objects are +//! `!Send`, so they live entirely on that thread (mirrors `WasapiLoopbackCapturer`). // Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it. #![deny(clippy::undocumented_unsafe_blocks)] @@ -40,8 +41,17 @@ use wasapi::{Direction, SampleType, StreamMode, WaveFormat}; const CHANNELS: u32 = 2; /// 48 kHz stereo f32: 2 channels * 4 bytes. const BLOCK_ALIGN: usize = 2 * 4; -/// Bound the inject queue at ~80 ms so the passed-through mic stays low-latency (drop oldest beyond). -const MAX_QUEUE_BYTES: usize = (SAMPLE_RATE as usize * 80 / 1000) * BLOCK_ALIGN; +/// Jitter-buffer priming depth (~48 ms): the render loop emits pure silence until this much PCM +/// is queued, then plays from the cushion. Clients deliver mic audio in BURSTS (the Mac client's +/// input tap yields ~two 20 ms Opus packets every ~42 ms) while WASAPI pulls a small block every +/// device period (~10 ms) — with no cushion the queue sits near-empty and most periods insert +/// mid-stream silence: the "crackling mic" (heard live, Mac → Windows host 2026-07-03; the Linux +/// backend's process callback primes the same way and the identical stream was clean there). The +/// depth must cover the worst inter-burst gap (~42 ms), so ~48 ms with re-prime on a full drain. +const PRIME_BYTES: usize = (SAMPLE_RATE as usize * 48 / 1000) * BLOCK_ALIGN; +/// Bound the inject queue at ~120 ms so the passed-through mic stays low-latency (drop oldest +/// beyond): the priming cushion (~48 ms) plus arrival-burst headroom. +const MAX_QUEUE_BYTES: usize = (SAMPLE_RATE as usize * 120 / 1000) * BLOCK_ALIGN; pub struct WasapiVirtualMic { queue: Arc>>, @@ -299,7 +309,17 @@ fn render_thread( // Any error below (endpoint invalidated/removed, engine restart) propagates out of the loop, // ending the thread — the `alive` flag flips in the spawn wrapper and the pump reopens. + // + // Adaptive jitter buffer (mirrors the Linux backend's process callback): clients push mic + // audio in bursts on their own clock while the device pulls a block every period from an + // independent clock, so a greedy per-period drain leaves the queue near-empty and pads most + // periods with mid-stream silence — audible as constant crackling. Instead: emit silence + // until [`PRIME_BYTES`] is buffered, then play from the cushion (zero-filling only a + // momentary shortfall), and re-prime only after a genuine FULL drain (the client went quiet — + // between talk spurts the cushion rebuilds, and [`VirtualMic::discard`] resets it across + // session gaps). let mut buf: Vec = Vec::new(); + let mut primed = false; while !stop.load(Ordering::Relaxed) { // The device signals when it wants more data; finite timeout keeps `stop` responsive. if h_event.wait_for_event(100).is_err() { @@ -315,13 +335,21 @@ fn render_thread( if buf.len() < need { buf.resize(need, 0); } - // Silence base; overwrite with queued mic PCM (zero-pad the tail when the client is quiet). + // Silence base; overwrite with queued mic PCM once the cushion is primed. buf[..need].fill(0); { let mut q = queue.lock().unwrap(); - let n = q.len().min(need); - for (i, b) in q.drain(..n).enumerate() { - buf[i] = b; + if !primed && q.len() >= PRIME_BYTES { + primed = true; + } + if primed { + let n = q.len().min(need); + for (i, b) in q.drain(..n).enumerate() { + buf[i] = b; + } + if q.is_empty() { + primed = false; // fully drained — re-prime before producing again + } } } render_client From 136f6e8f0eb402b8e3ae0c4be7d890b26219ccff Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Fri, 3 Jul 2026 22:35:49 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat(probe):=20--mic-burst=20=E2=80=94=20re?= =?UTF-8?q?al-client=20mic=20pacing=20for=20jitter-buffer=20regression=20t?= =?UTF-8?q?ests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The steady 5 ms mic-test cadence never trips host-side buffering bugs: the WASAPI crackle (fixed in the previous commit) only reproduced under a real client's bursty input tap. --mic-burst paces the tone the same way (two 20 ms Opus packets every 40 ms), so recording the host mic and counting silence gaps regression-tests the jitter buffer headlessly. Validated against the fixed Windows host on the lab box: 15 s of bursty tone, zero mid-stream gaps >=3 ms (gaps confined to the first 40 ms priming window). Co-Authored-By: Claude Fable 5 --- clients/probe/src/main.rs | 62 ++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/clients/probe/src/main.rs b/clients/probe/src/main.rs index 9b39e93..7bf17ff 100644 --- a/clients/probe/src/main.rs +++ b/clients/probe/src/main.rs @@ -41,7 +41,7 @@ //! Usage: `punktfunk-probe [--connect HOST:PORT] [--mode WxHxFPS] [--remode WxHxFPS:SECS] //! [--out FILE] [--bitrate KBPS] [--codec auto|h264|hevc|av1] [--audio-channels 2|6|8] //! [--launch APP] [--name NAME] [--speed-test KBPS:MS] -//! [--input-test | --mic-test | --touch-test | --rich-input-test] +//! [--input-test | --mic-test [--mic-burst] | --touch-test | --rich-input-test] //! [--pin HEX | --pair PIN] [--compositor NAME] [--gamepad NAME] | --discover [SECS]` //! Env: `PUNKTFUNK_CLIENT_10BIT=1` / `PUNKTFUNK_CLIENT_444=1` advertise the 10-bit / 4:4:4 caps. @@ -65,6 +65,9 @@ struct Args { input_test: bool, /// `--mic-test` — stream a synthetic 440 Hz tone as the mic uplink (proves the mic path). mic_test: bool, + /// `--mic-burst` — pace the mic-test like a real client's input tap (2× 20 ms per 40 ms), + /// the arrival shape that exercises host-side jitter buffering. + mic_burst: bool, /// `--touch-test` — drag a synthetic finger in a circle (proves the touch path). touch_test: bool, /// `--rich-input-test` — drive the DualSense touchpad + motion over 0xCC (host needs @@ -205,6 +208,7 @@ fn parse_args() -> Args { out: get("--out").map(String::from), input_test: argv.iter().any(|a| a == "--input-test"), mic_test: argv.iter().any(|a| a == "--mic-test"), + mic_burst: argv.iter().any(|a| a == "--mic-burst"), touch_test: argv.iter().any(|a| a == "--touch-test"), rich_input_test: argv.iter().any(|a| a == "--rich-input-test"), pin, @@ -740,9 +744,16 @@ async fn session(args: Args) -> Result<()> { }); } - // Mic plane: stream a synthetic 440 Hz tone as the mic uplink (0xCB), Opus-encoded 5 ms - // stereo frames — proves client→host mic passthrough end to end without a real microphone - // (the host decodes it into its virtual PipeWire source; record that source to hear the tone). + // Mic plane: stream a synthetic 440 Hz tone as the mic uplink (0xCB) — proves client→host + // mic passthrough end to end without a real microphone (the host decodes it into its virtual + // source; record that source to hear the tone). Two pacing modes: + // default — Opus 5 ms frames on a steady 5 ms tick (smooth arrival). + // --mic-burst — two 20 ms Opus frames back-to-back every 40 ms, replicating a real + // client's input-tap cadence (the Mac client's AVAudioEngine tap yields + // ~2048-frame buffers → two packets per ~42 ms). This is the arrival + // pattern that exposed the Windows host's missing jitter buffer (constant + // crackle, 2026-07-03): a steady 5 ms stream never trips it. Record the + // host mic and count silence gaps to regression-test host-side buffering. #[cfg(not(target_os = "linux"))] if args.mic_test { tracing::warn!("--mic-test requires Linux (libopus) — skipped"); @@ -750,6 +761,7 @@ async fn session(args: Args) -> Result<()> { #[cfg(target_os = "linux")] if args.mic_test { let conn2 = conn.clone(); + let burst = args.mic_burst; tokio::spawn(async move { let mut enc = match opus::Encoder::new(48_000, opus::Channels::Stereo, opus::Application::Voip) { @@ -760,28 +772,38 @@ async fn session(args: Args) -> Result<()> { } }; let _ = enc.set_bitrate(opus::Bitrate::Bits(64_000)); - tracing::info!("mic-test: streaming a 440 Hz tone as the mic uplink"); + // Frame size + tick per pacing mode; `per_tick` packets are sent back-to-back. + let (frame, tick_ms, per_tick) = if burst { + (960usize, 40u64, 2u32) // 2× 20 ms every 40 ms — the bursty real-client shape + } else { + (240usize, 5u64, 1u32) // 5 ms frames on a smooth tick + }; + tracing::info!(burst, "mic-test: streaming a 440 Hz tone as the mic uplink"); let mut phase = 0.0f32; let step = 2.0 * std::f32::consts::PI * 440.0 / 48_000.0; - let mut pcm = [0f32; 240 * 2]; // 5 ms stereo + let mut pcm = vec![0f32; frame * 2]; let mut out = [0u8; 4000]; - let mut interval = tokio::time::interval(std::time::Duration::from_millis(5)); - for seq in 0u32.. { + let mut interval = tokio::time::interval(std::time::Duration::from_millis(tick_ms)); + let mut seq = 0u32; + 'stream: loop { interval.tick().await; - for f in 0..240 { - let s = (phase.sin()) * 0.25; - phase += step; - if phase > std::f32::consts::PI * 2.0 { - phase -= std::f32::consts::PI * 2.0; + for _ in 0..per_tick { + for f in 0..frame { + let s = (phase.sin()) * 0.25; + phase += step; + if phase > std::f32::consts::PI * 2.0 { + phase -= std::f32::consts::PI * 2.0; + } + pcm[f * 2] = s; + pcm[f * 2 + 1] = s; } - pcm[f * 2] = s; - pcm[f * 2 + 1] = s; - } - if let Ok(n) = enc.encode_float(&pcm, &mut out) { - let d = punktfunk_core::quic::encode_mic_datagram(seq, now_ns(), &out[..n]); - if conn2.send_datagram(d.into()).is_err() { - break; + if let Ok(n) = enc.encode_float(&pcm, &mut out) { + let d = punktfunk_core::quic::encode_mic_datagram(seq, now_ns(), &out[..n]); + if conn2.send_datagram(d.into()).is_err() { + break 'stream; + } } + seq = seq.wrapping_add(1); } } tracing::info!("mic-test: done"); From 42d1c746635409bd5857448bd7e2cf2df16e5096 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Sat, 4 Jul 2026 00:36:24 +0200 Subject: [PATCH 3/3] fix(apple-client/audio): capture the right channel of a multi-channel mic + diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mic uplink handed the host pure digital silence on a multi-channel interface: AVAudioConverter's N→stereo downmix takes channels 0/1, but a pro interface puts the mic on ONE higher discrete channel. Fold the input to a mono bus ourselves instead — pick the mic's channel (or sum all) and resample that to the encoder's 48 kHz stereo, so the silent 0/1 downmix never happens. - New "Microphone channel" setting (macOS): Auto (sum every channel — a lone hot mic passes at full level) or pin 1-based channel N. Picker appears only for multi-channel devices, driven by the device's input channel count. - Diagnostics that make this class of failure self-naming next session: log the actual live capture device + format + fold mode, warn on a silent UID fallback, and a one-shot silence tripwire on the EXTRACTED signal (WARN on 10 s of zeros, else peak dBFS). - foldToMono extracted as a pure, unit-tested helper (pin / sum-clamp x interleaved / deinterleaved / mono / out-of-range). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Session/SessionModel.swift | 1 + .../Settings/SettingsView+Sections.swift | 11 + .../Settings/SettingsView.swift | 10 + .../PunktfunkKit/Audio/AudioDevices.swift | 46 ++++- .../PunktfunkKit/Audio/SessionAudio.swift | 188 ++++++++++++++++-- .../PunktfunkKit/Support/DefaultsKeys.swift | 6 + .../AudioChannelFoldTests.swift | 93 +++++++++ 7 files changed, 339 insertions(+), 16 deletions(-) create mode 100644 clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift diff --git a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift index c57c158..599b2f5 100644 --- a/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift +++ b/clients/apple/Sources/PunktfunkClient/Session/SessionModel.swift @@ -338,6 +338,7 @@ final class SessionModel: ObservableObject { audio.start( speakerUID: defaults.string(forKey: DefaultsKey.speakerUID) ?? "", micUID: defaults.string(forKey: DefaultsKey.micUID) ?? "", + micChannel: defaults.integer(forKey: DefaultsKey.micChannel), micEnabled: defaults.object(forKey: DefaultsKey.micEnabled) as? Bool ?? true) self.audio = audio // Gamepads: forward GamepadManager's active controller as pad 0 and render the diff --git a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift index 088fa63..baa2ef0 100644 --- a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift +++ b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView+Sections.swift @@ -208,6 +208,17 @@ extension SettingsView { } } .disabled(!micEnabled) + // Multi-channel interfaces only: the mic sits on ONE discrete input, so let the user + // pick it. Auto sums every channel (a lone hot mic still passes at full level). + if micChannelCount > 1 { + Picker("Microphone channel", selection: $micChannel) { + Text("Auto (all channels)").tag(0) + ForEach(1...micChannelCount, id: \.self) { ch in + Text("Channel \(ch)").tag(ch) + } + } + .disabled(!micEnabled) + } #endif } header: { Text("Audio") diff --git a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift index 1f34761..d66ed1d 100644 --- a/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift +++ b/clients/apple/Sources/PunktfunkClient/Settings/SettingsView.swift @@ -61,8 +61,12 @@ struct SettingsView: View { #if os(macOS) @AppStorage(DefaultsKey.speakerUID) var speakerUID = "" @AppStorage(DefaultsKey.micUID) var micUID = "" + @AppStorage(DefaultsKey.micChannel) var micChannel = 0 @State var outputDevices: [AudioDevice] = [] @State var inputDevices: [AudioDevice] = [] + // Input channels of the selected mic — drives the "Microphone channel" picker, which only + // appears for a multi-channel interface (>1). 0 until the Audio tab loads it. + @State var micChannelCount = 0 #endif #if os(iOS) @@ -115,6 +119,12 @@ struct SettingsView: View { .onAppear { outputDevices = AudioDevices.outputs() inputDevices = AudioDevices.inputs() + micChannelCount = AudioDevices.inputChannelCount(forUID: micUID) + } + .onChange(of: micUID) { _, newUID in + // A different mic → different channel count; drop a now-out-of-range pin to Auto. + micChannelCount = AudioDevices.inputChannelCount(forUID: newUID) + if micChannel > micChannelCount { micChannel = 0 } } .tabItem { Label("Audio", systemImage: "speaker.wave.2") } diff --git a/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift b/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift index f2113af..4584fd9 100644 --- a/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift +++ b/clients/apple/Sources/PunktfunkKit/Audio/AudioDevices.swift @@ -33,6 +33,49 @@ public enum AudioDevices { } } + /// Input channel count of the mic the picker would use — the device with this UID, or the + /// system default input when `uid` is empty. 0 when it can't be resolved. Drives the + /// "Microphone channel" picker (only shown for multi-channel interfaces). + public static func inputChannelCount(forUID uid: String) -> Int { + let id = uid.isEmpty ? defaultInputDevice() : deviceID(forUID: uid) + guard let id else { return 0 } + return channelCount(id, scope: kAudioObjectPropertyScopeInput) + } + + private static func defaultInputDevice() -> AudioDeviceID? { + var address = AudioObjectPropertyAddress( + mSelector: kAudioHardwarePropertyDefaultInputDevice, + mScope: kAudioObjectPropertyScopeGlobal, + mElement: kAudioObjectPropertyElementMain) + var dev = AudioDeviceID(0) + var size = UInt32(MemoryLayout.size) + guard AudioObjectGetPropertyData( + AudioObjectID(kAudioObjectSystemObject), &address, 0, nil, &size, &dev) == noErr, + dev != 0 + else { return nil } + return dev + } + + /// Sum of channels across the device's streams in `scope` (its total input/output channels). + private static func channelCount( + _ id: AudioDeviceID, scope: AudioObjectPropertyScope + ) -> Int { + var address = AudioObjectPropertyAddress( + mSelector: kAudioDevicePropertyStreamConfiguration, + mScope: scope, + mElement: kAudioObjectPropertyElementMain) + var size: UInt32 = 0 + guard AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr, size > 0 + else { return 0 } + let raw = UnsafeMutableRawPointer.allocate( + byteCount: Int(size), alignment: MemoryLayout.alignment) + defer { raw.deallocate() } + guard AudioObjectGetPropertyData(id, &address, 0, nil, &size, raw) == noErr else { return 0 } + let abl = UnsafeMutableAudioBufferListPointer( + raw.assumingMemoryBound(to: AudioBufferList.self)) + return abl.reduce(0) { $0 + Int($1.mNumberChannels) } + } + private static func all() -> [AudioDeviceID] { var address = AudioObjectPropertyAddress( mSelector: kAudioHardwarePropertyDevices, @@ -62,7 +105,8 @@ public enum AudioDevices { return AudioObjectGetPropertyDataSize(id, &address, 0, nil, &size) == noErr && size > 0 } - private static func describe(_ id: AudioDeviceID) -> AudioDevice? { + /// UID + human name for a live AudioDeviceID (nil if either property is unreadable). + static func describe(_ id: AudioDeviceID) -> AudioDevice? { guard let uid = stringProperty(id, kAudioDevicePropertyDeviceUID), let name = stringProperty(id, kAudioObjectPropertyName) else { return nil } diff --git a/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift b/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift index 327e785..00727e3 100644 --- a/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift +++ b/clients/apple/Sources/PunktfunkKit/Audio/SessionAudio.swift @@ -5,9 +5,10 @@ // AVAudioSourceNode pulls from the ring (silence on underrun with re-priming, so a // network gap costs one dip, not permanent crackle). // -// mic → host: a second AVAudioEngine taps the input device, resamples to 48 kHz -// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet — the host -// feeds them into a virtual PipeWire source. +// mic → host: a second AVAudioEngine taps the input device, folds it to one mono bus (the +// chosen channel of a multi-channel interface, or a sum of all channels), resamples to 48 kHz +// stereo, slices 20 ms chunks, Opus-encodes, and sendMic()s each packet — the host feeds them +// into a virtual PipeWire source. // // Devices are chosen by UID ("" = system default: the engine is then never pinned to a // concrete device and follows default-device changes). Two engines, not one — a single @@ -68,10 +69,11 @@ public final class SessionAudio { /// ASYNCHRONOUS: it activates the AVAudioSession off the main thread, then starts the engines on /// a later main-queue hop (gated by `!flag.isStopped`) — so playback is live shortly after, not /// on return. The mic may start later still if the permission prompt is pending. - public func start(speakerUID: String, micUID: String, micEnabled: Bool) { + public func start(speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool) { #if os(macOS) // No AVAudioSession on macOS — start the engines directly (caller's thread, as before). - startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled) + startEngines( + speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, micEnabled: micEnabled) #else // Configure + activate the session OFF the main thread (it blocks on the audio server), // then start the engines back on the main thread once it's active — engine routing/format @@ -81,7 +83,9 @@ public final class SessionAudio { self.activateAudioSession(micEnabled: micEnabled) DispatchQueue.main.async { [weak self] in guard let self, !self.flag.isStopped else { return } - self.startEngines(speakerUID: speakerUID, micUID: micUID, micEnabled: micEnabled) + self.startEngines( + speakerUID: speakerUID, micUID: micUID, micChannel: micChannel, + micEnabled: micEnabled) } } #endif @@ -115,7 +119,9 @@ public final class SessionAudio { /// Build + start the playback engine (and the mic uplink when enabled + authorized). Main /// thread (engine setup); on iOS/tvOS the session is already active by the time this runs. - private func startEngines(speakerUID: String, micUID: String, micEnabled: Bool) { + private func startEngines( + speakerUID: String, micUID: String, micChannel: Int, micEnabled: Bool + ) { startPlayback(speakerUID: speakerUID) #if os(tvOS) // No app-accessible microphone input on tvOS — playback only. @@ -123,12 +129,12 @@ public final class SessionAudio { guard micEnabled else { return } switch AVCaptureDevice.authorizationStatus(for: .audio) { case .authorized: - startCapture(micUID: micUID) + startCapture(micUID: micUID, micChannel: micChannel) case .notDetermined: AVCaptureDevice.requestAccess(for: .audio) { [weak self] granted in DispatchQueue.main.async { guard let self, granted, !self.flag.isStopped else { return } - self.startCapture(micUID: micUID) + self.startCapture(micUID: micUID, micChannel: micChannel) } } default: @@ -280,7 +286,7 @@ public final class SessionAudio { // MARK: - Mic (mic → host) #if !os(tvOS) - private func startCapture(micUID: String) { + private func startCapture(micUID: String, micChannel: Int) { let engine = AVAudioEngine() let input = engine.inputNode #if os(macOS) @@ -300,8 +306,63 @@ public final class SessionAudio { log.error("no usable input device — mic uplink disabled") return } - guard let encoder = try? OpusEncoder(), - let resampler = AVAudioConverter(from: inFormat, to: encoder.pcmFormat), + + // Multi-channel-interface handling. A pro interface exposes N discrete inputs with the mic + // on ONE of them, but AVAudioConverter's N→stereo downmix takes channels 0/1 — dead + // silence when the mic sits higher up (the classic "host receives zeros"). So we fold the + // input to a single mono bus OURSELVES and resample that. micChannel: 0 = Auto (sum every + // channel — a lone hot mic passes at full level), n≥1 pins 1-based input channel n. + let inChannels = Int(inFormat.channelCount) + let pinnedChannel: Int? = { + guard micChannel >= 1 else { return nil } + let idx = micChannel - 1 + guard idx < inChannels else { + log.warning( + "mic channel \(micChannel) out of range (device has \(inChannels)) — mixing all") + return nil + } + return idx + }() + let channelPlan = pinnedChannel.map { "channel \($0 + 1)/\(inChannels)" } + ?? (inChannels > 1 ? "mix \(inChannels)ch→mono" : "mono") + + // Name the device we're ACTUALLY recording from + its format + how we fold it, once per + // session. This single line localizes the whole class of "host receives silence" failures + // that otherwise need a host-side tone injection to pin down: a UID that silently fell back + // to the default, the wrong device being live, or the wrong channel picked. + #if os(macOS) + if let unit = input.audioUnit, let live = Self.currentDevice(of: unit), + let dev = AudioDevices.describe(live) { + if !micUID.isEmpty, dev.uid != micUID { + log.warning(""" + mic selection not honored — requested \(micUID) but capturing from \ + \(dev.name) [\(dev.uid)]; the device's UID likely changed (replug) — \ + reselect it in Settings + """) + } + log.info(""" + mic capture: \(dev.name) [\(dev.uid)] — \(Int(inFormat.sampleRate)) Hz, \ + \(inChannels) ch, \(channelPlan) + """) + } else { + log.info(""" + mic capture: — \(Int(inFormat.sampleRate)) Hz, \ + \(inChannels) ch, \(channelPlan) + """) + } + #else + log.info( + "mic capture: \(Int(inFormat.sampleRate)) Hz, \(inChannels) ch, \(channelPlan)") + #endif + + // Encode a single mono bus (folded from `inFormat` in the tap): the resampler goes + // mono@inputSR → the encoder's 48 kHz stereo, so it handles both the rate change and the + // mono→stereo duplication, and the wrong-channel downmix never happens. + guard let monoFormat = AVAudioFormat( + commonFormat: .pcmFormatFloat32, sampleRate: inFormat.sampleRate, + channels: 1, interleaved: false), + let encoder = try? OpusEncoder(), + let resampler = AVAudioConverter(from: monoFormat, to: encoder.pcmFormat), let chunk = AVAudioPCMBuffer( pcmFormat: encoder.pcmFormat, frameCapacity: OpusEncoder.framesPerPacket) else { @@ -317,11 +378,59 @@ public final class SessionAudio { let connection = connection let flag = flag + // Silence tripwire (tap-confined): a "recording" app can be handed pure digital zeros — + // a zeroed input-volume slider, a stale TCC grant, a muted device, OR the wrong channel + // picked — and everything downstream looks alive while the host gets silence. Track the + // peak of the EXTRACTED mono bus over the first ~10 s (not the raw device — a mic present + // on a channel we didn't grab must still read as silence) and emit exactly ONE verdict. + // This is the log line whose absence made the last occurrence take a host-side tone. + let silenceWindow = Int(inFormat.sampleRate * 10) + let deviceLabel = micUID.isEmpty ? "default input" : micUID + var framesInspected = 0 + var inputPeak: Float = 0 + var levelReported = false + input.installTap(onBus: 0, bufferSize: 2048, format: inFormat) { buffer, _ in if flag.isStopped { return } + let frames = Int(buffer.frameLength) + guard frames > 0, let src = buffer.floatChannelData, + let mono = AVAudioPCMBuffer( + pcmFormat: monoFormat, frameCapacity: buffer.frameLength), + let dst = mono.floatChannelData?[0] + else { return } + mono.frameLength = buffer.frameLength + + // Fold the multi-channel input down to the one mono bus we encode. + Self.foldToMono( + input: src, frames: frames, channels: Int(buffer.format.channelCount), + interleaved: buffer.format.isInterleaved, pinned: pinnedChannel, out: dst) + + if !levelReported { + var localPeak: Float = 0 + for i in 0.. localPeak { localPeak = abs(dst[i]) } + if localPeak > inputPeak { inputPeak = localPeak } + framesInspected += frames + if framesInspected >= silenceWindow { + levelReported = true + if inputPeak == 0 { + log.warning(""" + mic uplink has been pure digital SILENCE for 10 s (\(deviceLabel), \ + \(channelPlan)) — check the input level (System Settings → Sound → \ + Input), Privacy & Security → Microphone, and the Microphone channel in \ + Settings; the host is receiving zeros + """) + } else { + let dbfs = 20 * log10(inputPeak) + log.info(""" + mic uplink OK — peak \(String(format: "%.1f", dbfs)) dBFS over first \ + 10 s (\(deviceLabel), \(channelPlan)) + """) + } + } + } + let ratio = 48_000 / inFormat.sampleRate - let outCapacity = AVAudioFrameCount( - (Double(buffer.frameLength) * ratio).rounded(.up) + 64) + let outCapacity = AVAudioFrameCount((Double(frames) * ratio).rounded(.up) + 64) guard let staging = AVAudioPCMBuffer( pcmFormat: encoder.pcmFormat, frameCapacity: outCapacity) else { return } @@ -334,7 +443,7 @@ public final class SessionAudio { } fed = true outStatus.pointee = .haveData - return buffer + return mono } guard status != .error, let p = staging.floatChannelData?[0] else { return } fifo.append(contentsOf: UnsafeBufferPointer( @@ -378,6 +487,42 @@ public final class SessionAudio { stateLock.unlock() log.info("mic uplink started (\(micUID.isEmpty ? "default input" : micUID))") } + + /// Fold `channels` of input (`floatChannelData` layout: `interleaved` → one buffer strided by + /// channel count; else one buffer per channel) down to a single mono bus in `out` (`frames` + /// long). `pinned` (0-based, must be `< channels`) copies exactly that channel — the fix for a + /// mic on one input of a multi-channel interface; `nil` sums every channel, clamped to + /// [-1, 1], so a lone hot channel still passes at full level instead of the silent 0/1 the + /// default N→stereo downmix would grab. Pure + `internal` for unit testing the index math. + static func foldToMono( + input: UnsafePointer>, frames: Int, channels: Int, + interleaved: Bool, pinned: Int?, out: UnsafeMutablePointer + ) { + if let ch = pinned, ch < channels { + if interleaved { + let d = input[0] + for i in 0.. 1 { for i in 0...size)) == noErr } + + /// Read back the AUHAL's live device — the definitive "what are we actually capturing + /// from", which catches a selection that succeeded on paper but silently fell back to + /// the system default (a stale/changed UID, a device that vanished between resolve and + /// start). 0 / an error means we couldn't tell. + private static func currentDevice(of unit: AudioUnit) -> AudioDeviceID? { + var dev = AudioDeviceID(0) + var size = UInt32(MemoryLayout.size) + let status = AudioUnitGetProperty( + unit, kAudioOutputUnitProperty_CurrentDevice, kAudioUnitScope_Global, 0, &dev, &size) + guard status == noErr, dev != 0 else { return nil } + return dev + } #endif } diff --git a/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift b/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift index 7a6ee1c..94656ae 100644 --- a/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift +++ b/clients/apple/Sources/PunktfunkKit/Support/DefaultsKeys.swift @@ -24,6 +24,12 @@ public enum DefaultsKey { public static let micEnabled = "punktfunk.micEnabled" public static let speakerUID = "punktfunk.speakerUID" public static let micUID = "punktfunk.micUID" + /// macOS: which input channel of the chosen mic device feeds the host. 0 = "Auto" (sum every + /// channel to mono — a mic on a single input of a multi-channel interface passes at full + /// level); n≥1 pins 1-based input channel n. Multi-channel interfaces expose the mic on ONE + /// discrete channel, and the default N→stereo downmix grabs channels 0/1 (silence when the mic + /// is higher up), so we fold to mono ourselves. Only meaningful for multi-channel devices. + public static let micChannel = "punktfunk.micChannel" public static let presenter = "punktfunk.presenter" /// Request a 10-bit BT.2020 PQ (HDR10) stream. On by default; only takes effect when the host /// has HDR content AND this display supports HDR — otherwise the stream stays 8-bit SDR. diff --git a/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift b/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift new file mode 100644 index 0000000..ec8a21f --- /dev/null +++ b/clients/apple/Tests/PunktfunkKitTests/AudioChannelFoldTests.swift @@ -0,0 +1,93 @@ +// Multi-channel input → mono fold (SessionAudio.foldToMono): the fix for a mic on one channel of +// a multi-channel interface. AVAudioConverter's default N→stereo downmix grabs channels 0/1 — dead +// silence when the mic sits higher up — so we fold ourselves. This pins the fiddly bits (the +// interleaved stride, channel pinning, the sum-clamp) against regressions without needing hardware. + +#if !os(tvOS) +import XCTest + +@testable import PunktfunkKit + +final class AudioChannelFoldTests: XCTestCase { + /// Drive `foldToMono` over channel data expressed as `[[Float]]`, mirroring the two + /// `floatChannelData` layouts: + /// - deinterleaved: each inner array is one channel (all `frames` long). + /// - interleaved: a single inner array already interleaved (c0f0, c1f0, …), with the real + /// channel count passed separately. + private func fold( + _ planes: [[Float]], frames: Int, channels: Int, interleaved: Bool, pinned: Int? + ) -> [Float] { + // One C buffer per plane + a table of pointers to them — the shape of floatChannelData. + let buffers: [UnsafeMutablePointer] = planes.map { plane in + let p = UnsafeMutablePointer.allocate(capacity: plane.count) + for i in 0..>.allocate( + capacity: buffers.count) + for (i, b) in buffers.enumerated() { table[i] = b } + let out = UnsafeMutablePointer.allocate(capacity: frames) + defer { + buffers.forEach { $0.deallocate() } + table.deallocate() + out.deallocate() + } + SessionAudio.foldToMono( + input: table, frames: frames, channels: channels, + interleaved: interleaved, pinned: pinned, out: out) + return (0..