Files
punktfunk/crates/punktfunk-host/src/audio/windows/wasapi_mic.rs
T
enricobuehler 3477cbe7ce fix(audio/windows): stop the client mic echoing back through the loopback
The Windows virtual mic fakes a capture endpoint by writing the client's
uplinked PCM into a virtual device's *render* endpoint, while the
desktop-audio plane loopback-captures the *default render* endpoint — with
no mutual exclusion between the two. WASAPI loopback captures the mixed
output of an endpoint (everything any app renders to it, including our mic
writes), so when both resolve to the same device — VB-CABLE used for both,
or the auto-installed Steam Streaming Microphone being the default render on
a headless box — the injected mic is captured straight back into the
host->client audio stream: an infinite echo.

find_device() now resolves the loopback's endpoint id (default render) and
skips any candidate matching it, scanning on to the next non-loopback match,
so the mic can never land on the device the loopback reads. The auto-install
path now provisions the full Steam pair (Streaming Microphone + Streaming
Speakers) so a bare host gets two distinct devices instead of one shared
one. Errors distinguish "no device" from "only candidate is the loopback
device". Linux was already immune (its mic is a dedicated Audio/Source node,
structurally separate from the monitored sink).

Windows-only (#[cfg(windows)]); rustfmt-clean, compile-checked in
windows-host CI, needs on-glass validation on the RTX box. Does not force
the system default playback onto Steam Streaming Speakers (IPolicyConfig) —
not required to break the echo.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 23:51:46 +00:00

392 lines
17 KiB
Rust

//! WASAPI virtual microphone (Windows) — the inverse of [`super::wasapi_cap`]. Windows has no
//! user-mode way to *create* a capture (microphone) endpoint, so we target an EXISTING virtual audio
//! device and write the client's decoded mic PCM into that device's **render** endpoint; the device's
//! **capture** endpoint then surfaces as a microphone that host apps can record from.
//!
//! Target device, by friendly-name substring (first match wins; override with `PUNKTFUNK_MIC_DEVICE`):
//! "Steam Streaming Microphone" (ships with Steam Remote Play — exactly this purpose), VB-Audio
//! "CABLE Input", VoiceMeeter, or anything with "virtual" in the name. If none is present we
//! auto-install the Steam Streaming audio pair (see [`install_steam_audio_pair`]); failing that we
//! return an error with install guidance and the host runs without mic passthrough.
//!
//! **Anti-echo guard (the whole point of this being non-trivial).** The desktop-audio plane
//! ([`super::wasapi_cap`]) loopback-captures the **default render endpoint**. WASAPI loopback
//! captures the *mixed* output of an endpoint — i.e. everything any app renders to it, including
//! what THIS module writes. So if the virtual-mic target is the same device the loopback captures,
//! the client's uplinked mic is captured straight back into the host→client audio stream: an
//! infinite echo. [`find_device`] therefore **excludes the default render endpoint** from the
//! candidates — the mic is guaranteed to land on a different device. (Linux gets this for free: its
//! mic is a dedicated `Audio/Source` node, structurally separate from the monitored sink.)
//!
//! `push` enqueues decoded interleaved-f32 PCM into a bounded ring (drop-oldest beyond ~80 ms so mic
//! latency stays bounded); a dedicated COM-apartment thread renders it event-driven, filling silence
//! when the client isn't talking. WASAPI objects are `!Send`, so they live entirely on that thread
//! (mirrors `WasapiLoopbackCapturer`).
// Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it.
#![deny(clippy::undocumented_unsafe_blocks)]
use super::{VirtualMic, SAMPLE_RATE};
use anyhow::{anyhow, Context, Result};
use std::collections::VecDeque;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::mpsc::{sync_channel, SyncSender};
use std::sync::{Arc, Mutex};
use std::thread::{self, JoinHandle};
use std::time::Duration;
use wasapi::{Direction, SampleType, StreamMode, WaveFormat};
const CHANNELS: u32 = 2;
/// 48 kHz stereo f32: 2 channels * 4 bytes.
const BLOCK_ALIGN: usize = 2 * 4;
/// Bound the inject queue at ~80 ms so the passed-through mic stays low-latency (drop oldest beyond).
const MAX_QUEUE_BYTES: usize = (SAMPLE_RATE as usize * 80 / 1000) * BLOCK_ALIGN;
/// Render-endpoint friendly-name substrings (lowercased) we can write into so the device's capture
/// endpoint becomes a host mic. Ordered by preference.
const CANDIDATES: &[&str] = &[
"steam streaming microphone",
"cable input",
"voicemeeter input",
"voicemeeter aux input",
"virtual",
];
pub struct WasapiVirtualMic {
queue: Arc<Mutex<VecDeque<u8>>>,
stop: Arc<AtomicBool>,
join: Option<JoinHandle<()>>,
}
impl WasapiVirtualMic {
pub fn open(channels: u32) -> Result<Self> {
anyhow::ensure!(
channels == CHANNELS,
"virtual mic is stereo-only (got {channels})"
);
let queue = Arc::new(Mutex::new(VecDeque::<u8>::new()));
let stop = Arc::new(AtomicBool::new(false));
// Bring-up handshake: report the resolved device (or the error) before returning, so a missing
// virtual-mic device surfaces as Err (the caller retries with backoff) not a silent dead thread.
let (ready_tx, ready_rx) = sync_channel::<Result<String>>(1);
let (q, st) = (queue.clone(), stop.clone());
let join = thread::Builder::new()
.name("punktfunk-wasapi-mic".into())
.spawn(move || {
if let Err(e) = render_thread(q, st, ready_tx) {
tracing::error!(error = %format!("{e:#}"), "wasapi virtual-mic thread failed");
}
})
.context("spawn wasapi mic thread")?;
match ready_rx.recv_timeout(Duration::from_secs(3)) {
Ok(Ok(name)) => {
tracing::info!(device = %name,
"WASAPI virtual mic ready (client mic → this device's render endpoint)");
Ok(WasapiVirtualMic {
queue,
stop,
join: Some(join),
})
}
Ok(Err(e)) => Err(e),
Err(_) => Err(anyhow!("wasapi virtual-mic init timed out")),
}
}
}
impl Drop for WasapiVirtualMic {
fn drop(&mut self) {
self.stop.store(true, Ordering::SeqCst);
if let Some(j) = self.join.take() {
let _ = j.join();
}
}
}
impl VirtualMic for WasapiVirtualMic {
fn push(&self, pcm: &[f32]) {
let Ok(mut q) = self.queue.lock() else {
return;
};
q.reserve(pcm.len() * 4);
for &s in pcm {
q.extend(s.to_le_bytes());
}
// Drop-oldest to keep latency bounded (mic is real-time; stale audio is worse than dropped).
if q.len() > MAX_QUEUE_BYTES {
let excess = q.len() - MAX_QUEUE_BYTES;
q.drain(..excess);
}
}
fn channels(&self) -> u32 {
CHANNELS
}
}
/// The endpoint ID of the device the desktop-audio loopback records (the **default render
/// endpoint**, see [`super::wasapi_cap`]). The virtual mic must never target this device — injecting
/// there echoes the client's mic back into the host→client audio stream. `None` if it can't be
/// resolved (then [`find_device`] can't prove a candidate is safe and falls back to name-only
/// matching — no worse than before the guard existed).
fn default_render_id() -> Option<String> {
wasapi::DeviceEnumerator::new()
.ok()?
.get_default_device(&Direction::Render)
.ok()?
.get_id()
.ok()
}
/// Resolve the virtual-mic target among render endpoints by friendly-name, **excluding the endpoint
/// the loopback captures** (the [`default_render_id`] anti-echo guard). Logs all candidates so a
/// missing/skipped device is diagnosable.
fn find_device() -> Result<wasapi::Device> {
let enumerator = wasapi::DeviceEnumerator::new().context("DeviceEnumerator")?;
let collection = enumerator
.get_device_collection(&Direction::Render)
.context("render device collection")?;
let n = collection.get_nbr_devices().context("device count")?;
let want = std::env::var("PUNKTFUNK_MIC_DEVICE")
.ok()
.map(|s| s.to_lowercase());
// The device the loopback captures — a name match on it is rejected below (would echo).
let loopback_id = default_render_id();
let mut names = Vec::new();
let mut found = None;
let mut skipped_loopback = false;
for i in 0..n {
let Ok(dev) = collection.get_device_at_index(i) else {
continue;
};
let name = dev.get_friendlyname().unwrap_or_default();
let lname = name.to_lowercase();
let hit = match &want {
Some(w) => lname.contains(w),
None => CANDIDATES.iter().any(|c| lname.contains(c)),
};
if hit && found.is_none() {
// Anti-echo guard: never inject into the endpoint the loopback captures.
let is_loopback = match (dev.get_id().ok(), loopback_id.as_deref()) {
(Some(id), Some(lb)) => id == lb,
_ => false,
};
if is_loopback {
skipped_loopback = true;
tracing::warn!(device = %name,
"virtual-mic candidate is the loopback (default render) endpoint — skipping; \
injecting there would echo the client's mic into the desktop-audio stream");
} else {
found = Some(dev);
}
}
names.push(name);
}
found.ok_or_else(|| {
if skipped_loopback {
anyhow!(
"the only virtual-mic candidate among render endpoints {names:?} is the default \
playback device the host loopback-captures — injecting there would echo the mic \
back to the client. Add a SEPARATE virtual audio device for the mic (e.g. the Steam \
Streaming Microphone) or set a different default playback device, then reconnect."
)
} else {
anyhow!(
"no virtual-mic device among render endpoints {names:?}. Install VB-Audio Virtual \
Cable or enable Steam Remote Play's microphone (Steam Streaming Microphone), or set \
PUNKTFUNK_MIC_DEVICE=<friendly-name substring>."
)
}
})
}
/// Find the virtual-mic device, and if none exists, try to AUTO-INSTALL one so mic passthrough works
/// out of the box (then re-find). Falls back to the guidance error if nothing can be installed.
fn find_or_install_device() -> Result<wasapi::Device> {
match find_device() {
Ok(d) => Ok(d),
Err(e) => {
tracing::info!("no usable virtual mic device present — attempting auto-install");
// SAFETY: `install_steam_audio_pair` is `unsafe` only because it `LoadLibraryExW`s
// `newdev.dll` and calls `DiInstallDriverW` through a `transmute`d function pointer;
// calling it imposes no extra precondition here (it takes no args and aliases nothing).
// Its internal contract holds: the `DiInstall` type matches the documented
// `BOOL DiInstallDriverW(HWND, PCWSTR, DWORD, PBOOL)` ABI, and it passes a
// NUL-terminated UTF-16 INF path with null/zero optional args. Invoked once on the
// dedicated mic thread.
if unsafe { install_steam_audio_pair() } {
find_device()
} else {
Err(e)
}
}
}
}
/// Best-effort: install BOTH Steam Streaming audio devices (the "Steam pair") so mic passthrough
/// works out of the box and the host has a desktop-audio sink distinct from the mic. Steam Remote
/// Play ships `SteamStreamingMicrophone.inf` + `SteamStreamingSpeakers.inf`: the microphone gives the
/// virtual mic a target whose **capture** endpoint apps record from, and the speakers give a
/// **render** endpoint a headless box can loopback-capture that is NOT the mic — so the loopback and
/// the mic land on different devices and never echo (see [`find_device`]). Returns true if either
/// installed. No-op when Steam isn't installed (INFs absent), the install is denied (needs admin —
/// the host runs as SYSTEM), or `PUNKTFUNK_NO_MIC_INSTALL` is set.
unsafe fn install_steam_audio_pair() -> bool {
// Microphone first (the mic's actual target); speakers second (the distinct desktop-audio sink).
let mic = try_install_steam_audio("SteamStreamingMicrophone.inf");
let spk = try_install_steam_audio("SteamStreamingSpeakers.inf");
mic || spk
}
/// Install one Steam Streaming driver INF by filename via `DiInstallDriverW` (loaded from
/// `newdev.dll`, like Apollo, to avoid an extra windows-crate feature). See
/// [`install_steam_audio_pair`] for the contract; `inf_name` is a bare filename under Steam's
/// per-arch `drivers\Windows10\{arch}\` directory.
unsafe fn try_install_steam_audio(inf_name: &str) -> bool {
use windows::core::{s, w, PCWSTR};
use windows::Win32::Foundation::HWND;
use windows::Win32::System::Environment::ExpandEnvironmentStringsW;
use windows::Win32::System::LibraryLoader::{
GetProcAddress, LoadLibraryExW, LOAD_LIBRARY_SEARCH_SYSTEM32,
};
if std::env::var_os("PUNKTFUNK_NO_MIC_INSTALL").is_some() {
return false;
}
// Steam ships per-arch driver INFs under `Steam\drivers\Windows10\{arch}\`.
#[cfg(target_arch = "x86_64")]
let subdir = "x64";
#[cfg(target_arch = "aarch64")]
let subdir = "arm64";
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
let subdir = "x86";
let template: Vec<u16> =
format!("%CommonProgramFiles(x86)%\\Steam\\drivers\\Windows10\\{subdir}\\{inf_name}")
.encode_utf16()
.chain(std::iter::once(0))
.collect();
let mut path = vec![0u16; 1024];
let n = ExpandEnvironmentStringsW(PCWSTR(template.as_ptr()), Some(path.as_mut_slice()));
if n == 0 || n as usize > path.len() {
return false;
}
let Ok(newdev) = LoadLibraryExW(w!("newdev.dll"), None, LOAD_LIBRARY_SEARCH_SYSTEM32) else {
tracing::warn!("could not load newdev.dll — Steam-audio auto-install unavailable");
return false;
};
let Some(addr) = GetProcAddress(newdev, s!("DiInstallDriverW")) else {
return false;
};
// BOOL DiInstallDriverW(HWND hwndParent, PCWSTR InfPath, DWORD Flags, PBOOL NeedReboot)
type DiInstall = unsafe extern "system" fn(HWND, PCWSTR, u32, *mut i32) -> i32;
let f: DiInstall = std::mem::transmute(addr);
let ok = f(
HWND(std::ptr::null_mut()),
PCWSTR(path.as_ptr()),
0,
std::ptr::null_mut(),
) != 0;
if ok {
tracing::info!(
inf = inf_name,
"installed a Steam Streaming virtual audio device"
);
std::thread::sleep(Duration::from_secs(5)); // let the audio subsystem register the endpoint
} else {
let err = windows::Win32::Foundation::GetLastError();
tracing::info!(
inf = inf_name,
?err,
"Steam-audio device not auto-installed (Steam absent / not admin) — see install guidance"
);
}
ok
}
fn render_thread(
queue: Arc<Mutex<VecDeque<u8>>>,
stop: Arc<AtomicBool>,
ready: SyncSender<Result<String>>,
) -> Result<()> {
if let Err(e) = wasapi::initialize_mta()
.ok()
.context("CoInitializeEx (MTA)")
{
let _ = ready.send(Err(e));
return Ok(());
}
// Open + start the render stream. The WASAPI objects must outlive the loop, so build them here and
// keep them (a closure that *returned* them would drop them); on any failure report Err and exit.
let setup = (|| -> Result<(wasapi::AudioClient, wasapi::AudioRenderClient, wasapi::Handle, String)> {
let device = find_or_install_device()?;
let name = device.get_friendlyname().unwrap_or_else(|_| "virtual mic".into());
let mut audio_client = device.get_iaudioclient().context("IAudioClient")?;
// 48 kHz stereo f32; autoconvert lets WASAPI shared-mode SRC match the device mix format.
let desired = WaveFormat::new(
32,
32,
&SampleType::Float,
SAMPLE_RATE as usize,
CHANNELS as usize,
None,
);
let (default_period, _min) = audio_client.get_device_period().context("device period")?;
let mode = StreamMode::EventsShared {
autoconvert: true,
buffer_duration_hns: default_period,
};
audio_client
.initialize_client(&desired, &Direction::Render, &mode)
.context("initialize render client")?;
let h_event = audio_client.set_get_eventhandle().context("event handle")?;
let render_client = audio_client
.get_audiorenderclient()
.context("IAudioRenderClient")?;
// Pre-fill the whole buffer with silence so the stream starts cleanly (no startup glitch).
let buf_frames = audio_client.get_buffer_size().context("buffer size")? as usize;
let _ = render_client.write_to_device(buf_frames, &vec![0u8; buf_frames * BLOCK_ALIGN], None);
audio_client.start_stream().context("start render stream")?;
Ok((audio_client, render_client, h_event, name))
})();
let (audio_client, render_client, h_event, name) = match setup {
Ok(t) => t,
Err(e) => {
let _ = ready.send(Err(anyhow!("{e:#}")));
return Ok(());
}
};
let _ = ready.send(Ok(name));
let mut buf: Vec<u8> = Vec::new();
while !stop.load(Ordering::Relaxed) {
// The device signals when it wants more data; finite timeout keeps `stop` responsive.
if h_event.wait_for_event(100).is_err() {
continue;
}
let space = audio_client
.get_available_space_in_frames()
.context("available space")? as usize;
if space == 0 {
continue;
}
let need = space * BLOCK_ALIGN;
if buf.len() < need {
buf.resize(need, 0);
}
// Silence base; overwrite with queued mic PCM (zero-pad the tail when the client is quiet).
buf[..need].fill(0);
{
let mut q = queue.lock().unwrap();
let n = q.len().min(need);
for (i, b) in q.drain(..n).enumerate() {
buf[i] = b;
}
}
render_client
.write_to_device(space, &buf[..need], None)
.context("write_to_device")?;
}
audio_client.stop_stream().ok();
Ok(())
}