From 6e1097da4f140fcbb48c15835180eab28c1cb623 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Thu, 11 Jun 2026 11:57:09 +0000 Subject: [PATCH] fix(inject): self-heal a stale/hung EIS connection + per-kind injection diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The host-lifetime libei injector could connect to a gamescope EIS socket whose listen socket exists but whose server never drives the EI handshake — a stale socket left by a SIGKILLed prior session, or one created early in a new gamescope's startup before its libei server is ready. `UnixStream::connect` to a socket *file* succeeds the moment the path exists, so the worker sailed past the connect and then hung forever in `handshake_tokio` (or sat connected with no device ever resumed). Because `LibeiInjector::inject` only enqueues onto a channel (the !Send worker owns the connection), the send never errors, so InjectorService never noticed the dead worker and never reopened — every input event for the whole session was silently swallowed. The 30s setup timeout didn't help: a typical session ends first, so input just died with no error logged. Reconnecting made it worse (more stale sockets to land on). Two self-heal bounds, both paths (gamescope socket + KWin/GNOME portal): - Bound the EI handshake at 8s — a non-responding EIS server now errors instead of hanging, so the worker exits and the next inject() reopens. - Watchdog: if no input device resumes within 5s of connecting, treat the connection as dead-on-arrival and exit (same reopen path). Healthy servers add+resume a device within a beat of the handshake. Verified on-box: clean gamescope + KWin paths connect/resume/emit unchanged; a stale listener that accepts-but-never-handshakes now errors in 8s; two back-to-back gamescope sessions both inject (session 2 reopens against the fresh socket). Independently confirmed end-to-end delivery on KWin — a focused wev got the injected motions/keys/buttons — i.e. injection itself was never broken, only its recovery from a bad connection. Also adds permanent low-volume diagnostics so the next "input dead" report is instantly triageable: log each EIS device's capabilities on resume, the first of each InputKind a client sends + whether it emitted, and no-resumed-device drops. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/punktfunk-host/src/inject/libei.rs | 100 ++++++++++++++++++++-- 1 file changed, 94 insertions(+), 6 deletions(-) diff --git a/crates/punktfunk-host/src/inject/libei.rs b/crates/punktfunk-host/src/inject/libei.rs index 3063003..7ad7033 100644 --- a/crates/punktfunk-host/src/inject/libei.rs +++ b/crates/punktfunk-host/src/inject/libei.rs @@ -122,10 +122,23 @@ async fn session_main(mut rx: UnboundedReceiver, source: EiSource) { tracing::info!("libei: EIS connected — awaiting devices"); let mut state = EiState::new(); + // Watchdog: a healthy EIS server adds + resumes an input device within a beat of the handshake. + // If none has resumed by this deadline, the connection is dead-on-arrival (stale/half-ready + // gamescope socket the handshake passed but no real server is behind) — exit so the next + // inject() fails and InjectorService reopens against a fresh socket, instead of silently + // swallowing every event for the whole session. + let resume_deadline = tokio::time::sleep(Duration::from_secs(5)); + tokio::pin!(resume_deadline); + let mut resumed_once = false; loop { tokio::select! { ei = events.next() => match ei { - Some(Ok(ev)) => state.handle_ei(ev, &context), + Some(Ok(ev)) => { + state.handle_ei(ev, &context); + if !resumed_once && state.devices.iter().any(|d| d.resumed) { + resumed_once = true; + } + } Some(Err(e)) => { tracing::warn!(error = %e, "libei: event stream error"); break; } None => { tracing::info!("libei: EIS disconnected"); break; } }, @@ -133,6 +146,13 @@ async fn session_main(mut rx: UnboundedReceiver, source: EiSource) { Some(input) => state.inject(&input, &context), None => { tracing::info!("libei: injector closed — ending session"); break; } }, + _ = &mut resume_deadline, if !resumed_once => { + tracing::warn!( + "libei: no input device resumed within 5s of connecting — treating the EIS \ + connection as dead and reopening (stale or half-ready compositor socket)" + ); + break; + } } } } @@ -155,10 +175,19 @@ async fn connect(source: EiSource) -> Result { EiSource::SocketPathFile(file) => (None, connect_socket_file(&file).await?), }; let context = ei::Context::new(stream).map_err(|e| anyhow!("reis EI context: {e}"))?; - let (_conn, events) = context - .handshake_tokio("punktfunk-host", ei::handshake::ContextType::Sender) - .await - .map_err(|e| anyhow!("EI handshake: {e}"))?; + // Bound the handshake. `UnixStream::connect` to a socket *file* succeeds the moment the path + // exists, but a stale/half-ready gamescope (its socket created early in startup, or left behind + // by a SIGKILLed prior session) may never drive the EI handshake — which would otherwise hang + // this worker forever. A bounded handshake lets the worker error out so InjectorService reopens. + let (_conn, events) = tokio::time::timeout( + Duration::from_secs(8), + context.handshake_tokio("punktfunk-host", ei::handshake::ContextType::Sender), + ) + .await + .map_err(|_| { + anyhow!("EI handshake timed out (EIS server not responding — stale/half-ready socket?)") + })? + .map_err(|e| anyhow!("EI handshake: {e}"))?; Ok((portal, context, events)) } @@ -268,6 +297,31 @@ struct EiState { last_serial: u32, sequence: u32, start: Instant, + /// Total inject() calls — used only to throttle diagnostic logging. + injected: u64, + /// Bitmask of [`InputKind`]s already logged once (diagnostics: surface the FIRST of each + /// kind a client sends + whether it emitted, so an unexpected client — e.g. a touch-only + /// tablet hitting a compositor without ei_touchscreen — is immediately diagnosable). + seen_kinds: u32, +} + +/// Stable small index per [`InputKind`] for the `seen_kinds` bitmask. +fn kind_bit(kind: InputKind) -> u32 { + let i = match kind { + InputKind::MouseMove => 0, + InputKind::MouseMoveAbs => 1, + InputKind::MouseButtonDown => 2, + InputKind::MouseButtonUp => 3, + InputKind::MouseScroll => 4, + InputKind::KeyDown => 5, + InputKind::KeyUp => 6, + InputKind::TouchDown => 7, + InputKind::TouchMove => 8, + InputKind::TouchUp => 9, + InputKind::GamepadButton => 10, + InputKind::GamepadAxis => 11, + }; + 1 << i } impl EiState { @@ -277,6 +331,8 @@ impl EiState { last_serial: 0, sequence: 0, start: Instant::now(), + injected: 0, + seen_kinds: 0, } } @@ -315,6 +371,16 @@ impl EiState { d.resumed = true; d.emulating = false; // must re-issue start_emulating after a resume } + let dev = &e.device; + tracing::info!( + name = ?dev.name(), + pointer = dev.has_capability(DeviceCapability::Pointer), + pointer_abs = dev.has_capability(DeviceCapability::PointerAbsolute), + keyboard = dev.has_capability(DeviceCapability::Keyboard), + button = dev.has_capability(DeviceCapability::Button), + scroll = dev.has_capability(DeviceCapability::Scroll), + "libei: device RESUMED (now emittable)" + ); } EiEvent::DevicePaused(e) => { if let Some(d) = self.devices.iter_mut().find(|d| d.device == e.device) { @@ -357,7 +423,24 @@ impl EiState { } InputKind::GamepadButton | InputKind::GamepadAxis => return, // uinput path (later) }; + self.injected += 1; + let n = self.injected; + // Log the first of each kind always (diagnostics), then occasionally. + let bit = kind_bit(ev.kind); + let first = self.seen_kinds & bit == 0; + self.seen_kinds |= bit; + let loud = first || n <= 5 || n % 600 == 0; let Some(idx) = self.device_for(cap) else { + if loud { + tracing::warn!( + n, + kind = ?ev.kind, + ?cap, + devices = self.devices.len(), + resumed = self.devices.iter().filter(|d| d.resumed).count(), + "libei: DROP — no resumed device exposes this capability" + ); + } // No resumed device with this capability yet. For touch this is usually permanent on // this compositor — the RemoteDesktop portal may grant the Touchscreen *device type* // while the EIS server never creates a touchscreen *device* (observed on headless @@ -482,6 +565,11 @@ impl EiState { if emitted { dev.frame(self.last_serial, self.now_us()); } - let _ = ctx.flush(); + if let Err(e) = ctx.flush() { + tracing::warn!(error = %e, "libei: ctx.flush failed"); + } + if loud { + tracing::info!(n, kind = ?ev.kind, idx, emitted, "libei: emitted"); + } } }