fix(inject): self-heal a stale/hung EIS connection + per-kind injection diagnostics

The host-lifetime libei injector could connect to a gamescope EIS socket whose
listen socket exists but whose server never drives the EI handshake — a stale
socket left by a SIGKILLed prior session, or one created early in a new
gamescope's startup before its libei server is ready. `UnixStream::connect` to a
socket *file* succeeds the moment the path exists, so the worker sailed past the
connect and then hung forever in `handshake_tokio` (or sat connected with no
device ever resumed). Because `LibeiInjector::inject` only enqueues onto a
channel (the !Send worker owns the connection), the send never errors, so
InjectorService never noticed the dead worker and never reopened — every input
event for the whole session was silently swallowed. The 30s setup timeout didn't
help: a typical session ends first, so input just died with no error logged.
Reconnecting made it worse (more stale sockets to land on).

Two self-heal bounds, both paths (gamescope socket + KWin/GNOME portal):
- Bound the EI handshake at 8s — a non-responding EIS server now errors instead
  of hanging, so the worker exits and the next inject() reopens.
- Watchdog: if no input device resumes within 5s of connecting, treat the
  connection as dead-on-arrival and exit (same reopen path). Healthy servers
  add+resume a device within a beat of the handshake.

Verified on-box: clean gamescope + KWin paths connect/resume/emit unchanged; a
stale listener that accepts-but-never-handshakes now errors in 8s; two
back-to-back gamescope sessions both inject (session 2 reopens against the fresh
socket). Independently confirmed end-to-end delivery on KWin — a focused wev got
the injected motions/keys/buttons — i.e. injection itself was never broken, only
its recovery from a bad connection.

Also adds permanent low-volume diagnostics so the next "input dead" report is
instantly triageable: log each EIS device's capabilities on resume, the first of
each InputKind a client sends + whether it emitted, and no-resumed-device drops.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-11 11:57:09 +00:00
parent 4781933507
commit 6e1097da4f
+94 -6
View File
@@ -122,10 +122,23 @@ async fn session_main(mut rx: UnboundedReceiver<InputEvent>, source: EiSource) {
tracing::info!("libei: EIS connected — awaiting devices");
let mut state = EiState::new();
// Watchdog: a healthy EIS server adds + resumes an input device within a beat of the handshake.
// If none has resumed by this deadline, the connection is dead-on-arrival (stale/half-ready
// gamescope socket the handshake passed but no real server is behind) — exit so the next
// inject() fails and InjectorService reopens against a fresh socket, instead of silently
// swallowing every event for the whole session.
let resume_deadline = tokio::time::sleep(Duration::from_secs(5));
tokio::pin!(resume_deadline);
let mut resumed_once = false;
loop {
tokio::select! {
ei = events.next() => match ei {
Some(Ok(ev)) => state.handle_ei(ev, &context),
Some(Ok(ev)) => {
state.handle_ei(ev, &context);
if !resumed_once && state.devices.iter().any(|d| d.resumed) {
resumed_once = true;
}
}
Some(Err(e)) => { tracing::warn!(error = %e, "libei: event stream error"); break; }
None => { tracing::info!("libei: EIS disconnected"); break; }
},
@@ -133,6 +146,13 @@ async fn session_main(mut rx: UnboundedReceiver<InputEvent>, source: EiSource) {
Some(input) => state.inject(&input, &context),
None => { tracing::info!("libei: injector closed — ending session"); break; }
},
_ = &mut resume_deadline, if !resumed_once => {
tracing::warn!(
"libei: no input device resumed within 5s of connecting — treating the EIS \
connection as dead and reopening (stale or half-ready compositor socket)"
);
break;
}
}
}
}
@@ -155,10 +175,19 @@ async fn connect(source: EiSource) -> Result<Connected> {
EiSource::SocketPathFile(file) => (None, connect_socket_file(&file).await?),
};
let context = ei::Context::new(stream).map_err(|e| anyhow!("reis EI context: {e}"))?;
let (_conn, events) = context
.handshake_tokio("punktfunk-host", ei::handshake::ContextType::Sender)
.await
.map_err(|e| anyhow!("EI handshake: {e}"))?;
// Bound the handshake. `UnixStream::connect` to a socket *file* succeeds the moment the path
// exists, but a stale/half-ready gamescope (its socket created early in startup, or left behind
// by a SIGKILLed prior session) may never drive the EI handshake — which would otherwise hang
// this worker forever. A bounded handshake lets the worker error out so InjectorService reopens.
let (_conn, events) = tokio::time::timeout(
Duration::from_secs(8),
context.handshake_tokio("punktfunk-host", ei::handshake::ContextType::Sender),
)
.await
.map_err(|_| {
anyhow!("EI handshake timed out (EIS server not responding — stale/half-ready socket?)")
})?
.map_err(|e| anyhow!("EI handshake: {e}"))?;
Ok((portal, context, events))
}
@@ -268,6 +297,31 @@ struct EiState {
last_serial: u32,
sequence: u32,
start: Instant,
/// Total inject() calls — used only to throttle diagnostic logging.
injected: u64,
/// Bitmask of [`InputKind`]s already logged once (diagnostics: surface the FIRST of each
/// kind a client sends + whether it emitted, so an unexpected client — e.g. a touch-only
/// tablet hitting a compositor without ei_touchscreen — is immediately diagnosable).
seen_kinds: u32,
}
/// Stable small index per [`InputKind`] for the `seen_kinds` bitmask.
fn kind_bit(kind: InputKind) -> u32 {
let i = match kind {
InputKind::MouseMove => 0,
InputKind::MouseMoveAbs => 1,
InputKind::MouseButtonDown => 2,
InputKind::MouseButtonUp => 3,
InputKind::MouseScroll => 4,
InputKind::KeyDown => 5,
InputKind::KeyUp => 6,
InputKind::TouchDown => 7,
InputKind::TouchMove => 8,
InputKind::TouchUp => 9,
InputKind::GamepadButton => 10,
InputKind::GamepadAxis => 11,
};
1 << i
}
impl EiState {
@@ -277,6 +331,8 @@ impl EiState {
last_serial: 0,
sequence: 0,
start: Instant::now(),
injected: 0,
seen_kinds: 0,
}
}
@@ -315,6 +371,16 @@ impl EiState {
d.resumed = true;
d.emulating = false; // must re-issue start_emulating after a resume
}
let dev = &e.device;
tracing::info!(
name = ?dev.name(),
pointer = dev.has_capability(DeviceCapability::Pointer),
pointer_abs = dev.has_capability(DeviceCapability::PointerAbsolute),
keyboard = dev.has_capability(DeviceCapability::Keyboard),
button = dev.has_capability(DeviceCapability::Button),
scroll = dev.has_capability(DeviceCapability::Scroll),
"libei: device RESUMED (now emittable)"
);
}
EiEvent::DevicePaused(e) => {
if let Some(d) = self.devices.iter_mut().find(|d| d.device == e.device) {
@@ -357,7 +423,24 @@ impl EiState {
}
InputKind::GamepadButton | InputKind::GamepadAxis => return, // uinput path (later)
};
self.injected += 1;
let n = self.injected;
// Log the first of each kind always (diagnostics), then occasionally.
let bit = kind_bit(ev.kind);
let first = self.seen_kinds & bit == 0;
self.seen_kinds |= bit;
let loud = first || n <= 5 || n % 600 == 0;
let Some(idx) = self.device_for(cap) else {
if loud {
tracing::warn!(
n,
kind = ?ev.kind,
?cap,
devices = self.devices.len(),
resumed = self.devices.iter().filter(|d| d.resumed).count(),
"libei: DROP — no resumed device exposes this capability"
);
}
// No resumed device with this capability yet. For touch this is usually permanent on
// this compositor — the RemoteDesktop portal may grant the Touchscreen *device type*
// while the EIS server never creates a touchscreen *device* (observed on headless
@@ -482,6 +565,11 @@ impl EiState {
if emitted {
dev.frame(self.last_serial, self.now_us());
}
let _ = ctx.flush();
if let Err(e) = ctx.flush() {
tracing::warn!(error = %e, "libei: ctx.flush failed");
}
if loud {
tracing::info!(n, kind = ?ev.kind, idx, emitted, "libei: emitted");
}
}
}