feat(apple): explicit input-capture state machine — no more cursor grabs on window chrome
ci / rust (push) Has been cancelled

Capture used to engage whenever the app became active, so the click that activates the
window — on the title bar (a drag) or a resize edge — got the cursor warped away
mid-gesture, and raw deltas kept streaming to the host while the user fought the window.
Reworked Moonlight-style, with capture as a deliberate, reversible state owned by
StreamLayerView:

- Engage: automatically once when the stream starts / trust is confirmed (one-shot, can
  never fire surprisingly later), or by clicking into the video (that click's
  press/release are suppressed toward the host; acceptsFirstMouse makes it one click
  from another app). NEVER on app re-activation.
- Release: ⌘⎋ (toggles, key-window-scoped), focus loss — now including same-app window
  switches (⌘, / ⌘N / ⌘M resign key without resigning the app; previously the new
  window inherited a hidden frozen cursor and its typing was double-delivered to the
  host) — and disconnect.
- While released: nothing is forwarded (InputCapture.forwarding gates the GC handlers;
  held keys/buttons are flushed host-side so nothing sticks), the cursor is free, and
  the HUD (now showing the capture state) is clickable.
- The no-beep behavior moved from the NSEvent monitor to first-responder key
  consumption — swallowing at the monitor risked starving GC's own delivery (the
  "input broken altogether" report). The monitor now only intercepts ⌘⎋.
- Adversarial-review fixes: a second session preempts the previous one cleanly instead
  of leaving it captured with dead GC handlers (onPreempted); the engage click's
  suppression latch can't outlive the click (mouseUp backstop); ⌘⎋'s physical Esc can't
  type into the host in either toggle direction (suppressedVK latch + Esc-while-⌘
  guard); capture callbacks defer out of the SwiftUI update pass.

Validated live against the box: 16185 input datagrams injected during a captured
session (gamescope EIS), title-bar drag/resize free while released, and visible
cursor + typing on a streamed KWin desktop, all user-confirmed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 22:42:44 +02:00
parent acf44eed5f
commit a4eacabecd
5 changed files with 299 additions and 67 deletions
@@ -58,7 +58,7 @@ struct ContentView: View {
return nil
}()
return ZStack {
stream(capturesCursor: pendingFingerprint == nil)
stream(captureEnabled: pendingFingerprint == nil)
.blur(radius: pendingFingerprint != nil ? 32 : 0)
.overlay {
if pendingFingerprint != nil {
@@ -257,19 +257,22 @@ struct ContentView: View {
// MARK: - Stream
private func stream(capturesCursor: Bool) -> some View {
private func stream(captureEnabled: Bool) -> some View {
Group {
if let conn = model.connection {
StreamView(
connection: conn,
capturesCursor: capturesCursor,
captureEnabled: captureEnabled,
onCaptureChange: { [weak model] captured in
model?.mouseCaptured = captured
},
onFrame: { [meter = model.meter] au in meter.note(byteCount: au.data.count) },
onSessionEnd: { [weak model] in
Task { @MainActor in model?.sessionEnded() }
}
)
.overlay(alignment: .topTrailing) {
if capturesCursor { hud(conn) }
if captureEnabled { hud(conn) }
}
}
}
@@ -279,8 +282,13 @@ struct ContentView: View {
VStack(alignment: .trailing, spacing: 4) {
Text("\(conn.width)×\(conn.height)@\(conn.refreshHz) \(model.fps) fps \(model.mbps, specifier: "%.1f") Mb/s")
.font(.system(.caption, design: .monospaced))
// D because the local cursor is hidden+frozen while streaming the button
// can't be clicked. (Cmd+Tab away also frees the cursor.)
// While captured the cursor is hidden+frozen, so the button is keyboard-only
// ( or Cmd+Tab release the cursor; released, it's clickable again).
Text(model.mouseCaptured
? "⌘⎋ releases the mouse"
: "Click the stream to capture input")
.font(.caption2)
.opacity(0.8)
Button("Disconnect (⌘D)") { model.disconnect() }
.font(.caption)
.keyboardShortcut("d", modifiers: .command)
@@ -53,9 +53,11 @@ final class SessionModel: ObservableObject {
@Published var fps = 0
@Published var mbps = 0.0
@Published var totalFrames = 0
/// Mirrors StreamView's capture state (it owns the input capture; this drives the
/// HUD's "click to capture" / " releases" hint).
@Published var mouseCaptured = false
let meter = FrameMeter()
private var inputCapture: InputCapture?
private var statsTimer: Timer?
var isBusy: Bool { phase != .idle }
@@ -118,8 +120,6 @@ final class SessionModel: ObservableObject {
}
func disconnect() {
inputCapture?.stop()
inputCapture = nil
statsTimer?.invalidate()
statsTimer = nil
if let conn = connection {
@@ -132,6 +132,7 @@ final class SessionModel: ObservableObject {
phase = .idle
fps = 0
mbps = 0
mouseCaptured = false
}
/// Called (via the main actor) when the pump hits end-of-session.
@@ -143,11 +144,10 @@ final class SessionModel: ObservableObject {
}
private func beginStreaming() {
guard let conn = connection else { return }
guard connection != nil else { return }
// Input capture itself is owned by StreamView (engaged by the captureEnabled
// flip this phase change causes, released/re-engaged by the user from there).
phase = .streaming
let capture = InputCapture(connection: conn)
capture.start()
inputCapture = capture
}
private func startStatsTimer() {
@@ -14,6 +14,11 @@
// didResignActive and on stop(). All GC handlers and notifications fire on the main
// queue (the framework default), so the mutable state here needs no locking.
//
// Forwarding is gated by `forwarding` (driven by StreamLayerView's capture state): the
// handlers stay attached for the whole session, but while the user has released capture
// (, focus loss) nothing reaches the host and key events travel the responder chain
// normally. Everything held is flushed host-side on each transition to released.
//
// GCMouse.current/GCKeyboard.coalesced are process-global singletons with one handler
// slot each: only one InputCapture can be live per process. `activeCapture` tracks
// ownership so a stale capture's stop() can't clobber a newer one's handlers.
@@ -40,14 +45,66 @@ public final class InputCapture {
private var residualScrollY: Float = 0
private var pressedVKs: Set<UInt32> = []
private var pressedButtons: Set<UInt32> = []
/// One-shot: the left click that engaged capture belongs to the local UI GC sees
/// it at the HID layer regardless, so its press AND release are dropped here.
private var suppressedButton: UInt32?
/// One-shot twin of `suppressedButton` for the toggle: the physical Esc also
/// reaches GCKeyboard, racing the NSEvent monitor latched here so it can't type
/// an Escape into the host in either toggle direction.
private var suppressedVK: UInt32?
/// While true, mouse/keyboard flow to the host and key NSEvents are swallowed
/// locally; while false the user is interacting with the local UI (dragging the
/// window, clicking the HUD) and nothing is forwarded. Main-queue only.
public private(set) var forwarding = false
/// Fired on (the capture toggle detected here so it works in both states; the
/// event itself is swallowed). Main queue.
public var onToggleCapture: (() -> Void)?
/// Fired when a newer InputCapture takes the process-global GC handler slots (the
/// singletons hold ONE handler each): the preempted owner must drop its capture
/// state its handlers are gone, so it would otherwise sit "captured" with dead
/// input. Main queue.
public var onPreempted: (() -> Void)?
public init(connection: PunktfunkConnection) {
self.connection = connection
}
/// Gate the forwarding without detaching the GC handlers. `suppressClick` marks the
/// transition as click-driven: that click's press/release are not forwarded. Every
/// transition to false flushes held keys/buttons host-side.
public func setForwarding(_ on: Bool, suppressClick: Bool = false) {
if on {
forwarding = true
suppressedButton = suppressClick ? 1 : nil
} else if forwarding {
releaseAll()
forwarding = false
suppressedButton = nil
}
}
/// The engage click is over (its NSEvent mouseUp processed) stop suppressing.
/// Backstop for the GC-vs-NSEvent ordering where both halves of the click landed
/// before mouseDown armed the latch, which would otherwise eat the next real click.
public func endClickSuppression() {
suppressedButton = nil
}
/// Begin forwarding the current (and future) mouse/keyboard to the host. Steals the
/// global GC handler slots from any previous capture (one live capture per process).
/// global GC handler slots from any previous capture (one live capture per process),
/// notifying it via `onPreempted` so its owner releases its capture state.
public func start() {
if let previous = Self.activeCapture, previous !== self {
// Drop the previous owner's device lists first: its stop() must not be able
// to nil out the handler slots this capture is about to claim.
previous.mice.removeAll()
previous.keyboards.removeAll()
previous.onPreempted?()
}
Self.activeCapture = self
if let mouse = GCMouse.current { attach(mouse: mouse) }
if let keyboard = GCKeyboard.coalesced { attach(keyboard: keyboard) }
@@ -67,15 +124,21 @@ public final class InputCapture {
) { [weak self] _ in
self?.releaseAll()
})
// GC reads the HID state directly the NSEvents still travel the responder
// chain, where every unhandled keyDown makes NSWindow beep ("invalid input").
// Swallow key events while captured, EXCEPT -combos: those stay local (the
// HUD's D disconnect, Q, ) in addition to reaching the host via GC.
// the capture toggle is detected here so it works in both states. ONLY
// that one combo is intercepted: swallowing keys wholesale at the monitor level
// risks starving GC's own delivery, so the no-beep behavior lives in
// StreamLayerView (first responder consumes keyDown/keyUp while captured).
keyEventMonitor = NSEvent.addLocalMonitorForEvents(
matching: [.keyDown, .keyUp]
) { event in
event.modifierFlags.intersection(.deviceIndependentFlagsMask).contains(.command)
? event : nil
matching: [.keyDown]
) { [weak self] event in
guard let self else { return event }
let flags = event.modifierFlags.intersection(.deviceIndependentFlagsMask)
if event.keyCode == 53 /* Esc */, flags == .command {
self.suppressedVK = 0x1B // the same physical Esc is en route via GC
self.onToggleCapture?()
return nil
}
return event
}
}
@@ -126,6 +189,11 @@ public final class InputCapture {
}
private func sendButton(_ button: UInt32, pressed: Bool) {
guard forwarding else { return }
if button == suppressedButton {
if !pressed { suppressedButton = nil } // capture click over stop suppressing
return
}
if pressed {
pressedButtons.insert(button)
} else {
@@ -140,7 +208,7 @@ public final class InputCapture {
else { return }
mice.append(mouse)
input.mouseMovedHandler = { [weak self] _, dx, dy in
guard let self else { return }
guard let self, self.forwarding else { return }
// GC gives +y up; the host expects screen-space (+y down).
let fx = dx + self.residualX
let fy = -dy + self.residualY
@@ -170,7 +238,7 @@ public final class InputCapture {
}
}
input.scroll.valueChangedHandler = { [weak self] _, x, y in
guard let self else { return }
guard let self, self.forwarding else { return }
// WHEEL_DELTA(120) per notch; positive = up / right (Moonlight's convention).
let fy = y * 120 + self.residualScrollY
let fx = x * 120 + self.residualScrollX
@@ -188,6 +256,18 @@ public final class InputCapture {
keyboards.append(keyboard)
keyboard.keyboardInput?.keyChangedHandler = { [weak self] _, _, keyCode, pressed in
guard let self, let vk = Self.hidToVK[keyCode.rawValue] else { return }
// The toggle's Esc checked before the forwarding gate, because in the
// engage direction forwarding is already true when this fires.
if vk == self.suppressedVK {
if !pressed { self.suppressedVK = nil }
return
}
guard self.forwarding else { return }
// Release direction of the toggle: GC's Esc-down can beat the NSEvent
// monitor never type Esc into the host while is held ( is reserved).
if vk == 0x1B, self.pressedVKs.contains(0x5B) || self.pressedVKs.contains(0x5C) {
return
}
if pressed {
self.pressedVKs.insert(vk)
} else {
@@ -5,6 +5,13 @@
// zero-copy on Apple silicon. Stage 2 (explicit VTDecompressionSession + CAMetalLayer)
// replaces this when we start tuning frame pacing / measuring glass-to-glass.
//
// The view also owns the input-capture state machine (Moonlight-style): capture is a
// deliberate, reversible state engaged when the stream starts and when the user clicks
// into the video, released by or focus loss, and NEVER engaged by mere app
// activation (the click that activates the window may be a title-bar drag or a resize
// warping the cursor there is exactly the intrusiveness this design removes). While
// released, nothing is forwarded to the host and the local cursor is free.
//
// macOS-first (NSViewRepresentable); the iOS variant is the same layer under
// UIViewRepresentable.
@@ -13,10 +20,10 @@ import AppKit
import AVFoundation
import SwiftUI
/// Hides the LOCAL cursor while streaming. The host renders its own cursor, and the local
/// Hides the LOCAL cursor while captured. The host renders its own cursor, and the local
/// one both diverges from it (the host applies acceleration/clamping to our raw deltas)
/// and can wander out of the window a click there would focus another app. So while the
/// stream has focus we do what Moonlight does: warp the cursor into the view, freeze it
/// and can wander out of the window a click there would focus another app. So while
/// captured we do what Moonlight does: warp the cursor into the view, freeze it
/// (`CGAssociateMouseAndMouseCursorPosition(false)` GCMouse still delivers raw HID
/// deltas), and hide it. hide/unhide and associate are balanced via `captured`.
private final class CursorCapture {
@@ -44,34 +51,41 @@ private final class CursorCapture {
public struct StreamView: NSViewRepresentable {
private let connection: PunktfunkConnection
private let capturesCursor: Bool
private let captureEnabled: Bool
private let onCaptureChange: ((Bool) -> Void)?
private let onFrame: (@Sendable (AccessUnit) -> Void)?
private let onSessionEnd: (@Sendable () -> Void)?
/// `onFrame`/`onSessionEnd` fire on the pump thread hop to the main actor for UI.
/// `capturesCursor: false` keeps the local cursor usable while UI (e.g. a trust
/// prompt) is layered over the stream; flip it to true to enter capture.
/// `captureEnabled: false` disables input capture entirely while UI (e.g. a trust
/// prompt) is layered over the stream; flipping it to true auto-engages capture
/// once. `onCaptureChange` (main thread) reports engage/release drive the HUD's
/// "click to capture" / " releases" hint with it.
public init(
connection: PunktfunkConnection,
capturesCursor: Bool = true,
captureEnabled: Bool = true,
onCaptureChange: ((Bool) -> Void)? = nil,
onFrame: (@Sendable (AccessUnit) -> Void)? = nil,
onSessionEnd: (@Sendable () -> Void)? = nil
) {
self.connection = connection
self.capturesCursor = capturesCursor
self.captureEnabled = captureEnabled
self.onCaptureChange = onCaptureChange
self.onFrame = onFrame
self.onSessionEnd = onSessionEnd
}
public func makeNSView(context: Context) -> StreamLayerView {
let view = StreamLayerView()
view.capturesCursor = capturesCursor
view.onCaptureChange = onCaptureChange
view.captureEnabled = captureEnabled
view.start(connection: connection, onFrame: onFrame, onSessionEnd: onSessionEnd)
return view
}
public func updateNSView(_ view: StreamLayerView, context: Context) {
view.capturesCursor = capturesCursor
view.onCaptureChange = onCaptureChange
view.captureEnabled = captureEnabled
// SwiftUI reuses the NSView across state changes repoint the pump only when the
// connection identity actually changed.
if view.connection !== connection {
@@ -106,16 +120,30 @@ public final class StreamLayerView: NSView {
private var token: PumpToken?
public private(set) var connection: PunktfunkConnection?
private let cursorCapture = CursorCapture()
private var inputCapture: InputCapture?
private var appObservers: [NSObjectProtocol] = []
private var windowObservers: [NSObjectProtocol] = []
/// Main-thread only. False = leave the local cursor alone (UI layered over the
/// stream); switching back to true re-enters capture immediately.
public var capturesCursor = true {
/// Whether input capture is currently engaged (cursor hidden+frozen, mouse/keyboard
/// forwarded). Main-thread only.
public private(set) var captured = false
/// One-shot auto-engage request (stream start, trust confirmed) attempted as soon
/// as the view is in a window with real bounds, then dropped, so it can never fire
/// surprisingly later (e.g. on a resize).
private var pendingAutoCapture = false
/// Reports engage/release on the main thread.
public var onCaptureChange: ((Bool) -> Void)?
/// Main-thread only. False = input capture disabled outright (UI layered over the
/// stream); flipping to true auto-engages once.
public var captureEnabled = true {
didSet {
if capturesCursor {
captureCursorIfStreaming()
guard captureEnabled != oldValue else { return }
if captureEnabled {
requestAutoCapture()
} else {
cursorCapture.release()
releaseCapture()
}
}
}
@@ -125,17 +153,13 @@ public final class StreamLayerView: NSView {
displayLayer.videoGravity = .resizeAspect
layer = displayLayer // layer-hosting: assign before wantsLayer
wantsLayer = true
// The cursor comes back whenever the app loses focus (Cmd+Tab is the escape
// hatch) and is re-captured when the stream regains it.
// Focus loss releases capture. Becoming active does NOT re-engage: the click
// that activates the window may be on the title bar (a drag) or a resize edge
// the user clicks into the video (or hits ) when they want capture back.
appObservers.append(NotificationCenter.default.addObserver(
forName: NSApplication.didResignActiveNotification, object: nil, queue: .main
) { [weak self] _ in
self?.cursorCapture.release()
})
appObservers.append(NotificationCenter.default.addObserver(
forName: NSApplication.didBecomeActiveNotification, object: nil, queue: .main
) { [weak self] _ in
self?.captureCursorIfStreaming()
self?.releaseCapture()
})
}
@@ -143,18 +167,111 @@ public final class StreamLayerView: NSView {
public override func viewDidMoveToWindow() {
super.viewDidMoveToWindow()
if window == nil {
cursorCapture.release()
} else {
captureCursorIfStreaming()
windowObservers.forEach(NotificationCenter.default.removeObserver(_:))
windowObservers.removeAll()
guard let window else {
releaseCapture()
return
}
// -key-equivalents stay live while captured, so Settings (,), a new window
// (N), or Minimize (M) can take key status without the APP resigning active
// capture must release then too, or the new window inherits a hidden, frozen
// cursor and its local typing is double-delivered to the host.
for name in [NSWindow.didResignKeyNotification, NSWindow.didMiniaturizeNotification] {
windowObservers.append(NotificationCenter.default.addObserver(
forName: name, object: window, queue: .main
) { [weak self] _ in
self?.releaseCapture()
})
}
attemptPendingCapture()
}
private func captureCursorIfStreaming() {
guard capturesCursor, token != nil, NSApp.isActive else { return }
cursorCapture.capture(in: self)
public override func layout() {
super.layout()
attemptPendingCapture() // bounds become real here on first presentation
}
// MARK: - Capture state machine
/// Clicking into the video engages capture; that click is local (engagement), so
/// InputCapture suppresses its press/release toward the host. Clicks while captured
/// are the host's (GC forwards them) nothing to do here.
public override func mouseDown(with event: NSEvent) {
if captureEnabled, !captured {
engageCapture(fromClick: true)
return
}
super.mouseDown(with: event)
}
/// A click from another app counts (one click into the video captures, not two).
public override func acceptsFirstMouse(for event: NSEvent?) -> Bool { true }
/// The engage click is complete drop its suppression latch (see InputCapture;
/// guards against GC delivering both halves of the click before our mouseDown).
public override func mouseUp(with event: NSEvent) {
inputCapture?.endClickSuppression()
super.mouseUp(with: event)
}
// While captured, the view is first responder and consumes key events GC delivers
// them to the host independently, and consuming here stops the responder chain's
// "unhandled keyDown" beep without touching the event stream GC may rely on.
// -combos arrive via performKeyEquivalent instead and stay fully functional (D).
public override var acceptsFirstResponder: Bool { true }
public override func keyDown(with event: NSEvent) {
if captured { return }
super.keyDown(with: event)
}
public override func keyUp(with event: NSEvent) {
if captured { return }
super.keyUp(with: event)
}
private func requestAutoCapture() {
pendingAutoCapture = true
attemptPendingCapture()
}
private func attemptPendingCapture() {
guard pendingAutoCapture, window != nil, bounds.width > 0 else { return }
pendingAutoCapture = false // one shot, even if the engage below is refused
engageCapture(fromClick: false)
}
private func engageCapture(fromClick: Bool) {
// A click is explicit intent AND may arrive mid-activation (acceptsFirstMouse:
// NSApp.isActive / isKeyWindow are still false for the click coming in from
// another app) only the auto-engage paths require already-held key status.
guard captureEnabled, !captured, token != nil, window != nil,
fromClick || (NSApp.isActive && window?.isKeyWindow == true)
else { return }
cursorCapture.capture(in: self)
inputCapture?.setForwarding(true, suppressClick: fromClick)
captured = true
window?.makeFirstResponder(self)
notifyCaptureChange(true)
}
private func releaseCapture() {
guard captured else { return }
cursorCapture.release()
inputCapture?.setForwarding(false)
captured = false
notifyCaptureChange(false)
}
/// Engage/release can run inside a SwiftUI update pass (captureEnabled flips in
/// updateNSView; release in dismantleNSView) publishing model state synchronously
/// there is undefined behavior, so the callback is deferred a runloop turn.
private func notifyCaptureChange(_ captured: Bool) {
guard let onCaptureChange else { return }
DispatchQueue.main.async { onCaptureChange(captured) }
}
// MARK: - Pump
/// Pump thread: pull AUs from the connection, wrap, enqueue. The first IDR yields the
/// format description; non-IDR AUs before it are dropped (the host opens with an IDR).
public func start(
@@ -169,6 +286,27 @@ public final class StreamLayerView: NSView {
let layer = displayLayer
layer.flush() // drop any frames a previous connection left queued
// The view owns the session's input capture: handlers attach now, but nothing is
// forwarded until capture engages (captureEnabled + auto-engage or a click).
let capture = InputCapture(connection: connection)
capture.onToggleCapture = { [weak self] in
// The monitor is app-wide only the key window's stream owns the toggle
// (two stream windows would otherwise flip each other's capture).
guard let self, self.window?.isKeyWindow == true else { return }
if self.captured {
self.releaseCapture()
} else {
self.engageCapture(fromClick: false)
}
}
capture.onPreempted = { [weak self] in
// A newer session took the GC handler slots staying "captured" here would
// be a cursor trap with dead input.
self?.releaseCapture()
}
capture.start()
inputCapture = capture
let thread = Thread {
var format: CMVideoFormatDescription?
while token.isLive {
@@ -203,13 +341,15 @@ public final class StreamLayerView: NSView {
thread.name = "punktfunk-pump"
thread.qualityOfService = .userInteractive
thread.start()
captureCursorIfStreaming()
requestAutoCapture() // entering a session is the deliberate "capture me" moment
}
/// Stop pumping ( one poll timeout). Does not close the connection that stays with
/// whoever owns it (PunktfunkConnection.close() is safe alongside a draining pump).
public func stop() {
cursorCapture.release()
releaseCapture()
inputCapture?.stop()
inputCapture = nil
token?.cancel()
token = nil
connection = nil
@@ -217,6 +357,7 @@ public final class StreamLayerView: NSView {
deinit {
appObservers.forEach(NotificationCenter.default.removeObserver(_:))
windowObservers.forEach(NotificationCenter.default.removeObserver(_:))
token?.cancel()
}
}