feat(apple): explicit input-capture state machine — no more cursor grabs on window chrome
ci / rust (push) Has been cancelled

Capture used to engage whenever the app became active, so the click that activates the
window — on the title bar (a drag) or a resize edge — got the cursor warped away
mid-gesture, and raw deltas kept streaming to the host while the user fought the window.
Reworked Moonlight-style, with capture as a deliberate, reversible state owned by
StreamLayerView:

- Engage: automatically once when the stream starts / trust is confirmed (one-shot, can
  never fire surprisingly later), or by clicking into the video (that click's
  press/release are suppressed toward the host; acceptsFirstMouse makes it one click
  from another app). NEVER on app re-activation.
- Release: ⌘⎋ (toggles, key-window-scoped), focus loss — now including same-app window
  switches (⌘, / ⌘N / ⌘M resign key without resigning the app; previously the new
  window inherited a hidden frozen cursor and its typing was double-delivered to the
  host) — and disconnect.
- While released: nothing is forwarded (InputCapture.forwarding gates the GC handlers;
  held keys/buttons are flushed host-side so nothing sticks), the cursor is free, and
  the HUD (now showing the capture state) is clickable.
- The no-beep behavior moved from the NSEvent monitor to first-responder key
  consumption — swallowing at the monitor risked starving GC's own delivery (the
  "input broken altogether" report). The monitor now only intercepts ⌘⎋.
- Adversarial-review fixes: a second session preempts the previous one cleanly instead
  of leaving it captured with dead GC handlers (onPreempted); the engage click's
  suppression latch can't outlive the click (mouseUp backstop); ⌘⎋'s physical Esc can't
  type into the host in either toggle direction (suppressedVK latch + Esc-while-⌘
  guard); capture callbacks defer out of the SwiftUI update pass.

Validated live against the box: 16185 input datagrams injected during a captured
session (gamescope EIS), title-bar drag/resize free while released, and visible
cursor + typing on a streamed KWin desktop, all user-confirmed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 22:42:44 +02:00
parent acf44eed5f
commit a4eacabecd
5 changed files with 299 additions and 67 deletions
@@ -5,6 +5,13 @@
// zero-copy on Apple silicon. Stage 2 (explicit VTDecompressionSession + CAMetalLayer)
// replaces this when we start tuning frame pacing / measuring glass-to-glass.
//
// The view also owns the input-capture state machine (Moonlight-style): capture is a
// deliberate, reversible state engaged when the stream starts and when the user clicks
// into the video, released by or focus loss, and NEVER engaged by mere app
// activation (the click that activates the window may be a title-bar drag or a resize
// warping the cursor there is exactly the intrusiveness this design removes). While
// released, nothing is forwarded to the host and the local cursor is free.
//
// macOS-first (NSViewRepresentable); the iOS variant is the same layer under
// UIViewRepresentable.
@@ -13,10 +20,10 @@ import AppKit
import AVFoundation
import SwiftUI
/// Hides the LOCAL cursor while streaming. The host renders its own cursor, and the local
/// Hides the LOCAL cursor while captured. The host renders its own cursor, and the local
/// one both diverges from it (the host applies acceleration/clamping to our raw deltas)
/// and can wander out of the window a click there would focus another app. So while the
/// stream has focus we do what Moonlight does: warp the cursor into the view, freeze it
/// and can wander out of the window a click there would focus another app. So while
/// captured we do what Moonlight does: warp the cursor into the view, freeze it
/// (`CGAssociateMouseAndMouseCursorPosition(false)` GCMouse still delivers raw HID
/// deltas), and hide it. hide/unhide and associate are balanced via `captured`.
private final class CursorCapture {
@@ -44,34 +51,41 @@ private final class CursorCapture {
public struct StreamView: NSViewRepresentable {
private let connection: PunktfunkConnection
private let capturesCursor: Bool
private let captureEnabled: Bool
private let onCaptureChange: ((Bool) -> Void)?
private let onFrame: (@Sendable (AccessUnit) -> Void)?
private let onSessionEnd: (@Sendable () -> Void)?
/// `onFrame`/`onSessionEnd` fire on the pump thread hop to the main actor for UI.
/// `capturesCursor: false` keeps the local cursor usable while UI (e.g. a trust
/// prompt) is layered over the stream; flip it to true to enter capture.
/// `captureEnabled: false` disables input capture entirely while UI (e.g. a trust
/// prompt) is layered over the stream; flipping it to true auto-engages capture
/// once. `onCaptureChange` (main thread) reports engage/release drive the HUD's
/// "click to capture" / " releases" hint with it.
public init(
connection: PunktfunkConnection,
capturesCursor: Bool = true,
captureEnabled: Bool = true,
onCaptureChange: ((Bool) -> Void)? = nil,
onFrame: (@Sendable (AccessUnit) -> Void)? = nil,
onSessionEnd: (@Sendable () -> Void)? = nil
) {
self.connection = connection
self.capturesCursor = capturesCursor
self.captureEnabled = captureEnabled
self.onCaptureChange = onCaptureChange
self.onFrame = onFrame
self.onSessionEnd = onSessionEnd
}
public func makeNSView(context: Context) -> StreamLayerView {
let view = StreamLayerView()
view.capturesCursor = capturesCursor
view.onCaptureChange = onCaptureChange
view.captureEnabled = captureEnabled
view.start(connection: connection, onFrame: onFrame, onSessionEnd: onSessionEnd)
return view
}
public func updateNSView(_ view: StreamLayerView, context: Context) {
view.capturesCursor = capturesCursor
view.onCaptureChange = onCaptureChange
view.captureEnabled = captureEnabled
// SwiftUI reuses the NSView across state changes repoint the pump only when the
// connection identity actually changed.
if view.connection !== connection {
@@ -106,16 +120,30 @@ public final class StreamLayerView: NSView {
private var token: PumpToken?
public private(set) var connection: PunktfunkConnection?
private let cursorCapture = CursorCapture()
private var inputCapture: InputCapture?
private var appObservers: [NSObjectProtocol] = []
private var windowObservers: [NSObjectProtocol] = []
/// Main-thread only. False = leave the local cursor alone (UI layered over the
/// stream); switching back to true re-enters capture immediately.
public var capturesCursor = true {
/// Whether input capture is currently engaged (cursor hidden+frozen, mouse/keyboard
/// forwarded). Main-thread only.
public private(set) var captured = false
/// One-shot auto-engage request (stream start, trust confirmed) attempted as soon
/// as the view is in a window with real bounds, then dropped, so it can never fire
/// surprisingly later (e.g. on a resize).
private var pendingAutoCapture = false
/// Reports engage/release on the main thread.
public var onCaptureChange: ((Bool) -> Void)?
/// Main-thread only. False = input capture disabled outright (UI layered over the
/// stream); flipping to true auto-engages once.
public var captureEnabled = true {
didSet {
if capturesCursor {
captureCursorIfStreaming()
guard captureEnabled != oldValue else { return }
if captureEnabled {
requestAutoCapture()
} else {
cursorCapture.release()
releaseCapture()
}
}
}
@@ -125,17 +153,13 @@ public final class StreamLayerView: NSView {
displayLayer.videoGravity = .resizeAspect
layer = displayLayer // layer-hosting: assign before wantsLayer
wantsLayer = true
// The cursor comes back whenever the app loses focus (Cmd+Tab is the escape
// hatch) and is re-captured when the stream regains it.
// Focus loss releases capture. Becoming active does NOT re-engage: the click
// that activates the window may be on the title bar (a drag) or a resize edge
// the user clicks into the video (or hits ) when they want capture back.
appObservers.append(NotificationCenter.default.addObserver(
forName: NSApplication.didResignActiveNotification, object: nil, queue: .main
) { [weak self] _ in
self?.cursorCapture.release()
})
appObservers.append(NotificationCenter.default.addObserver(
forName: NSApplication.didBecomeActiveNotification, object: nil, queue: .main
) { [weak self] _ in
self?.captureCursorIfStreaming()
self?.releaseCapture()
})
}
@@ -143,18 +167,111 @@ public final class StreamLayerView: NSView {
public override func viewDidMoveToWindow() {
super.viewDidMoveToWindow()
if window == nil {
cursorCapture.release()
} else {
captureCursorIfStreaming()
windowObservers.forEach(NotificationCenter.default.removeObserver(_:))
windowObservers.removeAll()
guard let window else {
releaseCapture()
return
}
// -key-equivalents stay live while captured, so Settings (,), a new window
// (N), or Minimize (M) can take key status without the APP resigning active
// capture must release then too, or the new window inherits a hidden, frozen
// cursor and its local typing is double-delivered to the host.
for name in [NSWindow.didResignKeyNotification, NSWindow.didMiniaturizeNotification] {
windowObservers.append(NotificationCenter.default.addObserver(
forName: name, object: window, queue: .main
) { [weak self] _ in
self?.releaseCapture()
})
}
attemptPendingCapture()
}
private func captureCursorIfStreaming() {
guard capturesCursor, token != nil, NSApp.isActive else { return }
cursorCapture.capture(in: self)
public override func layout() {
super.layout()
attemptPendingCapture() // bounds become real here on first presentation
}
// MARK: - Capture state machine
/// Clicking into the video engages capture; that click is local (engagement), so
/// InputCapture suppresses its press/release toward the host. Clicks while captured
/// are the host's (GC forwards them) nothing to do here.
public override func mouseDown(with event: NSEvent) {
if captureEnabled, !captured {
engageCapture(fromClick: true)
return
}
super.mouseDown(with: event)
}
/// A click from another app counts (one click into the video captures, not two).
public override func acceptsFirstMouse(for event: NSEvent?) -> Bool { true }
/// The engage click is complete drop its suppression latch (see InputCapture;
/// guards against GC delivering both halves of the click before our mouseDown).
public override func mouseUp(with event: NSEvent) {
inputCapture?.endClickSuppression()
super.mouseUp(with: event)
}
// While captured, the view is first responder and consumes key events GC delivers
// them to the host independently, and consuming here stops the responder chain's
// "unhandled keyDown" beep without touching the event stream GC may rely on.
// -combos arrive via performKeyEquivalent instead and stay fully functional (D).
public override var acceptsFirstResponder: Bool { true }
public override func keyDown(with event: NSEvent) {
if captured { return }
super.keyDown(with: event)
}
public override func keyUp(with event: NSEvent) {
if captured { return }
super.keyUp(with: event)
}
private func requestAutoCapture() {
pendingAutoCapture = true
attemptPendingCapture()
}
private func attemptPendingCapture() {
guard pendingAutoCapture, window != nil, bounds.width > 0 else { return }
pendingAutoCapture = false // one shot, even if the engage below is refused
engageCapture(fromClick: false)
}
private func engageCapture(fromClick: Bool) {
// A click is explicit intent AND may arrive mid-activation (acceptsFirstMouse:
// NSApp.isActive / isKeyWindow are still false for the click coming in from
// another app) only the auto-engage paths require already-held key status.
guard captureEnabled, !captured, token != nil, window != nil,
fromClick || (NSApp.isActive && window?.isKeyWindow == true)
else { return }
cursorCapture.capture(in: self)
inputCapture?.setForwarding(true, suppressClick: fromClick)
captured = true
window?.makeFirstResponder(self)
notifyCaptureChange(true)
}
private func releaseCapture() {
guard captured else { return }
cursorCapture.release()
inputCapture?.setForwarding(false)
captured = false
notifyCaptureChange(false)
}
/// Engage/release can run inside a SwiftUI update pass (captureEnabled flips in
/// updateNSView; release in dismantleNSView) publishing model state synchronously
/// there is undefined behavior, so the callback is deferred a runloop turn.
private func notifyCaptureChange(_ captured: Bool) {
guard let onCaptureChange else { return }
DispatchQueue.main.async { onCaptureChange(captured) }
}
// MARK: - Pump
/// Pump thread: pull AUs from the connection, wrap, enqueue. The first IDR yields the
/// format description; non-IDR AUs before it are dropped (the host opens with an IDR).
public func start(
@@ -169,6 +286,27 @@ public final class StreamLayerView: NSView {
let layer = displayLayer
layer.flush() // drop any frames a previous connection left queued
// The view owns the session's input capture: handlers attach now, but nothing is
// forwarded until capture engages (captureEnabled + auto-engage or a click).
let capture = InputCapture(connection: connection)
capture.onToggleCapture = { [weak self] in
// The monitor is app-wide only the key window's stream owns the toggle
// (two stream windows would otherwise flip each other's capture).
guard let self, self.window?.isKeyWindow == true else { return }
if self.captured {
self.releaseCapture()
} else {
self.engageCapture(fromClick: false)
}
}
capture.onPreempted = { [weak self] in
// A newer session took the GC handler slots staying "captured" here would
// be a cursor trap with dead input.
self?.releaseCapture()
}
capture.start()
inputCapture = capture
let thread = Thread {
var format: CMVideoFormatDescription?
while token.isLive {
@@ -203,13 +341,15 @@ public final class StreamLayerView: NSView {
thread.name = "punktfunk-pump"
thread.qualityOfService = .userInteractive
thread.start()
captureCursorIfStreaming()
requestAutoCapture() // entering a session is the deliberate "capture me" moment
}
/// Stop pumping ( one poll timeout). Does not close the connection that stays with
/// whoever owns it (PunktfunkConnection.close() is safe alongside a draining pump).
public func stop() {
cursorCapture.release()
releaseCapture()
inputCapture?.stop()
inputCapture = nil
token?.cancel()
token = nil
connection = nil
@@ -217,6 +357,7 @@ public final class StreamLayerView: NSView {
deinit {
appObservers.forEach(NotificationCenter.default.removeObserver(_:))
windowObservers.forEach(NotificationCenter.default.removeObserver(_:))
token?.cancel()
}
}