diff --git a/crates/punktfunk-host/Cargo.toml b/crates/punktfunk-host/Cargo.toml index c46305e..e5746e2 100644 --- a/crates/punktfunk-host/Cargo.toml +++ b/crates/punktfunk-host/Cargo.toml @@ -124,6 +124,7 @@ windows = { version = "0.62", features = [ "Win32_Graphics_Dxgi_Common", "Win32_Graphics_Direct3D", "Win32_Graphics_Direct3D11", + "Win32_Graphics_Direct3D_Fxc", "Win32_Graphics_Gdi", ] } # Software H.264 encoder (GPU-less path + NVENC fallback). The default `source` feature statically diff --git a/crates/punktfunk-host/src/capture/dxgi.rs b/crates/punktfunk-host/src/capture/dxgi.rs index d2503e3..a540124 100644 --- a/crates/punktfunk-host/src/capture/dxgi.rs +++ b/crates/punktfunk-host/src/capture/dxgi.rs @@ -9,23 +9,42 @@ use super::{CapturedFrame, Capturer, FramePayload, PixelFormat}; use anyhow::{anyhow, bail, Context, Result}; +use std::ffi::c_void; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use windows::core::Interface; +use windows::core::{s, Interface, PCSTR}; use windows::Win32::Foundation::{HMODULE, LUID}; -use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0}; +use windows::Win32::Graphics::Direct3D::Fxc::D3DCompile; +use windows::Win32::Graphics::Direct3D::{ + ID3DBlob, D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0, + D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP, +}; use windows::Win32::Graphics::Direct3D11::{ - D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG, - D3D11_BIND_RENDER_TARGET, D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT, - D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC, - D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING, + D3D11CreateDevice, ID3D11BlendState, ID3D11Buffer, ID3D11Device, ID3D11DeviceContext, + ID3D11PixelShader, ID3D11RenderTargetView, ID3D11SamplerState, ID3D11ShaderResourceView, + ID3D11Texture2D, ID3D11VertexShader, D3D11_BIND_CONSTANT_BUFFER, D3D11_BIND_FLAG, + D3D11_BIND_RENDER_TARGET, D3D11_BIND_SHADER_RESOURCE, D3D11_BLEND_DESC, D3D11_BLEND_INV_DEST_COLOR, + D3D11_BLEND_INV_SRC_ALPHA, D3D11_BLEND_ONE, D3D11_BLEND_OP_ADD, D3D11_BLEND_SRC_ALPHA, + D3D11_BUFFER_DESC, + D3D11_COLOR_WRITE_ENABLE_ALL, D3D11_COMPARISON_NEVER, D3D11_CPU_ACCESS_READ, + D3D11_CPU_ACCESS_WRITE, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_FILTER_MIN_MAG_MIP_POINT, + D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_MAP_WRITE_DISCARD, D3D11_RENDER_TARGET_BLEND_DESC, + D3D11_SAMPLER_DESC, D3D11_SDK_VERSION, D3D11_SUBRESOURCE_DATA, D3D11_TEXTURE2D_DESC, + D3D11_TEXTURE_ADDRESS_CLAMP, D3D11_USAGE_DEFAULT, D3D11_USAGE_DYNAMIC, D3D11_USAGE_STAGING, + D3D11_VIEWPORT, }; use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC}; use windows::Win32::Graphics::Dxgi::{ CreateDXGIFactory1, IDXGIAdapter1, IDXGIFactory1, IDXGIOutput1, IDXGIOutputDuplication, - IDXGIResource, DXGI_ERROR_ACCESS_LOST, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, - DXGI_OUTDUPL_FRAME_INFO, + IDXGIResource, DXGI_ERROR_ACCESS_LOST, DXGI_ERROR_DEVICE_REMOVED, DXGI_ERROR_DEVICE_RESET, + DXGI_ERROR_INVALID_CALL, DXGI_ERROR_WAIT_TIMEOUT, DXGI_OUTDUPL_DESC, DXGI_OUTDUPL_FRAME_INFO, + DXGI_OUTDUPL_POINTER_SHAPE_INFO, DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR, + DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR, }; +use windows::Win32::System::StationsAndDesktops::{ + OpenInputDesktop, SetThreadDesktop, DESKTOP_ACCESS_FLAGS, DESKTOP_CONTROL_FLAGS, +}; +use windows::Win32::UI::WindowsAndMessaging::SetCursorPos; /// The Windows capture identity carried out of the SudoVDA backend in /// [`crate::vdisplay::VirtualOutput`]: which adapter + which GDI output to duplicate. @@ -33,8 +52,10 @@ use windows::Win32::Graphics::Dxgi::{ pub struct WinCaptureTarget { /// Packed DXGI adapter LUID (`(HighPart << 32) | (LowPart & 0xffff_ffff)`). pub adapter_luid: i64, - /// The output's GDI device name, e.g. `\\.\DISPLAY3`. + /// The output's GDI device name, e.g. `\\.\DISPLAY3`. Can CHANGE across a secure-desktop switch. pub gdi_name: String, + /// Stable SudoVDA target id — re-resolved to the current GDI name on every recovery. + pub target_id: u32, } /// A GPU-resident captured texture (future NVENC-D3D11 zero-copy path). @@ -66,11 +87,471 @@ fn depad_bgra(src: &[u8], pitch: usize, w: usize, h: usize) -> Vec { out } +/// Re-find the live `IDXGIOutput1` for a GDI name across all adapters (the SudoVDA monitor is +/// enumerated under the rendering GPU). Used to recover after ACCESS_LOST, where the cached handle +/// may be stale. +unsafe fn find_output(gdi_name: &str) -> Result<(IDXGIAdapter1, IDXGIOutput1)> { + let factory: IDXGIFactory1 = CreateDXGIFactory1().context("CreateDXGIFactory1")?; + let mut i = 0u32; + while let Ok(a) = factory.EnumAdapters1(i) { + let mut j = 0u32; + while let Ok(o) = a.EnumOutputs(j) { + let od = o.GetDesc()?; + if gdi_name_matches(&od.DeviceName, gdi_name) { + return Ok((a.clone(), o.cast::()?)); + } + j += 1; + } + i += 1; + } + bail!("no DXGI output named {gdi_name} (gone after ACCESS_LOST?)") +} + +/// Create a fresh D3D11 device + context on a specific adapter (driver_type UNKNOWN with an explicit +/// adapter). Used at open and on every ACCESS_LOST: a device created on one desktop cannot sustain a +/// duplication on a *different* desktop (perpetual ACCESS_LOST), so the secure-desktop switch needs a +/// device made while the thread is attached to that desktop. +unsafe fn make_device(adapter: &IDXGIAdapter1) -> Result<(ID3D11Device, ID3D11DeviceContext)> { + let mut device: Option = None; + let mut context: Option = None; + D3D11CreateDevice( + adapter, + D3D_DRIVER_TYPE_UNKNOWN, + HMODULE::default(), + D3D11_CREATE_DEVICE_BGRA_SUPPORT, + Some(&[D3D_FEATURE_LEVEL_11_0]), + D3D11_SDK_VERSION, + Some(&mut device), + None, + Some(&mut context), + ) + .context("D3D11CreateDevice")?; + Ok(( + device.context("null D3D11 device")?, + context.context("null D3D11 context")?, + )) +} + +/// Re-find the output, make a fresh device on its adapter, and duplicate it. Used by the ACCESS_LOST +/// recovery to rebuild the whole capture on the current (possibly secure) input desktop. +unsafe fn reopen_duplication( + gdi_name: &str, +) -> Result<( + ID3D11Device, + ID3D11DeviceContext, + IDXGIOutput1, + IDXGIOutputDuplication, +)> { + let (adapter, out) = find_output(gdi_name)?; + let (dev, ctx) = make_device(&adapter)?; + let dupl = out + .DuplicateOutput(&dev) + .context("re-DuplicateOutput after ACCESS_LOST")?; + Ok((dev, ctx, out, dupl)) +} + +/// Park the cursor on a duplicated output. A blank virtual display emits NO Desktop Duplication +/// frames until something changes; a pointer move IS a DDA "change", so this kicks the very first +/// `AcquireNextFrame` loose — and lands the cursor on the display the client is viewing. Two moves +/// to distinct points guarantee an actual move even if the cursor already sat at the center. +/// Follow the current input desktop so duplication spans the normal ↔ Winlogon (secure: login/UAC) +/// desktops. Opening the secure desktop requires SYSTEM; on a non-SYSTEM host this just fails on +/// Winlogon (capture freezes there) — which is why the host relaunches itself as SYSTEM. The HDESK +/// is intentionally leaked: it must stay open while it's the thread's desktop, and switches +/// (lock/unlock/UAC) are rare, so a few handles per session is fine. +unsafe fn attach_input_desktop() { + match OpenInputDesktop( + DESKTOP_CONTROL_FLAGS(0), + false, + DESKTOP_ACCESS_FLAGS(0x1000_0000), // GENERIC_ALL + ) { + Ok(desk) => match SetThreadDesktop(desk) { + Ok(()) => tracing::info!("attach_input_desktop: SetThreadDesktop OK"), + Err(e) => { + tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: SetThreadDesktop FAILED") + } + }, + Err(e) => { + tracing::warn!(error = %format!("{e:?}"), "attach_input_desktop: OpenInputDesktop FAILED") + } + } +} + +unsafe fn nudge_cursor_onto(output: &IDXGIOutput1) { + if let Ok(od) = output.GetDesc() { + let r = od.DesktopCoordinates; + let _ = SetCursorPos(r.left + 8, r.top + 8); + let _ = SetCursorPos((r.left + r.right) / 2, (r.top + r.bottom) / 2); + } +} + +// DXGI Desktop Duplication deliberately EXCLUDES the hardware cursor from the captured surface (the +// OS composites it separately). We capture the cursor shape/position from the frame info and blend it +// back in — on the GPU for the zero-copy path (a CPU readback would stall the 240 fps pipeline). + +const CURSOR_VS: &str = r" +cbuffer Rect : register(b0) { float4 r; }; +struct VOut { float4 pos : SV_POSITION; float2 uv : TEXCOORD0; }; +VOut main(uint vid : SV_VertexID) { + float2 uv = float2((vid == 1 || vid == 3) ? 1.0 : 0.0, (vid >= 2) ? 1.0 : 0.0); + VOut o; + o.pos = float4(lerp(r.x, r.z, uv.x), lerp(r.y, r.w, uv.y), 0.0, 1.0); + o.uv = uv; + return o; +} +"; + +const CURSOR_PS: &str = r" +Texture2D tx : register(t0); +SamplerState sm : register(s0); +float4 main(float4 pos : SV_POSITION, float2 uv : TEXCOORD0) : SV_TARGET { + return tx.Sample(sm, uv); +} +"; + +unsafe fn compile_shader(src: &str, entry: PCSTR, target: PCSTR) -> Result> { + let mut blob: Option = None; + let mut errs: Option = None; + let r = D3DCompile( + src.as_ptr() as *const c_void, + src.len(), + PCSTR::null(), + None, + None, + entry, + target, + 0, + 0, + &mut blob, + Some(&mut errs), + ); + if r.is_err() { + let msg = errs + .as_ref() + .map(|e| { + let p = e.GetBufferPointer() as *const u8; + String::from_utf8_lossy(std::slice::from_raw_parts(p, e.GetBufferSize())).to_string() + }) + .unwrap_or_default(); + bail!("D3DCompile failed: {msg}"); + } + let blob = blob.context("no shader blob")?; + let p = blob.GetBufferPointer() as *const u8; + Ok(std::slice::from_raw_parts(p, blob.GetBufferSize()).to_vec()) +} + +/// GPU cursor overlay: a tiny shader pipeline that alpha-blends the cursor texture onto the captured +/// frame. Tied to one D3D11 device; rebuilt when the capturer recreates its device on a desktop switch. +struct CursorCompositor { + vs: ID3D11VertexShader, + ps: ID3D11PixelShader, + cbuf: ID3D11Buffer, + blend: ID3D11BlendState, + /// Inversion blend for masked-color (XOR) cursors like the text I-beam: result = white*(1-dest), + /// i.e. it inverts the screen under the cursor so it's visible on any background. + blend_invert: ID3D11BlendState, + sampler: ID3D11SamplerState, + tex: Option<(ID3D11ShaderResourceView, u32, u32)>, // srv + width + height +} + +impl CursorCompositor { + unsafe fn new(device: &ID3D11Device) -> Result { + let vsb = compile_shader(CURSOR_VS, s!("main"), s!("vs_5_0"))?; + let psb = compile_shader(CURSOR_PS, s!("main"), s!("ps_5_0"))?; + let mut vs = None; + device.CreateVertexShader(&vsb, None, Some(&mut vs))?; + let mut ps = None; + device.CreatePixelShader(&psb, None, Some(&mut ps))?; + + let cbd = D3D11_BUFFER_DESC { + ByteWidth: 16, + Usage: D3D11_USAGE_DYNAMIC, + BindFlags: D3D11_BIND_CONSTANT_BUFFER.0 as u32, + CPUAccessFlags: D3D11_CPU_ACCESS_WRITE.0 as u32, + ..Default::default() + }; + let mut cbuf = None; + device.CreateBuffer(&cbd, None, Some(&mut cbuf))?; + + let mut bd = D3D11_BLEND_DESC::default(); + bd.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC { + BlendEnable: true.into(), + SrcBlend: D3D11_BLEND_SRC_ALPHA, + DestBlend: D3D11_BLEND_INV_SRC_ALPHA, + BlendOp: D3D11_BLEND_OP_ADD, + SrcBlendAlpha: D3D11_BLEND_ONE, + DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA, + BlendOpAlpha: D3D11_BLEND_OP_ADD, + RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8, + }; + let mut blend = None; + device.CreateBlendState(&bd, Some(&mut blend))?; + + // Inversion blend: result.rgb = src*(1-dest) + dest*(1-src.a). A white opaque cursor pixel + // (src=1,a=1) -> 1-dest (inverted); a transparent pixel (src=0,a=0) -> dest (unchanged). + let mut bdi = D3D11_BLEND_DESC::default(); + bdi.RenderTarget[0] = D3D11_RENDER_TARGET_BLEND_DESC { + BlendEnable: true.into(), + SrcBlend: D3D11_BLEND_INV_DEST_COLOR, + DestBlend: D3D11_BLEND_INV_SRC_ALPHA, + BlendOp: D3D11_BLEND_OP_ADD, + SrcBlendAlpha: D3D11_BLEND_ONE, + DestBlendAlpha: D3D11_BLEND_INV_SRC_ALPHA, + BlendOpAlpha: D3D11_BLEND_OP_ADD, + RenderTargetWriteMask: D3D11_COLOR_WRITE_ENABLE_ALL.0 as u8, + }; + let mut blend_invert = None; + device.CreateBlendState(&bdi, Some(&mut blend_invert))?; + + let sd = D3D11_SAMPLER_DESC { + Filter: D3D11_FILTER_MIN_MAG_MIP_POINT, + AddressU: D3D11_TEXTURE_ADDRESS_CLAMP, + AddressV: D3D11_TEXTURE_ADDRESS_CLAMP, + AddressW: D3D11_TEXTURE_ADDRESS_CLAMP, + ComparisonFunc: D3D11_COMPARISON_NEVER, + MaxLOD: f32::MAX, + ..Default::default() + }; + let mut sampler = None; + device.CreateSamplerState(&sd, Some(&mut sampler))?; + + Ok(Self { + vs: vs.context("vs")?, + ps: ps.context("ps")?, + cbuf: cbuf.context("cbuf")?, + blend: blend.context("blend")?, + blend_invert: blend_invert.context("blend_invert")?, + sampler: sampler.context("sampler")?, + tex: None, + }) + } + + unsafe fn set_shape(&mut self, device: &ID3D11Device, bgra: &[u8], w: u32, h: u32) -> Result<()> { + let desc = D3D11_TEXTURE2D_DESC { + Width: w, + Height: h, + MipLevels: 1, + ArraySize: 1, + Format: DXGI_FORMAT_B8G8R8A8_UNORM, + SampleDesc: DXGI_SAMPLE_DESC { + Count: 1, + Quality: 0, + }, + Usage: D3D11_USAGE_DEFAULT, + BindFlags: D3D11_BIND_SHADER_RESOURCE.0 as u32, + ..Default::default() + }; + let init = D3D11_SUBRESOURCE_DATA { + pSysMem: bgra.as_ptr() as *const c_void, + SysMemPitch: w * 4, + SysMemSlicePitch: 0, + }; + let mut tex: Option = None; + device.CreateTexture2D(&desc, Some(&init), Some(&mut tex))?; + let tex = tex.context("cursor tex")?; + let mut srv = None; + device.CreateShaderResourceView(&tex, None, Some(&mut srv))?; + self.tex = Some((srv.context("cursor srv")?, w, h)); + Ok(()) + } + + /// Blend the cursor onto `rtv` (a render-target view of the captured frame) at frame pixel (cx,cy). + #[allow(clippy::too_many_arguments)] + unsafe fn draw( + &self, + ctx: &ID3D11DeviceContext, + rtv: &ID3D11RenderTargetView, + fw: u32, + fh: u32, + cx: i32, + cy: i32, + invert: bool, + ) { + let (srv, cw, ch) = match &self.tex { + Some(t) => t, + None => return, + }; + let x0 = (cx as f32 / fw as f32) * 2.0 - 1.0; + let x1 = ((cx + *cw as i32) as f32 / fw as f32) * 2.0 - 1.0; + let y0 = 1.0 - (cy as f32 / fh as f32) * 2.0; + let y1 = 1.0 - ((cy + *ch as i32) as f32 / fh as f32) * 2.0; + let rect = [x0, y0, x1, y1]; + let mut mapped = D3D11_MAPPED_SUBRESOURCE::default(); + if ctx + .Map(&self.cbuf, 0, D3D11_MAP_WRITE_DISCARD, 0, Some(&mut mapped)) + .is_ok() + { + std::ptr::copy_nonoverlapping(rect.as_ptr(), mapped.pData as *mut f32, 4); + ctx.Unmap(&self.cbuf, 0); + } + let vp = D3D11_VIEWPORT { + TopLeftX: 0.0, + TopLeftY: 0.0, + Width: fw as f32, + Height: fh as f32, + MinDepth: 0.0, + MaxDepth: 1.0, + }; + ctx.RSSetViewports(Some(&[vp])); + ctx.OMSetRenderTargets(Some(&[Some(rtv.clone())]), None); + let blend = if invert { &self.blend_invert } else { &self.blend }; + ctx.OMSetBlendState(blend, Some(&[0.0; 4]), 0xffff_ffff); + ctx.VSSetShader(&self.vs, None); + ctx.PSSetShader(&self.ps, None); + ctx.VSSetConstantBuffers(0, Some(&[Some(self.cbuf.clone())])); + ctx.PSSetShaderResources(0, Some(&[Some(srv.clone())])); + ctx.PSSetSamplers(0, Some(&[Some(self.sampler.clone())])); + ctx.IASetInputLayout(None); + ctx.IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); + ctx.Draw(4, 0); + // Unbind the render target so the next frame's CopyResource into this texture is unobstructed. + ctx.OMSetRenderTargets(Some(&[None]), None); + } +} + +/// Convert a DXGI pointer shape (color / masked-color / monochrome) into top-down BGRA. +fn convert_pointer_shape(buf: &[u8], si: &DXGI_OUTDUPL_POINTER_SHAPE_INFO) -> Option<(Vec, u32, u32)> { + let w = si.Width as usize; + let pitch = si.Pitch as usize; + if w == 0 || pitch == 0 { + return None; + } + // Type is a u32 (newtype constants compared via .0). + if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_COLOR.0 as u32 { + // Straight 32bpp BGRA with a real alpha channel. + let h = si.Height as usize; + if buf.len() < pitch * h { + return None; + } + let mut out = vec![0u8; w * h * 4]; + for y in 0..h { + for x in 0..w { + let s = y * pitch + x * 4; + let d = (y * w + x) * 4; + out[d] = buf[s]; + out[d + 1] = buf[s + 1]; + out[d + 2] = buf[s + 2]; + out[d + 3] = buf[s + 3]; + } + } + Some((out, w as u32, h as u32)) + } else if si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR.0 as u32 { + // 32bpp where the alpha byte is a MASK, not an alpha: 0x00 = opaque (copy RGB), 0xFF = XOR + // with the screen. The text I-beam is this type — surround = XOR-with-black (a no-op, must be + // transparent), bar = XOR-with-white (inverts the screen so it shows on any background). + // Compositing uses the INVERSION blend (see CursorCompositor) when `cursor_invert` is set, so: + // mask 0x00 -> opaque RGB (rendered as a plain pixel — rare for I-beams) + // mask 0xFF, RGB == 0 -> transparent (XOR with black = unchanged) + // mask 0xFF, RGB != 0 -> WHITE opaque (the inversion blend turns this into 1-dest) + let h = si.Height as usize; + if buf.len() < pitch * h { + return None; + } + let mut out = vec![0u8; w * h * 4]; + for y in 0..h { + for x in 0..w { + let s = y * pitch + x * 4; + let d = (y * w + x) * 4; + let (b, g, r, mask) = (buf[s], buf[s + 1], buf[s + 2], buf[s + 3]); + if mask == 0 { + out[d] = b; + out[d + 1] = g; + out[d + 2] = r; + out[d + 3] = 255; + } else if b == 0 && g == 0 && r == 0 { + out[d + 3] = 0; // XOR with black = no change → transparent + } else { + out[d] = 255; // inverting pixel → white; inversion blend makes it 1-dest + out[d + 1] = 255; + out[d + 2] = 255; + out[d + 3] = 255; + } + } + } + Some((out, w as u32, h as u32)) + } else { + // Monochrome: top half = AND mask, bottom half = XOR mask, 1 bpp. + let h = (si.Height / 2) as usize; + if buf.len() < pitch * h * 2 { + return None; + } + let bit = |row: usize, x: usize| (buf[row * pitch + x / 8] >> (7 - (x % 8))) & 1; + let mut out = vec![0u8; w * h * 4]; + for y in 0..h { + for x in 0..w { + let and_bit = bit(y, x); + let xor_bit = bit(y + h, x); + let (b, g, r, a) = match (and_bit, xor_bit) { + (0, 0) => (0, 0, 0, 255), // opaque black + (0, 1) => (255, 255, 255, 255), // opaque white + (1, 0) => (0, 0, 0, 0), // transparent + _ => (0, 0, 0, 255), // invert -> approximate as black + }; + let d = (y * w + x) * 4; + out[d] = b; + out[d + 1] = g; + out[d + 2] = r; + out[d + 3] = a; + } + } + Some((out, w as u32, h as u32)) + } +} + +/// CPU src-over alpha blend of a BGRA cursor into a BGRA frame buffer (software-encode path). When +/// `invert` is set (masked-color / XOR cursor), a covered pixel inverts the frame instead (true XOR). +#[allow(clippy::too_many_arguments)] +fn blend_cursor_cpu( + frame: &mut [u8], + fw: u32, + fh: u32, + cur: &[u8], + cw: u32, + ch: u32, + cx: i32, + cy: i32, + invert: bool, +) { + let (fw, fh, cw, ch) = (fw as i32, fh as i32, cw as i32, ch as i32); + for y in 0..ch { + let fy = cy + y; + if fy < 0 || fy >= fh { + continue; + } + for x in 0..cw { + let fx = cx + x; + if fx < 0 || fx >= fw { + continue; + } + let s = ((y * cw + x) * 4) as usize; + let a = cur[s + 3] as u32; + if a == 0 { + continue; + } + let d = ((fy * fw + fx) * 4) as usize; + if invert { + for k in 0..3 { + frame[d + k] = 255 - frame[d + k]; + } + } else { + for k in 0..3 { + frame[d + k] = + ((cur[s + k] as u32 * a + frame[d + k] as u32 * (255 - a)) / 255) as u8; + } + } + } + } +} + pub struct DuplCapturer { device: ID3D11Device, context: ID3D11DeviceContext, output: IDXGIOutput1, dupl: IDXGIOutputDuplication, + /// The output's GDI name — re-resolved on ACCESS_LOST (a mode change can stale the cached handle). + gdi_name: String, + /// Stable SudoVDA target id, used to re-resolve `gdi_name` during recovery. + target_id: u32, width: u32, height: u32, refresh_hz: u32, @@ -78,6 +559,11 @@ pub struct DuplCapturer { holding_frame: bool, active: AtomicBool, timeout_ms: u32, + /// The first AcquireNextFrame after a (re)DuplicateOutput gets a generous timeout — the initial + /// desktop snapshot of a large surface can take longer than the per-frame budget. + first_frame: bool, + dbg_timeouts: u32, + dbg_lost: u32, last: Option>, /// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA. /// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input. @@ -86,6 +572,17 @@ pub struct DuplCapturer { /// surface is transient and released each frame). gpu_copy: Option, have_gpu_frame: bool, + /// GPU cursor overlay (rebuilt on device recreate). `None` until the first composite. + cursor: Option, + /// Last cursor shape as BGRA (kept device-independent so it survives a device recreate). + cursor_shape: Option<(Vec, u32, u32)>, + cursor_pos: (i32, i32), + cursor_visible: bool, + /// Cursor shape changed → re-upload to the GPU texture before the next composite. + cursor_dirty: bool, + /// Current cursor is masked-color (XOR) → composite with the inversion blend. + cursor_invert: bool, + dbg_cursor: u64, _keepalive: Box, } // COM objects used only from the one thread that owns the capturer (the encode thread). @@ -197,10 +694,14 @@ impl DuplCapturer { .context("D3D11CreateDevice")?; let device = device.context("null D3D11 device")?; let context = context.context("null D3D11 context")?; - // 3) duplicate the output. + // 3) duplicate the output. Attach to the current input desktop first (as SYSTEM this can + // be the Winlogon secure desktop) so a session that starts at the lock/login screen works. + attach_input_desktop(); let dupl = output .DuplicateOutput(&device) .context("DuplicateOutput (already duplicated by another app?)")?; + // Kick the first frame loose: a blank virtual display is otherwise change-less. + nudge_cursor_onto(&output); let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc(); let (width, height) = (dd.ModeDesc.Width, dd.ModeDesc.Height); let refresh_hz = preferred @@ -236,6 +737,8 @@ impl DuplCapturer { context, output, dupl, + target_id: target.target_id, + gdi_name: target.gdi_name, width, height, refresh_hz, @@ -243,10 +746,20 @@ impl DuplCapturer { holding_frame: false, active: AtomicBool::new(false), timeout_ms, + first_frame: true, + dbg_timeouts: 0, + dbg_lost: 0, last: None, gpu_mode, gpu_copy: None, have_gpu_frame: false, + cursor: None, + cursor_shape: None, + cursor_pos: (0, 0), + cursor_visible: false, + cursor_dirty: false, + cursor_invert: false, + dbg_cursor: 0, _keepalive: keepalive, }) } @@ -306,16 +819,137 @@ impl DuplCapturer { Ok(()) } + /// Pull cursor position/visibility/shape out of the frame info (the HW cursor is NOT in the frame). + unsafe fn update_cursor(&mut self, info: &DXGI_OUTDUPL_FRAME_INFO) { + if info.LastMouseUpdateTime != 0 { + self.cursor_pos = (info.PointerPosition.Position.x, info.PointerPosition.Position.y); + self.cursor_visible = info.PointerPosition.Visible.as_bool(); + } + if info.PointerShapeBufferSize > 0 { + let mut buf = vec![0u8; info.PointerShapeBufferSize as usize]; + let mut required = 0u32; + let mut si = DXGI_OUTDUPL_POINTER_SHAPE_INFO::default(); + if self + .dupl + .GetFramePointerShape( + info.PointerShapeBufferSize, + buf.as_mut_ptr() as *mut c_void, + &mut required, + &mut si, + ) + .is_ok() + { + if let Some(shape) = convert_pointer_shape(&buf, &si) { + tracing::info!( + shape_type = si.Type, + size = format!("{}x{}", shape.1, shape.2), + "cursor shape captured" + ); + self.cursor_invert = + si.Type == DXGI_OUTDUPL_POINTER_SHAPE_TYPE_MASKED_COLOR.0 as u32; + self.cursor_shape = Some(shape); + self.cursor_dirty = true; + } + } + } + } + + /// Composite the cursor onto the GPU frame texture (zero-copy path). + unsafe fn composite_cursor_gpu(&mut self, gpu: &ID3D11Texture2D) -> Result<()> { + self.dbg_cursor += 1; + if self.dbg_cursor % 240 == 1 { + tracing::debug!( + visible = self.cursor_visible, + pos = format!("{:?}", self.cursor_pos), + shape = self.cursor_shape.as_ref().map(|(_, w, h)| format!("{w}x{h}")), + "cursor state" + ); + } + if !self.cursor_visible || self.cursor_shape.is_none() { + return Ok(()); + } + if self.cursor.is_none() { + self.cursor = Some(CursorCompositor::new(&self.device)?); + self.cursor_dirty = true; // fresh device → must (re)upload the shape texture + } + if self.cursor_dirty { + if let Some((bgra, w, h)) = &self.cursor_shape { + self.cursor + .as_mut() + .unwrap() + .set_shape(&self.device, bgra, *w, *h)?; + } + self.cursor_dirty = false; + } + let mut rtv: Option = None; + self.device + .CreateRenderTargetView(gpu, None, Some(&mut rtv))?; + let rtv = rtv.context("cursor rtv")?; + let (cx, cy) = self.cursor_pos; + self.cursor.as_ref().unwrap().draw( + &self.context, + &rtv, + self.width, + self.height, + cx, + cy, + self.cursor_invert, + ); + Ok(()) + } + unsafe fn recreate_dupl(&mut self) -> Result<()> { if self.holding_frame { let _ = self.dupl.ReleaseFrame(); self.holding_frame = false; } - self.dupl = self - .output - .DuplicateOutput(&self.device) - .context("re-DuplicateOutput after ACCESS_LOST")?; - Ok(()) + // ACCESS_LOST fires on desktop switches (normal ↔ Winlogon secure: lock/login/UAC) and on the + // mode change we issue at create. Re-attach to the now-current input desktop AND recreate the + // D3D11 device on it: a device made on the previous desktop cannot sustain a duplication on the + // new one (perpetual ACCESS_LOST). The capturer hands the new device out on `FramePayload::D3d11`, + // so NVENC re-inits when it sees it. Retry while the desktop is mid-reconfigure. + let deadline = Instant::now() + Duration::from_millis(12000); + loop { + // The SudoVDA virtual output's GDI name can CHANGE across a secure-desktop topology + // rebuild — the observed failure was searching for the stale \\.\DISPLAYn until the + // deadline and dying ("no DXGI output named ..."). Re-resolve it from the STABLE target + // id each retry so recovery finds the output under its current name. + if let Some(n) = crate::vdisplay::sudovda::resolve_gdi_name(self.target_id) { + self.gdi_name = n; + } + attach_input_desktop(); + match reopen_duplication(&self.gdi_name) { + Ok((dev, ctx, out, dupl)) => { + // A desktop switch can come back at a different size (e.g. the user session applies + // its own resolution on login). Adopt it: update dimensions and drop the staging/gpu + // copies so they reallocate. NVENC re-inits at the new size when it sees the frame. + let dd: DXGI_OUTDUPL_DESC = dupl.GetDesc(); + let (nw, nh) = (dd.ModeDesc.Width, dd.ModeDesc.Height); + if nw != self.width || nh != self.height { + tracing::info!( + old = format!("{}x{}", self.width, self.height), + new = format!("{nw}x{nh}"), + "DXGI duplication size changed across switch" + ); + self.width = nw; + self.height = nh; + self.staging = None; + } + self.device = dev; + self.context = ctx; + self.output = out; + self.dupl = dupl; + self.gpu_copy = None; // stale: belonged to the old device + self.cursor = None; // shaders/textures belonged to the old device; rebuilt on demand + self.have_gpu_frame = false; + self.first_frame = true; + nudge_cursor_onto(&self.output); // re-kick after recovery + return Ok(()); + } + Err(e) if Instant::now() >= deadline => return Err(e), + Err(_) => std::thread::sleep(Duration::from_millis(120)), + } + } } /// Acquire one frame: `Some` on a fresh image, `None` on timeout (no change → caller reuses last). @@ -326,14 +960,46 @@ impl DuplCapturer { } let mut info = DXGI_OUTDUPL_FRAME_INFO::default(); let mut res: Option = None; - match self - .dupl - .AcquireNextFrame(self.timeout_ms, &mut info, &mut res) - { - Ok(()) => {} - Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => return Ok(None), - Err(e) if e.code() == DXGI_ERROR_ACCESS_LOST => { + let timeout = if self.first_frame { 2000 } else { self.timeout_ms }; + match self.dupl.AcquireNextFrame(timeout, &mut info, &mut res) { + Ok(()) => { + if self.first_frame { + tracing::info!(w = self.width, h = self.height, "DXGI first frame acquired"); + self.first_frame = false; + } + self.update_cursor(&info); + } + Err(e) if e.code() == DXGI_ERROR_WAIT_TIMEOUT => { + self.dbg_timeouts += 1; + if self.dbg_timeouts % 40 == 1 { + tracing::warn!( + timeouts = self.dbg_timeouts, + first_frame = self.first_frame, + "DXGI AcquireNextFrame timeout (no desktop change yet)" + ); + } + return Ok(None); + } + // Recoverable losses, ALL handled by rebuilding the duplication (device + re-DuplicateOutput): + // ACCESS_LOST — desktop switch (normal <-> Winlogon secure: lock/login/UAC) or mode change + // INVALID_CALL — the secure->user-desktop switch (post-login) leaves the duplication in a + // state where AcquireNextFrame returns 0x887A0001; recreating recovers it. + // Previously fatal -> the stream dropped the instant the user logged in. + // DEVICE_REMOVED/RESET — GPU TDR / driver reset. + Err(e) + if e.code() == DXGI_ERROR_ACCESS_LOST + || e.code() == DXGI_ERROR_INVALID_CALL + || e.code() == DXGI_ERROR_DEVICE_REMOVED + || e.code() == DXGI_ERROR_DEVICE_RESET => + { + self.dbg_lost += 1; + tracing::warn!( + lost = self.dbg_lost, + code = format!("{:#x}", e.code().0), + "DXGI capture lost (desktop switch?) — recovering" + ); self.recreate_dupl()?; + self.first_frame = true; return Ok(None); } Err(e) => return Err(e).context("AcquireNextFrame"), @@ -350,6 +1016,7 @@ impl DuplCapturer { let _ = self.dupl.ReleaseFrame(); self.holding_frame = false; self.have_gpu_frame = true; + self.composite_cursor_gpu(&gpu)?; return Ok(Some(CapturedFrame { width: self.width, height: self.height, @@ -371,10 +1038,25 @@ impl DuplCapturer { let (w, h) = (self.width as usize, self.height as usize); let pitch = map.RowPitch as usize; let src = std::slice::from_raw_parts(map.pData as *const u8, pitch * h); - let tight = depad_bgra(src, pitch, w, h); + let mut tight = depad_bgra(src, pitch, w, h); self.context.Unmap(&staging, 0); let _ = self.dupl.ReleaseFrame(); self.holding_frame = false; + if self.cursor_visible { + if let Some((bgra, cw, ch)) = &self.cursor_shape { + blend_cursor_cpu( + &mut tight, + self.width, + self.height, + bgra, + *cw, + *ch, + self.cursor_pos.0, + self.cursor_pos.1, + self.cursor_invert, + ); + } + } self.last = Some(tight.clone()); Ok(Some(CapturedFrame { width: self.width, @@ -395,7 +1077,9 @@ fn now_ns() -> u64 { impl Capturer for DuplCapturer { fn next_frame(&mut self) -> Result { - let deadline = Instant::now() + Duration::from_secs(10); + // Generous: a secure-desktop switch can take several seconds to settle (re-resolve + recreate + // the duplication up to 12 s). Better a few seconds of frozen-last-frame than dropping the stream. + let deadline = Instant::now() + Duration::from_secs(20); loop { if let Some(f) = unsafe { self.acquire() }? { return Ok(f); @@ -425,7 +1109,7 @@ impl Capturer for DuplCapturer { } if Instant::now() > deadline { return Err(anyhow!( - "no DXGI frame within 10s (SudoVDA monitor not activated by a WDDM GPU?)" + "no DXGI frame within 20s (SudoVDA monitor not activated by a WDDM GPU?)" )); } } diff --git a/crates/punktfunk-host/src/encode/nvenc.rs b/crates/punktfunk-host/src/encode/nvenc.rs index e74f805..1bbf5d5 100644 --- a/crates/punktfunk-host/src/encode/nvenc.rs +++ b/crates/punktfunk-host/src/encode/nvenc.rs @@ -58,6 +58,10 @@ pub struct NvencD3d11Encoder { frame_idx: i64, force_kf: bool, inited: bool, + /// Raw ptr of the D3D11 device this session was initialized with. The capturer recreates the + /// device on a desktop switch (normal ↔ Winlogon secure); when a frame carries a new device we + /// tear down and re-init NVENC against it. + init_device: *mut c_void, } // Raw NVENC handle + COM ptrs; confined to the single encode thread (like the Linux encoder). @@ -88,9 +92,35 @@ impl NvencD3d11Encoder { frame_idx: 0, force_kf: false, inited: false, + init_device: ptr::null_mut(), }) } + /// Tear down the encode session + pooled resources. Reused on a capture-device change (desktop + /// switch) and at Drop. + unsafe fn teardown(&mut self) { + if self.encoder.is_null() { + return; + } + for p in &self.pool { + if !p.map.is_null() { + let _ = (API.unmap_input_resource)(self.encoder, p.map); + } + let _ = (API.unregister_resource)(self.encoder, p.reg); + } + for &bs in &self.bitstreams { + let _ = (API.destroy_bitstream_buffer)(self.encoder, bs); + } + let _ = (API.destroy_encoder)(self.encoder); + self.pool.clear(); + self.bitstreams.clear(); + self.pending.clear(); + self.encoder = ptr::null_mut(); + self.ctx = None; + self.inited = false; + self.next = 0; + } + /// Lazily create the session on the first frame's D3D11 device (so capture + encode share it). fn init_session(&mut self, device: &ID3D11Device) -> Result<()> { unsafe { @@ -100,70 +130,112 @@ impl NvencD3d11Encoder { .context("D3D11 immediate context")?, ); - // 1. open the session bound to the D3D11 device. - let mut params = nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS { - version: nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER, - deviceType: nv::NV_ENC_DEVICE_TYPE::NV_ENC_DEVICE_TYPE_DIRECTX, - device: device.as_raw(), - apiVersion: nv::NVENCAPI_VERSION, - ..Default::default() - }; - let mut enc: *mut c_void = ptr::null_mut(); - (API.open_encode_session_ex)(&mut params, &mut enc) - .result_without_string() - .map_err(|e| anyhow!("NVENC open_encode_session_ex: {e:?} (no NVIDIA GPU?)"))?; - self.encoder = enc; - - // 2. seed the P1 + ultra-low-latency preset config. - let mut preset = nv::NV_ENC_PRESET_CONFIG { - version: nv::NV_ENC_PRESET_CONFIG_VER, - presetCfg: nv::NV_ENC_CONFIG { - version: nv::NV_ENC_CONFIG_VER, + // Probe-and-step-down on the bitrate. NVENC rejects `initialize_encoder` with InvalidParam + // when `averageBitRate` exceeds what the GPU's max codec level can express (e.g. a 1.6 Gbps + // request on HEVC). Mirror the Linux host's strategy: try the requested rate, and on + // failure drop to 3/4 and retry, down to a floor — so the connection ALWAYS succeeds at the + // highest bitrate THIS GPU supports (a newer GPU that accepts the request keeps it + // untouched; only an over-asking client gets clamped). Each attempt re-opens a fresh + // session (NVENC has no re-init after a failed initialize). + const FLOOR_BPS: u64 = 10_000_000; + let requested_bps = self.bitrate_bps; + let mut bitrate = self.bitrate_bps; + let enc = loop { + // 1. open the session bound to the D3D11 device. + let mut params = nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS { + version: nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER, + deviceType: nv::NV_ENC_DEVICE_TYPE::NV_ENC_DEVICE_TYPE_DIRECTX, + device: device.as_raw(), + apiVersion: nv::NVENCAPI_VERSION, ..Default::default() - }, - ..Default::default() - }; - (API.get_encode_preset_config_ex)( - enc, - self.codec_guid, - nv::NV_ENC_PRESET_P1_GUID, - nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY, - &mut preset, - ) - .result_without_string() - .map_err(|e| anyhow!("get_encode_preset_config_ex: {e:?}"))?; - let mut cfg = preset.presetCfg; + }; + let mut enc: *mut c_void = ptr::null_mut(); + (API.open_encode_session_ex)(&mut params, &mut enc) + .result_without_string() + .map_err(|e| anyhow!("NVENC open_encode_session_ex: {e:?} (no NVIDIA GPU?)"))?; - // 3. mirror the Linux RC config: CBR, infinite GOP, P-only, ~1-frame VBV. - cfg.gopLength = nv::NVENC_INFINITE_GOPLENGTH; - cfg.frameIntervalP = 1; - cfg.rcParams.rateControlMode = nv::NV_ENC_PARAMS_RC_MODE::NV_ENC_PARAMS_RC_CBR; - let bps = self.bitrate_bps.min(u32::MAX as u64) as u32; - cfg.rcParams.averageBitRate = bps; - cfg.rcParams.maxBitRate = bps; - let vbv = (self.bitrate_bps as f64 / self.fps.max(1) as f64) as u32; - cfg.rcParams.vbvBufferSize = vbv; - cfg.rcParams.vbvInitialDelay = vbv; - - // 4. initialize the encoder. - let mut init = nv::NV_ENC_INITIALIZE_PARAMS { - version: nv::NV_ENC_INITIALIZE_PARAMS_VER, - encodeGUID: self.codec_guid, - presetGUID: nv::NV_ENC_PRESET_P1_GUID, - tuningInfo: nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY, - encodeWidth: self.width, - encodeHeight: self.height, - darWidth: self.width, - darHeight: self.height, - frameRateNum: self.fps, - frameRateDen: 1, - enablePTD: 1, - encodeConfig: &mut cfg, - ..Default::default() - }; - (API.initialize_encoder)(enc, &mut init) + // 2. seed the P1 + ultra-low-latency preset config. + let mut preset = nv::NV_ENC_PRESET_CONFIG { + version: nv::NV_ENC_PRESET_CONFIG_VER, + presetCfg: nv::NV_ENC_CONFIG { + version: nv::NV_ENC_CONFIG_VER, + ..Default::default() + }, + ..Default::default() + }; + if let Err(e) = (API.get_encode_preset_config_ex)( + enc, + self.codec_guid, + nv::NV_ENC_PRESET_P1_GUID, + nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY, + &mut preset, + ) .result_without_string() - .map_err(|e| anyhow!("initialize_encoder: {e:?}"))?; + { + let _ = (API.destroy_encoder)(enc); + return Err(anyhow!("get_encode_preset_config_ex: {e:?}")); + } + let mut cfg = preset.presetCfg; + + // 3. mirror the Linux RC config: CBR, infinite GOP, P-only, ~1-frame VBV. + cfg.gopLength = nv::NVENC_INFINITE_GOPLENGTH; + cfg.frameIntervalP = 1; + cfg.rcParams.rateControlMode = nv::NV_ENC_PARAMS_RC_MODE::NV_ENC_PARAMS_RC_CBR; + let bps = bitrate.min(u32::MAX as u64) as u32; + cfg.rcParams.averageBitRate = bps; + cfg.rcParams.maxBitRate = bps; + // Shrink the VBV with the bitrate — NVENC validates it against the same level ceiling. + let vbv = (bitrate as f64 / self.fps.max(1) as f64) as u32; + cfg.rcParams.vbvBufferSize = vbv; + cfg.rcParams.vbvInitialDelay = vbv; + + // 4. initialize the encoder. + let mut init = nv::NV_ENC_INITIALIZE_PARAMS { + version: nv::NV_ENC_INITIALIZE_PARAMS_VER, + encodeGUID: self.codec_guid, + presetGUID: nv::NV_ENC_PRESET_P1_GUID, + tuningInfo: nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY, + encodeWidth: self.width, + encodeHeight: self.height, + darWidth: self.width, + darHeight: self.height, + frameRateNum: self.fps, + frameRateDen: 1, + enablePTD: 1, + encodeConfig: &mut cfg, + ..Default::default() + }; + match (API.initialize_encoder)(enc, &mut init).result_without_string() { + Ok(()) => { + self.bitrate_bps = bitrate; + break enc; + } + Err(e) if bitrate > FLOOR_BPS => { + let _ = (API.destroy_encoder)(enc); + let next = (bitrate * 3 / 4).max(FLOOR_BPS); + tracing::warn!( + tried_mbps = bitrate / 1_000_000, + next_mbps = next / 1_000_000, + error = ?e, + "NVENC initialize_encoder rejected bitrate — stepping down (GPU codec-level cap)" + ); + bitrate = next; + continue; + } + Err(e) => { + let _ = (API.destroy_encoder)(enc); + return Err(anyhow!("initialize_encoder: {e:?} (even at {} Mbps floor)", FLOOR_BPS / 1_000_000)); + } + } + }; + self.encoder = enc; + if self.bitrate_bps < requested_bps { + tracing::info!( + requested_mbps = requested_bps / 1_000_000, + applied_mbps = self.bitrate_bps / 1_000_000, + "NVENC bitrate capped to this GPU's max for the codec" + ); + } // 5. encoder-owned BGRA texture pool, registered once, + one bitstream per slot. let desc = D3D11_TEXTURE2D_DESC { @@ -222,7 +294,7 @@ impl NvencD3d11Encoder { self.width, self.height, self.fps, - bps / 1_000_000, + self.bitrate_bps / 1_000_000, self.codec_guid ); Ok(()) @@ -238,9 +310,27 @@ impl Encoder for NvencD3d11Encoder { bail!("NVENC D3D11 encoder needs a GPU texture frame (use the software encoder for CPU frames)") } }; + // The capturer recreates its D3D11 device on a desktop switch (secure/Winlogon) and may come + // back at a different resolution (user session applies its own mode on login). Re-init when the + // frame arrives on a different device OR at a different size than our session was built on. + let dev_raw = frame.device.as_raw(); + let size_changed = self.inited && (self.width != captured.width || self.height != captured.height); + if self.inited && (self.init_device != dev_raw || size_changed) { + tracing::info!( + device_changed = self.init_device != dev_raw, + size_changed, + new = format!("{}x{}", captured.width, captured.height), + "NVENC: capture device/size changed (desktop switch) — re-initializing session" + ); + unsafe { self.teardown() }; + } if !self.inited { + // Adopt the current frame size so the encoder always matches what the capturer produces. + self.width = captured.width; + self.height = captured.height; let device = frame.device.clone(); self.init_session(&device)?; + self.init_device = dev_raw; } let slot = self.next % POOL; self.next += 1; @@ -336,20 +426,6 @@ impl Encoder for NvencD3d11Encoder { impl Drop for NvencD3d11Encoder { fn drop(&mut self) { - if self.encoder.is_null() { - return; - } - unsafe { - for p in &self.pool { - if !p.map.is_null() { - let _ = (API.unmap_input_resource)(self.encoder, p.map); - } - let _ = (API.unregister_resource)(self.encoder, p.reg); - } - for &bs in &self.bitstreams { - let _ = (API.destroy_bitstream_buffer)(self.encoder, bs); - } - let _ = (API.destroy_encoder)(self.encoder); - } + unsafe { self.teardown() }; } } diff --git a/crates/punktfunk-host/src/vdisplay.rs b/crates/punktfunk-host/src/vdisplay.rs index 56b21a0..9558774 100644 --- a/crates/punktfunk-host/src/vdisplay.rs +++ b/crates/punktfunk-host/src/vdisplay.rs @@ -603,7 +603,7 @@ mod kwin; #[cfg(target_os = "linux")] mod mutter; #[cfg(target_os = "windows")] -mod sudovda; +pub(crate) mod sudovda; #[cfg(target_os = "linux")] mod wlroots; diff --git a/crates/punktfunk-host/src/vdisplay/sudovda.rs b/crates/punktfunk-host/src/vdisplay/sudovda.rs index d6fd296..653092f 100644 --- a/crates/punktfunk-host/src/vdisplay/sudovda.rs +++ b/crates/punktfunk-host/src/vdisplay/sudovda.rs @@ -1,4 +1,4 @@ -//! Windows virtual-display backend driving **SudoVDA** (the SudoMaker Virtual Display Adapter — +//! Windows virtual-display backend driving **SudoVDA** (the SudoMaker Virtual Display Adapter — //! the Indirect Display Driver the Apollo Sunshine-fork ships). The Windows analogue of the //! Linux per-compositor backends: [`create`](VirtualDisplay::create) adds a virtual monitor at the //! client's exact `WxH@Hz` (the mode is baked into the ADD IOCTL — no EDID seeding), starts the @@ -27,6 +27,12 @@ use windows::Win32::Devices::Display::{ DISPLAYCONFIG_SOURCE_DEVICE_NAME, QDC_ONLY_ACTIVE_PATHS, }; use windows::Win32::Foundation::{CloseHandle, HANDLE, LUID}; +use windows::Win32::Graphics::Gdi::{ + ChangeDisplaySettingsExW, EnumDisplayDevicesW, EnumDisplaySettingsW, CDS_GLOBAL, CDS_NORESET, + CDS_SET_PRIMARY, CDS_TEST, CDS_TYPE, CDS_UPDATEREGISTRY, DEVMODEW, DISPLAY_DEVICEW, + DISPLAY_DEVICE_ATTACHED_TO_DESKTOP, DISP_CHANGE_SUCCESSFUL, DM_BITSPERPEL, DM_DISPLAYFREQUENCY, + DM_PELSHEIGHT, DM_PELSWIDTH, DM_POSITION, ENUM_CURRENT_SETTINGS, ENUM_DISPLAY_SETTINGS_MODE, +}; use windows::Win32::Storage::FileSystem::{ CreateFileW, FILE_FLAGS_AND_ATTRIBUTES, FILE_SHARE_READ, FILE_SHARE_WRITE, OPEN_EXISTING, }; @@ -97,7 +103,7 @@ unsafe fn ioctl(h: HANDLE, code: u32, input: &[u8], output: &mut [u8]) -> Result /// Resolve the `\\.\DisplayN` GDI name for a SudoVDA target id via the CCD API. Returns `None` /// until the OS activates the target into the desktop topology (needs a real WDDM GPU; on a /// GPU-less box this stays `None` even though ADD succeeded). -unsafe fn resolve_gdi_name(target_id: u32) -> Option { +pub(crate) unsafe fn resolve_gdi_name(target_id: u32) -> Option { let mut np = 0u32; let mut nm = 0u32; if GetDisplayConfigBufferSizes(QDC_ONLY_ACTIVE_PATHS, &mut np, &mut nm).is_err() { @@ -133,6 +139,204 @@ unsafe fn resolve_gdi_name(target_id: u32) -> Option { None } +/// Force the freshly-added SudoVDA monitor to the client's exact `WxH@Hz`. The ADD IOCTL only +/// ADVERTISES the mode; Windows otherwise activates an IDD target at a 1280x720 default, so the +/// ACTIVE mode (what DXGI Desktop Duplication captures) must be set explicitly. CDS_TEST first so a +/// mode the driver didn't advertise just leaves the default instead of erroring the session. +fn set_active_mode(gdi_name: &str, mode: Mode) { + let wname: Vec = gdi_name.encode_utf16().chain(std::iter::once(0)).collect(); + + // Enumerate the modes the driver actually advertises for this output and pick the best match for + // the requested RESOLUTION: the exact refresh if present, else the highest advertised refresh + // <= requested, else the highest available at that resolution. The SudoVDA ADD IOCTL advertises + // the client mode, but a very high pixel rate (e.g. 5120x1440@240 = 1.77 Gpix/s) can be clamped + // or absent — falling back to a lower refresh AT THE SAME RESOLUTION keeps the client's + // resolution (what the user sees) instead of collapsing to the 1280x720/1920x1080 OS default. + let mut at_res: Vec = Vec::new(); + let mut res_set: std::collections::BTreeSet<(u32, u32)> = std::collections::BTreeSet::new(); + let mut i = 0u32; + loop { + let mut dm = DEVMODEW { + dmSize: size_of::() as u16, + ..Default::default() + }; + let ok = unsafe { + EnumDisplaySettingsW(PCWSTR(wname.as_ptr()), ENUM_DISPLAY_SETTINGS_MODE(i), &mut dm) + } + .as_bool(); + if !ok { + break; + } + i += 1; + res_set.insert((dm.dmPelsWidth, dm.dmPelsHeight)); + if dm.dmPelsWidth == mode.width && dm.dmPelsHeight == mode.height { + at_res.push(dm.dmDisplayFrequency); + } + } + let chosen_hz = if at_res.contains(&mode.refresh_hz) { + mode.refresh_hz + } else if let Some(hz) = at_res.iter().copied().filter(|&hz| hz <= mode.refresh_hz).max() { + hz + } else if let Some(hz) = at_res.iter().copied().max() { + hz + } else { + mode.refresh_hz // resolution not advertised at all; attempt anyway (likely -> OS default) + }; + if at_res.is_empty() { + tracing::warn!( + "{gdi_name}: driver advertises no {}x{} mode (top advertised: {:?}); attempting @{} anyway", + mode.width, + mode.height, + res_set.iter().rev().take(8).collect::>(), + mode.refresh_hz + ); + } else if chosen_hz != mode.refresh_hz { + tracing::info!( + "{gdi_name}: {}x{}@{} not advertised; using {}x{}@{} (advertised refreshes here: {:?})", + mode.width, + mode.height, + mode.refresh_hz, + mode.width, + mode.height, + chosen_hz, + at_res + ); + } + + let dm = DEVMODEW { + dmSize: size_of::() as u16, + dmFields: DM_PELSWIDTH | DM_PELSHEIGHT | DM_DISPLAYFREQUENCY | DM_BITSPERPEL | DM_POSITION, + dmBitsPerPel: 32, + dmPelsWidth: mode.width, + dmPelsHeight: mode.height, + dmDisplayFrequency: chosen_hz, + ..Default::default() + }; + let test = + unsafe { ChangeDisplaySettingsExW(PCWSTR(wname.as_ptr()), Some(&dm), None, CDS_TEST, None) }; + if test != DISP_CHANGE_SUCCESSFUL { + tracing::warn!( + result = test.0, + "{gdi_name}: driver rejected {}x{}@{} (mode not advertised?) — leaving OS default", + mode.width, + mode.height, + chosen_hz + ); + return; + } + let apply = unsafe { + ChangeDisplaySettingsExW( + PCWSTR(wname.as_ptr()), + Some(&dm), + None, + // Make it the PRIMARY display: a blank *extended* IDD output isn't composited by the DWM, + // so it produces no duplication frames. As primary it carries the shell/cursor → frames + // flow (this is what Apollo does). Position is (0,0) via DM_POSITION (zeroed by default). + CDS_UPDATEREGISTRY | CDS_GLOBAL | CDS_SET_PRIMARY, + None, + ) + }; + if apply == DISP_CHANGE_SUCCESSFUL { + tracing::info!( + "{gdi_name}: active mode set to {}x{}@{}", + mode.width, + mode.height, + chosen_hz + ); + } else { + tracing::warn!( + result = apply.0, + "{gdi_name}: failed to apply {}x{}@{}", + mode.width, + mode.height, + chosen_hz + ); + } +} + +/// Detach every display except `keep_gdi_name`, leaving the SudoVDA virtual output as the ONLY +/// display. This is the SudoVDA/Apollo "isolate the virtual display" move and the key to capturing +/// the secure desktop: Windows renders the login / UAC (Winlogon) desktop on the physical/primary +/// display and resets the topology when it switches there — with a physical monitor still attached +/// (e.g. an LG TV), the login lands on it and our virtual output goes perpetually ACCESS_LOST. With +/// the physical detached and the change PERSISTED to the registry, Winlogon reads "only the virtual +/// is attached" and the secure desktop has nowhere to render but the output we capture. +/// +/// Returns the displays we detached plus their saved modes so teardown can restore them. +unsafe fn isolate_displays(keep_gdi_name: &str) -> Vec<(String, DEVMODEW)> { + let mut saved = Vec::new(); + let mut idx = 0u32; + loop { + let mut dd = DISPLAY_DEVICEW { + cb: size_of::() as u32, + ..Default::default() + }; + if !EnumDisplayDevicesW(PCWSTR::null(), idx, &mut dd, 0).as_bool() { + break; + } + idx += 1; + if (dd.StateFlags & DISPLAY_DEVICE_ATTACHED_TO_DESKTOP).0 == 0 { + continue; // not part of the desktop — nothing to detach + } + let name = String::from_utf16_lossy(&dd.DeviceName); + let name = name.trim_end_matches('\u{0}').to_string(); + if name == keep_gdi_name { + continue; // the virtual output we want to keep + } + // Save the current mode so the teardown can re-attach this display where it was. + let mut cur = DEVMODEW { + dmSize: size_of::() as u16, + ..Default::default() + }; + let wname: Vec = name.encode_utf16().chain(std::iter::once(0)).collect(); + if EnumDisplaySettingsW(PCWSTR(wname.as_ptr()), ENUM_CURRENT_SETTINGS, &mut cur).as_bool() { + saved.push((name.clone(), cur)); + } + // A 0x0 mode removes the display from the desktop. NORESET batches; we commit once below. + let off = DEVMODEW { + dmSize: size_of::() as u16, + dmFields: DM_POSITION | DM_PELSWIDTH | DM_PELSHEIGHT, + ..Default::default() + }; + let r = ChangeDisplaySettingsExW( + PCWSTR(wname.as_ptr()), + Some(&off), + None, + CDS_UPDATEREGISTRY | CDS_NORESET | CDS_GLOBAL, + None, + ); + tracing::info!("display isolate: detaching {name} (result={})", r.0); + } + if !saved.is_empty() { + // Commit the batched detaches (NULL device + 0 flags applies the pending registry changes). + let _ = ChangeDisplaySettingsExW(PCWSTR::null(), None, None, CDS_TYPE(0), None); + tracing::info!( + "display isolate: {} display(s) detached — only {keep_gdi_name} remains", + saved.len() + ); + } + saved +} + +/// Re-attach the displays [`isolate_displays`] detached, restoring each to its saved mode. Called on +/// teardown BEFORE the virtual output is removed, so there is always at least one display. +unsafe fn restore_displays(saved: &[(String, DEVMODEW)]) { + for (name, dm) in saved { + let wname: Vec = name.encode_utf16().chain(std::iter::once(0)).collect(); + let _ = ChangeDisplaySettingsExW( + PCWSTR(wname.as_ptr()), + Some(dm), + None, + CDS_UPDATEREGISTRY | CDS_NORESET | CDS_GLOBAL, + None, + ); + } + if !saved.is_empty() { + let _ = ChangeDisplaySettingsExW(PCWSTR::null(), None, None, CDS_TYPE(0), None); + tracing::info!("display isolate: restored {} display(s)", saved.len()); + } +} + unsafe fn open_device() -> Result { let hdev = SetupDiGetClassDevsW( Some(&SUVDA_INTERFACE), @@ -275,8 +479,16 @@ impl VirtualDisplay for SudoVdaDisplay { break; } } + let mut isolated: Vec<(String, DEVMODEW)> = Vec::new(); match &gdi_name { - Some(n) => tracing::info!("SudoVDA target {} -> {n}", ao.target_id), + Some(n) => { + tracing::info!("SudoVDA target {} -> {n}", ao.target_id); + // ADD only advertises the mode; force it active so DXGI captures the requested size. + set_active_mode(n, mode); + // Detach every other display so the secure desktop (Winlogon/UAC) renders here too. + isolated = unsafe { isolate_displays(n) }; + thread::sleep(Duration::from_millis(1500)); // let the topology settle before capture opens + } None => tracing::warn!( "SudoVDA target {} not yet an active display path (needs a WDDM GPU to activate)", ao.target_id @@ -291,6 +503,9 @@ impl VirtualDisplay for SudoVdaDisplay { .map(|n| crate::capture::dxgi::WinCaptureTarget { adapter_luid: crate::capture::dxgi::pack_luid(ao.luid), gdi_name: n, + // The SudoVDA target id is stable across secure-desktop topology rebuilds; the + // GDI name is NOT, so capture re-resolves the name from this on every recovery. + target_id: ao.target_id, }), keepalive: Box::new(SudoVdaKeepalive { device: device_raw, @@ -298,6 +513,7 @@ impl VirtualDisplay for SudoVdaDisplay { stop, pinger: Some(pinger), gdi_name, + isolated, }), }) } @@ -312,6 +528,8 @@ struct SudoVdaKeepalive { pinger: Option>, #[allow(dead_code)] // consumed by the Windows capture backend (not yet wired) gdi_name: Option, + /// Displays detached by [`isolate_displays`], restored here on teardown. + isolated: Vec<(String, DEVMODEW)>, } impl Drop for SudoVdaKeepalive { @@ -320,6 +538,9 @@ impl Drop for SudoVdaKeepalive { if let Some(j) = self.pinger.take() { let _ = j.join(); } + // Re-attach the physical display(s) we detached BEFORE removing the virtual output, so the + // box is never left with zero displays. + unsafe { restore_displays(&self.isolated) }; let rp = RemoveParams { guid: self.guid }; let rp_bytes = unsafe { std::slice::from_raw_parts(&rp as *const _ as *const u8, size_of::()) diff --git a/docs/windows-host.md b/docs/windows-host.md index af2aa0d..90ffbe5 100644 --- a/docs/windows-host.md +++ b/docs/windows-host.md @@ -47,6 +47,42 @@ coexisting with a running Apollo (two concurrent NVENC sessions). - **Frame pacing on static content** — DXGI duplication is change-driven, so a blank/idle virtual display delivers only ~12 fps (181/177 frames over ~15 s); a rendering app drives the full rate. +### Live UX hardening (2026-06-15, validated Mac ↔ RTX 4090) + +Driven by live testing with the native macOS client at the display's native **5120×1440@240**: + +- **Native resolution, not 1080p.** `sudovda::set_active_mode` enumerates the modes the IDD actually + advertises (`EnumDisplaySettingsW`) and sets the requested **resolution** at the best supported + refresh — keeping 5120×1440@240, never silently collapsing to the 1280×720/1920×1080 OS default + when an exact mode is briefly unavailable. +- **Bitrate auto-cap.** NVENC `init_session` probes and steps the average bitrate down (×3/4 to a + floor) when the requested rate exceeds the GPU's codec-level max, so a high client bitrate connects + instead of failing (matches the Linux host; we do NOT split NVENC sessions). +- **Mouse cursor.** DXGI duplication excludes the hardware cursor; we read the pointer + position/shape from the frame info (`GetFramePointerShape`) and GPU-composite it onto the captured + texture before NVENC (a CPU read-back would stall the pipeline). Color cursors alpha-blend; + **masked-color** cursors (the text I-beam) use an `INV_DEST_COLOR` blend for true screen inversion, + so the caret is visible on any background (no black box). Monochrome handled too. +- **Secure desktop (lock / login / UAC).** The host runs as **SYSTEM in the interactive session**; + the capturer `SetThreadDesktop`s onto the current input desktop and, on the WinSta switch, + **recreates the D3D11 device** and **re-resolves the virtual output's GDI name from the stable + SudoVDA target id** (the name changes across the topology rebuild — the old failure was hunting the + stale `\\.\DISPLAYn` and dropping). `ACCESS_LOST` / `INVALID_CALL` / device-removed are all treated + as recoverable, and a mid-stream resolution change is followed (capturer + NVENC re-init at the new + size). Validated: logging in / locking through the stream stays connected (one real session + recovered 1012 desktop switches and completed cleanly). *Display isolation* (`isolate_displays` + detaches other monitors so Winlogon renders to the virtual output) covers the case where a physical + monitor is also attached. + +### Running as SYSTEM, windowless (deployment) + +To capture the secure desktop the host must run as **SYSTEM in the interactive Session 1** (a Session +0 service can't duplicate Session 1). Launch chain: a scheduled task (Interactive, Highest) → +`PsExec64 -s -i 1 -d wscript.exe launch.vbs` → `launch.vbs` runs `host-run.cmd` with a **hidden +window** (`WScript.Shell.Run …, 0`). This keeps the host off the captured desktop — no `cmd` windows +the user can see or accidentally close (which would kill the stream). `host-run.cmd` sets +`APPDATA=C:\Users\Public` (shared identity/pairing) + `PUNKTFUNK_ENCODER=nvenc` and runs `m3-host`. + ### Real-GPU test box (RTX 4090, `ssh "Enrico Bühler"@192.168.1.174`) Windows 11, RTX 4090 (driver 596.36) + AMD iGPU, SudoVDA + Apollo (sunshine) installed. SSH lands in