Files
punktfunk/clients/windows/src/video.rs
T
enricobuehler a58b6b8e76
apple / swift (push) Successful in 53s
windows-msix / package (push) Successful in 1m4s
windows / build (push) Successful in 57s
ci / bench (push) Failing after 2s
android / android (push) Failing after 2m46s
ci / web (push) Successful in 32s
ci / docs-site (push) Failing after 16s
deb / build-publish (push) Failing after 1s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Failing after 0s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Failing after 0s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Failing after 1s
docker / deploy-docs (push) Has been skipped
decky / build-publish (push) Failing after 0s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Failing after 0s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Failing after 1s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 0s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 0s
ci / rust (push) Failing after 2m16s
fix(windows-client): clear clippy -D warnings on MSVC
The cfg(windows) code can't be lint-checked on the Linux dev box, so three
-D warnings slipped through (caught by windows.yml; the FFI + shaders compiled
fine):
- gpu.rs: SetMultithreadProtected returns a must-use BOOL -> `let _ =`.
- video.rs: drop the unused GpuFrame::ten_bit field (present keys off `hdr`;
  the value is still computed locally for the first-frame log).
- present.rs: GpuView::frame is an RAII keep-alive (its Drop returns the decoder
  surface to the pool), never read -> #[allow(dead_code)].

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-18 23:21:18 +00:00

509 lines
21 KiB
Rust

//! Video decode: reassembled HEVC access units → frames for the D3D11 presenter.
//!
//! Two backends, picked at session start (override via [`DecoderPref`] / the Settings UI):
//!
//! * **D3D11VA** (any GPU): libavcodec decodes on the GPU straight into `ID3D11Texture2D`s that
//! carry `D3D11_BIND_SHADER_RESOURCE`, so the presenter samples the decoded NV12/P010 surface
//! directly — **zero copy** (no swscale, no CPU readback, no per-frame upload). The textures are
//! created by the process-wide shared device ([`crate::gpu`]) the presenter also draws with, which
//! is what makes them bindable there. This is the big latency/throughput win over software decode.
//! * **Software**: libavcodec on the CPU + swscale to a packed 4-byte format the presenter uploads
//! (`RGBA` for SDR, `X2BGR10` for HDR). The fallback on a GPU-less box (WARP), when D3D11VA init
//! fails, or when a mid-session hardware error demotes us — the host's IDR/RFI recovery
//! resynchronizes on the next keyframe either way.
//!
//! Both run `AV_CODEC_FLAG_LOW_DELAY`; the host encodes zero-reorder streams (no B-frames, in-band
//! parameter sets on every IDR), so decode is strictly one-in/one-out.
//!
//! HDR is detected in-band from the decoded frame's transfer characteristic (`SMPTE2084` / PQ in the
//! HEVC VUI) — the same signal every other punktfunk client keys off — not from a protocol field.
use anyhow::{anyhow, bail, Context as _, Result};
use ffmpeg::format::Pixel;
use ffmpeg::software::scaling;
use ffmpeg::util::frame::Video as AvFrame;
use ffmpeg_next as ffmpeg;
use std::ffi::c_void;
use std::ptr;
use windows::core::Interface; // ID3D11Device::clone().into_raw() for the FFmpeg hwdevice ctx
/// Which decode backend to use; the Settings UI persists this as a string.
#[derive(Clone, Copy, PartialEq, Eq, Debug, Default)]
pub enum DecoderPref {
/// Try D3D11VA, fall back to software.
#[default]
Auto,
/// Force D3D11VA (error out if unavailable, for debugging).
Hardware,
/// Force software decode.
Software,
}
impl DecoderPref {
pub fn from_name(s: &str) -> DecoderPref {
match s {
"hardware" => DecoderPref::Hardware,
"software" => DecoderPref::Software,
_ => DecoderPref::Auto,
}
}
}
pub enum DecodedFrame {
Cpu(CpuFrame),
Gpu(GpuFrame),
}
impl DecodedFrame {
pub fn dims(&self) -> (u32, u32) {
match self {
DecodedFrame::Cpu(c) => (c.width, c.height),
DecodedFrame::Gpu(g) => (g.width, g.height),
}
}
pub fn hdr(&self) -> bool {
match self {
DecodedFrame::Cpu(c) => c.hdr,
DecodedFrame::Gpu(g) => g.hdr,
}
}
}
/// Packed 4-byte-per-pixel frame for a D3D11 dynamic-texture upload (which takes a row pitch). The
/// bytes are `R8G8B8A8` for SDR and `X2BGR10` (== DXGI `R10G10B10A2`, R in the low 10 bits) for HDR.
pub struct CpuFrame {
pub width: u32,
pub height: u32,
/// Row stride in bytes (≥ width*4 — swscale pads rows for SIMD).
pub stride: usize,
pub pixels: Vec<u8>,
/// BT.2020 PQ HDR10 frame: `pixels` is `X2BGR10` and the presenter switches to a 10-bit
/// R10G10B10A2 + ST.2084 swapchain. `false` = ordinary 8-bit BT.709 SDR.
pub hdr: bool,
}
/// A decoded frame still on the GPU: a D3D11 texture **array** plus the slice index the decoder
/// wrote this frame into. The presenter creates per-plane shader-resource views over the slice and
/// converts YUV→RGB in a pixel shader. The underlying surface stays alive — and out of the decoder's
/// reuse pool — for exactly as long as `guard` (an `av_frame_clone` of the decoded frame) lives.
pub struct GpuFrame {
pub width: u32,
pub height: u32,
/// Texture-array slice this frame occupies (`AVFrame::data[1]`).
pub index: u32,
/// BT.2020 PQ HDR10 (P010, ST.2084) vs ordinary 8-bit BT.709 SDR (NV12). The present path keys
/// SRV format + shader off this (the host couples 10-bit ⟺ HDR).
pub hdr: bool,
guard: D3d11FrameGuard,
}
impl GpuFrame {
/// The decoder's D3D11 texture array holding this frame's slice, borrowed from the live cloned
/// `AVFrame`. Construct the windows-rs interface on the thread that will use it (the presenter /
/// UI thread): COM interfaces are `!Send`, but the raw pointer is fine to carry across threads.
pub fn texture_ptr(&self) -> *mut c_void {
unsafe { (*self.guard.0).data[0] as *mut c_void }
}
}
/// Owns a cloned decoded `AVFrame` (which refs the D3D11 surface in the decoder pool). Dropping it
/// releases the surface back for reuse. The clone is plain refcounted data; freeing it from the
/// presenter thread is fine.
pub struct D3d11FrameGuard(*mut ffmpeg::ffi::AVFrame);
unsafe impl Send for D3d11FrameGuard {}
impl Drop for D3d11FrameGuard {
fn drop(&mut self) {
unsafe { ffmpeg::ffi::av_frame_free(&mut self.0) };
}
}
enum Backend {
D3d11va(D3d11vaDecoder),
Software(SoftwareDecoder),
}
pub struct Decoder {
backend: Backend,
}
impl Decoder {
pub fn new(pref: DecoderPref) -> Result<Decoder> {
ffmpeg::init().context("ffmpeg init")?;
if pref != DecoderPref::Software {
match D3d11vaDecoder::new() {
Ok(d) => {
tracing::info!("D3D11VA hardware decode active (zero-copy)");
return Ok(Decoder {
backend: Backend::D3d11va(d),
});
}
Err(e) => {
if pref == DecoderPref::Hardware {
return Err(e.context("decoder=hardware but D3D11VA failed"));
}
tracing::info!(reason = %e, "D3D11VA unavailable — software decode");
}
}
}
Ok(Decoder {
backend: Backend::Software(SoftwareDecoder::new()?),
})
}
/// True for the zero-copy hardware backend (shown in the stream HUD).
pub fn is_hardware(&self) -> bool {
matches!(self.backend, Backend::D3d11va(_))
}
/// Feed one access unit; returns the decoded frame (the host's streams are one-in/one-out). A
/// software decode error after packet loss is survivable — keep feeding. A D3D11VA error demotes
/// to software for the rest of the session (the next IDR resynchronizes).
pub fn decode(&mut self, au: &[u8]) -> Result<Option<DecodedFrame>> {
match &mut self.backend {
Backend::D3d11va(d) => match d.decode(au) {
Ok(f) => Ok(f.map(DecodedFrame::Gpu)),
Err(e) => {
tracing::warn!(error = %e, "D3D11VA decode failed — falling back to software");
self.backend = Backend::Software(SoftwareDecoder::new()?);
Ok(None)
}
},
Backend::Software(s) => Ok(s.decode(au)?.map(DecodedFrame::Cpu)),
}
}
}
// --- software backend ---------------------------------------------------------------
struct SoftwareDecoder {
decoder: ffmpeg::decoder::Video,
/// Rebuilt whenever the decoded format/size **or output format** changes (mid-stream
/// `Reconfigure`, or an SDR↔HDR flip): `(ctx, src_fmt, w, h, dst_fmt)`.
sws: Option<(scaling::Context, Pixel, u32, u32, Pixel)>,
}
impl SoftwareDecoder {
fn new() -> Result<SoftwareDecoder> {
let codec =
ffmpeg::decoder::find(ffmpeg::codec::Id::HEVC).ok_or(anyhow!("no HEVC decoder"))?;
let mut ctx = ffmpeg::codec::Context::new_with_codec(codec);
unsafe {
let raw = ctx.as_mut_ptr();
(*raw).flags |= ffmpeg::ffi::AV_CODEC_FLAG_LOW_DELAY as i32;
// Slice threading adds no frame delay (frame threading adds thread_count-1).
(*raw).thread_type = ffmpeg::ffi::FF_THREAD_SLICE;
(*raw).thread_count = 0; // auto
}
let decoder = ctx.decoder().video().context("open HEVC decoder")?;
Ok(SoftwareDecoder { decoder, sws: None })
}
fn decode(&mut self, au: &[u8]) -> Result<Option<CpuFrame>> {
let packet = ffmpeg::Packet::copy(au);
self.decoder
.send_packet(&packet)
.map_err(|e| anyhow!("send_packet: {e}"))?;
let mut frame = AvFrame::empty();
let mut out = None;
while self.decoder.receive_frame(&mut frame).is_ok() {
out = Some(self.convert(&frame)?);
}
Ok(out)
}
/// Convert the decoded YUV frame to a packed 4-byte format the presenter uploads directly:
/// SDR → `RGBA` (BT.709), HDR (SMPTE ST.2084 / PQ transfer) → `X2BGR10` (== DXGI R10G10B10A2)
/// using the BT.2020 matrix. For HDR the PQ-encoded values pass through unchanged (swscale only
/// applies the YUV→RGB matrix + range, never the transfer) — exactly what an HDR10 swapchain wants.
fn convert(&mut self, frame: &AvFrame) -> Result<CpuFrame> {
use ffmpeg::color::TransferCharacteristic;
let (fmt, w, h) = (frame.format(), frame.width(), frame.height());
let hdr = frame.color_transfer_characteristic() == TransferCharacteristic::SMPTE2084;
let dst = if hdr { Pixel::X2BGR10LE } else { Pixel::RGBA };
let rebuild = !matches!(&self.sws, Some((_, f, sw, sh, d)) if *f == fmt && *sw == w && *sh == h && *d == dst);
if rebuild {
let mut ctx = scaling::Context::get(fmt, w, h, dst, w, h, scaling::Flags::POINT)
.context("swscale context")?;
if hdr {
// BT.2020 non-constant-luminance YUV (limited range) → full-range RGB. swscale
// applies only the matrix + range here, so the samples stay PQ-encoded.
unsafe {
let coef = ffmpeg::ffi::sws_getCoefficients(ffmpeg::ffi::SWS_CS_BT2020);
ffmpeg::ffi::sws_setColorspaceDetails(
ctx.as_mut_ptr(),
coef,
0, // src range: limited (video)
coef,
1, // dst range: full
0,
1 << 16,
1 << 16, // brightness / contrast / saturation defaults (16.16)
);
}
}
self.sws = Some((ctx, fmt, w, h, dst));
}
let (sws, ..) = self.sws.as_mut().unwrap();
let mut conv = AvFrame::empty();
sws.run(frame, &mut conv).map_err(|e| anyhow!("sws: {e}"))?;
Ok(CpuFrame {
width: w,
height: h,
stride: conv.stride(0),
pixels: conv.data(0).to_vec(),
hdr,
})
}
}
// --- D3D11VA backend ------------------------------------------------------------------
//
// Raw FFI: ffmpeg-next has no hwaccel wrappers. The COM-typed hwcontext structs are declared here
// (stable FFmpeg public ABI) rather than relied on from ffmpeg-sys bindgen — the generic
// AVHWDeviceContext / AVHWFramesContext (whose payload is an opaque `void *hwctx`) come from
// ffmpeg-sys, and we cast `hwctx` to the structs below. All owned pointers are freed in Drop;
// decoded surfaces transfer out through D3d11FrameGuard.
const AVERROR_EAGAIN: i32 = -11; // -EAGAIN
const D3D11_BIND_SHADER_RESOURCE: u32 = 0x8; // <d3d11.h>; FFmpeg ORs D3D11_BIND_DECODER itself
/// `hwcontext_d3d11va.h` — `AVHWDeviceContext::hwctx`. Leaving `lock` null makes FFmpeg install an
/// `ID3D11Multithread` default lock + set multithread protection on `device_context` during init,
/// which is what lets the presenter share this device's immediate context from the UI thread.
#[repr(C)]
struct AVD3D11VADeviceContext {
device: *mut c_void, // ID3D11Device*
device_context: *mut c_void, // ID3D11DeviceContext*
video_device: *mut c_void, // ID3D11VideoDevice*
video_context: *mut c_void, // ID3D11VideoContext*
lock: *mut c_void, // void (*)(void*)
unlock: *mut c_void, // void (*)(void*)
lock_ctx: *mut c_void,
}
/// `hwcontext_d3d11va.h` — `AVHWFramesContext::hwctx`. `BindFlags` lets us add
/// `D3D11_BIND_SHADER_RESOURCE` so the decoded array texture is sampleable (zero copy).
#[repr(C)]
struct AVD3D11VAFramesContext {
texture: *mut c_void, // ID3D11Texture2D* (null → FFmpeg allocates the pool)
bind_flags: u32, // UINT BindFlags
misc_flags: u32, // UINT MiscFlags
}
fn averr(what: &str, code: i32) -> anyhow::Error {
anyhow!("{what}: {}", ffmpeg::Error::from(code))
}
/// libavcodec's `get_format` callback: accept the D3D11 hw surface, building a frames context whose
/// textures carry `BIND_SHADER_RESOURCE` (so the presenter can sample them). Returning anything but
/// `AV_PIX_FMT_D3D11` aborts hardware decode → the session demotes to software.
unsafe extern "C" fn get_format_d3d11(
avctx: *mut ffmpeg::ffi::AVCodecContext,
mut list: *const ffmpeg::ffi::AVPixelFormat,
) -> ffmpeg::ffi::AVPixelFormat {
use ffmpeg::ffi::*;
unsafe {
let mut found = false;
while *list != AVPixelFormat::AV_PIX_FMT_NONE {
if *list == AVPixelFormat::AV_PIX_FMT_D3D11 {
found = true;
break;
}
list = list.add(1);
}
if !found {
return AVPixelFormat::AV_PIX_FMT_NONE;
}
let device_ref = (*avctx).hw_device_ctx;
if device_ref.is_null() {
return AVPixelFormat::AV_PIX_FMT_NONE;
}
let frames_ref = av_hwframe_ctx_alloc(device_ref);
if frames_ref.is_null() {
return AVPixelFormat::AV_PIX_FMT_NONE;
}
let frames = (*frames_ref).data as *mut AVHWFramesContext;
(*frames).format = AVPixelFormat::AV_PIX_FMT_D3D11;
let sw = if (*avctx).sw_pix_fmt != AVPixelFormat::AV_PIX_FMT_NONE {
(*avctx).sw_pix_fmt
} else {
AVPixelFormat::AV_PIX_FMT_NV12
};
(*frames).sw_format = sw;
(*frames).width = (*avctx).coded_width;
(*frames).height = (*avctx).coded_height;
// DPB + a few in-flight (decoded channel + the presenter's held frame); the host's
// zero-reorder stream needs only a small DPB, so 20 is comfortable headroom.
(*frames).initial_pool_size = 20;
let fhw = (*frames).hwctx as *mut AVD3D11VAFramesContext;
(*fhw).bind_flags = D3D11_BIND_SHADER_RESOURCE;
let r = av_hwframe_ctx_init(frames_ref);
if r < 0 {
let mut fr = frames_ref;
av_buffer_unref(&mut fr);
return AVPixelFormat::AV_PIX_FMT_NONE;
}
(*avctx).hw_frames_ctx = frames_ref; // decoder takes ownership
AVPixelFormat::AV_PIX_FMT_D3D11
}
}
struct D3d11vaDecoder {
ctx: *mut ffmpeg::ffi::AVCodecContext,
hw_device: *mut ffmpeg::ffi::AVBufferRef,
packet: *mut ffmpeg::ffi::AVPacket,
frame: *mut ffmpeg::ffi::AVFrame,
}
// Single-owner pointers, only touched from the session pump thread.
unsafe impl Send for D3d11vaDecoder {}
impl D3d11vaDecoder {
fn new() -> Result<D3d11vaDecoder> {
use ffmpeg::ffi;
let shared = crate::gpu::shared().ok_or_else(|| anyhow!("no shared D3D11 device"))?;
if !shared.hardware {
bail!("shared device is WARP (no hardware video decode)");
}
unsafe {
// Build a D3D11VA hwdevice context around the *shared* device, so decoded textures live
// on the same device the presenter samples + draws with.
let hw_device =
ffi::av_hwdevice_ctx_alloc(ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_D3D11VA);
if hw_device.is_null() {
bail!("av_hwdevice_ctx_alloc(D3D11VA) failed");
}
let devctx = (*hw_device).data as *mut ffi::AVHWDeviceContext;
let d3dctx = (*devctx).hwctx as *mut AVD3D11VADeviceContext;
// Hand FFmpeg an owned ref to the device + immediate context (it Releases them when the
// hwdevice ctx is freed). `into_raw()` transfers a +1 ref without releasing.
(*d3dctx).device = shared.device.clone().into_raw();
(*d3dctx).device_context = shared.context.clone().into_raw();
// lock left null → FFmpeg installs the ID3D11Multithread default lock in init.
let r = ffi::av_hwdevice_ctx_init(hw_device);
if r < 0 {
let mut hw = hw_device;
ffi::av_buffer_unref(&mut hw);
bail!("av_hwdevice_ctx_init: {}", ffmpeg::Error::from(r));
}
let codec = ffi::avcodec_find_decoder(ffi::AVCodecID::AV_CODEC_ID_HEVC);
if codec.is_null() {
let mut hw = hw_device;
ffi::av_buffer_unref(&mut hw);
bail!("no HEVC decoder");
}
let ctx = ffi::avcodec_alloc_context3(codec);
(*ctx).hw_device_ctx = ffi::av_buffer_ref(hw_device);
(*ctx).get_format = Some(get_format_d3d11);
(*ctx).flags |= ffi::AV_CODEC_FLAG_LOW_DELAY as i32;
(*ctx).thread_count = 1; // hwaccel: threads only add latency
let r = ffi::avcodec_open2(ctx, codec, ptr::null_mut());
if r < 0 {
let mut ctx = ctx;
ffi::avcodec_free_context(&mut ctx);
let mut hw = hw_device;
ffi::av_buffer_unref(&mut hw);
bail!("avcodec_open2 (D3D11VA): {}", ffmpeg::Error::from(r));
}
Ok(D3d11vaDecoder {
ctx,
hw_device,
packet: ffi::av_packet_alloc(),
frame: ffi::av_frame_alloc(),
})
}
}
fn decode(&mut self, au: &[u8]) -> Result<Option<GpuFrame>> {
use ffmpeg::ffi;
unsafe {
let r = ffi::av_new_packet(self.packet, au.len() as i32);
if r < 0 {
return Err(averr("av_new_packet", r));
}
ptr::copy_nonoverlapping(au.as_ptr(), (*self.packet).data, au.len());
let r = ffi::avcodec_send_packet(self.ctx, self.packet);
ffi::av_packet_unref(self.packet);
if r < 0 {
return Err(averr("send_packet", r));
}
let mut out = None;
loop {
let r = ffi::avcodec_receive_frame(self.ctx, self.frame);
if r == AVERROR_EAGAIN {
break;
}
if r < 0 {
return Err(averr("receive_frame", r));
}
out = Some(self.lift()?); // newest wins; older guards drop here
ffi::av_frame_unref(self.frame);
}
Ok(out)
}
}
/// Lift the decoded D3D11 surface into a `GpuFrame`. `data[0]` is the texture array, `data[1]`
/// the slice index. We `av_frame_clone` so the surface stays referenced (kept out of the reuse
/// pool) until the presenter drops the guard.
unsafe fn lift(&mut self) -> Result<GpuFrame> {
use ffmpeg::ffi;
unsafe {
if (*self.frame).format != ffi::AVPixelFormat::AV_PIX_FMT_D3D11 as i32 {
bail!("decoder returned a software frame (no D3D11 surface)");
}
let hdr =
(*self.frame).color_trc == ffi::AVColorTransferCharacteristic::AVCOL_TRC_SMPTE2084;
let ten_bit = {
let hwfc = (*self.frame).hw_frames_ctx;
!hwfc.is_null()
&& (*((*hwfc).data as *const ffi::AVHWFramesContext)).sw_format
== ffi::AVPixelFormat::AV_PIX_FMT_P010LE
};
let cloned = ffi::av_frame_clone(self.frame);
if cloned.is_null() {
bail!("av_frame_clone failed");
}
let frame = GpuFrame {
width: (*self.frame).width as u32,
height: (*self.frame).height as u32,
index: (*self.frame).data[1] as usize as u32,
hdr,
guard: D3d11FrameGuard(cloned),
};
log_layout_once(frame.width, frame.height, frame.index, hdr, ten_bit);
Ok(frame)
}
}
}
impl Drop for D3d11vaDecoder {
fn drop(&mut self) {
use ffmpeg::ffi;
unsafe {
ffi::av_packet_free(&mut self.packet);
ffi::av_frame_free(&mut self.frame);
ffi::avcodec_free_context(&mut self.ctx);
ffi::av_buffer_unref(&mut self.hw_device);
}
}
}
/// One-time dump of the first decoded surface's layout — so a new GPU/driver combination's real
/// format (slice index range, HDR/bit-depth) is visible in the logs without a debugger.
fn log_layout_once(width: u32, height: u32, index: u32, hdr: bool, ten_bit: bool) {
use std::sync::atomic::{AtomicBool, Ordering};
static ONCE: AtomicBool = AtomicBool::new(true);
if ONCE.swap(false, Ordering::Relaxed) {
tracing::info!(
width,
height,
slice = index,
hdr,
ten_bit,
"D3D11VA first frame (zero-copy)"
);
}
}