feat(host/windows): NVENC D3D11 hardware encoder (--features nvenc)
android / android (push) Failing after 36s
ci / rust (push) Failing after 45s
apple / swift (push) Successful in 55s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 29s
ci / bench (push) Successful in 1m35s
decky / build-publish (push) Successful in 12s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 4s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
flatpak / build-publish (push) Failing after 2s
deb / build-publish (push) Successful in 3m13s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 1m17s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m32s
docker / deploy-docs (push) Successful in 17s

Zero-copy capture->encode on the GPU via the raw NVENC API (nvidia_video_codec_sdk sys + ENCODE_API; the safe wrapper is CUDA-only). Opens an NV_ENC_DEVICE_TYPE_DIRECTX session on the SAME ID3D11Device as the DXGI capturer (carried on the new FramePayload::D3d11), registers a pool of BGRA textures once, CopyResources each captured texture in and encode_picture; CBR/ULL, infinite GOP, P-only, forced-IDR for RFI. The DXGI capturer gains a D3D11 zero-copy output (selected, like the encoder, by PUNKTFUNK_ENCODER=nvenc) so capture+encode share textures.

OFF by default (the nvenc feature pulls the NVENC SDK + cudarc): the default Windows host links without it (openh264 path). cudarc builds toolkit-less via the SDK ci-check feature (dynamic-loading). At link time --features nvenc needs nvencodeapi.lib (NVENC SDK, or an import lib generated from the driver's nvEncodeAPI64.dll) on PUNKTFUNK_NVENC_LIB_DIR. Both default and --features nvenc builds validated to compile+link GPU-less on the VM (import lib generated from the driver DLL). Runtime needs a real NVIDIA GPU.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 01:39:46 +00:00
parent 04b76ebfc7
commit 69ba6ec45d
6 changed files with 498 additions and 9 deletions
Generated
+20
View File
@@ -806,6 +806,15 @@ dependencies = [
"cipher", "cipher",
] ]
[[package]]
name = "cudarc"
version = "0.16.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17200eb07e7d85a243aa1bf4569a7aa998385ba98d14833973a817a63cc86e92"
dependencies = [
"libloading",
]
[[package]] [[package]]
name = "curve25519-dalek" name = "curve25519-dalek"
version = "4.1.3" version = "4.1.3"
@@ -2222,6 +2231,16 @@ dependencies = [
"syn", "syn",
] ]
[[package]]
name = "nvidia-video-codec-sdk"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b150dfc88653e761947906dfc0ea14af16ae366cfc55122caab94381761605a"
dependencies = [
"cudarc",
"lazy_static",
]
[[package]] [[package]]
name = "oid-registry" name = "oid-registry"
version = "0.7.1" version = "0.7.1"
@@ -2615,6 +2634,7 @@ dependencies = [
"khronos-egl", "khronos-egl",
"libc", "libc",
"mdns-sd", "mdns-sd",
"nvidia-video-codec-sdk",
"openh264", "openh264",
"opus", "opus",
"pipewire", "pipewire",
+13
View File
@@ -127,3 +127,16 @@ windows = { version = "0.62", features = [
openh264 = "0.9" openh264 = "0.9"
# WASAPI loopback audio capture (default render endpoint -> 48 kHz stereo f32 for the Opus path). # WASAPI loopback audio capture (default render endpoint -> 48 kHz stereo f32 for the Opus path).
wasapi = "0.23" wasapi = "0.23"
# NVENC hardware encoder (NVENC SDK, D3D11 input). The SDK pins `cudarc` with
# `cuda-version-from-build-system` (a build-time CUDA-toolkit probe); its `ci-check` feature switches
# cudarc to `dynamic-loading` (loads nvcuda.dll at runtime — nothing needed at build), which is how
# the crate builds on docs.rs/CI. We enable it so the GPU-less VM/CI compiles; the DirectX NVENC path
# never calls CUDA at runtime, so the pinned CUDA bindings version is irrelevant.
nvidia-video-codec-sdk = { version = "0.4", features = ["ci-check"], optional = true }
[features]
# NVENC hardware encode (Windows). OFF by default: it pulls the NVENC SDK, and the host then needs
# the NVENC entry points (NvEncodeAPICreateInstance / NvEncodeAPIGetMaxSupportedVersion) at link
# time — i.e. `nvencodeapi.lib` from the NVIDIA Video Codec SDK (or an import lib generated from
# nvEncodeAPI64.dll) on the linker path. Build the GPU host with `--features nvenc`.
nvenc = ["dep:nvidia-video-codec-sdk"]
+16
View File
@@ -0,0 +1,16 @@
//! Build script. The only thing it does: with the `nvenc` feature (Windows GPU host), tell the
//! linker to pull the NVENC import library. The NVENC entry points
//! (`NvEncodeAPICreateInstance` / `NvEncodeAPIGetMaxSupportedVersion`) live in `nvEncodeAPI64.dll`
//! (shipped with the NVIDIA driver), so the host links against `nvencodeapi.lib`. Point
//! `PUNKTFUNK_NVENC_LIB_DIR` at a directory containing `nvencodeapi.lib` — from the NVIDIA Video
//! Codec SDK, or an import lib generated from the driver's `nvEncodeAPI64.dll`
//! (`lib /def:nvenc.def /machine:x64 /out:nvencodeapi.lib` with the two exports above).
fn main() {
if std::env::var_os("CARGO_FEATURE_NVENC").is_some() {
if let Some(dir) = std::env::var_os("PUNKTFUNK_NVENC_LIB_DIR") {
println!("cargo:rustc-link-search=native={}", dir.to_string_lossy());
}
println!("cargo:rustc-link-lib=dylib=nvencodeapi");
println!("cargo:rerun-if-env-changed=PUNKTFUNK_NVENC_LIB_DIR");
}
}
+80 -4
View File
@@ -16,8 +16,9 @@ use windows::Win32::Foundation::{HMODULE, LUID};
use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0}; use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0};
use windows::Win32::Graphics::Direct3D11::{ use windows::Win32::Graphics::Direct3D11::{
D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG, D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG,
D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_BIND_RENDER_TARGET, D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT,
D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC, D3D11_USAGE_STAGING, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC,
D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING,
}; };
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC}; use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
use windows::Win32::Graphics::Dxgi::{ use windows::Win32::Graphics::Dxgi::{
@@ -78,6 +79,13 @@ pub struct DuplCapturer {
active: AtomicBool, active: AtomicBool,
timeout_ms: u32, timeout_ms: u32,
last: Option<Vec<u8>>, last: Option<Vec<u8>>,
/// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA.
/// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input.
gpu_mode: bool,
/// Reused owned texture the duplication frame is copied into for the D3D11 path (the duplication
/// surface is transient and released each frame).
gpu_copy: Option<ID3D11Texture2D>,
have_gpu_frame: bool,
_keepalive: Box<dyn Send>, _keepalive: Box<dyn Send>,
} }
// COM objects used only from the one thread that owns the capturer (the encode thread). // COM objects used only from the one thread that owns the capturer (the encode thread).
@@ -154,12 +162,16 @@ impl DuplCapturer {
.ok() .ok()
.and_then(|s| s.parse().ok()) .and_then(|s| s.parse().ok())
.unwrap_or((2000 / refresh_hz.max(1)).max(100)); .unwrap_or((2000 / refresh_hz.max(1)).max(100));
let gpu_mode = std::env::var("PUNKTFUNK_ENCODER")
.map(|v| matches!(v.to_ascii_lowercase().as_str(), "nvenc" | "hw" | "nvidia"))
.unwrap_or(false);
tracing::info!( tracing::info!(
"DXGI duplication: {}x{}@{} on {}", "DXGI duplication: {}x{}@{} on {} ({})",
width, width,
height, height,
refresh_hz, refresh_hz,
target.gdi_name target.gdi_name,
if gpu_mode { "D3D11 zero-copy" } else { "CPU staging" }
); );
Ok(Self { Ok(Self {
device, device,
@@ -174,6 +186,9 @@ impl DuplCapturer {
active: AtomicBool::new(false), active: AtomicBool::new(false),
timeout_ms, timeout_ms,
last: None, last: None,
gpu_mode,
gpu_copy: None,
have_gpu_frame: false,
_keepalive: keepalive, _keepalive: keepalive,
}) })
} }
@@ -206,6 +221,33 @@ impl DuplCapturer {
Ok(()) Ok(())
} }
unsafe fn ensure_gpu_copy(&mut self) -> Result<()> {
if self.gpu_copy.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(gpu copy)")?;
self.gpu_copy = t;
Ok(())
}
unsafe fn recreate_dupl(&mut self) -> Result<()> { unsafe fn recreate_dupl(&mut self) -> Result<()> {
if self.holding_frame { if self.holding_frame {
let _ = self.dupl.ReleaseFrame(); let _ = self.dupl.ReleaseFrame();
@@ -238,6 +280,26 @@ impl DuplCapturer {
self.holding_frame = true; self.holding_frame = true;
let res = res.context("AcquireNextFrame: null resource")?; let res = res.context("AcquireNextFrame: null resource")?;
let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?; let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?;
if self.gpu_mode {
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
// surface into a reused owned texture, release the duplication frame, hand off the texture.
self.ensure_gpu_copy()?;
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
self.context.CopyResource(&gpu, &tex);
let _ = self.dupl.ReleaseFrame();
self.holding_frame = false;
self.have_gpu_frame = true;
return Ok(Some(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::D3d11(D3d11Frame {
texture: gpu,
device: self.device.clone(),
}),
}));
}
self.ensure_staging()?; self.ensure_staging()?;
let staging = self.staging.clone().context("staging texture")?; let staging = self.staging.clone().context("staging texture")?;
self.context.CopyResource(&staging, &tex); self.context.CopyResource(&staging, &tex);
@@ -277,6 +339,20 @@ impl Capturer for DuplCapturer {
if let Some(f) = unsafe { self.acquire() }? { if let Some(f) = unsafe { self.acquire() }? {
return Ok(f); return Ok(f);
} }
if self.gpu_mode && self.have_gpu_frame {
if let Some(gpu) = &self.gpu_copy {
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::D3d11(D3d11Frame {
texture: gpu.clone(),
device: self.device.clone(),
}),
});
}
}
if let Some(b) = &self.last { if let Some(b) = &self.last {
return Ok(CapturedFrame { return Ok(CapturedFrame {
width: self.width, width: self.width,
+18 -5
View File
@@ -162,15 +162,26 @@ pub fn open_video(
.unwrap_or_default() .unwrap_or_default()
.to_ascii_lowercase(); .to_ascii_lowercase();
if matches!(pref.as_str(), "nvenc" | "hw" | "nvidia") { if matches!(pref.as_str(), "nvenc" | "hw" | "nvidia") {
anyhow::bail!( // Hardware path: NVENC over D3D11. The DXGI capturer switches to its zero-copy
"NVENC hardware encode is not yet implemented on Windows — omit PUNKTFUNK_ENCODER \ // FramePayload::D3d11 output under the same env var so capture + encode share textures.
or set it to 'software' to use the openh264 encoder" #[cfg(feature = "nvenc")]
); {
let enc =
nvenc::NvencD3d11Encoder::open(codec, format, width, height, fps, bitrate_bps)?;
return Ok(Box::new(enc) as Box<dyn Encoder>);
}
#[cfg(not(feature = "nvenc"))]
{
anyhow::bail!(
"NVENC requested but this host was built without it — rebuild with \
`--features nvenc` (needs the NVENC SDK's nvencodeapi.lib at link time)"
);
}
} }
anyhow::ensure!( anyhow::ensure!(
codec == Codec::H264, codec == Codec::H264,
"the Windows software encoder supports H.264 only; client negotiated {codec:?} \ "the Windows software encoder supports H.264 only; client negotiated {codec:?} \
(request H264, or use a GPU host once NVENC lands)" (set PUNKTFUNK_ENCODER=nvenc for a GPU host, or request H264)"
); );
// Software H.264 realistically caps far below the negotiated hardware rates. // Software H.264 realistically caps far below the negotiated hardware rates.
const SW_BITRATE_CEIL: u64 = 100_000_000; const SW_BITRATE_CEIL: u64 = 100_000_000;
@@ -189,6 +200,8 @@ pub fn open_video(
mod linux; mod linux;
#[cfg(target_os = "windows")] #[cfg(target_os = "windows")]
mod sw; mod sw;
#[cfg(all(target_os = "windows", feature = "nvenc"))]
mod nvenc;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
+351
View File
@@ -0,0 +1,351 @@
//! NVENC hardware encoder (Windows, D3D11 input) — zero-copy capture→encode on the GPU.
//!
//! Drives the raw NVENC API via `nvidia_video_codec_sdk::{sys, ENCODE_API}` (the safe `Encoder`
//! wrapper is CUDA-only). Opens an encode session bound to the **same** `ID3D11Device` as the DXGI
//! capturer (the device is carried on `FramePayload::D3d11`), registers a small pool of encoder-owned
//! BGRA textures once, and per frame `CopyResource`s the captured texture into a pooled one and
//! `encode_picture`s it. Mirrors the Linux NVENC config: CBR + ultra-low-latency, infinite GOP,
//! P-frames only, forced-IDR for RFI, in-band SPS/PPS each keyframe.
//!
//! Needs a real NVIDIA GPU at runtime (session creation fails otherwise) — compiles GPU-less, but
//! `open`/`submit` only succeed on a GPU box. The software encoder (`super::sw`) is the fallback.
use super::{Codec, EncodedFrame, Encoder};
use crate::capture::{CapturedFrame, FramePayload, PixelFormat};
use anyhow::{anyhow, bail, Context, Result};
use std::collections::VecDeque;
use std::ffi::c_void;
use std::ptr;
use windows::core::Interface;
use windows::Win32::Graphics::Direct3D11::{
ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_TEXTURE2D_DESC,
D3D11_USAGE_DEFAULT,
};
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv;
use nvidia_video_codec_sdk::ENCODE_API as API;
const POOL: usize = 4;
fn codec_guid(codec: Codec) -> nv::GUID {
match codec {
Codec::H264 => nv::NV_ENC_CODEC_H264_GUID,
Codec::H265 => nv::NV_ENC_CODEC_HEVC_GUID,
Codec::Av1 => nv::NV_ENC_CODEC_AV1_GUID,
}
}
struct PooledTex {
tex: ID3D11Texture2D,
reg: nv::NV_ENC_REGISTERED_PTR,
map: nv::NV_ENC_INPUT_PTR,
}
pub struct NvencD3d11Encoder {
ctx: Option<ID3D11DeviceContext>,
encoder: *mut c_void,
codec_guid: nv::GUID,
width: u32,
height: u32,
fps: u32,
bitrate_bps: u64,
buffer_fmt: nv::NV_ENC_BUFFER_FORMAT,
pool: Vec<PooledTex>,
next: usize,
bitstreams: Vec<nv::NV_ENC_OUTPUT_PTR>,
pending: VecDeque<(nv::NV_ENC_OUTPUT_PTR, usize, u64)>,
frame_idx: i64,
force_kf: bool,
inited: bool,
}
// Raw NVENC handle + COM ptrs; confined to the single encode thread (like the Linux encoder).
unsafe impl Send for NvencD3d11Encoder {}
impl NvencD3d11Encoder {
pub fn open(
codec: Codec,
_format: PixelFormat,
width: u32,
height: u32,
fps: u32,
bitrate_bps: u64,
) -> Result<Self> {
Ok(Self {
ctx: None,
encoder: ptr::null_mut(),
codec_guid: codec_guid(codec),
width,
height,
fps,
bitrate_bps,
buffer_fmt: nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
pool: Vec::new(),
next: 0,
bitstreams: Vec::new(),
pending: VecDeque::new(),
frame_idx: 0,
force_kf: false,
inited: false,
})
}
/// Lazily create the session on the first frame's D3D11 device (so capture + encode share it).
fn init_session(&mut self, device: &ID3D11Device) -> Result<()> {
unsafe {
self.ctx = Some(device.GetImmediateContext().context("D3D11 immediate context")?);
// 1. open the session bound to the D3D11 device.
let mut params = nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS {
version: nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER,
deviceType: nv::NV_ENC_DEVICE_TYPE::NV_ENC_DEVICE_TYPE_DIRECTX,
device: device.as_raw(),
apiVersion: nv::NVENCAPI_VERSION,
..Default::default()
};
let mut enc: *mut c_void = ptr::null_mut();
(API.open_encode_session_ex)(&mut params, &mut enc)
.result_without_string()
.map_err(|e| anyhow!("NVENC open_encode_session_ex: {e:?} (no NVIDIA GPU?)"))?;
self.encoder = enc;
// 2. seed the P1 + ultra-low-latency preset config.
let mut preset = nv::NV_ENC_PRESET_CONFIG {
version: nv::NV_ENC_PRESET_CONFIG_VER,
presetCfg: nv::NV_ENC_CONFIG {
version: nv::NV_ENC_CONFIG_VER,
..Default::default()
},
..Default::default()
};
(API.get_encode_preset_config_ex)(
enc,
self.codec_guid,
nv::NV_ENC_PRESET_P1_GUID,
nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY,
&mut preset,
)
.result_without_string()
.map_err(|e| anyhow!("get_encode_preset_config_ex: {e:?}"))?;
let mut cfg = preset.presetCfg;
// 3. mirror the Linux RC config: CBR, infinite GOP, P-only, ~1-frame VBV.
cfg.gopLength = nv::NVENC_INFINITE_GOPLENGTH;
cfg.frameIntervalP = 1;
cfg.rcParams.rateControlMode = nv::NV_ENC_PARAMS_RC_MODE::NV_ENC_PARAMS_RC_CBR;
let bps = self.bitrate_bps.min(u32::MAX as u64) as u32;
cfg.rcParams.averageBitRate = bps;
cfg.rcParams.maxBitRate = bps;
let vbv = (self.bitrate_bps as f64 / self.fps.max(1) as f64) as u32;
cfg.rcParams.vbvBufferSize = vbv;
cfg.rcParams.vbvInitialDelay = vbv;
// 4. initialize the encoder.
let mut init = nv::NV_ENC_INITIALIZE_PARAMS {
version: nv::NV_ENC_INITIALIZE_PARAMS_VER,
encodeGUID: self.codec_guid,
presetGUID: nv::NV_ENC_PRESET_P1_GUID,
tuningInfo: nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY,
encodeWidth: self.width,
encodeHeight: self.height,
darWidth: self.width,
darHeight: self.height,
frameRateNum: self.fps,
frameRateDen: 1,
enablePTD: 1,
encodeConfig: &mut cfg,
..Default::default()
};
(API.initialize_encoder)(enc, &mut init)
.result_without_string()
.map_err(|e| anyhow!("initialize_encoder: {e:?}"))?;
// 5. encoder-owned BGRA texture pool, registered once, + one bitstream per slot.
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
for _ in 0..POOL {
let mut tex: Option<ID3D11Texture2D> = None;
device
.CreateTexture2D(&desc, None, Some(&mut tex))
.context("CreateTexture2D(nvenc pool)")?;
let tex = tex.context("null pool texture")?;
let mut rr = nv::NV_ENC_REGISTER_RESOURCE {
version: nv::NV_ENC_REGISTER_RESOURCE_VER,
resourceType:
nv::NV_ENC_INPUT_RESOURCE_TYPE::NV_ENC_INPUT_RESOURCE_TYPE_DIRECTX,
width: self.width,
height: self.height,
pitch: 0,
resourceToRegister: tex.as_raw(),
bufferFormat: self.buffer_fmt,
bufferUsage: nv::NV_ENC_BUFFER_USAGE::NV_ENC_INPUT_IMAGE,
..Default::default()
};
(API.register_resource)(enc, &mut rr)
.result_without_string()
.map_err(|e| anyhow!("register_resource: {e:?}"))?;
self.pool.push(PooledTex {
tex,
reg: rr.registeredResource,
map: ptr::null_mut(),
});
let mut cb = nv::NV_ENC_CREATE_BITSTREAM_BUFFER {
version: nv::NV_ENC_CREATE_BITSTREAM_BUFFER_VER,
..Default::default()
};
(API.create_bitstream_buffer)(enc, &mut cb)
.result_without_string()
.map_err(|e| anyhow!("create_bitstream_buffer: {e:?}"))?;
self.bitstreams.push(cb.bitstreamBuffer);
}
self.inited = true;
tracing::info!(
"NVENC D3D11 session: {}x{}@{} {} Mbps {:?}",
self.width,
self.height,
self.fps,
bps / 1_000_000,
self.codec_guid
);
Ok(())
}
}
}
impl Encoder for NvencD3d11Encoder {
fn submit(&mut self, captured: &CapturedFrame) -> Result<()> {
let frame = match &captured.payload {
FramePayload::D3d11(f) => f,
FramePayload::Cpu(_) => {
bail!("NVENC D3D11 encoder needs a GPU texture frame (use the software encoder for CPU frames)")
}
};
if !self.inited {
let device = frame.device.clone();
self.init_session(&device)?;
}
let slot = self.next % POOL;
self.next += 1;
unsafe {
let ctx = self.ctx.as_ref().context("no D3D11 context")?;
ctx.CopyResource(&self.pool[slot].tex, &frame.texture);
let mut mp = nv::NV_ENC_MAP_INPUT_RESOURCE {
version: nv::NV_ENC_MAP_INPUT_RESOURCE_VER,
registeredResource: self.pool[slot].reg,
..Default::default()
};
(API.map_input_resource)(self.encoder, &mut mp)
.result_without_string()
.map_err(|e| anyhow!("map_input_resource: {e:?}"))?;
self.pool[slot].map = mp.mappedResource;
let pts = self.frame_idx as u64;
self.frame_idx += 1;
let flags = if std::mem::take(&mut self.force_kf) {
nv::NV_ENC_PIC_FLAGS::NV_ENC_PIC_FLAG_FORCEIDR as u32
| nv::NV_ENC_PIC_FLAGS::NV_ENC_PIC_FLAG_OUTPUT_SPSPPS as u32
} else {
0
};
let mut pic = nv::NV_ENC_PIC_PARAMS {
version: nv::NV_ENC_PIC_PARAMS_VER,
inputWidth: self.width,
inputHeight: self.height,
inputPitch: 0,
inputBuffer: mp.mappedResource,
bufferFmt: mp.mappedBufferFmt,
outputBitstream: self.bitstreams[slot],
pictureStruct: nv::NV_ENC_PIC_STRUCT::NV_ENC_PIC_STRUCT_FRAME,
inputTimeStamp: pts,
encodePicFlags: flags as u32,
..Default::default()
};
(API.encode_picture)(self.encoder, &mut pic)
.result_without_string()
.map_err(|e| anyhow!("encode_picture: {e:?}"))?;
self.pending
.push_back((self.bitstreams[slot], slot, captured.pts_ns));
}
Ok(())
}
fn request_keyframe(&mut self) {
self.force_kf = true;
}
fn poll(&mut self) -> Result<Option<EncodedFrame>> {
let Some((bs, slot, pts_ns)) = self.pending.pop_front() else {
return Ok(None);
};
unsafe {
let mut lock = nv::NV_ENC_LOCK_BITSTREAM {
version: nv::NV_ENC_LOCK_BITSTREAM_VER,
outputBitstream: bs,
..Default::default()
};
(API.lock_bitstream)(self.encoder, &mut lock)
.result_without_string()
.map_err(|e| anyhow!("lock_bitstream: {e:?}"))?;
let data = std::slice::from_raw_parts(
lock.bitstreamBufferPtr as *const u8,
lock.bitstreamSizeInBytes as usize,
)
.to_vec();
let keyframe = matches!(
lock.pictureType,
nv::NV_ENC_PIC_TYPE::NV_ENC_PIC_TYPE_IDR | nv::NV_ENC_PIC_TYPE::NV_ENC_PIC_TYPE_I
);
(API.unlock_bitstream)(self.encoder, bs)
.result_without_string()
.map_err(|e| anyhow!("unlock_bitstream: {e:?}"))?;
if !self.pool[slot].map.is_null() {
let _ = (API.unmap_input_resource)(self.encoder, self.pool[slot].map);
self.pool[slot].map = ptr::null_mut();
}
Ok(Some(EncodedFrame {
data,
pts_ns,
keyframe,
}))
}
}
fn flush(&mut self) -> Result<()> {
Ok(()) // P1/ULL + frameIntervalP=1: each submit yields its AU; no internal queue to drain.
}
}
impl Drop for NvencD3d11Encoder {
fn drop(&mut self) {
if self.encoder.is_null() {
return;
}
unsafe {
for p in &self.pool {
if !p.map.is_null() {
let _ = (API.unmap_input_resource)(self.encoder, p.map);
}
let _ = (API.unregister_resource)(self.encoder, p.reg);
}
for &bs in &self.bitstreams {
let _ = (API.destroy_bitstream_buffer)(self.encoder, bs);
}
let _ = (API.destroy_encoder)(self.encoder);
}
}
}