feat(host/windows): NVENC D3D11 hardware encoder (--features nvenc)
android / android (push) Failing after 36s
ci / rust (push) Failing after 45s
apple / swift (push) Successful in 55s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 29s
ci / bench (push) Successful in 1m35s
decky / build-publish (push) Successful in 12s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 4s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
flatpak / build-publish (push) Failing after 2s
deb / build-publish (push) Successful in 3m13s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 1m17s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m32s
docker / deploy-docs (push) Successful in 17s
android / android (push) Failing after 36s
ci / rust (push) Failing after 45s
apple / swift (push) Successful in 55s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 29s
ci / bench (push) Successful in 1m35s
decky / build-publish (push) Successful in 12s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 4s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
flatpak / build-publish (push) Failing after 2s
deb / build-publish (push) Successful in 3m13s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 1m17s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m32s
docker / deploy-docs (push) Successful in 17s
Zero-copy capture->encode on the GPU via the raw NVENC API (nvidia_video_codec_sdk sys + ENCODE_API; the safe wrapper is CUDA-only). Opens an NV_ENC_DEVICE_TYPE_DIRECTX session on the SAME ID3D11Device as the DXGI capturer (carried on the new FramePayload::D3d11), registers a pool of BGRA textures once, CopyResources each captured texture in and encode_picture; CBR/ULL, infinite GOP, P-only, forced-IDR for RFI. The DXGI capturer gains a D3D11 zero-copy output (selected, like the encoder, by PUNKTFUNK_ENCODER=nvenc) so capture+encode share textures. OFF by default (the nvenc feature pulls the NVENC SDK + cudarc): the default Windows host links without it (openh264 path). cudarc builds toolkit-less via the SDK ci-check feature (dynamic-loading). At link time --features nvenc needs nvencodeapi.lib (NVENC SDK, or an import lib generated from the driver's nvEncodeAPI64.dll) on PUNKTFUNK_NVENC_LIB_DIR. Both default and --features nvenc builds validated to compile+link GPU-less on the VM (import lib generated from the driver DLL). Runtime needs a real NVIDIA GPU. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Generated
+20
@@ -806,6 +806,15 @@ dependencies = [
|
|||||||
"cipher",
|
"cipher",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cudarc"
|
||||||
|
version = "0.16.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "17200eb07e7d85a243aa1bf4569a7aa998385ba98d14833973a817a63cc86e92"
|
||||||
|
dependencies = [
|
||||||
|
"libloading",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "curve25519-dalek"
|
name = "curve25519-dalek"
|
||||||
version = "4.1.3"
|
version = "4.1.3"
|
||||||
@@ -2222,6 +2231,16 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nvidia-video-codec-sdk"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1b150dfc88653e761947906dfc0ea14af16ae366cfc55122caab94381761605a"
|
||||||
|
dependencies = [
|
||||||
|
"cudarc",
|
||||||
|
"lazy_static",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "oid-registry"
|
name = "oid-registry"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@@ -2615,6 +2634,7 @@ dependencies = [
|
|||||||
"khronos-egl",
|
"khronos-egl",
|
||||||
"libc",
|
"libc",
|
||||||
"mdns-sd",
|
"mdns-sd",
|
||||||
|
"nvidia-video-codec-sdk",
|
||||||
"openh264",
|
"openh264",
|
||||||
"opus",
|
"opus",
|
||||||
"pipewire",
|
"pipewire",
|
||||||
|
|||||||
@@ -127,3 +127,16 @@ windows = { version = "0.62", features = [
|
|||||||
openh264 = "0.9"
|
openh264 = "0.9"
|
||||||
# WASAPI loopback audio capture (default render endpoint -> 48 kHz stereo f32 for the Opus path).
|
# WASAPI loopback audio capture (default render endpoint -> 48 kHz stereo f32 for the Opus path).
|
||||||
wasapi = "0.23"
|
wasapi = "0.23"
|
||||||
|
# NVENC hardware encoder (NVENC SDK, D3D11 input). The SDK pins `cudarc` with
|
||||||
|
# `cuda-version-from-build-system` (a build-time CUDA-toolkit probe); its `ci-check` feature switches
|
||||||
|
# cudarc to `dynamic-loading` (loads nvcuda.dll at runtime — nothing needed at build), which is how
|
||||||
|
# the crate builds on docs.rs/CI. We enable it so the GPU-less VM/CI compiles; the DirectX NVENC path
|
||||||
|
# never calls CUDA at runtime, so the pinned CUDA bindings version is irrelevant.
|
||||||
|
nvidia-video-codec-sdk = { version = "0.4", features = ["ci-check"], optional = true }
|
||||||
|
|
||||||
|
[features]
|
||||||
|
# NVENC hardware encode (Windows). OFF by default: it pulls the NVENC SDK, and the host then needs
|
||||||
|
# the NVENC entry points (NvEncodeAPICreateInstance / NvEncodeAPIGetMaxSupportedVersion) at link
|
||||||
|
# time — i.e. `nvencodeapi.lib` from the NVIDIA Video Codec SDK (or an import lib generated from
|
||||||
|
# nvEncodeAPI64.dll) on the linker path. Build the GPU host with `--features nvenc`.
|
||||||
|
nvenc = ["dep:nvidia-video-codec-sdk"]
|
||||||
|
|||||||
@@ -0,0 +1,16 @@
|
|||||||
|
//! Build script. The only thing it does: with the `nvenc` feature (Windows GPU host), tell the
|
||||||
|
//! linker to pull the NVENC import library. The NVENC entry points
|
||||||
|
//! (`NvEncodeAPICreateInstance` / `NvEncodeAPIGetMaxSupportedVersion`) live in `nvEncodeAPI64.dll`
|
||||||
|
//! (shipped with the NVIDIA driver), so the host links against `nvencodeapi.lib`. Point
|
||||||
|
//! `PUNKTFUNK_NVENC_LIB_DIR` at a directory containing `nvencodeapi.lib` — from the NVIDIA Video
|
||||||
|
//! Codec SDK, or an import lib generated from the driver's `nvEncodeAPI64.dll`
|
||||||
|
//! (`lib /def:nvenc.def /machine:x64 /out:nvencodeapi.lib` with the two exports above).
|
||||||
|
fn main() {
|
||||||
|
if std::env::var_os("CARGO_FEATURE_NVENC").is_some() {
|
||||||
|
if let Some(dir) = std::env::var_os("PUNKTFUNK_NVENC_LIB_DIR") {
|
||||||
|
println!("cargo:rustc-link-search=native={}", dir.to_string_lossy());
|
||||||
|
}
|
||||||
|
println!("cargo:rustc-link-lib=dylib=nvencodeapi");
|
||||||
|
println!("cargo:rerun-if-env-changed=PUNKTFUNK_NVENC_LIB_DIR");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -16,8 +16,9 @@ use windows::Win32::Foundation::{HMODULE, LUID};
|
|||||||
use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0};
|
use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0};
|
||||||
use windows::Win32::Graphics::Direct3D11::{
|
use windows::Win32::Graphics::Direct3D11::{
|
||||||
D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG,
|
D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG,
|
||||||
D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ,
|
D3D11_BIND_RENDER_TARGET, D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT,
|
||||||
D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC, D3D11_USAGE_STAGING,
|
D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC,
|
||||||
|
D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING,
|
||||||
};
|
};
|
||||||
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
|
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
|
||||||
use windows::Win32::Graphics::Dxgi::{
|
use windows::Win32::Graphics::Dxgi::{
|
||||||
@@ -78,6 +79,13 @@ pub struct DuplCapturer {
|
|||||||
active: AtomicBool,
|
active: AtomicBool,
|
||||||
timeout_ms: u32,
|
timeout_ms: u32,
|
||||||
last: Option<Vec<u8>>,
|
last: Option<Vec<u8>>,
|
||||||
|
/// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA.
|
||||||
|
/// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input.
|
||||||
|
gpu_mode: bool,
|
||||||
|
/// Reused owned texture the duplication frame is copied into for the D3D11 path (the duplication
|
||||||
|
/// surface is transient and released each frame).
|
||||||
|
gpu_copy: Option<ID3D11Texture2D>,
|
||||||
|
have_gpu_frame: bool,
|
||||||
_keepalive: Box<dyn Send>,
|
_keepalive: Box<dyn Send>,
|
||||||
}
|
}
|
||||||
// COM objects used only from the one thread that owns the capturer (the encode thread).
|
// COM objects used only from the one thread that owns the capturer (the encode thread).
|
||||||
@@ -154,12 +162,16 @@ impl DuplCapturer {
|
|||||||
.ok()
|
.ok()
|
||||||
.and_then(|s| s.parse().ok())
|
.and_then(|s| s.parse().ok())
|
||||||
.unwrap_or((2000 / refresh_hz.max(1)).max(100));
|
.unwrap_or((2000 / refresh_hz.max(1)).max(100));
|
||||||
|
let gpu_mode = std::env::var("PUNKTFUNK_ENCODER")
|
||||||
|
.map(|v| matches!(v.to_ascii_lowercase().as_str(), "nvenc" | "hw" | "nvidia"))
|
||||||
|
.unwrap_or(false);
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"DXGI duplication: {}x{}@{} on {}",
|
"DXGI duplication: {}x{}@{} on {} ({})",
|
||||||
width,
|
width,
|
||||||
height,
|
height,
|
||||||
refresh_hz,
|
refresh_hz,
|
||||||
target.gdi_name
|
target.gdi_name,
|
||||||
|
if gpu_mode { "D3D11 zero-copy" } else { "CPU staging" }
|
||||||
);
|
);
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
device,
|
device,
|
||||||
@@ -174,6 +186,9 @@ impl DuplCapturer {
|
|||||||
active: AtomicBool::new(false),
|
active: AtomicBool::new(false),
|
||||||
timeout_ms,
|
timeout_ms,
|
||||||
last: None,
|
last: None,
|
||||||
|
gpu_mode,
|
||||||
|
gpu_copy: None,
|
||||||
|
have_gpu_frame: false,
|
||||||
_keepalive: keepalive,
|
_keepalive: keepalive,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@@ -206,6 +221,33 @@ impl DuplCapturer {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsafe fn ensure_gpu_copy(&mut self) -> Result<()> {
|
||||||
|
if self.gpu_copy.is_some() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let desc = D3D11_TEXTURE2D_DESC {
|
||||||
|
Width: self.width,
|
||||||
|
Height: self.height,
|
||||||
|
MipLevels: 1,
|
||||||
|
ArraySize: 1,
|
||||||
|
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||||
|
SampleDesc: DXGI_SAMPLE_DESC {
|
||||||
|
Count: 1,
|
||||||
|
Quality: 0,
|
||||||
|
},
|
||||||
|
Usage: D3D11_USAGE_DEFAULT,
|
||||||
|
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
|
CPUAccessFlags: 0,
|
||||||
|
MiscFlags: 0,
|
||||||
|
};
|
||||||
|
let mut t: Option<ID3D11Texture2D> = None;
|
||||||
|
self.device
|
||||||
|
.CreateTexture2D(&desc, None, Some(&mut t))
|
||||||
|
.context("CreateTexture2D(gpu copy)")?;
|
||||||
|
self.gpu_copy = t;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
unsafe fn recreate_dupl(&mut self) -> Result<()> {
|
unsafe fn recreate_dupl(&mut self) -> Result<()> {
|
||||||
if self.holding_frame {
|
if self.holding_frame {
|
||||||
let _ = self.dupl.ReleaseFrame();
|
let _ = self.dupl.ReleaseFrame();
|
||||||
@@ -238,6 +280,26 @@ impl DuplCapturer {
|
|||||||
self.holding_frame = true;
|
self.holding_frame = true;
|
||||||
let res = res.context("AcquireNextFrame: null resource")?;
|
let res = res.context("AcquireNextFrame: null resource")?;
|
||||||
let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?;
|
let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?;
|
||||||
|
if self.gpu_mode {
|
||||||
|
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
|
||||||
|
// surface into a reused owned texture, release the duplication frame, hand off the texture.
|
||||||
|
self.ensure_gpu_copy()?;
|
||||||
|
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
|
||||||
|
self.context.CopyResource(&gpu, &tex);
|
||||||
|
let _ = self.dupl.ReleaseFrame();
|
||||||
|
self.holding_frame = false;
|
||||||
|
self.have_gpu_frame = true;
|
||||||
|
return Ok(Some(CapturedFrame {
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pts_ns: now_ns(),
|
||||||
|
format: PixelFormat::Bgra,
|
||||||
|
payload: FramePayload::D3d11(D3d11Frame {
|
||||||
|
texture: gpu,
|
||||||
|
device: self.device.clone(),
|
||||||
|
}),
|
||||||
|
}));
|
||||||
|
}
|
||||||
self.ensure_staging()?;
|
self.ensure_staging()?;
|
||||||
let staging = self.staging.clone().context("staging texture")?;
|
let staging = self.staging.clone().context("staging texture")?;
|
||||||
self.context.CopyResource(&staging, &tex);
|
self.context.CopyResource(&staging, &tex);
|
||||||
@@ -277,6 +339,20 @@ impl Capturer for DuplCapturer {
|
|||||||
if let Some(f) = unsafe { self.acquire() }? {
|
if let Some(f) = unsafe { self.acquire() }? {
|
||||||
return Ok(f);
|
return Ok(f);
|
||||||
}
|
}
|
||||||
|
if self.gpu_mode && self.have_gpu_frame {
|
||||||
|
if let Some(gpu) = &self.gpu_copy {
|
||||||
|
return Ok(CapturedFrame {
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pts_ns: now_ns(),
|
||||||
|
format: PixelFormat::Bgra,
|
||||||
|
payload: FramePayload::D3d11(D3d11Frame {
|
||||||
|
texture: gpu.clone(),
|
||||||
|
device: self.device.clone(),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
if let Some(b) = &self.last {
|
if let Some(b) = &self.last {
|
||||||
return Ok(CapturedFrame {
|
return Ok(CapturedFrame {
|
||||||
width: self.width,
|
width: self.width,
|
||||||
|
|||||||
@@ -162,15 +162,26 @@ pub fn open_video(
|
|||||||
.unwrap_or_default()
|
.unwrap_or_default()
|
||||||
.to_ascii_lowercase();
|
.to_ascii_lowercase();
|
||||||
if matches!(pref.as_str(), "nvenc" | "hw" | "nvidia") {
|
if matches!(pref.as_str(), "nvenc" | "hw" | "nvidia") {
|
||||||
anyhow::bail!(
|
// Hardware path: NVENC over D3D11. The DXGI capturer switches to its zero-copy
|
||||||
"NVENC hardware encode is not yet implemented on Windows — omit PUNKTFUNK_ENCODER \
|
// FramePayload::D3d11 output under the same env var so capture + encode share textures.
|
||||||
or set it to 'software' to use the openh264 encoder"
|
#[cfg(feature = "nvenc")]
|
||||||
);
|
{
|
||||||
|
let enc =
|
||||||
|
nvenc::NvencD3d11Encoder::open(codec, format, width, height, fps, bitrate_bps)?;
|
||||||
|
return Ok(Box::new(enc) as Box<dyn Encoder>);
|
||||||
|
}
|
||||||
|
#[cfg(not(feature = "nvenc"))]
|
||||||
|
{
|
||||||
|
anyhow::bail!(
|
||||||
|
"NVENC requested but this host was built without it — rebuild with \
|
||||||
|
`--features nvenc` (needs the NVENC SDK's nvencodeapi.lib at link time)"
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
codec == Codec::H264,
|
codec == Codec::H264,
|
||||||
"the Windows software encoder supports H.264 only; client negotiated {codec:?} \
|
"the Windows software encoder supports H.264 only; client negotiated {codec:?} \
|
||||||
(request H264, or use a GPU host once NVENC lands)"
|
(set PUNKTFUNK_ENCODER=nvenc for a GPU host, or request H264)"
|
||||||
);
|
);
|
||||||
// Software H.264 realistically caps far below the negotiated hardware rates.
|
// Software H.264 realistically caps far below the negotiated hardware rates.
|
||||||
const SW_BITRATE_CEIL: u64 = 100_000_000;
|
const SW_BITRATE_CEIL: u64 = 100_000_000;
|
||||||
@@ -189,6 +200,8 @@ pub fn open_video(
|
|||||||
mod linux;
|
mod linux;
|
||||||
#[cfg(target_os = "windows")]
|
#[cfg(target_os = "windows")]
|
||||||
mod sw;
|
mod sw;
|
||||||
|
#[cfg(all(target_os = "windows", feature = "nvenc"))]
|
||||||
|
mod nvenc;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|||||||
@@ -0,0 +1,351 @@
|
|||||||
|
//! NVENC hardware encoder (Windows, D3D11 input) — zero-copy capture→encode on the GPU.
|
||||||
|
//!
|
||||||
|
//! Drives the raw NVENC API via `nvidia_video_codec_sdk::{sys, ENCODE_API}` (the safe `Encoder`
|
||||||
|
//! wrapper is CUDA-only). Opens an encode session bound to the **same** `ID3D11Device` as the DXGI
|
||||||
|
//! capturer (the device is carried on `FramePayload::D3d11`), registers a small pool of encoder-owned
|
||||||
|
//! BGRA textures once, and per frame `CopyResource`s the captured texture into a pooled one and
|
||||||
|
//! `encode_picture`s it. Mirrors the Linux NVENC config: CBR + ultra-low-latency, infinite GOP,
|
||||||
|
//! P-frames only, forced-IDR for RFI, in-band SPS/PPS each keyframe.
|
||||||
|
//!
|
||||||
|
//! Needs a real NVIDIA GPU at runtime (session creation fails otherwise) — compiles GPU-less, but
|
||||||
|
//! `open`/`submit` only succeed on a GPU box. The software encoder (`super::sw`) is the fallback.
|
||||||
|
|
||||||
|
use super::{Codec, EncodedFrame, Encoder};
|
||||||
|
use crate::capture::{CapturedFrame, FramePayload, PixelFormat};
|
||||||
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
|
use std::collections::VecDeque;
|
||||||
|
use std::ffi::c_void;
|
||||||
|
use std::ptr;
|
||||||
|
use windows::core::Interface;
|
||||||
|
use windows::Win32::Graphics::Direct3D11::{
|
||||||
|
ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_RENDER_TARGET, D3D11_TEXTURE2D_DESC,
|
||||||
|
D3D11_USAGE_DEFAULT,
|
||||||
|
};
|
||||||
|
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
|
||||||
|
|
||||||
|
use nvidia_video_codec_sdk::sys::nvEncodeAPI as nv;
|
||||||
|
use nvidia_video_codec_sdk::ENCODE_API as API;
|
||||||
|
|
||||||
|
const POOL: usize = 4;
|
||||||
|
|
||||||
|
fn codec_guid(codec: Codec) -> nv::GUID {
|
||||||
|
match codec {
|
||||||
|
Codec::H264 => nv::NV_ENC_CODEC_H264_GUID,
|
||||||
|
Codec::H265 => nv::NV_ENC_CODEC_HEVC_GUID,
|
||||||
|
Codec::Av1 => nv::NV_ENC_CODEC_AV1_GUID,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PooledTex {
|
||||||
|
tex: ID3D11Texture2D,
|
||||||
|
reg: nv::NV_ENC_REGISTERED_PTR,
|
||||||
|
map: nv::NV_ENC_INPUT_PTR,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct NvencD3d11Encoder {
|
||||||
|
ctx: Option<ID3D11DeviceContext>,
|
||||||
|
encoder: *mut c_void,
|
||||||
|
codec_guid: nv::GUID,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
fps: u32,
|
||||||
|
bitrate_bps: u64,
|
||||||
|
buffer_fmt: nv::NV_ENC_BUFFER_FORMAT,
|
||||||
|
pool: Vec<PooledTex>,
|
||||||
|
next: usize,
|
||||||
|
bitstreams: Vec<nv::NV_ENC_OUTPUT_PTR>,
|
||||||
|
pending: VecDeque<(nv::NV_ENC_OUTPUT_PTR, usize, u64)>,
|
||||||
|
frame_idx: i64,
|
||||||
|
force_kf: bool,
|
||||||
|
inited: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Raw NVENC handle + COM ptrs; confined to the single encode thread (like the Linux encoder).
|
||||||
|
unsafe impl Send for NvencD3d11Encoder {}
|
||||||
|
|
||||||
|
impl NvencD3d11Encoder {
|
||||||
|
pub fn open(
|
||||||
|
codec: Codec,
|
||||||
|
_format: PixelFormat,
|
||||||
|
width: u32,
|
||||||
|
height: u32,
|
||||||
|
fps: u32,
|
||||||
|
bitrate_bps: u64,
|
||||||
|
) -> Result<Self> {
|
||||||
|
Ok(Self {
|
||||||
|
ctx: None,
|
||||||
|
encoder: ptr::null_mut(),
|
||||||
|
codec_guid: codec_guid(codec),
|
||||||
|
width,
|
||||||
|
height,
|
||||||
|
fps,
|
||||||
|
bitrate_bps,
|
||||||
|
buffer_fmt: nv::NV_ENC_BUFFER_FORMAT::NV_ENC_BUFFER_FORMAT_ARGB,
|
||||||
|
pool: Vec::new(),
|
||||||
|
next: 0,
|
||||||
|
bitstreams: Vec::new(),
|
||||||
|
pending: VecDeque::new(),
|
||||||
|
frame_idx: 0,
|
||||||
|
force_kf: false,
|
||||||
|
inited: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lazily create the session on the first frame's D3D11 device (so capture + encode share it).
|
||||||
|
fn init_session(&mut self, device: &ID3D11Device) -> Result<()> {
|
||||||
|
unsafe {
|
||||||
|
self.ctx = Some(device.GetImmediateContext().context("D3D11 immediate context")?);
|
||||||
|
|
||||||
|
// 1. open the session bound to the D3D11 device.
|
||||||
|
let mut params = nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS {
|
||||||
|
version: nv::NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER,
|
||||||
|
deviceType: nv::NV_ENC_DEVICE_TYPE::NV_ENC_DEVICE_TYPE_DIRECTX,
|
||||||
|
device: device.as_raw(),
|
||||||
|
apiVersion: nv::NVENCAPI_VERSION,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let mut enc: *mut c_void = ptr::null_mut();
|
||||||
|
(API.open_encode_session_ex)(&mut params, &mut enc)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("NVENC open_encode_session_ex: {e:?} (no NVIDIA GPU?)"))?;
|
||||||
|
self.encoder = enc;
|
||||||
|
|
||||||
|
// 2. seed the P1 + ultra-low-latency preset config.
|
||||||
|
let mut preset = nv::NV_ENC_PRESET_CONFIG {
|
||||||
|
version: nv::NV_ENC_PRESET_CONFIG_VER,
|
||||||
|
presetCfg: nv::NV_ENC_CONFIG {
|
||||||
|
version: nv::NV_ENC_CONFIG_VER,
|
||||||
|
..Default::default()
|
||||||
|
},
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.get_encode_preset_config_ex)(
|
||||||
|
enc,
|
||||||
|
self.codec_guid,
|
||||||
|
nv::NV_ENC_PRESET_P1_GUID,
|
||||||
|
nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY,
|
||||||
|
&mut preset,
|
||||||
|
)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("get_encode_preset_config_ex: {e:?}"))?;
|
||||||
|
let mut cfg = preset.presetCfg;
|
||||||
|
|
||||||
|
// 3. mirror the Linux RC config: CBR, infinite GOP, P-only, ~1-frame VBV.
|
||||||
|
cfg.gopLength = nv::NVENC_INFINITE_GOPLENGTH;
|
||||||
|
cfg.frameIntervalP = 1;
|
||||||
|
cfg.rcParams.rateControlMode = nv::NV_ENC_PARAMS_RC_MODE::NV_ENC_PARAMS_RC_CBR;
|
||||||
|
let bps = self.bitrate_bps.min(u32::MAX as u64) as u32;
|
||||||
|
cfg.rcParams.averageBitRate = bps;
|
||||||
|
cfg.rcParams.maxBitRate = bps;
|
||||||
|
let vbv = (self.bitrate_bps as f64 / self.fps.max(1) as f64) as u32;
|
||||||
|
cfg.rcParams.vbvBufferSize = vbv;
|
||||||
|
cfg.rcParams.vbvInitialDelay = vbv;
|
||||||
|
|
||||||
|
// 4. initialize the encoder.
|
||||||
|
let mut init = nv::NV_ENC_INITIALIZE_PARAMS {
|
||||||
|
version: nv::NV_ENC_INITIALIZE_PARAMS_VER,
|
||||||
|
encodeGUID: self.codec_guid,
|
||||||
|
presetGUID: nv::NV_ENC_PRESET_P1_GUID,
|
||||||
|
tuningInfo: nv::NV_ENC_TUNING_INFO::NV_ENC_TUNING_INFO_ULTRA_LOW_LATENCY,
|
||||||
|
encodeWidth: self.width,
|
||||||
|
encodeHeight: self.height,
|
||||||
|
darWidth: self.width,
|
||||||
|
darHeight: self.height,
|
||||||
|
frameRateNum: self.fps,
|
||||||
|
frameRateDen: 1,
|
||||||
|
enablePTD: 1,
|
||||||
|
encodeConfig: &mut cfg,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.initialize_encoder)(enc, &mut init)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("initialize_encoder: {e:?}"))?;
|
||||||
|
|
||||||
|
// 5. encoder-owned BGRA texture pool, registered once, + one bitstream per slot.
|
||||||
|
let desc = D3D11_TEXTURE2D_DESC {
|
||||||
|
Width: self.width,
|
||||||
|
Height: self.height,
|
||||||
|
MipLevels: 1,
|
||||||
|
ArraySize: 1,
|
||||||
|
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
|
||||||
|
SampleDesc: DXGI_SAMPLE_DESC {
|
||||||
|
Count: 1,
|
||||||
|
Quality: 0,
|
||||||
|
},
|
||||||
|
Usage: D3D11_USAGE_DEFAULT,
|
||||||
|
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
|
||||||
|
CPUAccessFlags: 0,
|
||||||
|
MiscFlags: 0,
|
||||||
|
};
|
||||||
|
for _ in 0..POOL {
|
||||||
|
let mut tex: Option<ID3D11Texture2D> = None;
|
||||||
|
device
|
||||||
|
.CreateTexture2D(&desc, None, Some(&mut tex))
|
||||||
|
.context("CreateTexture2D(nvenc pool)")?;
|
||||||
|
let tex = tex.context("null pool texture")?;
|
||||||
|
let mut rr = nv::NV_ENC_REGISTER_RESOURCE {
|
||||||
|
version: nv::NV_ENC_REGISTER_RESOURCE_VER,
|
||||||
|
resourceType:
|
||||||
|
nv::NV_ENC_INPUT_RESOURCE_TYPE::NV_ENC_INPUT_RESOURCE_TYPE_DIRECTX,
|
||||||
|
width: self.width,
|
||||||
|
height: self.height,
|
||||||
|
pitch: 0,
|
||||||
|
resourceToRegister: tex.as_raw(),
|
||||||
|
bufferFormat: self.buffer_fmt,
|
||||||
|
bufferUsage: nv::NV_ENC_BUFFER_USAGE::NV_ENC_INPUT_IMAGE,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.register_resource)(enc, &mut rr)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("register_resource: {e:?}"))?;
|
||||||
|
self.pool.push(PooledTex {
|
||||||
|
tex,
|
||||||
|
reg: rr.registeredResource,
|
||||||
|
map: ptr::null_mut(),
|
||||||
|
});
|
||||||
|
let mut cb = nv::NV_ENC_CREATE_BITSTREAM_BUFFER {
|
||||||
|
version: nv::NV_ENC_CREATE_BITSTREAM_BUFFER_VER,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.create_bitstream_buffer)(enc, &mut cb)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("create_bitstream_buffer: {e:?}"))?;
|
||||||
|
self.bitstreams.push(cb.bitstreamBuffer);
|
||||||
|
}
|
||||||
|
self.inited = true;
|
||||||
|
tracing::info!(
|
||||||
|
"NVENC D3D11 session: {}x{}@{} {} Mbps {:?}",
|
||||||
|
self.width,
|
||||||
|
self.height,
|
||||||
|
self.fps,
|
||||||
|
bps / 1_000_000,
|
||||||
|
self.codec_guid
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Encoder for NvencD3d11Encoder {
|
||||||
|
fn submit(&mut self, captured: &CapturedFrame) -> Result<()> {
|
||||||
|
let frame = match &captured.payload {
|
||||||
|
FramePayload::D3d11(f) => f,
|
||||||
|
FramePayload::Cpu(_) => {
|
||||||
|
bail!("NVENC D3D11 encoder needs a GPU texture frame (use the software encoder for CPU frames)")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if !self.inited {
|
||||||
|
let device = frame.device.clone();
|
||||||
|
self.init_session(&device)?;
|
||||||
|
}
|
||||||
|
let slot = self.next % POOL;
|
||||||
|
self.next += 1;
|
||||||
|
unsafe {
|
||||||
|
let ctx = self.ctx.as_ref().context("no D3D11 context")?;
|
||||||
|
ctx.CopyResource(&self.pool[slot].tex, &frame.texture);
|
||||||
|
|
||||||
|
let mut mp = nv::NV_ENC_MAP_INPUT_RESOURCE {
|
||||||
|
version: nv::NV_ENC_MAP_INPUT_RESOURCE_VER,
|
||||||
|
registeredResource: self.pool[slot].reg,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.map_input_resource)(self.encoder, &mut mp)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("map_input_resource: {e:?}"))?;
|
||||||
|
self.pool[slot].map = mp.mappedResource;
|
||||||
|
|
||||||
|
let pts = self.frame_idx as u64;
|
||||||
|
self.frame_idx += 1;
|
||||||
|
let flags = if std::mem::take(&mut self.force_kf) {
|
||||||
|
nv::NV_ENC_PIC_FLAGS::NV_ENC_PIC_FLAG_FORCEIDR as u32
|
||||||
|
| nv::NV_ENC_PIC_FLAGS::NV_ENC_PIC_FLAG_OUTPUT_SPSPPS as u32
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
};
|
||||||
|
let mut pic = nv::NV_ENC_PIC_PARAMS {
|
||||||
|
version: nv::NV_ENC_PIC_PARAMS_VER,
|
||||||
|
inputWidth: self.width,
|
||||||
|
inputHeight: self.height,
|
||||||
|
inputPitch: 0,
|
||||||
|
inputBuffer: mp.mappedResource,
|
||||||
|
bufferFmt: mp.mappedBufferFmt,
|
||||||
|
outputBitstream: self.bitstreams[slot],
|
||||||
|
pictureStruct: nv::NV_ENC_PIC_STRUCT::NV_ENC_PIC_STRUCT_FRAME,
|
||||||
|
inputTimeStamp: pts,
|
||||||
|
encodePicFlags: flags as u32,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.encode_picture)(self.encoder, &mut pic)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("encode_picture: {e:?}"))?;
|
||||||
|
self.pending
|
||||||
|
.push_back((self.bitstreams[slot], slot, captured.pts_ns));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn request_keyframe(&mut self) {
|
||||||
|
self.force_kf = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn poll(&mut self) -> Result<Option<EncodedFrame>> {
|
||||||
|
let Some((bs, slot, pts_ns)) = self.pending.pop_front() else {
|
||||||
|
return Ok(None);
|
||||||
|
};
|
||||||
|
unsafe {
|
||||||
|
let mut lock = nv::NV_ENC_LOCK_BITSTREAM {
|
||||||
|
version: nv::NV_ENC_LOCK_BITSTREAM_VER,
|
||||||
|
outputBitstream: bs,
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
(API.lock_bitstream)(self.encoder, &mut lock)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("lock_bitstream: {e:?}"))?;
|
||||||
|
let data = std::slice::from_raw_parts(
|
||||||
|
lock.bitstreamBufferPtr as *const u8,
|
||||||
|
lock.bitstreamSizeInBytes as usize,
|
||||||
|
)
|
||||||
|
.to_vec();
|
||||||
|
let keyframe = matches!(
|
||||||
|
lock.pictureType,
|
||||||
|
nv::NV_ENC_PIC_TYPE::NV_ENC_PIC_TYPE_IDR | nv::NV_ENC_PIC_TYPE::NV_ENC_PIC_TYPE_I
|
||||||
|
);
|
||||||
|
(API.unlock_bitstream)(self.encoder, bs)
|
||||||
|
.result_without_string()
|
||||||
|
.map_err(|e| anyhow!("unlock_bitstream: {e:?}"))?;
|
||||||
|
if !self.pool[slot].map.is_null() {
|
||||||
|
let _ = (API.unmap_input_resource)(self.encoder, self.pool[slot].map);
|
||||||
|
self.pool[slot].map = ptr::null_mut();
|
||||||
|
}
|
||||||
|
Ok(Some(EncodedFrame {
|
||||||
|
data,
|
||||||
|
pts_ns,
|
||||||
|
keyframe,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self) -> Result<()> {
|
||||||
|
Ok(()) // P1/ULL + frameIntervalP=1: each submit yields its AU; no internal queue to drain.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for NvencD3d11Encoder {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if self.encoder.is_null() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
unsafe {
|
||||||
|
for p in &self.pool {
|
||||||
|
if !p.map.is_null() {
|
||||||
|
let _ = (API.unmap_input_resource)(self.encoder, p.map);
|
||||||
|
}
|
||||||
|
let _ = (API.unregister_resource)(self.encoder, p.reg);
|
||||||
|
}
|
||||||
|
for &bs in &self.bitstreams {
|
||||||
|
let _ = (API.destroy_bitstream_buffer)(self.encoder, bs);
|
||||||
|
}
|
||||||
|
let _ = (API.destroy_encoder)(self.encoder);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user