feat(host/windows): NVENC D3D11 hardware encoder (--features nvenc)
android / android (push) Failing after 36s
ci / rust (push) Failing after 45s
apple / swift (push) Successful in 55s
ci / web (push) Successful in 27s
ci / docs-site (push) Successful in 29s
ci / bench (push) Successful in 1m35s
decky / build-publish (push) Successful in 12s
docker / build-push (--build-arg FEDORA_VERSION=44, ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora44-rpm) (push) Successful in 5s
docker / build-push (., web/Dockerfile, punktfunk-web) (push) Successful in 4s
docker / build-push (ci, ci/fedora-rpm.Dockerfile, punktfunk-fedora-rpm) (push) Successful in 4s
docker / build-push (ci, ci/rust-ci.Dockerfile, punktfunk-rust-ci) (push) Successful in 4s
docker / build-push (docs-site, docs-site/Dockerfile, punktfunk-docs) (push) Successful in 3s
flatpak / build-publish (push) Failing after 2s
deb / build-publish (push) Successful in 3m13s
rpm / build-publish (bazzite, punktfunk-fedora-rpm) (push) Failing after 1m17s
rpm / build-publish (fedora-44, punktfunk-fedora44-rpm) (push) Failing after 1m32s
docker / deploy-docs (push) Successful in 17s

Zero-copy capture->encode on the GPU via the raw NVENC API (nvidia_video_codec_sdk sys + ENCODE_API; the safe wrapper is CUDA-only). Opens an NV_ENC_DEVICE_TYPE_DIRECTX session on the SAME ID3D11Device as the DXGI capturer (carried on the new FramePayload::D3d11), registers a pool of BGRA textures once, CopyResources each captured texture in and encode_picture; CBR/ULL, infinite GOP, P-only, forced-IDR for RFI. The DXGI capturer gains a D3D11 zero-copy output (selected, like the encoder, by PUNKTFUNK_ENCODER=nvenc) so capture+encode share textures.

OFF by default (the nvenc feature pulls the NVENC SDK + cudarc): the default Windows host links without it (openh264 path). cudarc builds toolkit-less via the SDK ci-check feature (dynamic-loading). At link time --features nvenc needs nvencodeapi.lib (NVENC SDK, or an import lib generated from the driver's nvEncodeAPI64.dll) on PUNKTFUNK_NVENC_LIB_DIR. Both default and --features nvenc builds validated to compile+link GPU-less on the VM (import lib generated from the driver DLL). Runtime needs a real NVIDIA GPU.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 01:39:46 +00:00
parent 04b76ebfc7
commit 69ba6ec45d
6 changed files with 498 additions and 9 deletions
+80 -4
View File
@@ -16,8 +16,9 @@ use windows::Win32::Foundation::{HMODULE, LUID};
use windows::Win32::Graphics::Direct3D::{D3D_DRIVER_TYPE_UNKNOWN, D3D_FEATURE_LEVEL_11_0};
use windows::Win32::Graphics::Direct3D11::{
D3D11CreateDevice, ID3D11Device, ID3D11DeviceContext, ID3D11Texture2D, D3D11_BIND_FLAG,
D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT, D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ,
D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC, D3D11_USAGE_STAGING,
D3D11_BIND_RENDER_TARGET, D3D11_CPU_ACCESS_READ, D3D11_CREATE_DEVICE_BGRA_SUPPORT,
D3D11_MAPPED_SUBRESOURCE, D3D11_MAP_READ, D3D11_SDK_VERSION, D3D11_TEXTURE2D_DESC,
D3D11_USAGE_DEFAULT, D3D11_USAGE_STAGING,
};
use windows::Win32::Graphics::Dxgi::Common::{DXGI_FORMAT_B8G8R8A8_UNORM, DXGI_SAMPLE_DESC};
use windows::Win32::Graphics::Dxgi::{
@@ -78,6 +79,13 @@ pub struct DuplCapturer {
active: AtomicBool,
timeout_ms: u32,
last: Option<Vec<u8>>,
/// GPU-output mode (zero-copy → NVENC): produce `FramePayload::D3d11` instead of CPU BGRA.
/// Selected by `PUNKTFUNK_ENCODER=nvenc` so the capturer's output matches the encoder's input.
gpu_mode: bool,
/// Reused owned texture the duplication frame is copied into for the D3D11 path (the duplication
/// surface is transient and released each frame).
gpu_copy: Option<ID3D11Texture2D>,
have_gpu_frame: bool,
_keepalive: Box<dyn Send>,
}
// COM objects used only from the one thread that owns the capturer (the encode thread).
@@ -154,12 +162,16 @@ impl DuplCapturer {
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or((2000 / refresh_hz.max(1)).max(100));
let gpu_mode = std::env::var("PUNKTFUNK_ENCODER")
.map(|v| matches!(v.to_ascii_lowercase().as_str(), "nvenc" | "hw" | "nvidia"))
.unwrap_or(false);
tracing::info!(
"DXGI duplication: {}x{}@{} on {}",
"DXGI duplication: {}x{}@{} on {} ({})",
width,
height,
refresh_hz,
target.gdi_name
target.gdi_name,
if gpu_mode { "D3D11 zero-copy" } else { "CPU staging" }
);
Ok(Self {
device,
@@ -174,6 +186,9 @@ impl DuplCapturer {
active: AtomicBool::new(false),
timeout_ms,
last: None,
gpu_mode,
gpu_copy: None,
have_gpu_frame: false,
_keepalive: keepalive,
})
}
@@ -206,6 +221,33 @@ impl DuplCapturer {
Ok(())
}
unsafe fn ensure_gpu_copy(&mut self) -> Result<()> {
if self.gpu_copy.is_some() {
return Ok(());
}
let desc = D3D11_TEXTURE2D_DESC {
Width: self.width,
Height: self.height,
MipLevels: 1,
ArraySize: 1,
Format: DXGI_FORMAT_B8G8R8A8_UNORM,
SampleDesc: DXGI_SAMPLE_DESC {
Count: 1,
Quality: 0,
},
Usage: D3D11_USAGE_DEFAULT,
BindFlags: D3D11_BIND_RENDER_TARGET.0 as u32,
CPUAccessFlags: 0,
MiscFlags: 0,
};
let mut t: Option<ID3D11Texture2D> = None;
self.device
.CreateTexture2D(&desc, None, Some(&mut t))
.context("CreateTexture2D(gpu copy)")?;
self.gpu_copy = t;
Ok(())
}
unsafe fn recreate_dupl(&mut self) -> Result<()> {
if self.holding_frame {
let _ = self.dupl.ReleaseFrame();
@@ -238,6 +280,26 @@ impl DuplCapturer {
self.holding_frame = true;
let res = res.context("AcquireNextFrame: null resource")?;
let tex: ID3D11Texture2D = res.cast().context("resource -> Texture2D")?;
if self.gpu_mode {
// Zero-copy path: keep the frame on the GPU for NVENC. Copy the transient duplication
// surface into a reused owned texture, release the duplication frame, hand off the texture.
self.ensure_gpu_copy()?;
let gpu = self.gpu_copy.clone().context("gpu copy texture")?;
self.context.CopyResource(&gpu, &tex);
let _ = self.dupl.ReleaseFrame();
self.holding_frame = false;
self.have_gpu_frame = true;
return Ok(Some(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::D3d11(D3d11Frame {
texture: gpu,
device: self.device.clone(),
}),
}));
}
self.ensure_staging()?;
let staging = self.staging.clone().context("staging texture")?;
self.context.CopyResource(&staging, &tex);
@@ -277,6 +339,20 @@ impl Capturer for DuplCapturer {
if let Some(f) = unsafe { self.acquire() }? {
return Ok(f);
}
if self.gpu_mode && self.have_gpu_frame {
if let Some(gpu) = &self.gpu_copy {
return Ok(CapturedFrame {
width: self.width,
height: self.height,
pts_ns: now_ns(),
format: PixelFormat::Bgra,
payload: FramePayload::D3d11(D3d11Frame {
texture: gpu.clone(),
device: self.device.clone(),
}),
});
}
}
if let Some(b) = &self.last {
return Ok(CapturedFrame {
width: self.width,