//! NVENC encoder via `ffmpeg-next` (binds the system FFmpeg — `ffmpeg-sys-next` auto-detects the //! installed version, so this builds against FFmpeg 7.x/libavcodec 61 *or* 8.x/libavcodec 62; //! validated live on Ubuntu 26.04 (FFmpeg 8) and Bazzite F43 (FFmpeg 7.1)). //! //! Input is a packed RGB/BGR CPU frame; `*_nvenc` accepts `rgb0`/`bgr0`/`rgba`/`bgra` //! directly and does the RGB→YUV conversion on the GPU, so the host stays off the //! colour-conversion path. The portal commonly negotiates packed 24-bit `RGB`, which NVENC //! does *not* accept — we expand it to `rgb0` (one padding byte/pixel, no colour math). //! The encoder is opened *without* a global header so VPS/SPS/PPS are emitted in-band on //! every IDR — the output is both a playable raw Annex-B stream and self-contained AUs. // Every `unsafe` block in this file carries a `// SAFETY:` proof; enforce it (unsafe-proof program). #![deny(clippy::undocumented_unsafe_blocks)] use super::{ChromaFormat, Codec, EncodedFrame, Encoder}; use crate::capture::{CapturedFrame, FramePayload, PixelFormat}; use anyhow::{anyhow, bail, Context, Result}; use ffmpeg::format::Pixel; use ffmpeg::util::frame::Video as VideoFrame; use ffmpeg::{codec, encoder, Dictionary, Packet, Rational}; use ffmpeg_next as ffmpeg; use std::os::raw::c_int; use std::ptr; use ffmpeg::ffi; // = ffmpeg_sys_next /// swscale: nearest-neighbour scaler flag (`SWS_POINT`). We never rescale (src dims == dst dims), so /// the resampler choice only governs the colour-conversion path; POINT is the cheapest. const SWS_POINT: c_int = 0x10; /// swscale colorspace id for ITU-R BT.709 (`SWS_CS_ITU709`) — the CSC coefficients for our RGB→YUV. const SWS_CS_ITU709: c_int = 1; /// The swscale *source* pixel format for a captured packed RGB/BGR layout (the real byte order, not /// the NVENC-padded `*0` form). Used by the 4:4:4 RGB→YUV444P conversion path. Mirrors the VAAPI /// CPU-input mapping; YUV/10-bit inputs can't feed this path (the 4:4:4 session forces packed RGB). fn sws_src_pixel(format: PixelFormat) -> Result { Ok(match format { PixelFormat::Bgrx => Pixel::BGRZ, // bgr0 PixelFormat::Rgbx => Pixel::RGBZ, // rgb0 PixelFormat::Bgra => Pixel::BGRA, PixelFormat::Rgba => Pixel::RGBA, PixelFormat::Rgb => Pixel::RGB24, PixelFormat::Bgr => Pixel::BGR24, PixelFormat::Nv12 | PixelFormat::P010 | PixelFormat::Rgb10a2 => { bail!("NVENC 4:4:4 CPU-input path supports packed RGB/BGR only; got {format:?}") } }) } /// `AVCUDADeviceContext` (libavutil/hwcontext_cuda.h) — not in the ffmpeg-sys bindings (the /// crate doesn't allowlist that header), so mirror its stable 3-pointer layout. We set the /// first field to *our* `CUcontext` so NVENC shares the context the EGL importer maps into. #[repr(C)] struct AVCUDADeviceContext { cuda_ctx: *mut std::ffi::c_void, // CUcontext stream: *mut std::ffi::c_void, // CUstream (null = default) internal: *mut std::ffi::c_void, // filled by ctx_init } /// CUDA hardware-frame contexts that wrap our shared `CUcontext`, so `hevc_nvenc` reads the /// imported device buffer directly. Owns two `AVBufferRef`s, unref'd on drop. struct CudaHw { device_ref: *mut ffi::AVBufferRef, frames_ref: *mut ffi::AVBufferRef, } impl CudaHw { /// Build a CUDA hwdevice wrapping `cu_ctx` and a frames pool (`sw_format` = `pixel`). unsafe fn new(cu_ctx: *mut std::ffi::c_void, sw_format: Pixel, w: u32, h: u32) -> Result { let mut device_ref = ffi::av_hwdevice_ctx_alloc(ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA); if device_ref.is_null() { bail!("av_hwdevice_ctx_alloc(CUDA) failed"); } let dev_ctx = (*device_ref).data as *mut ffi::AVHWDeviceContext; let cu = (*dev_ctx).hwctx as *mut AVCUDADeviceContext; (*cu).cuda_ctx = cu_ctx; // share the importer's context let r = ffi::av_hwdevice_ctx_init(device_ref); if r < 0 { ffi::av_buffer_unref(&mut device_ref); bail!("av_hwdevice_ctx_init failed ({r})"); } let mut frames_ref = ffi::av_hwframe_ctx_alloc(device_ref); if frames_ref.is_null() { ffi::av_buffer_unref(&mut device_ref); bail!("av_hwframe_ctx_alloc failed"); } let fc = (*frames_ref).data as *mut ffi::AVHWFramesContext; (*fc).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; (*fc).sw_format = pixel_to_av(sw_format); (*fc).width = w as c_int; (*fc).height = h as c_int; (*fc).initial_pool_size = 0; // we supply the device pointers let r = ffi::av_hwframe_ctx_init(frames_ref); if r < 0 { ffi::av_buffer_unref(&mut frames_ref); ffi::av_buffer_unref(&mut device_ref); bail!("av_hwframe_ctx_init failed ({r})"); } Ok(CudaHw { device_ref, frames_ref, }) } } impl Drop for CudaHw { fn drop(&mut self) { // SAFETY: `frames_ref`/`device_ref` are the two non-null `AVBufferRef`s `CudaHw::new` created // (it bails before returning `Self` if either alloc fails, so a live `CudaHw` always holds // both). `av_buffer_unref` drops one reference and nulls the pointer through the `&mut`. This // `Drop` runs exactly once and `CudaHw` owns these refs exclusively → no double-free / // use-after-free. Frames are unref'd before the device (the frames ctx internally refs the // device; refcounted, so the order is sound regardless). unsafe { ffi::av_buffer_unref(&mut self.frames_ref); ffi::av_buffer_unref(&mut self.device_ref); } } } /// `ffmpeg::format::Pixel` → raw `AVPixelFormat`. fn pixel_to_av(p: Pixel) -> ffi::AVPixelFormat { // `Pixel` is `#[repr(i32)]`-compatible with `AVPixelFormat` (the bindgen enum) via this // documented conversion in ffmpeg-next. ffi::AVPixelFormat::from(p) } /// Map a captured layout to the NVENC input pixel format, and whether a 3→4 byte expand is /// needed (packed RGB/BGR have no padding byte; the NVENC `*0` formats do). fn nvenc_input(format: PixelFormat) -> (Pixel, bool) { match format { PixelFormat::Bgrx => (Pixel::BGRZ, false), // bgr0 PixelFormat::Rgbx => (Pixel::RGBZ, false), // rgb0 PixelFormat::Bgra => (Pixel::BGRA, false), PixelFormat::Rgba => (Pixel::RGBA, false), PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0 PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0 // NV12 is native YUV: NVENC encodes it with NO internal RGB→YUV CSC (the Tier 2A win). On // Linux it's produced by the GPU convert on the zero-copy tiled path (`PUNKTFUNK_NV12`); on // Windows by the D3D11 video processor. PixelFormat::Nv12 => (Pixel::NV12, false), // Rgb10a2 (HDR) and P010 (the Windows 10-bit video-processor output) are produced only by // the Windows paths; the Linux capturer never emits them. Map to BGRA so the match is // exhaustive — unreachable here. PixelFormat::Rgb10a2 | PixelFormat::P010 => (Pixel::BGRA, false), } } pub struct NvencEncoder { enc: encoder::video::Encoder, /// Reusable 4-bpp CPU input frame (CPU path only; `None` for the zero-copy/CUDA path). /// Mutating it in place across frames is sound only because the encoder is opened with /// `delay=0`/`bf=0`/`max_b_frames=0` and the caller drains `poll()` after each `submit`, /// so libavcodec holds no reference to the previous frame's buffer when we overwrite it. frame: Option, /// Zero-copy path: CUDA hwdevice/hwframes contexts (the encoder takes `AV_PIX_FMT_CUDA`). cuda: Option, /// 4:4:4 path only: swscale context converting the captured packed RGB/BGR → planar YUV444P /// (BT.709 limited) into [`Self::frame`], because `hevc_nvenc` only emits 4:4:4 from a YUV444 /// *input* (RGB-in is always 4:2:0). `None` on the ordinary 4:2:0 RGB path. Freed in `Drop`. sws_444: Option<*mut ffi::SwsContext>, src_format: PixelFormat, expand: bool, width: u32, height: u32, fps: u32, /// Monotonic presentation index, in `1/fps` time-base units. frame_idx: i64, /// Force the next submitted frame to be an IDR (set by [`request_keyframe`]). force_kf: bool, } // `CudaHw` holds raw `AVBufferRef`s and `sws_444` a raw `SwsContext`; the encoder lives on a single // thread. The CPU encoder is already `Send` via ffmpeg-next; assert it for the raw fields too. // SAFETY: `NvencEncoder` owns an ffmpeg-next `Encoder`/`VideoFrame` (already `Send`) plus a `CudaHw` // holding raw `AVBufferRef`s and an optional raw `SwsContext`, none of which are `Send` by default. // The `SwsContext` is a self-contained swscale state object with no thread affinity, touched only // through `&mut self` on the one encode thread. The encoder is owned and driven by // exactly ONE thread — the per-session encode thread it is moved to — and is only touched through // `&mut self` methods, so it is never aliased or accessed concurrently. The wrapped libav contexts // (and the shared `CUcontext` the `CudaHw` references) have no thread affinity, so transferring // ownership across threads is sound. This asserts `Send` (transfer) only, extending ffmpeg-next's // existing `Send` to the raw CUDA fields; `Sync` (shared `&`) is deliberately NOT implemented. unsafe impl Send for NvencEncoder {} impl NvencEncoder { #[allow(clippy::too_many_arguments)] pub fn open( codec: Codec, format: PixelFormat, width: u32, height: u32, fps: u32, bitrate_bps: u64, cuda: bool, bit_depth: u8, chroma: ChromaFormat, ) -> Result { // TODO(hdr): Linux 10-bit parity. Unlike the Windows raw-SDK path (which upconverts 8-bit // ARGB → Main10 via pixelBitDepthMinus8), libavcodec hevc_nvenc needs a 10-bit input pixel // format (p010) for Main10, so it's a bigger change; deferred until a Linux GPU box is // available to validate. The Linux host stays 8-bit for now. if bit_depth != 8 { tracing::warn!( bit_depth, "Linux NVENC 10-bit not yet wired — encoding 8-bit" ); } // Full-chroma 4:4:4 (HEVC Range Extensions). `hevc_nvenc` only emits 4:4:4 from a YUV444 // *input* frame — feeding RGB always subsamples to 4:2:0 regardless of profile (verified on // the RTX 5070 Ti). So a 4:4:4 session swscales the captured RGB → YUV444P (BT.709 limited) // and feeds that with `profile=rext`. The negotiator gates this to HEVC + the single-process // CPU-capture topology, so `cuda` must be false here; defend the contract. let want_444 = chroma.is_444() && codec == Codec::H265; if want_444 && cuda { bail!( "NVENC 4:4:4 needs CPU RGB frames (the session forces non-zero-copy capture for \ 4:4:4); got a CUDA frame — capture/encoder negotiation mismatch" ); } ffmpeg::init().context("ffmpeg init")?; if std::env::var_os("PUNKTFUNK_FFMPEG_DEBUG").is_some() { // SAFETY: `av_log_set_level` sets libav's global integer log level; `48` (= AV_LOG_DEBUG) // is a valid level with no pointer args, and libav was just initialized by `ffmpeg::init()` // above — always sound. unsafe { ffi::av_log_set_level(48) }; // AV_LOG_DEBUG — surface NVENC hw-frame rejects } let name = codec.nvenc_name(); let av_codec = encoder::find_by_name(name) .ok_or_else(|| anyhow!("{name} not built into libavcodec"))?; let (rgb_pixel, rgb_expand) = nvenc_input(format); // 4:4:4 feeds NVENC a planar YUV444P frame we produce by swscale; the ordinary path feeds the // captured RGB straight in and lets NVENC's internal CSC subsample to 4:2:0. let (nvenc_pixel, expand) = if want_444 { (Pixel::YUV444P, false) } else { (rgb_pixel, rgb_expand) }; let mut video = codec::context::Context::new_with_codec(av_codec) .encoder() .video() .context("alloc video encoder")?; video.set_width(width); video.set_height(height); video.set_format(nvenc_pixel); // NVENC converts RGB→YUV internally video.set_time_base(Rational(1, fps as i32)); video.set_frame_rate(Some(Rational(fps as i32, 1))); video.set_bit_rate(bitrate_bps as usize); video.set_max_bit_rate(bitrate_bps as usize); // VBV/HRD buffer — bound the SIZE of any single frame. Under CBR with no buffer set, NVENC // uses a loose default VBV, so a high-motion P-frame is allowed to balloon to many times the // average; those extra packets overflow the bounded send queue + kernel socket buffer and // get dropped, which the client sees as framedrops/jitter (and, on the infinite-GOP path, as // old/stale frames flashing until the next RFI). A tight ~1-frame buffer makes the encoder // hold frame size roughly constant and absorb motion as a momentary QP (quality) dip instead // — the trade we want. Default = 1 frame of bits (bitrate/fps); PUNKTFUNK_VBV_FRAMES tunes it // (larger = better motion quality but bigger per-frame bursts). let vbv_frames = std::env::var("PUNKTFUNK_VBV_FRAMES") .ok() .and_then(|s| s.parse::().ok()) .filter(|v| v.is_finite() && *v > 0.0) .unwrap_or(1.0); let vbv_bits = ((bitrate_bps as f64 / fps.max(1) as f64) * vbv_frames as f64) .clamp(1.0, i32::MAX as f64); // SAFETY: `video` is the ffmpeg-next encoder builder wrapping a freshly-allocated // `AVCodecContext` that we hold by value and have not opened yet; `video.as_mut_ptr()` returns // that non-null, properly-aligned, exclusively-owned context. Writing the plain `rc_buffer_size` // int field before `open_with` is the supported way to set a field ffmpeg-next exposes no // setter for. Sole owner → no aliasing; synchronous in-bounds scalar write. unsafe { (*video.as_mut_ptr()).rc_buffer_size = vbv_bits as i32; } video.set_max_b_frames(0); // Infinite GOP — NO periodic IDR. A keyframe at 5120x1440 is ~20-40x a P-frame, so a // periodic IDR is a recurring multi-millisecond encode+packetize+send spike — the ~2s // "freeze". NVENC emits one IDR at stream start, then P-frames only; `forced-idr` (below) // turns a client recovery request (RFI, via `request_keyframe`) into an IDR on demand. // This is the Moonlight/Sunshine low-latency model. // SAFETY: same `video` builder as above — a non-null, properly-aligned, sole-owned, not-yet- // opened `AVCodecContext`. We write the plain `gop_size` int field (= -1, infinite GOP) before // `open_with`, which ffmpeg-next has no setter for. No aliasing; synchronous scalar write. unsafe { (*video.as_mut_ptr()).gop_size = -1; } // NV12 / 4:4:4 paths: we do the RGB→YUV conversion ourselves as BT.709 *limited* range // (swscale), so signal that in the bitstream VUI (colorspace/range/primaries/transfer) — // otherwise the client decoder assumes a default and the picture comes out washed-out / // wrong-contrast. The RGB-input 4:2:0 path leaves these unset (NVENC's internal CSC writes // its own VUI). Matches the Windows NV12 path's BT.709 limited-range signalling. if matches!(format, PixelFormat::Nv12) || want_444 { // SAFETY: same `video` builder — `raw = video.as_mut_ptr()` is the non-null, properly- // aligned, sole-owned, not-yet-opened `AVCodecContext`. We set its four VUI colour enum // fields to valid `AVColorSpace`/`AVColorRange`/`AVColorPrimaries`/`AVColorTransfer- // Characteristic` variants before `open_with`. Sole owner → no aliasing; synchronous writes. unsafe { let raw = video.as_mut_ptr(); (*raw).colorspace = ffi::AVColorSpace::AVCOL_SPC_BT709; (*raw).color_range = ffi::AVColorRange::AVCOL_RANGE_MPEG; // limited/studio (*raw).color_primaries = ffi::AVColorPrimaries::AVCOL_PRI_BT709; (*raw).color_trc = ffi::AVColorTransferCharacteristic::AVCOL_TRC_BT709; } } // For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA // hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context // *before* open (NVENC derives the device from `hw_frames_ctx`). let cuda_hw = if cuda { let cu_ctx = crate::zerocopy::cuda::context().context("shared CUDA context")?; // SAFETY: `CudaHw::new` (an `unsafe fn`) requires libav initialized (the `ffmpeg::init()` // above ran) and a valid `CUcontext`; `cu_ctx` is the shared importer context from // `zerocopy::cuda::context()?`, non-null on the `Ok` path. `nvenc_pixel` is a valid `Pixel` // and `width`/`height` are the validated positive dims. It returns a RAII `CudaHw` wrapping // (not owning) `cu_ctx` and owning two `AVBufferRef`s freed on drop. let hw = unsafe { CudaHw::new(cu_ctx, nvenc_pixel, width, height)? }; // SAFETY: `raw = video.as_mut_ptr()` is the non-null, sole-owned, not-yet-opened // `AVCodecContext`. We set `pix_fmt = CUDA` and attach NEW refs (`av_buffer_ref`) of // `hw.device_ref`/`hw.frames_ref` — both non-null (`CudaHw::new` guarantees) and from the // live `hw`, which is moved into `NvencEncoder.cuda` next to `enc` and so outlives the // encoder. The context owns its own refs (freed when the context closes). No aliasing. unsafe { let raw = video.as_mut_ptr(); (*raw).pix_fmt = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; (*raw).hw_device_ctx = ffi::av_buffer_ref(hw.device_ref); (*raw).hw_frames_ctx = ffi::av_buffer_ref(hw.frames_ref); } Some(hw) } else { None }; // 4:4:4: build the RGB→YUV444P swscale (BT.709 limited, no rescale). Mirrors the VAAPI CPU // path's RGB→NV12 scaler, but the dst is full-chroma planar 4:4:4. let sws_444 = if want_444 { let src_av = pixel_to_av(sws_src_pixel(format)?); // SAFETY: `sws_getContext` allocates a swscale context for the given src/dst dims + pixel // formats. Both dims are the encoder's positive `width`/`height` as `c_int`; `src_av` is a // valid `AVPixelFormat` (from the `sws_src_pixel`-validated, packed-RGB-only source), the // dst is YUV444P. The trailing filter/param pointers are null = "use defaults" (documented // as accepted). No Rust memory is borrowed; the returned pointer is null-checked below. let sws = unsafe { ffi::sws_getContext( width as c_int, height as c_int, src_av, width as c_int, height as c_int, ffi::AVPixelFormat::AV_PIX_FMT_YUV444P, SWS_POINT, ptr::null_mut(), ptr::null_mut(), ptr::null(), ) }; if sws.is_null() { bail!("sws_getContext(RGB→YUV444P) failed"); } // SAFETY: `sws` is the non-null context from the call above (null-checked). The ITU-709 // coefficient table from `sws_getCoefficients` is a process-lifetime libswscale static, // reused for src+dst matrices; `sws_setColorspaceDetails` only reads it and writes scalar // CSC settings into `sws` (limited-range dst: dstRange = 0). No Rust memory is passed. unsafe { let cs709 = ffi::sws_getCoefficients(SWS_CS_ITU709); ffi::sws_setColorspaceDetails(sws, cs709, 1, cs709, 0, 0, 1 << 16, 1 << 16); } Some(sws) } else { None }; // Low-latency NVENC tuning (plan §7 / linux-setup doc). let mut opts = Dictionary::new(); opts.set("preset", "p1"); // fastest opts.set("tune", "ull"); // ultra-low-latency opts.set("rc", "cbr"); opts.set("bf", "0"); opts.set("delay", "0"); opts.set("forced-idr", "1"); // RFI/request_keyframe → real IDR under the infinite GOP if want_444 { // HEVC Range Extensions — the profile that carries chroma_format_idc=3. With a YUV444P // input `hevc_nvenc` auto-selects it, but pin it explicitly so the chroma is never silently // dropped on a future libavcodec. opts.set("profile", "rext"); } // Split-frame encode across both NVENC engines (GB203 has 2) when the pixel rate exceeds // a single engine's HEVC capacity (~1 Gpix/s); e.g. 5120x1440@240 = 1.77 Gpix/s needs it, // @120 = 0.88 Gpix/s does not. HEVC/AV1 only (not H.264). AUTO won't engage below ~2112px // height, so we force `2`; below the threshold we leave it AUTO (split costs ~2% BD-rate). // Output is standard HEVC — transparent to the client. Override with PUNKTFUNK_SPLIT_ENCODE. let pix_rate = width as u64 * height as u64 * fps as u64; let split = std::env::var("PUNKTFUNK_SPLIT_ENCODE").ok(); match split.as_deref() { Some(mode) => opts.set("split_encode_mode", mode), None if matches!(codec, Codec::H265 | Codec::Av1) && pix_rate > 1_000_000_000 => { opts.set("split_encode_mode", "2"); tracing::info!( pix_rate, "NVENC: forcing 2-way split encode (high pixel rate)" ); } None => {} } let enc = video .open_with(opts) .with_context(|| format!("open {name} ({width}x{height}@{fps}, {bitrate_bps} bps)"))?; let frame = if cuda { None } else { Some(VideoFrame::new(nvenc_pixel, width, height)) }; Ok(NvencEncoder { enc, frame, cuda: cuda_hw, sws_444, src_format: format, expand, width, height, fps, frame_idx: 0, force_kf: false, }) } } impl Encoder for NvencEncoder { fn caps(&self) -> super::EncoderCaps { super::EncoderCaps { // 4:4:4 iff this session opened the RGB→YUV444P swscale path (FREXT). RFI/HDR-SEI stay // unsupported on libavcodec NVENC (the trait defaults). chroma_444: self.sws_444.is_some(), ..super::EncoderCaps::default() } } fn submit(&mut self, captured: &CapturedFrame) -> Result<()> { anyhow::ensure!( captured.width == self.width && captured.height == self.height, "captured frame {}x{} != encoder {}x{}", captured.width, captured.height, self.width, self.height ); let pts = self.frame_idx; self.frame_idx += 1; // Force an IDR when requested (client RFI); otherwise let NVENC pick (GOP/P-frame). let idr = self.force_kf; self.force_kf = false; match &captured.payload { FramePayload::Cuda(buf) => self.submit_cuda(buf, pts, idr), FramePayload::Cpu(bytes) => self.submit_cpu(bytes, captured.format, pts, idr), FramePayload::Dmabuf(_) => { bail!("NVENC got a VAAPI dmabuf frame — capture/encoder backend mismatch") } } } fn request_keyframe(&mut self) { self.force_kf = true; } fn poll(&mut self) -> Result> { let mut pkt = Packet::empty(); match self.enc.receive_packet(&mut pkt) { Ok(()) => { let data = pkt.data().map(|d| d.to_vec()).unwrap_or_default(); let pts = pkt.pts().unwrap_or(0).max(0) as u64; let pts_ns = pts * 1_000_000_000 / self.fps as u64; Ok(Some(EncodedFrame { data, pts_ns, keyframe: pkt.is_key(), })) } // No packet ready yet (need another input frame). Err(ffmpeg::Error::Other { errno }) if errno == ffmpeg::util::error::EAGAIN || errno == ffmpeg::util::error::EWOULDBLOCK => { Ok(None) } // Fully drained after flush(). Err(ffmpeg::Error::Eof) => Ok(None), Err(e) => Err(e).context("receive_packet"), } } fn flush(&mut self) -> Result<()> { self.enc.send_eof().context("send_eof")?; Ok(()) } } impl NvencEncoder { /// CPU path: expand/copy the packed RGB/BGR bytes into the reusable 4-bpp frame, then send. fn submit_cpu(&mut self, bytes: &[u8], format: PixelFormat, pts: i64, idr: bool) -> Result<()> { anyhow::ensure!( format == self.src_format, "captured format {:?} != encoder source {:?}", format, self.src_format ); let w = self.width as usize; let h = self.height as usize; let src_bpp = self.src_format.bytes_per_pixel(); let src_row = w * src_bpp; anyhow::ensure!( bytes.len() >= src_row * h, "captured buffer {} bytes < required {}", bytes.len(), src_row * h ); // 4:4:4: swscale the packed RGB straight into the planar YUV444P input frame (BT.709 limited), // then send it — no byte-expand. The 4:2:0 RGB path (below) feeds NVENC packed RGB directly. if let Some(sws) = self.sws_444 { let frame = self .frame .as_mut() .context("CPU frame missing (encoder opened in CUDA mode)")?; // SAFETY: `format == self.src_format` and `bytes.len() >= src_row * h` (the `ensure!`s // above), so `sws_scale` reads `h` rows of `src_row` bytes from `src_data[0] = bytes` // (packed RGB is single-plane; the other src planes are null/0) — all in bounds. `sws` is // the non-null context built in `open`. The dst is `frame`'s underlying `AVFrame`: its // `data`/`linesize` in-struct arrays were sized for YUV444P by `VideoFrame::new`, and the // 3 planes are each `width`×`height`. All pointers are live locals for this synchronous // call; the encoder runs only on this thread (`unsafe impl Send`), so no aliasing/race. unsafe { let dst_av = frame.as_mut_ptr(); let src_data: [*const u8; 4] = [bytes.as_ptr(), ptr::null(), ptr::null(), ptr::null()]; let src_stride: [c_int; 4] = [src_row as c_int, 0, 0, 0]; let r = ffi::sws_scale( sws, src_data.as_ptr(), src_stride.as_ptr(), 0, h as c_int, (*dst_av).data.as_ptr(), (*dst_av).linesize.as_ptr(), ); if r < 0 { bail!("sws_scale(RGB→YUV444P) failed ({r})"); } } frame.set_pts(Some(pts)); frame.set_kind(if idr { ffmpeg::picture::Type::I } else { ffmpeg::picture::Type::None }); self.enc.send_frame(frame).context("send_frame(444)")?; return Ok(()); } let frame = self .frame .as_mut() .context("CPU frame missing (encoder opened in CUDA mode)")?; let stride = frame.stride(0); // dst is 4-bpp, aligned let dst = frame.data_mut(0); if self.expand { // packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte) for y in 0..h { let s = &bytes[y * src_row..y * src_row + src_row]; let drow = &mut dst[y * stride..y * stride + w * 4]; for x in 0..w { drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]); drow[x * 4 + 3] = 0; } } } else { // 4-bpp → 4-bpp, honoring the (possibly larger) dst stride for y in 0..h { dst[y * stride..y * stride + src_row] .copy_from_slice(&bytes[y * src_row..y * src_row + src_row]); } } frame.set_pts(Some(pts)); frame.set_kind(if idr { ffmpeg::picture::Type::I } else { ffmpeg::picture::Type::None }); self.enc.send_frame(frame).context("send_frame")?; Ok(()) } /// Zero-copy path: hand the imported CUDA device buffer to NVENC with no CPU touch. /// /// We take a *pooled* surface from the CUDA hwframes context (`av_hwframe_get_buffer`) and /// device→device-copy our imported buffer into it, rather than wrapping our own pointer in a /// bare frame. Two reasons: (1) NVENC's `nvenc_send_frame` ignores frames whose `buf[0]` is /// null and the generic encode path's `av_frame_ref` needs a refcounted buffer — a bare /// frame is rejected with `EINVAL`; (2) NVENC caches CUDA-resource *registrations* keyed by /// device pointer with a bounded table, so a fresh pointer every frame would thrash/overflow /// it — the pool recycles a small set of pointers. The extra copy is device-local (~8 MB at /// 1080p, sub-millisecond on the GPU) and keeps the host fully off the pixel path. fn submit_cuda( &mut self, buf: &crate::zerocopy::DeviceBuffer, pts: i64, idr: bool, ) -> Result<()> { let frames_ref = self .cuda .as_ref() .context("CUDA hw context missing (encoder opened in CPU mode)")? .frames_ref; // The device→device copy below uses our shared context directly; make it current on the // encode thread (ffmpeg pushes its own around the pool alloc, so order is fine). crate::zerocopy::cuda::make_current().context("CUDA context current (encode thread)")?; // SAFETY: `frames_ref` is the non-null CUDA frames ctx from `self.cuda` (unwrapped via // `.context(..)?` above), and the shared CUDA context was just made current on THIS thread // (`make_current()?`), the precondition for the device-pointer copies below. // * `av_frame_alloc` → `f` (null-checked). `av_hwframe_get_buffer(frames_ref, f, 0)` fills `f` // with a pooled CUDA surface (sets `data[]`/`linesize[]`/`buf[0]`/`hw_frames_ctx`); on // failure we free `f` and bail. // * For NV12 we read `(*f).data[0..2]` / `linesize[0..2]` (Y + interleaved UV), else // `data[0]`/`linesize[0]` — in-struct fields of the non-null `f`, valid for the surface dims // ffmpeg allocated — and pass them to the cuda copy helpers, which device→device copy `buf` // (the imported `DeviceBuffer`, owned by the caller and live for this call) into the surface. // * On copy error we free `f` and return. Otherwise we write `pts`/`pict_type` through `f` and // `avcodec_send_frame` it into the live owned `self.enc` context (which takes its own ref of // the pooled surface), then free our `f` ref exactly once. Single-threaded encoder → no race. unsafe { let mut f = ffi::av_frame_alloc(); if f.is_null() { bail!("av_frame_alloc failed"); } // Pooled CUDA surface: sets format, width/height, data[0]/linesize[0], buf[0] and // hw_frames_ctx. Reused across frames (the pool recycles), keeping NVENC's // registration cache warm. let r = ffi::av_hwframe_get_buffer(frames_ref, f, 0); if r < 0 { ffi::av_frame_free(&mut f); bail!("av_hwframe_get_buffer(CUDA) failed ({r})"); } // NV12 surfaces are two-plane (Y in data[0], interleaved UV in data[1]); the RGB // surfaces are single-plane. Copy the matching layout into NVENC's pooled surface. let copy_res = if buf.is_nv12() { let y_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; let y_pitch = (*f).linesize[0] as usize; let uv_ptr = (*f).data[1] as crate::zerocopy::cuda::CUdeviceptr; let uv_pitch = (*f).linesize[1] as usize; crate::zerocopy::cuda::copy_nv12_to_device(buf, y_ptr, y_pitch, uv_ptr, uv_pitch) } else { let dst_ptr = (*f).data[0] as crate::zerocopy::cuda::CUdeviceptr; let dst_pitch = (*f).linesize[0] as usize; crate::zerocopy::cuda::copy_device_to_device(buf, dst_ptr, dst_pitch) }; if let Err(e) = copy_res { ffi::av_frame_free(&mut f); return Err(e).context("copy imported buffer into NVENC surface"); } (*f).pts = pts; (*f).pict_type = if idr { ffi::AVPictureType::AV_PICTURE_TYPE_I } else { ffi::AVPictureType::AV_PICTURE_TYPE_NONE }; let r = ffi::avcodec_send_frame(self.enc.as_mut_ptr(), f); ffi::av_frame_free(&mut f); if r < 0 { bail!("avcodec_send_frame(CUDA) failed ({r})"); } } Ok(()) } } impl Drop for NvencEncoder { fn drop(&mut self) { if let Some(sws) = self.sws_444.take() { // SAFETY: `sws` is the non-null `SwsContext` allocated by `sws_getContext` in `open` and // owned exclusively by this encoder (taken out of the field so it can't be freed twice). // `sws_freeContext` frees it; nothing else references it after this single-threaded drop. unsafe { ffi::sws_freeContext(sws) }; } } } /// Probe whether this NVIDIA GPU + driver + libavcodec can actually encode HEVC **4:4:4** (Range /// Extensions). Opens a tiny real `hevc_nvenc` 4:4:4 session — the exact path [`NvencEncoder::open`] /// takes for a live 4:4:4 stream — and reports whether it succeeded. HEVC-only; the result is cached /// by the caller ([`crate::encode::can_encode_444`]). A GPU/driver/ffmpeg without RExt 4:4:4 fails /// the open here, so the host resolves the session to 4:2:0 before the Welcome (honest downgrade). pub fn probe_can_encode_444(codec: Codec) -> bool { if codec != Codec::H265 { return false; } if ffmpeg::init().is_err() { return false; } // Quiet ffmpeg's open error on a GPU that lacks 4:4:4 — the probe failing is an expected outcome. // SAFETY: libav initialized above; `av_log_{get,set}_level` only read/write the global int level // (no pointer args) and are always sound post-init. let prev = unsafe { let p = ffi::av_log_get_level(); ffi::av_log_set_level(ffi::AV_LOG_FATAL); p }; let ok = NvencEncoder::open( codec, PixelFormat::Bgra, 640, 480, 30, 2_000_000, false, // CPU input (the 4:4:4 path never uses CUDA) 8, ChromaFormat::Yuv444, ) .is_ok(); // SAFETY: restore the saved global log level (scalar arg, no pointers). unsafe { ffi::av_log_set_level(prev) }; ok }