//! NVENC encoder via `ffmpeg-next` (binds the system FFmpeg 7.x / libavcodec 61). //! //! Input is a packed RGB/BGR CPU frame; `*_nvenc` accepts `rgb0`/`bgr0`/`rgba`/`bgra` //! directly and does the RGB→YUV conversion on the GPU, so the host stays off the //! colour-conversion path. The portal commonly negotiates packed 24-bit `RGB`, which NVENC //! does *not* accept — we expand it to `rgb0` (one padding byte/pixel, no colour math). //! The encoder is opened *without* a global header so VPS/SPS/PPS are emitted in-band on //! every IDR — the output is both a playable raw Annex-B stream and self-contained AUs. use super::{Codec, EncodedFrame, Encoder}; use crate::capture::{CapturedFrame, FramePayload, PixelFormat}; use anyhow::{anyhow, bail, Context, Result}; use ffmpeg::format::Pixel; use ffmpeg::util::frame::Video as VideoFrame; use ffmpeg::{codec, encoder, Dictionary, Packet, Rational}; use ffmpeg_next as ffmpeg; use std::os::raw::c_int; use ffmpeg::ffi; // = ffmpeg_sys_next /// `AVCUDADeviceContext` (libavutil/hwcontext_cuda.h) — not in the ffmpeg-sys bindings (the /// crate doesn't allowlist that header), so mirror its stable 3-pointer layout. We set the /// first field to *our* `CUcontext` so NVENC shares the context the EGL importer maps into. #[repr(C)] struct AVCUDADeviceContext { cuda_ctx: *mut std::ffi::c_void, // CUcontext stream: *mut std::ffi::c_void, // CUstream (null = default) internal: *mut std::ffi::c_void, // filled by ctx_init } /// CUDA hardware-frame contexts that wrap our shared `CUcontext`, so `hevc_nvenc` reads the /// imported device buffer directly. Owns two `AVBufferRef`s, unref'd on drop. struct CudaHw { device_ref: *mut ffi::AVBufferRef, frames_ref: *mut ffi::AVBufferRef, } impl CudaHw { /// Build a CUDA hwdevice wrapping `cu_ctx` and a frames pool (`sw_format` = `pixel`). unsafe fn new(cu_ctx: *mut std::ffi::c_void, sw_format: Pixel, w: u32, h: u32) -> Result { let mut device_ref = ffi::av_hwdevice_ctx_alloc(ffi::AVHWDeviceType::AV_HWDEVICE_TYPE_CUDA); if device_ref.is_null() { bail!("av_hwdevice_ctx_alloc(CUDA) failed"); } let dev_ctx = (*device_ref).data as *mut ffi::AVHWDeviceContext; let cu = (*dev_ctx).hwctx as *mut AVCUDADeviceContext; (*cu).cuda_ctx = cu_ctx; // share the importer's context let r = ffi::av_hwdevice_ctx_init(device_ref); if r < 0 { ffi::av_buffer_unref(&mut device_ref); bail!("av_hwdevice_ctx_init failed ({r})"); } let mut frames_ref = ffi::av_hwframe_ctx_alloc(device_ref); if frames_ref.is_null() { ffi::av_buffer_unref(&mut device_ref); bail!("av_hwframe_ctx_alloc failed"); } let fc = (*frames_ref).data as *mut ffi::AVHWFramesContext; (*fc).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; (*fc).sw_format = pixel_to_av(sw_format); (*fc).width = w as c_int; (*fc).height = h as c_int; (*fc).initial_pool_size = 0; // we supply the device pointers let r = ffi::av_hwframe_ctx_init(frames_ref); if r < 0 { ffi::av_buffer_unref(&mut frames_ref); ffi::av_buffer_unref(&mut device_ref); bail!("av_hwframe_ctx_init failed ({r})"); } Ok(CudaHw { device_ref, frames_ref, }) } } impl Drop for CudaHw { fn drop(&mut self) { unsafe { ffi::av_buffer_unref(&mut self.frames_ref); ffi::av_buffer_unref(&mut self.device_ref); } } } /// `ffmpeg::format::Pixel` → raw `AVPixelFormat`. fn pixel_to_av(p: Pixel) -> ffi::AVPixelFormat { // `Pixel` is `#[repr(i32)]`-compatible with `AVPixelFormat` (the bindgen enum) via this // documented conversion in ffmpeg-next. ffi::AVPixelFormat::from(p) } /// Map a captured layout to the NVENC input pixel format, and whether a 3→4 byte expand is /// needed (packed RGB/BGR have no padding byte; the NVENC `*0` formats do). fn nvenc_input(format: PixelFormat) -> (Pixel, bool) { match format { PixelFormat::Bgrx => (Pixel::BGRZ, false), // bgr0 PixelFormat::Rgbx => (Pixel::RGBZ, false), // rgb0 PixelFormat::Bgra => (Pixel::BGRA, false), PixelFormat::Rgba => (Pixel::RGBA, false), PixelFormat::Rgb => (Pixel::RGBZ, true), // RGB -> rgb0 PixelFormat::Bgr => (Pixel::BGRZ, true), // BGR -> bgr0 } } pub struct NvencEncoder { enc: encoder::video::Encoder, /// Reusable 4-bpp CPU input frame (CPU path only; `None` for the zero-copy/CUDA path). /// Mutating it in place across frames is sound only because the encoder is opened with /// `delay=0`/`bf=0`/`max_b_frames=0` and the caller drains `poll()` after each `submit`, /// so libavcodec holds no reference to the previous frame's buffer when we overwrite it. frame: Option, /// Zero-copy path: CUDA hwdevice/hwframes contexts (the encoder takes `AV_PIX_FMT_CUDA`). cuda: Option, src_format: PixelFormat, expand: bool, width: u32, height: u32, fps: u32, /// Monotonic presentation index, in `1/fps` time-base units. frame_idx: i64, /// Force the next submitted frame to be an IDR (set by [`request_keyframe`]). force_kf: bool, } // `CudaHw` holds raw `AVBufferRef`s; the encoder lives on a single thread. The CPU encoder is // already `Send` via ffmpeg-next; assert it for the CUDA fields too. unsafe impl Send for NvencEncoder {} impl NvencEncoder { pub fn open( codec: Codec, format: PixelFormat, width: u32, height: u32, fps: u32, bitrate_bps: u64, cuda: bool, ) -> Result { ffmpeg::init().context("ffmpeg init")?; let name = codec.nvenc_name(); let av_codec = encoder::find_by_name(name) .ok_or_else(|| anyhow!("{name} not built into libavcodec"))?; let (nvenc_pixel, expand) = nvenc_input(format); let mut video = codec::context::Context::new_with_codec(av_codec) .encoder() .video() .context("alloc video encoder")?; video.set_width(width); video.set_height(height); video.set_format(nvenc_pixel); // NVENC converts RGB→YUV internally video.set_time_base(Rational(1, fps as i32)); video.set_frame_rate(Some(Rational(fps as i32, 1))); video.set_bit_rate(bitrate_bps as usize); video.set_max_bit_rate(bitrate_bps as usize); video.set_gop(fps.saturating_mul(2).max(1)); // ~2s keyframe interval video.set_max_b_frames(0); // For the zero-copy path, take CUDA surfaces: wrap the shared CUcontext in CUDA // hwdevice/hwframes contexts and set `pix_fmt = CUDA` on the raw encoder context // *before* open (NVENC derives the device from `hw_frames_ctx`). let cuda_hw = if cuda { let cu_ctx = crate::zerocopy::cuda::context().context("shared CUDA context")?; let hw = unsafe { CudaHw::new(cu_ctx, nvenc_pixel, width, height)? }; unsafe { let raw = video.as_mut_ptr(); (*raw).pix_fmt = ffi::AVPixelFormat::AV_PIX_FMT_CUDA; (*raw).hw_device_ctx = ffi::av_buffer_ref(hw.device_ref); (*raw).hw_frames_ctx = ffi::av_buffer_ref(hw.frames_ref); } Some(hw) } else { None }; // Low-latency NVENC tuning (plan §7 / linux-setup doc). let mut opts = Dictionary::new(); opts.set("preset", "p1"); // fastest opts.set("tune", "ull"); // ultra-low-latency opts.set("rc", "cbr"); opts.set("bf", "0"); opts.set("delay", "0"); let enc = video .open_with(opts) .with_context(|| format!("open {name} ({width}x{height}@{fps}, {bitrate_bps} bps)"))?; let frame = if cuda { None } else { Some(VideoFrame::new(nvenc_pixel, width, height)) }; Ok(NvencEncoder { enc, frame, cuda: cuda_hw, src_format: format, expand, width, height, fps, frame_idx: 0, force_kf: false, }) } } impl Encoder for NvencEncoder { fn submit(&mut self, captured: &CapturedFrame) -> Result<()> { anyhow::ensure!( captured.width == self.width && captured.height == self.height, "captured frame {}x{} != encoder {}x{}", captured.width, captured.height, self.width, self.height ); let pts = self.frame_idx; self.frame_idx += 1; // Force an IDR when requested (client RFI); otherwise let NVENC pick (GOP/P-frame). let idr = self.force_kf; self.force_kf = false; match &captured.payload { FramePayload::Cuda(buf) => self.submit_cuda(buf, pts, idr), FramePayload::Cpu(bytes) => self.submit_cpu(bytes, captured.format, pts, idr), } } fn request_keyframe(&mut self) { self.force_kf = true; } fn poll(&mut self) -> Result> { let mut pkt = Packet::empty(); match self.enc.receive_packet(&mut pkt) { Ok(()) => { let data = pkt.data().map(|d| d.to_vec()).unwrap_or_default(); let pts = pkt.pts().unwrap_or(0).max(0) as u64; let pts_ns = pts * 1_000_000_000 / self.fps as u64; Ok(Some(EncodedFrame { data, pts_ns, keyframe: pkt.is_key(), })) } // No packet ready yet (need another input frame). Err(ffmpeg::Error::Other { errno }) if errno == ffmpeg::util::error::EAGAIN || errno == ffmpeg::util::error::EWOULDBLOCK => { Ok(None) } // Fully drained after flush(). Err(ffmpeg::Error::Eof) => Ok(None), Err(e) => Err(e).context("receive_packet"), } } fn flush(&mut self) -> Result<()> { self.enc.send_eof().context("send_eof")?; Ok(()) } } impl NvencEncoder { /// CPU path: expand/copy the packed RGB/BGR bytes into the reusable 4-bpp frame, then send. fn submit_cpu(&mut self, bytes: &[u8], format: PixelFormat, pts: i64, idr: bool) -> Result<()> { anyhow::ensure!( format == self.src_format, "captured format {:?} != encoder source {:?}", format, self.src_format ); let w = self.width as usize; let h = self.height as usize; let src_bpp = self.src_format.bytes_per_pixel(); let src_row = w * src_bpp; anyhow::ensure!( bytes.len() >= src_row * h, "captured buffer {} bytes < required {}", bytes.len(), src_row * h ); let frame = self .frame .as_mut() .context("CPU frame missing (encoder opened in CUDA mode)")?; let stride = frame.stride(0); // dst is 4-bpp, aligned let dst = frame.data_mut(0); if self.expand { // packed 3-bpp RGB/BGR → 4-bpp *0 (copy 3 bytes, zero the pad byte) for y in 0..h { let s = &bytes[y * src_row..y * src_row + src_row]; let drow = &mut dst[y * stride..y * stride + w * 4]; for x in 0..w { drow[x * 4..x * 4 + 3].copy_from_slice(&s[x * 3..x * 3 + 3]); drow[x * 4 + 3] = 0; } } } else { // 4-bpp → 4-bpp, honoring the (possibly larger) dst stride for y in 0..h { dst[y * stride..y * stride + src_row] .copy_from_slice(&bytes[y * src_row..y * src_row + src_row]); } } frame.set_pts(Some(pts)); frame.set_kind(if idr { ffmpeg::picture::Type::I } else { ffmpeg::picture::Type::None }); self.enc.send_frame(frame).context("send_frame")?; Ok(()) } /// Zero-copy path: wrap the imported CUDA device buffer in an `AV_PIX_FMT_CUDA` frame and /// send it straight to NVENC (no CPU touch). `buf.ptr` aliases device memory we own, so /// `buf[0]` is left null (ffmpeg must not free it); the frame shell is freed after send. fn submit_cuda( &mut self, buf: &crate::zerocopy::DeviceBuffer, pts: i64, idr: bool, ) -> Result<()> { let frames_ref = self .cuda .as_ref() .context("CUDA hw context missing (encoder opened in CPU mode)")? .frames_ref; unsafe { let mut f = ffi::av_frame_alloc(); if f.is_null() { bail!("av_frame_alloc failed"); } (*f).format = ffi::AVPixelFormat::AV_PIX_FMT_CUDA as c_int; (*f).width = self.width as c_int; (*f).height = self.height as c_int; (*f).hw_frames_ctx = ffi::av_buffer_ref(frames_ref); (*f).data[0] = buf.ptr as *mut u8; (*f).linesize[0] = buf.pitch as c_int; (*f).pts = pts; (*f).pict_type = if idr { ffi::AVPictureType::AV_PICTURE_TYPE_I } else { ffi::AVPictureType::AV_PICTURE_TYPE_NONE }; let r = ffi::avcodec_send_frame(self.enc.as_mut_ptr(), f); ffi::av_frame_free(&mut f); if r < 0 { bail!("avcodec_send_frame(CUDA) failed ({r})"); } } Ok(()) } }