From ec617f9c6b348204cfd961c6ec72d7ce71f3c4e8 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Sat, 13 Jun 2026 19:24:52 +0000 Subject: [PATCH] =?UTF-8?q?bench(ci):=20report-only=20regression=20harness?= =?UTF-8?q?=20=E2=80=94=20Tier-1/2=20in=20CI=20+=20Tier-3=20GPU=20runner?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - scripts/bench/compare.py: diff criterion medians (target/criterion/**/estimates.json) vs a committed baseline, print a markdown table to the job summary, flag >threshold regressions, always exit 0 (shared CI hardware is too noisy to gate on). --update rewrites the baseline. - ci.yml `bench` job: runs Tier-1 (criterion) + Tier-2 (loss-harness FEC recovery) GPU-free in the rust-ci container, then compare.py — report-only visibility per push/PR. - scripts/bench/gpu-stream.sh + bench-gpu.yml: Tier-3 real pipeline (virtual output → zero-copy → NVENC → punktfunk/1 → reassemble) on a self-hosted GPU runner; captures encode_us/tx_mbps/ send_dropped + client capture→reassembled latency, compares to gpu-baseline.json (20% threshold). Needs the dev box registered as a `[self-hosted, gpu]` act_runner (one-time, see the workflow header) — the dedicated hardware makes its absolute baseline meaningful, unlike shared CI. - baseline.json: dev-box Tier-1 numbers. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/bench-gpu.yml | 32 +++++++++++ .gitea/workflows/ci.yml | 22 +++++++ scripts/bench/baseline.json | 9 +++ scripts/bench/compare.py | 102 +++++++++++++++++++++++++++++++++ scripts/bench/gpu-stream.sh | 94 ++++++++++++++++++++++++++++++ 5 files changed, 259 insertions(+) create mode 100644 .gitea/workflows/bench-gpu.yml create mode 100644 scripts/bench/baseline.json create mode 100755 scripts/bench/compare.py create mode 100755 scripts/bench/gpu-stream.sh diff --git a/.gitea/workflows/bench-gpu.yml b/.gitea/workflows/bench-gpu.yml new file mode 100644 index 0000000..3bb1274 --- /dev/null +++ b/.gitea/workflows/bench-gpu.yml @@ -0,0 +1,32 @@ +# Tier-3 real-world GPU benchmark — the actual capture → zero-copy → NVENC → punktfunk/1 → reassemble +# pipeline, measuring encode time / throughput / end-to-end latency. The GPU-less CI containers +# (ci.yml `bench` job) can only run the Tier-1/2 GPU-free benchmarks; this runs on a SELF-HOSTED GPU +# runner — a dev box with an NVIDIA GPU + a KWin session. +# +# Runner setup (one-time, on the GPU box): register a Gitea act_runner with the labels below, e.g. +# act_runner register --instance https://git.unom.io --token \ +# --labels gpu:host --name -gpu +# It runs jobs directly on the host (no container) so it can reach the GPU, PipeWire and the +# compositor. A persistent KWin session helps (else the script brings up a headless one). +# +# Report-only: the script flags regressions vs scripts/bench/gpu-baseline.json but never fails the +# job. Refresh the baseline on the runner with `scripts/bench/gpu-stream.sh --update`. +name: bench-gpu + +on: + workflow_dispatch: + inputs: + mode: + description: "stream mode WxHxHz" + default: "1920x1080x120" + schedule: + - cron: "0 6 * * *" # nightly + +jobs: + gpu-stream: + runs-on: [self-hosted, gpu] + timeout-minutes: 20 + steps: + - uses: actions/checkout@v4 + - name: Tier-3 GPU stream benchmark + run: bash scripts/bench/gpu-stream.sh "${{ inputs.mode || '1920x1080x120' }}" 12 diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 6a78d06..9c3978c 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -115,3 +115,25 @@ jobs: run: bun run build - name: Typecheck run: bun run lint + + bench: + # Tier-1 (criterion microbenchmarks) + Tier-2 (FEC loss recovery) — GPU-free, so they run here. + # Report-only: prints the numbers + a diff vs the committed baseline to the job summary and never + # fails the build (shared CI hardware is too noisy to gate on). The tight regression gate + the + # real encode/stream path live on the self-hosted GPU runner (Tier 3, bench-gpu.yml). + runs-on: ubuntu-24.04 + container: + image: git.unom.io/unom/punktfunk-rust-ci:latest + timeout-minutes: 30 + steps: + - uses: actions/checkout@v4 + - name: Prep + run: | + git config --global --add safe.directory "$PWD" + command -v python3 >/dev/null || { apt-get update && apt-get install -y --no-install-recommends python3; } + - name: Tier-1 microbenchmarks (criterion) + run: cargo bench -p punktfunk-core --bench pipeline -- --warm-up-time 1 --measurement-time 3 + - name: Tier-2 FEC loss recovery (loss-harness) + run: cargo run -q -p loss-harness + - name: Compare vs baseline (report-only) + run: python3 scripts/bench/compare.py --threshold 0.5 diff --git a/scripts/bench/baseline.json b/scripts/bench/baseline.json new file mode 100644 index 0000000..5b36c4f --- /dev/null +++ b/scripts/bench/baseline.json @@ -0,0 +1,9 @@ +{ + "crypto/open": 883.5727126417917, + "crypto/seal": 852.9653926763301, + "crypto/seal_in_place": 829.5838139351247, + "pipeline/gf16/1048576": 2384684.9285714286, + "pipeline/gf16/65536": 136898.88634950249, + "pipeline/gf8/1048576": 43770210.0, + "pipeline/gf8/65536": 541486.272459893 +} diff --git a/scripts/bench/compare.py b/scripts/bench/compare.py new file mode 100755 index 0000000..53cfe75 --- /dev/null +++ b/scripts/bench/compare.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Compare criterion benchmark results against a committed baseline and report (soft-warn). + +Reads the median time of every benchmark from target/criterion/**/new/estimates.json (written by +`cargo bench -p punktfunk-core`), diffs each against scripts/bench/baseline.json, and prints a +markdown table (also appended to $GITHUB_STEP_SUMMARY in CI). A metric slower than the baseline by +more than the regression threshold is flagged ⚠, but the script ALWAYS exits 0 — CI timing is noisy +(shared hardware), so this is report-only by design; the tight gate lives on the dedicated Tier-3 +GPU runner. Use --update to (re)write the baseline from the current run. + + python3 scripts/bench/compare.py [--threshold 0.5] [--update] +""" +import argparse +import glob +import json +import os +import sys + +ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +BASELINE = os.path.join(ROOT, "scripts", "bench", "baseline.json") + + +def collect(): + """name -> median nanoseconds, for every criterion benchmark in target/criterion.""" + out = {} + for est in glob.glob(os.path.join(ROOT, "target", "criterion", "**", "new", "estimates.json"), + recursive=True): + # .../target/criterion///new/estimates.json -> "/" + rel = os.path.relpath(est, os.path.join(ROOT, "target", "criterion")) + name = os.path.dirname(os.path.dirname(rel)) # strip "/new/estimates.json" + if name == "report": + continue + with open(est) as f: + data = json.load(f) + out[name.replace(os.sep, "/")] = data["median"]["point_estimate"] + return out + + +def human(ns): + for unit, scale in (("s", 1e9), ("ms", 1e6), ("µs", 1e3), ("ns", 1.0)): + if ns >= scale: + return f"{ns / scale:.2f} {unit}" + return f"{ns:.2f} ns" + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--threshold", type=float, default=0.5, + help="fractional slowdown vs baseline that flags a regression (0.5 = plus 50 pct)") + ap.add_argument("--update", action="store_true", help="overwrite the baseline from this run") + args = ap.parse_args() + + current = collect() + if not current: + print("no criterion results found — run `cargo bench -p punktfunk-core` first", file=sys.stderr) + return 0 # soft + + if args.update: + os.makedirs(os.path.dirname(BASELINE), exist_ok=True) + with open(BASELINE, "w") as f: + json.dump({k: current[k] for k in sorted(current)}, f, indent=2) + f.write("\n") + print(f"wrote baseline ({len(current)} benchmarks) -> {BASELINE}") + return 0 + + baseline = {} + if os.path.exists(BASELINE): + with open(BASELINE) as f: + baseline = json.load(f) + + lines = ["| benchmark | baseline | current | Δ |", "|---|---:|---:|---:|"] + regressions = [] + for name in sorted(current): + cur = current[name] + base = baseline.get(name) + if base is None: + lines.append(f"| {name} | — | {human(cur)} | _new_ |") + continue + delta = (cur - base) / base + flag = " ⚠" if delta > args.threshold else (" ✅" if delta < -args.threshold else "") + lines.append(f"| {name} | {human(base)} | {human(cur)} | {delta:+.1%}{flag} |") + if delta > args.threshold: + regressions.append((name, delta)) + + table = "\n".join(lines) + header = "## Benchmark vs baseline (report-only)\n" + print(header + table) + summary = os.environ.get("GITHUB_STEP_SUMMARY") + if summary: + with open(summary, "a") as f: + f.write(header + table + "\n") + + if regressions: + print(f"\n⚠ {len(regressions)} benchmark(s) regressed > {args.threshold:+.0%} vs baseline " + f"(report-only; verify on the dedicated runner before acting):") + for name, d in regressions: + print(f" {name}: {d:+.1%}") + return 0 # always soft + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/bench/gpu-stream.sh b/scripts/bench/gpu-stream.sh new file mode 100755 index 0000000..b56ef70 --- /dev/null +++ b/scripts/bench/gpu-stream.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Tier-3 GPU stream benchmark — the REAL pipeline: virtual output → zero-copy dmabuf→CUDA → NVENC → +# punktfunk/1 over loopback UDP → FEC/decrypt/reassemble, with the client measuring end-to-end +# latency. This is the "real-world" regression test the GPU-less CI can't run; it runs on a +# self-hosted GPU runner (a dev box with an NVIDIA GPU + a KWin session). Report-only by default. +# +# scripts/bench/gpu-stream.sh [WxHxHz] [seconds] # measure + compare to the baseline +# scripts/bench/gpu-stream.sh 1920x1080x120 12 --update # (re)write scripts/bench/gpu-baseline.json +# +# Metrics (host PUNKTFUNK_PERF + client report): encode_us_p50/p99, tx_mbps, send_dropped, and the +# client's capture→reassembled lat_p50/p95/p99_us. Lower is better for latency/encode/drops, higher +# for throughput. Regressions are flagged ⚠ but the script exits 0 (gate decisions stay human). +set -uo pipefail + +MODE="${1:-1920x1080x120}" +SECS="${2:-12}" +UPDATE="" +[[ "${3:-}" == "--update" || "${2:-}" == "--update" ]] && UPDATE=1 +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +cd "$ROOT" +BASELINE="scripts/bench/gpu-baseline.json" + +# Compositor session: reuse one if present, else bring up a headless KWin (dev-box KDE pattern). +export XDG_RUNTIME_DIR="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" +export WAYLAND_DISPLAY="${WAYLAND_DISPLAY:-wayland-kde}" +export XDG_CURRENT_DESKTOP="${XDG_CURRENT_DESKTOP:-KDE}" +export PUNKTFUNK_COMPOSITOR="${PUNKTFUNK_COMPOSITOR:-kwin}" +export PUNKTFUNK_VIDEO_SOURCE=virtual PUNKTFUNK_ZEROCOPY=1 PUNKTFUNK_PERF=1 +OWN_KWIN="" +if [[ ! -S "$XDG_RUNTIME_DIR/$WAYLAND_DISPLAY" ]]; then + echo "==> no $WAYLAND_DISPLAY — bringing up a headless KWin session" + setsid bash scripts/headless/run-headless-kde.sh "${MODE%x*}" /tmp/bench-kwin.log 2>&1 & + OWN_KWIN=$! + for _ in $(seq 1 30); do [[ -S "$XDG_RUNTIME_DIR/$WAYLAND_DISPLAY" ]] && break; sleep 1; done +fi + +echo "==> building host + client (release)" +cargo build -rq -p punktfunk-host -p punktfunk-client-rs + +HOST_LOG="$(mktemp)"; CLI_LOG="$(mktemp)" +trap 'kill "$HOST_PID" 2>/dev/null; [[ -n "$OWN_KWIN" ]] && pkill -f "kwin_wayland --virtual" 2>/dev/null; rm -f "$HOST_LOG" "$CLI_LOG"' EXIT + +echo "==> host: m3-host --source virtual ($MODE, ${SECS}s)" +target/release/punktfunk-host m3-host --source virtual --seconds "$SECS" --max-sessions 1 \ + >"$HOST_LOG" 2>&1 & +HOST_PID=$! +sleep 3 +echo "==> client: streaming + measuring latency" +target/release/punktfunk-client-rs --connect 127.0.0.1:9777 --mode "$MODE" --out /dev/null \ + >"$CLI_LOG" 2>&1 || true +wait "$HOST_PID" 2>/dev/null || true + +# --- extract metrics --------------------------------------------------------- +field() { grep -oE "$1=\"?[0-9]+" "$2" | tail -1 | grep -oE "[0-9]+$"; } +ENC_P50=$(field "encode_us_p50" "$HOST_LOG"); ENC_P99=$(field "encode_us_p99" "$HOST_LOG") +TX_MBPS=$(field "tx_mbps" "$HOST_LOG"); DROPPED=$(field "send_dropped_total" "$HOST_LOG") +LAT_P50=$(field "lat_p50_us" "$CLI_LOG"); LAT_P95=$(field "lat_p95_us" "$CLI_LOG") +LAT_P99=$(field "lat_p99_us" "$CLI_LOG") +if [[ -z "$LAT_P50" || -z "$ENC_P50" ]]; then + echo "!! incomplete metrics (host/client did not stream). host log tail:"; tail -8 "$HOST_LOG" + exit 0 +fi + +python3 - "$BASELINE" "${UPDATE:-}" <", baseline_path); sys.exit(0) +base = json.load(open(baseline_path)) if os.path.exists(baseline_path) else {} +THRESH = 0.20 # 20% on a dedicated runner +rows = ["## Tier-3 GPU stream benchmark ($MODE)", "", "| metric | baseline | current | Δ |", "|---|---:|---:|---:|"] +regr = [] +for k, (v, lower) in cur.items(): + b = base.get(k) + if b is None: rows.append(f"| {k} | — | {v} | _new_ |"); continue + d = (v - b) / b if b else 0.0 + worse = (d > THRESH) if lower else (d < -THRESH) + flag = " ⚠" if worse else "" + rows.append(f"| {k} | {b} | {v} | {d:+.1%}{flag} |") + if worse: regr.append(k) +out = "\n".join(rows) +print(out) +s = os.environ.get("GITHUB_STEP_SUMMARY") +if s: open(s, "a").write(out + "\n") +if regr: print("\n⚠ regressed:", ", ".join(regr)) +PY