From bf65d264fd2444214971be5c4a49e4b2c043c107 Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Mon, 15 Jun 2026 09:10:56 +0000 Subject: [PATCH] ci: bound runner disk + bust the disk-full-corrupted cargo target cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The self-hosted runner filled its disk (95%, builds failing on ENOSPC): every CI push builds a sha--tagged Docker image per pipeline, and since those tags are never dangling a plain `docker image prune` skips them — they piled up to 589 images / ~85 GB plus 18 GB of build cache. Two parts: - scripts/ci/docker-prune.{service,timer}: a host-level systemd timer (every 6h, Persistent) that prunes images/build-cache/containers older than 24h — in-use images stay protected. Checked in (the runner is hand-provisioned and shared across orgs) and already installed live; reclaimed 89 GB -> 39 GB (95% -> 42%). - ci.yml / deb.yml: bump the `cargo-target--*` cache key to `-v2-`. The disk-full build let actions/cache save a truncated target/ (a dep's .rmeta went missing -> "error[E0463]: can't find crate for pem_rfc7468" while compiling der). A suffix bump is useless here — restore-keys would fall back to the poisoned prefix — so the prefix is versioned to force one clean rebuild. cargo-home is untouched (sources were intact; the failure was a missing build artifact). Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/ci.yml | 8 ++++++-- .gitea/workflows/deb.yml | 6 ++++-- scripts/ci/docker-prune.service | 28 ++++++++++++++++++++++++++++ scripts/ci/docker-prune.timer | 13 +++++++++++++ 4 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 scripts/ci/docker-prune.service create mode 100644 scripts/ci/docker-prune.timer diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 9c3978c..15c4e10 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -42,8 +42,12 @@ jobs: - uses: actions/cache@v4 with: path: target - key: cargo-target-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} - restore-keys: cargo-target-${{ env.rustc }}- + # -v2-: the prior `cargo-target--*` cache was poisoned when the runner ran + # out of disk mid-build and actions/cache saved a truncated target/ (a dep's .rmeta + # went missing -> E0463 "can't find crate"). A suffix bump wouldn't help — restore-keys + # would fall back to the poisoned prefix — so the prefix itself is versioned. + key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} + restore-keys: cargo-target-v2-${{ env.rustc }}- - name: Format run: cargo fmt --all --check diff --git a/.gitea/workflows/deb.yml b/.gitea/workflows/deb.yml index b7b284c..65402a2 100644 --- a/.gitea/workflows/deb.yml +++ b/.gitea/workflows/deb.yml @@ -54,8 +54,10 @@ jobs: - uses: actions/cache@v4 with: path: target - key: cargo-target-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} - restore-keys: cargo-target-${{ env.rustc }}- + # -v2-: bypass a target cache poisoned by a disk-full build (see ci.yml). Shares the + # key with ci.yml so the release build reuses its clean artifacts. + key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} + restore-keys: cargo-target-v2-${{ env.rustc }}- - name: Build release host + client run: | diff --git a/scripts/ci/docker-prune.service b/scripts/ci/docker-prune.service new file mode 100644 index 0000000..70d88b2 --- /dev/null +++ b/scripts/ci/docker-prune.service @@ -0,0 +1,28 @@ +# Docker disk hygiene for the self-hosted Gitea Actions runner (home-runner-1, 192.168.1.52). +# +# Why this exists: every CI push builds and sha--tags a Docker image per pipeline +# (rust-ci, web, docs, fedora-rpm, fedora44-rpm, ...). Those tags are never dangling, so a +# plain `docker image prune` SKIPS them and they accumulate forever — that is what filled the +# disk (589 images / ~85 GB, builds failing on ENOSPC). This trims everything older than 24h; +# images IN USE by a running container are always protected regardless of age. +# +# Host-level, not per-repo CI, because the runner is shared (punktfunk + other orgs all benefit). +# +# Install on the runner host (root): +# cp scripts/ci/docker-prune.{service,timer} /etc/systemd/system/ +# systemctl daemon-reload && systemctl enable --now docker-prune.timer +# See also scripts/ci/setup-macos-runner.sh for the macOS runner. + +[Unit] +Description=Prune aged Docker images / build cache (CI runner disk hygiene) +Documentation=https://git.unom.io/unom/punktfunk +Wants=docker.service +After=docker.service + +[Service] +Type=oneshot +# '-' prefix: each step is independent — a no-op/failure never blocks the others. +ExecStart=-/usr/bin/docker image prune -af --filter until=24h +ExecStart=-/usr/bin/docker builder prune -af --filter until=24h +ExecStart=-/usr/bin/docker buildx prune -af --filter until=24h +ExecStart=-/usr/bin/docker container prune -f --filter until=24h diff --git a/scripts/ci/docker-prune.timer b/scripts/ci/docker-prune.timer new file mode 100644 index 0000000..5a74efe --- /dev/null +++ b/scripts/ci/docker-prune.timer @@ -0,0 +1,13 @@ +# Runs docker-prune.service every 6h. Persistent=true catches up after downtime. +# Install: see the header of docker-prune.service. + +[Unit] +Description=Run docker-prune every 6h (CI runner disk hygiene) + +[Timer] +OnCalendar=*-*-* 00/6:00:00 +RandomizedDelaySec=600 +Persistent=true + +[Install] +WantedBy=timers.target