From 8265742e74c5781a2247f049f84f99cf5d38f1fb Mon Sep 17 00:00:00 2001 From: enricobuehler Date: Mon, 15 Jun 2026 14:25:40 +0000 Subject: [PATCH] ci: bust the re-poisoned cargo cache (v3) + burst-guard the runner prune MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This session's push storm refilled the runner to 100% WITHIN the prune timer's 24h window (it only trims >24h), so a build hit ENOSPC and actions/cache saved a truncated target/ -> `error[E0463]: can't find crate for shlex` in ci.yml's clippy. Two fixes: - Bump cargo-target-v2- -> v3- in ci.yml + deb.yml so the poisoned tarball is bypassed (a suffix bump can't — restore-keys falls back to the old prefix; same as the v1->v2 fix). - Harden scripts/ci/docker-prune: run HOURLY (was 6h) with a burst guard — if the disk is still >85% after the normal until=12h trim, prune ALL idle images + build cache (in-use protected). A fast push-burst can fill 99 GB inside any time window, so the disk-pressure trigger, not the age filter, is the real backstop. Applied live on home-runner-1 (reclaimed 95%->66%) and checked in. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitea/workflows/ci.yml | 6 +++--- .gitea/workflows/deb.yml | 6 +++--- scripts/ci/docker-prune.service | 21 +++++++++++++-------- scripts/ci/docker-prune.timer | 10 +++++----- 4 files changed, 24 insertions(+), 19 deletions(-) diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 15c4e10..0f4d451 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -42,12 +42,12 @@ jobs: - uses: actions/cache@v4 with: path: target - # -v2-: the prior `cargo-target--*` cache was poisoned when the runner ran + # -v3-: the prior `cargo-target--*` cache was poisoned when the runner ran # out of disk mid-build and actions/cache saved a truncated target/ (a dep's .rmeta # went missing -> E0463 "can't find crate"). A suffix bump wouldn't help — restore-keys # would fall back to the poisoned prefix — so the prefix itself is versioned. - key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} - restore-keys: cargo-target-v2-${{ env.rustc }}- + key: cargo-target-v3-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} + restore-keys: cargo-target-v3-${{ env.rustc }}- - name: Format run: cargo fmt --all --check diff --git a/.gitea/workflows/deb.yml b/.gitea/workflows/deb.yml index b22e1f5..73cdc22 100644 --- a/.gitea/workflows/deb.yml +++ b/.gitea/workflows/deb.yml @@ -71,10 +71,10 @@ jobs: - uses: actions/cache@v4 with: path: target - # -v2-: bypass a target cache poisoned by a disk-full build (see ci.yml). Shares the + # -v3-: bypass a target cache poisoned by a disk-full build (see ci.yml). Shares the # key with ci.yml so the release build reuses its clean artifacts. - key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} - restore-keys: cargo-target-v2-${{ env.rustc }}- + key: cargo-target-v3-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }} + restore-keys: cargo-target-v3-${{ env.rustc }}- - name: Build release host + client env: diff --git a/scripts/ci/docker-prune.service b/scripts/ci/docker-prune.service index 70d88b2..372117c 100644 --- a/scripts/ci/docker-prune.service +++ b/scripts/ci/docker-prune.service @@ -2,12 +2,14 @@ # # Why this exists: every CI push builds and sha--tags a Docker image per pipeline # (rust-ci, web, docs, fedora-rpm, fedora44-rpm, ...). Those tags are never dangling, so a -# plain `docker image prune` SKIPS them and they accumulate forever — that is what filled the -# disk (589 images / ~85 GB, builds failing on ENOSPC). This trims everything older than 24h; -# images IN USE by a running container are always protected regardless of age. -# +# plain `docker image prune` SKIPS them and they accumulate — that is what filled the disk. # Host-level, not per-repo CI, because the runner is shared (punktfunk + other orgs all benefit). # +# Two tiers: trim anything older than 12h normally, AND — because a push-burst can fill 99 GB +# WITHIN that 12h window (a fast iteration session hit 100% and poisoned the cargo cache with a +# truncated, half-saved target/) — a burst guard that prunes ALL idle images + cache once the +# disk is >85% full. Images IN USE by a running container are always protected. +# # Install on the runner host (root): # cp scripts/ci/docker-prune.{service,timer} /etc/systemd/system/ # systemctl daemon-reload && systemctl enable --now docker-prune.timer @@ -22,7 +24,10 @@ After=docker.service [Service] Type=oneshot # '-' prefix: each step is independent — a no-op/failure never blocks the others. -ExecStart=-/usr/bin/docker image prune -af --filter until=24h -ExecStart=-/usr/bin/docker builder prune -af --filter until=24h -ExecStart=-/usr/bin/docker buildx prune -af --filter until=24h -ExecStart=-/usr/bin/docker container prune -f --filter until=24h +ExecStart=-/usr/bin/docker image prune -af --filter until=12h +ExecStart=-/usr/bin/docker builder prune -af --filter until=12h +ExecStart=-/usr/bin/docker buildx prune -af --filter until=12h +ExecStart=-/usr/bin/docker container prune -f --filter until=12h +# Burst guard: if STILL >85% full, prune every idle image + all build cache (in-use protected), +# so a push-storm can't drive CI into ENOSPC (which truncates and poisons the actions/cargo cache). +ExecStart=-/bin/sh -c 'P=$(df --output=pcent / | tr -dc 0-9); [ "$P" -ge 85 ] && { docker image prune -af; docker builder prune -af; docker buildx prune -af; } || true' diff --git a/scripts/ci/docker-prune.timer b/scripts/ci/docker-prune.timer index 5a74efe..104d8d0 100644 --- a/scripts/ci/docker-prune.timer +++ b/scripts/ci/docker-prune.timer @@ -1,12 +1,12 @@ -# Runs docker-prune.service every 6h. Persistent=true catches up after downtime. -# Install: see the header of docker-prune.service. +# Runs docker-prune.service hourly (the burst guard needs to react within the hour, not every 6h). +# Persistent=true catches up after downtime. Install: see the header of docker-prune.service. [Unit] -Description=Run docker-prune every 6h (CI runner disk hygiene) +Description=Run docker-prune hourly (CI runner disk hygiene + burst guard) [Timer] -OnCalendar=*-*-* 00/6:00:00 -RandomizedDelaySec=600 +OnCalendar=hourly +RandomizedDelaySec=300 Persistent=true [Install]