From bf65d264fd2444214971be5c4a49e4b2c043c107 Mon Sep 17 00:00:00 2001
From: enricobuehler <buehler@unom.io>
Date: Mon, 15 Jun 2026 09:10:56 +0000
Subject: [PATCH] ci: bound runner disk + bust the disk-full-corrupted cargo
 target cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The self-hosted runner filled its disk (95%, builds failing on ENOSPC): every CI
push builds a sha-<commit>-tagged Docker image per pipeline, and since those tags
are never dangling a plain `docker image prune` skips them — they piled up to 589
images / ~85 GB plus 18 GB of build cache. Two parts:

- scripts/ci/docker-prune.{service,timer}: a host-level systemd timer (every 6h,
  Persistent) that prunes images/build-cache/containers older than 24h — in-use
  images stay protected. Checked in (the runner is hand-provisioned and shared
  across orgs) and already installed live; reclaimed 89 GB -> 39 GB (95% -> 42%).

- ci.yml / deb.yml: bump the `cargo-target-<rustc>-*` cache key to `-v2-`. The
  disk-full build let actions/cache save a truncated target/ (a dep's .rmeta went
  missing -> "error[E0463]: can't find crate for pem_rfc7468" while compiling der).
  A suffix bump is useless here — restore-keys would fall back to the poisoned
  prefix — so the prefix is versioned to force one clean rebuild. cargo-home is
  untouched (sources were intact; the failure was a missing build artifact).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .gitea/workflows/ci.yml         |  8 ++++++--
 .gitea/workflows/deb.yml        |  6 ++++--
 scripts/ci/docker-prune.service | 28 ++++++++++++++++++++++++++++
 scripts/ci/docker-prune.timer   | 13 +++++++++++++
 4 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 scripts/ci/docker-prune.service
 create mode 100644 scripts/ci/docker-prune.timer
diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml
index 9c3978c..15c4e10 100644
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -42,8 +42,12 @@ jobs:
       - uses: actions/cache@v4
         with:
           path: target
-          key: cargo-target-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }}
-          restore-keys: cargo-target-${{ env.rustc }}-
+          # -v2-: the prior `cargo-target-<rustc>-*` cache was poisoned when the runner ran
+          # out of disk mid-build and actions/cache saved a truncated target/ (a dep's .rmeta
+          # went missing -> E0463 "can't find crate"). A suffix bump wouldn't help — restore-keys
+          # would fall back to the poisoned prefix — so the prefix itself is versioned.
+          key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }}
+          restore-keys: cargo-target-v2-${{ env.rustc }}-
 
       - name: Format
         run: cargo fmt --all --check
diff --git a/.gitea/workflows/deb.yml b/.gitea/workflows/deb.yml
index b7b284c..65402a2 100644
--- a/.gitea/workflows/deb.yml
+++ b/.gitea/workflows/deb.yml
@@ -54,8 +54,10 @@ jobs:
       - uses: actions/cache@v4
         with:
           path: target
-          key: cargo-target-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }}
-          restore-keys: cargo-target-${{ env.rustc }}-
+          # -v2-: bypass a target cache poisoned by a disk-full build (see ci.yml). Shares the
+          # key with ci.yml so the release build reuses its clean artifacts.
+          key: cargo-target-v2-${{ env.rustc }}-${{ hashFiles('Cargo.lock') }}
+          restore-keys: cargo-target-v2-${{ env.rustc }}-
 
       - name: Build release host + client
         run: |
diff --git a/scripts/ci/docker-prune.service b/scripts/ci/docker-prune.service
new file mode 100644
index 0000000..70d88b2
--- /dev/null
+++ b/scripts/ci/docker-prune.service
@@ -0,0 +1,28 @@
+# Docker disk hygiene for the self-hosted Gitea Actions runner (home-runner-1, 192.168.1.52).
+#
+# Why this exists: every CI push builds and sha-<commit>-tags a Docker image per pipeline
+# (rust-ci, web, docs, fedora-rpm, fedora44-rpm, ...). Those tags are never dangling, so a
+# plain `docker image prune` SKIPS them and they accumulate forever — that is what filled the
+# disk (589 images / ~85 GB, builds failing on ENOSPC). This trims everything older than 24h;
+# images IN USE by a running container are always protected regardless of age.
+#
+# Host-level, not per-repo CI, because the runner is shared (punktfunk + other orgs all benefit).
+#
+# Install on the runner host (root):
+#   cp scripts/ci/docker-prune.{service,timer} /etc/systemd/system/
+#   systemctl daemon-reload && systemctl enable --now docker-prune.timer
+# See also scripts/ci/setup-macos-runner.sh for the macOS runner.
+
+[Unit]
+Description=Prune aged Docker images / build cache (CI runner disk hygiene)
+Documentation=https://git.unom.io/unom/punktfunk
+Wants=docker.service
+After=docker.service
+
+[Service]
+Type=oneshot
+# '-' prefix: each step is independent — a no-op/failure never blocks the others.
+ExecStart=-/usr/bin/docker image prune -af --filter until=24h
+ExecStart=-/usr/bin/docker builder prune -af --filter until=24h
+ExecStart=-/usr/bin/docker buildx prune -af --filter until=24h
+ExecStart=-/usr/bin/docker container prune -f --filter until=24h
diff --git a/scripts/ci/docker-prune.timer b/scripts/ci/docker-prune.timer
new file mode 100644
index 0000000..5a74efe
--- /dev/null
+++ b/scripts/ci/docker-prune.timer
@@ -0,0 +1,13 @@
+# Runs docker-prune.service every 6h. Persistent=true catches up after downtime.
+# Install: see the header of docker-prune.service.
+
+[Unit]
+Description=Run docker-prune every 6h (CI runner disk hygiene)
+
+[Timer]
+OnCalendar=*-*-* 00/6:00:00
+RandomizedDelaySec=600
+Persistent=true
+
+[Install]
+WantedBy=timers.target