diff --git a/scripts/ci/docker-prune.service b/scripts/ci/docker-prune.service index 372117c..560a9c0 100644 --- a/scripts/ci/docker-prune.service +++ b/scripts/ci/docker-prune.service @@ -2,32 +2,31 @@ # # Why this exists: every CI push builds and sha--tags a Docker image per pipeline # (rust-ci, web, docs, fedora-rpm, fedora44-rpm, ...). Those tags are never dangling, so a -# plain `docker image prune` SKIPS them and they accumulate — that is what filled the disk. -# Host-level, not per-repo CI, because the runner is shared (punktfunk + other orgs all benefit). +# plain `docker image prune` SKIPS them and they accumulate. Host-level, not per-repo CI, +# because the runner is shared (punktfunk + other orgs all benefit). # -# Two tiers: trim anything older than 12h normally, AND — because a push-burst can fill 99 GB -# WITHIN that 12h window (a fast iteration session hit 100% and poisoned the cargo cache with a -# truncated, half-saved target/) — a burst guard that prunes ALL idle images + cache once the -# disk is >85% full. Images IN USE by a running container are always protected. +# THE BIG ONE (2026-06-19): the act_runner CACHE SERVER store lives in the long-running runner +# container's WRITABLE LAYER (HOME/.cache/actcache/cache inside gitea-runner-runner-1, +# `cache.dir: ""` -> defaults under /root). `docker prune` can NEVER see it — only stopped +# containers + unused images/cache are prunable, not a 13-day-up container's layer. That store +# grew to ~66 GB and filled a 125 GB disk on its own. docker-prune.sh caps it by clearing the +# blobs in-place (act_runner repopulates; keys are content-hashed). +# +# The logic is in docker-prune.sh, NOT inline ExecStart=, because systemd does its own +# $-expansion on ExecStart and would empty the shell vars / $(...) before sh runs them. # # Install on the runner host (root): +# install -m755 scripts/ci/docker-prune.sh /usr/local/bin/ci-docker-prune.sh # cp scripts/ci/docker-prune.{service,timer} /etc/systemd/system/ # systemctl daemon-reload && systemctl enable --now docker-prune.timer # See also scripts/ci/setup-macos-runner.sh for the macOS runner. [Unit] -Description=Prune aged Docker images / build cache (CI runner disk hygiene) +Description=Prune aged Docker images/cache + cap the act_runner cache (CI runner disk hygiene) Documentation=https://git.unom.io/unom/punktfunk Wants=docker.service After=docker.service [Service] Type=oneshot -# '-' prefix: each step is independent — a no-op/failure never blocks the others. -ExecStart=-/usr/bin/docker image prune -af --filter until=12h -ExecStart=-/usr/bin/docker builder prune -af --filter until=12h -ExecStart=-/usr/bin/docker buildx prune -af --filter until=12h -ExecStart=-/usr/bin/docker container prune -f --filter until=12h -# Burst guard: if STILL >85% full, prune every idle image + all build cache (in-use protected), -# so a push-storm can't drive CI into ENOSPC (which truncates and poisons the actions/cargo cache). -ExecStart=-/bin/sh -c 'P=$(df --output=pcent / | tr -dc 0-9); [ "$P" -ge 85 ] && { docker image prune -af; docker builder prune -af; docker buildx prune -af; } || true' +ExecStart=/usr/local/bin/ci-docker-prune.sh diff --git a/scripts/ci/docker-prune.sh b/scripts/ci/docker-prune.sh new file mode 100644 index 0000000..05b80aa --- /dev/null +++ b/scripts/ci/docker-prune.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# CI runner disk hygiene — invoked by docker-prune.service (every 30 min). Lives in a real script +# rather than inline ExecStart= lines because systemd does its OWN $-expansion on ExecStart and +# empties shell vars / $(...) before /bin/sh sees them (silently breaking the logic under `|| true`). +# +# See docker-prune.service for the full why. The headline: the act_runner cache server's blob store +# lives INSIDE the long-running runner container's writable layer, where `docker prune` can't reach +# it — left alone it grows to tens of GB and fills the disk on its own. +set -u +export PATH=/usr/bin:/bin:/usr/local/bin:$PATH + +RUNNER=$(docker ps -q -f name=gitea-runner-runner | head -1) +ACTCACHE=/root/.cache/actcache/cache # path INSIDE the runner container (HOME=/root there) +CAP_MB=20000 # clear the actcache once its blob dir exceeds ~20 GB +BURST_PCT=80 # full clear once the disk is this % full + +# 1) Routine: trim aged images / build cache / stopped containers. sha- tags aren't +# dangling, so -a is required; until=6h keeps very recent ones for quick re-runs. +docker image prune -af --filter until=6h || true +docker builder prune -af --filter until=6h || true +docker buildx prune -af --filter until=6h || true +docker container prune -f --filter until=6h || true + +# 2) Cap the act_runner cache server store (the real disk filler). Clearing the blobs is safe — +# act_runner repopulates it and cache keys are content-hashed, so this only drops stale entries. +if [ -n "$RUNNER" ]; then + SZ=$(docker exec "$RUNNER" du -sm "$ACTCACHE" 2>/dev/null | cut -f1) + if [ -n "${SZ:-}" ] && [ "$SZ" -ge "$CAP_MB" ]; then + docker exec "$RUNNER" sh -c "rm -rf $ACTCACHE/*" && echo "actcache cleared (was ${SZ} MB)" + fi +fi + +# 3) Burst guard: a push-storm can fill the disk within one interval. Once >=BURST_PCT% full, prune +# ALL idle images/cache AND clear the actcache, regardless of age. In-use images are protected. +PCT=$(df --output=pcent / | tr -dc '0-9') +if [ -n "$PCT" ] && [ "$PCT" -ge "$BURST_PCT" ]; then + echo "disk ${PCT}% >= ${BURST_PCT}% — burst clear" + docker image prune -af || true + docker builder prune -af || true + docker buildx prune -af || true + [ -n "$RUNNER" ] && docker exec "$RUNNER" sh -c "rm -rf $ACTCACHE/*" || true +fi diff --git a/scripts/ci/docker-prune.timer b/scripts/ci/docker-prune.timer index 104d8d0..dacb784 100644 --- a/scripts/ci/docker-prune.timer +++ b/scripts/ci/docker-prune.timer @@ -1,12 +1,13 @@ -# Runs docker-prune.service hourly (the burst guard needs to react within the hour, not every 6h). +# Runs docker-prune.service every 30 min. The runner is hammered with build bursts that can refill +# the disk fast (and the actcache cap needs to react well within an hour), so 30 min beats hourly. # Persistent=true catches up after downtime. Install: see the header of docker-prune.service. [Unit] -Description=Run docker-prune hourly (CI runner disk hygiene + burst guard) +Description=Run docker-prune every 30 min (CI runner disk hygiene + actcache cap + burst guard) [Timer] -OnCalendar=hourly -RandomizedDelaySec=300 +OnCalendar=*:0/30 +RandomizedDelaySec=120 Persistent=true [Install]