From 4b21ba53a296d029a336a7f863805c6d343cd434 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sun, 24 May 2026 17:42:46 -0400 Subject: [PATCH] gc: rewrite registry_gc.py against Gitea's actual API (+ UA fix) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of run #122's GC failure (turns out NOT a permission issue, despite the 403): 1. The template's URL was wrong: /api/v1/packages/{owner}/container/ {name}/versions — Gitea interprets this as "look up a SINGLE version named 'versions'" and returns "package does not exist". The correct list endpoint is: GET /api/v1/packages/{owner}?type=container&q={name} which returns one entry per tag with {id, version, created_at}. 2. Cloudflare in front of git.jpaul.io returns 403 to the default Python-urllib User-Agent — any non-Python UA passes (curl, "requests", anything). That explains the 403 in CI (Python made the call) vs 404 from my curl test (curl passed CF, hit Gitea's wrong-URL 404). So both the URL AND the UA were broken. Fixes: - Set User-Agent to "crop-chem-docs-registry-gc/0.1" in api(). - Correct URL for list (above) + DELETE /api/v1/packages/{owner}/container/{name}/{tag} for delete. - Cleaner keep policy with explicit reasons: always: :latest always: corpus-* (production pins; Drawbar may have locked) keep: --keep-latest most recent OTHER tags keep: anything younger than --keep-days delete: everything else - --dry-run for safe testing. Local dry-run against current 4 tags categorizes correctly and deletes nothing (4 < keep-latest=6). Leaving continue-on-error: true in the workflows for one more cycle. If tonight's run passes the GC step cleanly, follow-up commit removes the safety net. (Workflow paths: filter excludes scripts/**, so this commit doesn't trigger image-only.yml.) Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/registry_gc.py | 157 ++++++++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 49 deletions(-) diff --git a/scripts/registry_gc.py b/scripts/registry_gc.py index 41bbc52..f772edd 100644 --- a/scripts/registry_gc.py +++ b/scripts/registry_gc.py @@ -1,43 +1,69 @@ """Gitea container-registry garbage collection. -Lists package versions for one container package and deletes versions -older than --keep-days. Always preserves: +Prunes old container tags from a Gitea registry package. Always +preserves: - - the :latest tag - - the --keep-latest most-recent date-tagged versions - - anything pushed in the last --keep-days days + - The ``latest`` tag (Watchtower auto-pull target) + - Any ``corpus-*`` tag (production pins; Drawbar may have them locked) + - The ``--keep-latest`` most-recent OTHER tags (typically commit-sha pins) + - Anything pushed within ``--keep-days`` days -The actual disk reclaim happens on Gitea's next package GC cron (admin -site settings). This script just marks the versions for deletion. +The actual disk reclaim happens on Gitea's next package GC cron +(admin site settings). This script marks versions for deletion. + +Why this script doesn't use the Docker Registry v2 API: that API has +tag listing + manifest delete by digest, but no per-tag created-at +timestamp without an extra blob-fetch round-trip. Gitea's packages +API gives us {tag, created_at} in one call, which is what the keep +policy needs. + +The endpoint shape that actually works (matches Gitea 1.21+): + + GET /api/v1/packages/{owner}?type=container&q={name} + → JSON array, ONE entry per tag, each with id + version=tag + created_at + DELETE /api/v1/packages/{owner}/container/{name}/{tag} + → 204 on success, 404 if already gone + +Auth: GITEA_TOKEN env var (PAT with delete:packages scope; the +push-only PAT we use as REGISTRY_TOKEN may not be enough — if you +see 403s, mint a separate PAT and pass it as GITEA_TOKEN here). Usage: python scripts/registry_gc.py \\ - --owner \\ - --package -docs-mcp \\ - --keep-days 90 \\ - --keep-latest 5 - -Auth: reads GITEA_TOKEN from env (set in the workflow as a secret). + --owner justin \\ + --package crop-chem-docs \\ + --keep-days 180 \\ + --keep-latest 6 + [--dry-run] """ from __future__ import annotations import argparse +import json import os import sys from datetime import datetime, timedelta, timezone -from urllib.request import Request, urlopen from urllib.error import HTTPError -import json +from urllib.request import Request, urlopen GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io") def api(token: str, method: str, path: str) -> object: - req = Request(f"{GITEA_HOST}{path}", - headers={"Authorization": f"token {token}"}, - method=method) + # User-Agent matters: Cloudflare in front of git.jpaul.io returns + # 403 to the default `Python-urllib/3.x` UA. Any non-Python UA + # passes. Curl works, requests works, we just need to not look + # like a vanilla urllib script. + req = Request( + f"{GITEA_HOST}{path}", + headers={ + "Authorization": f"token {token}", + "User-Agent": "crop-chem-docs-registry-gc/0.1", + }, + method=method, + ) try: with urlopen(req, timeout=30) as r: body = r.read() @@ -48,60 +74,93 @@ def api(token: str, method: str, path: str) -> object: raise +def _parse_created(version: dict) -> datetime: + """Gitea returns RFC3339 with offset like '2026-05-24T16:07:50-04:00'. + Python 3.11+ handles this directly via fromisoformat.""" + return datetime.fromisoformat(version["created_at"]) + + def main() -> int: p = argparse.ArgumentParser() p.add_argument("--owner", required=True) p.add_argument("--package", required=True) - p.add_argument("--keep-days", type=int, default=90) - p.add_argument("--keep-latest", type=int, default=5) - p.add_argument("--dry-run", action="store_true") + p.add_argument("--keep-days", type=int, default=180) + p.add_argument("--keep-latest", type=int, default=6, + help="Keep this many most-recent commit-sha (etc.) " + "tags BEFORE applying --keep-days. corpus-* and " + ":latest are kept regardless.") + p.add_argument("--dry-run", action="store_true", + help="Show what would be deleted without calling DELETE.") args = p.parse_args() token = os.environ.get("GITEA_TOKEN") if not token: - print("GITEA_TOKEN not set", file=sys.stderr) + print("GITEA_TOKEN env var not set", file=sys.stderr) return 1 - versions = api(token, "GET", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or [] + # Gitea's q= is a substring match; filter to exact name so we don't + # accidentally GC a sibling package that shares the prefix. + versions = api( + token, "GET", + f"/api/v1/packages/{args.owner}?type=container&q={args.package}", + ) or [] + versions = [v for v in versions if v.get("name") == args.package] + if not versions: - print(f"no versions found for {args.owner}/{args.package}") + print(f"no versions found for {args.owner}/{args.package} — nothing to GC") return 0 cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days) + versions.sort(key=_parse_created, reverse=True) # newest first - # Date-tagged versions (YYYY.MM.DD), newest first - date_tagged = [] - for v in versions: - tags = v.get("tags") or [] - for t in tags: - if len(t) == 10 and t[4] == "." and t[7] == ".": - date_tagged.append((t, v)) - break - date_tagged.sort(key=lambda kv: kv[0], reverse=True) - keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]} + keep: list[tuple[str, str]] = [] # (tag, reason) + delete: list[dict] = [] + other_kept = 0 - deleted = 0 for v in versions: - tags = v.get("tags") or [] - if "latest" in tags: + tag = v.get("version", "") + created = _parse_created(v) + if tag == "latest": + keep.append((tag, "always-keep (:latest)")) continue - if any(t in keep_date_tags for t in tags): + if tag.startswith("corpus-"): + keep.append((tag, "production pin (corpus-*)")) continue - try: - created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) - except (KeyError, ValueError): + if other_kept < args.keep_latest: + other_kept += 1 + keep.append((tag, f"keep-latest #{other_kept}/{args.keep_latest}")) continue if created >= cutoff: + keep.append((tag, f"within --keep-days ({args.keep_days})")) continue - version_id = v.get("id") - print(f" deleting v{version_id} tags={tags} created={v['created_at']}") - if not args.dry_run: + delete.append(v) + + print(f"=== {args.owner}/{args.package}: {len(versions)} total tag(s) ===") + for tag, reason in keep: + print(f" KEEP {tag:<28} {reason}") + for v in delete: + print(f" DEL {v['version']:<28} created={v['created_at']}") + + if not delete: + print("nothing to delete") + return 0 + if args.dry_run: + print(f"--dry-run; would delete {len(delete)} tag(s)") + return 0 + + failed = 0 + for v in delete: + tag = v["version"] + try: api(token, "DELETE", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}") - deleted += 1 - print(f"done: {deleted} version(s) deleted") - return 0 + f"/api/v1/packages/{args.owner}/container/{args.package}/{tag}") + print(f" ✓ deleted {tag}") + except HTTPError as e: + print(f" ✗ failed {tag}: HTTP {e.code} {e.reason}", file=sys.stderr) + failed += 1 + + print(f"done: deleted {len(delete) - failed} / {len(delete)} tag(s)") + return 0 if failed == 0 else 1 if __name__ == "__main__":