From 761552fe69686257336ba20bf6946229aa22e728 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 22 May 2026 13:44:43 -0400 Subject: [PATCH] fix(registry_gc): correct Gitea packages API + Cloudflare-friendly UA (#2) --- scripts/registry_gc.py | 111 ++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 41 deletions(-) diff --git a/scripts/registry_gc.py b/scripts/registry_gc.py index 41bbc52..0e900ca 100644 --- a/scripts/registry_gc.py +++ b/scripts/registry_gc.py @@ -1,42 +1,58 @@ """Gitea container-registry garbage collection. -Lists package versions for one container package and deletes versions -older than --keep-days. Always preserves: +Lists tagged versions of one container package and deletes old ones. +Always preserves: - - the :latest tag - - the --keep-latest most-recent date-tagged versions - - anything pushed in the last --keep-days days + - the `latest` tag (Watchtower's auto-deploy target) + - the `--keep-latest` most-recent date-tagged versions (YYYY.MM.DD) + - the `--keep-latest` most-recent short-SHA tags (rollback pins) + - anything pushed within `--keep-days` days -The actual disk reclaim happens on Gitea's next package GC cron (admin -site settings). This script just marks the versions for deletion. +OCI blob-level versions (`sha256:...`) are never touched directly — those +are managed by Gitea's internal package GC cron when their last tag +goes away. Usage: - python scripts/registry_gc.py \\ - --owner \\ - --package -docs-mcp \\ + GITEA_TOKEN=... python scripts/registry_gc.py \\ + --owner justin \\ + --package hvm-docs \\ --keep-days 90 \\ --keep-latest 5 -Auth: reads GITEA_TOKEN from env (set in the workflow as a secret). +The Gitea endpoint shape (confirmed 2026-05-22 against git.jpaul.io): + + GET /api/v1/packages/{owner}/container/{package} + -> [{id, version, created_at, ...}, ...] + DELETE /api/v1/packages/{owner}/container/{package}/{version} """ from __future__ import annotations import argparse +import json import os +import re import sys from datetime import datetime, timedelta, timezone -from urllib.request import Request, urlopen from urllib.error import HTTPError -import json - +from urllib.parse import quote +from urllib.request import Request, urlopen GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io") +DATE_TAG = re.compile(r"^\d{4}\.\d{2}\.\d{2}$") +SHA_TAG = re.compile(r"^[0-9a-f]{7,40}$") # short or full git SHA +BLOB_VER = re.compile(r"^sha256:") # OCI blob versions — skip def api(token: str, method: str, path: str) -> object: + # Explicit User-Agent: git.jpaul.io is behind Cloudflare, whose default + # Bot Fight Mode 403s `Python-urllib/X.Y` with error 1010. Any + # recognizable browser/curl-style UA passes. req = Request(f"{GITEA_HOST}{path}", - headers={"Authorization": f"token {token}"}, + headers={ + "Authorization": f"token {token}", + "User-Agent": "hvm-docs-registry-gc/1.0", + }, method=method) try: with urlopen(req, timeout=30) as r: @@ -63,44 +79,57 @@ def main() -> int: return 1 versions = api(token, "GET", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or [] + f"/api/v1/packages/{args.owner}/container/{args.package}") or [] if not versions: - print(f"no versions found for {args.owner}/{args.package}") + print(f"no versions found for {args.owner}/container/{args.package}") return 0 cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days) + print(f" {len(versions)} version(s); cutoff={cutoff.isoformat()} " + f"keep_days={args.keep_days} keep_latest={args.keep_latest}") - # Date-tagged versions (YYYY.MM.DD), newest first - date_tagged = [] - for v in versions: - tags = v.get("tags") or [] - for t in tags: - if len(t) == 10 and t[4] == "." and t[7] == ".": - date_tagged.append((t, v)) - break - date_tagged.sort(key=lambda kv: kv[0], reverse=True) - keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]} - - deleted = 0 - for v in versions: - tags = v.get("tags") or [] - if "latest" in tags: - continue - if any(t in keep_date_tags for t in tags): - continue + # Sort newest first by created_at. + def parsed_ts(v: dict) -> datetime: try: - created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) + return datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) except (KeyError, ValueError): + return datetime.min.replace(tzinfo=timezone.utc) + + versions.sort(key=parsed_ts, reverse=True) + + # Compute the keep-set: top-N date tags + top-N sha tags + always latest. + keep_dates: list[str] = [] + keep_shas: list[str] = [] + for v in versions: + ver = v.get("version") or "" + if DATE_TAG.match(ver) and len(keep_dates) < args.keep_latest: + keep_dates.append(ver) + elif SHA_TAG.match(ver) and len(keep_shas) < args.keep_latest: + keep_shas.append(ver) + keep = {"latest", *keep_dates, *keep_shas} + print(f" keep tags: {sorted(keep)}") + + deleted = skipped_blob = skipped_age = skipped_keep = 0 + for v in versions: + ver = v.get("version") or "" + ts = parsed_ts(v) + if BLOB_VER.match(ver): + skipped_blob += 1 continue - if created >= cutoff: + if ver in keep: + skipped_keep += 1 continue - version_id = v.get("id") - print(f" deleting v{version_id} tags={tags} created={v['created_at']}") + if ts >= cutoff: + skipped_age += 1 + continue + print(f" deleting {ver!r} id={v.get('id')} created={v.get('created_at')}") if not args.dry_run: api(token, "DELETE", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}") + f"/api/v1/packages/{args.owner}/container/{args.package}/{quote(ver, safe='')}") deleted += 1 - print(f"done: {deleted} version(s) deleted") + + print(f"done: deleted={deleted} kept_named={skipped_keep} " + f"kept_recent={skipped_age} skipped_blobs={skipped_blob}") return 0