Files
hvm-docs/scripts/registry_gc.py
T
justin 3b9cb4f874 fix(registry_gc): correct Gitea package API + add Cloudflare-friendly UA
The script was hitting /api/v1/packages/{owner}/container/{name}/versions
which doesn't exist (Gitea returns 404 — interpreting the request as a
version named "versions"). Replaced with the actual endpoint
/api/v1/packages/{owner}/container/{name} which returns the array of
version rows directly. Delete path is now
/api/v1/packages/{owner}/container/{name}/{version} (URL-encoded version
string, not numeric ID).

Refactored the keep-set: always preserve `latest`, top --keep-latest
YYYY.MM.DD date tags AND top --keep-latest short-SHA tags (the rollback
pins) by created_at desc. Anything within --keep-days is kept; older
date/sha tags are deleted. sha256:* blob versions are skipped — Gitea's
internal package GC reclaims them when their last tag goes away.

Also added an explicit User-Agent header because git.jpaul.io sits
behind Cloudflare, whose Bot Fight Mode 403s the default
"Python-urllib/X.Y" UA with error code 1010. Affected run 104's GC
step (curl was fine; urllib was blocked).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 13:44:42 -04:00

138 lines
4.7 KiB
Python

"""Gitea container-registry garbage collection.
Lists tagged versions of one container package and deletes old ones.
Always preserves:
- the `latest` tag (Watchtower's auto-deploy target)
- the `--keep-latest` most-recent date-tagged versions (YYYY.MM.DD)
- the `--keep-latest` most-recent short-SHA tags (rollback pins)
- anything pushed within `--keep-days` days
OCI blob-level versions (`sha256:...`) are never touched directly — those
are managed by Gitea's internal package GC cron when their last tag
goes away.
Usage:
GITEA_TOKEN=... python scripts/registry_gc.py \\
--owner justin \\
--package hvm-docs \\
--keep-days 90 \\
--keep-latest 5
The Gitea endpoint shape (confirmed 2026-05-22 against git.jpaul.io):
GET /api/v1/packages/{owner}/container/{package}
-> [{id, version, created_at, ...}, ...]
DELETE /api/v1/packages/{owner}/container/{package}/{version}
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from datetime import datetime, timedelta, timezone
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.request import Request, urlopen
GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io")
DATE_TAG = re.compile(r"^\d{4}\.\d{2}\.\d{2}$")
SHA_TAG = re.compile(r"^[0-9a-f]{7,40}$") # short or full git SHA
BLOB_VER = re.compile(r"^sha256:") # OCI blob versions — skip
def api(token: str, method: str, path: str) -> object:
# Explicit User-Agent: git.jpaul.io is behind Cloudflare, whose default
# Bot Fight Mode 403s `Python-urllib/X.Y` with error 1010. Any
# recognizable browser/curl-style UA passes.
req = Request(f"{GITEA_HOST}{path}",
headers={
"Authorization": f"token {token}",
"User-Agent": "hvm-docs-registry-gc/1.0",
},
method=method)
try:
with urlopen(req, timeout=30) as r:
body = r.read()
return json.loads(body) if body else None
except HTTPError as e:
if e.code == 404:
return None
raise
def main() -> int:
p = argparse.ArgumentParser()
p.add_argument("--owner", required=True)
p.add_argument("--package", required=True)
p.add_argument("--keep-days", type=int, default=90)
p.add_argument("--keep-latest", type=int, default=5)
p.add_argument("--dry-run", action="store_true")
args = p.parse_args()
token = os.environ.get("GITEA_TOKEN")
if not token:
print("GITEA_TOKEN not set", file=sys.stderr)
return 1
versions = api(token, "GET",
f"/api/v1/packages/{args.owner}/container/{args.package}") or []
if not versions:
print(f"no versions found for {args.owner}/container/{args.package}")
return 0
cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days)
print(f" {len(versions)} version(s); cutoff={cutoff.isoformat()} "
f"keep_days={args.keep_days} keep_latest={args.keep_latest}")
# Sort newest first by created_at.
def parsed_ts(v: dict) -> datetime:
try:
return datetime.fromisoformat(v["created_at"].replace("Z", "+00:00"))
except (KeyError, ValueError):
return datetime.min.replace(tzinfo=timezone.utc)
versions.sort(key=parsed_ts, reverse=True)
# Compute the keep-set: top-N date tags + top-N sha tags + always latest.
keep_dates: list[str] = []
keep_shas: list[str] = []
for v in versions:
ver = v.get("version") or ""
if DATE_TAG.match(ver) and len(keep_dates) < args.keep_latest:
keep_dates.append(ver)
elif SHA_TAG.match(ver) and len(keep_shas) < args.keep_latest:
keep_shas.append(ver)
keep = {"latest", *keep_dates, *keep_shas}
print(f" keep tags: {sorted(keep)}")
deleted = skipped_blob = skipped_age = skipped_keep = 0
for v in versions:
ver = v.get("version") or ""
ts = parsed_ts(v)
if BLOB_VER.match(ver):
skipped_blob += 1
continue
if ver in keep:
skipped_keep += 1
continue
if ts >= cutoff:
skipped_age += 1
continue
print(f" deleting {ver!r} id={v.get('id')} created={v.get('created_at')}")
if not args.dry_run:
api(token, "DELETE",
f"/api/v1/packages/{args.owner}/container/{args.package}/{quote(ver, safe='')}")
deleted += 1
print(f"done: deleted={deleted} kept_named={skipped_keep} "
f"kept_recent={skipped_age} skipped_blobs={skipped_blob}")
return 0
if __name__ == "__main__":
sys.exit(main())