fix(registry_gc): correct Gitea packages API + Cloudflare-friendly UA (#2)

This commit was merged in pull request #2.
This commit is contained in:
2026-05-22 13:44:43 -04:00
parent 8743fff510
commit 761552fe69
+70 -41
View File
@@ -1,42 +1,58 @@
"""Gitea container-registry garbage collection. """Gitea container-registry garbage collection.
Lists package versions for one container package and deletes versions Lists tagged versions of one container package and deletes old ones.
older than --keep-days. Always preserves: Always preserves:
- the :latest tag - the `latest` tag (Watchtower's auto-deploy target)
- the --keep-latest most-recent date-tagged versions - the `--keep-latest` most-recent date-tagged versions (YYYY.MM.DD)
- anything pushed in the last --keep-days days - the `--keep-latest` most-recent short-SHA tags (rollback pins)
- anything pushed within `--keep-days` days
The actual disk reclaim happens on Gitea's next package GC cron (admin OCI blob-level versions (`sha256:...`) are never touched directly — those
site settings). This script just marks the versions for deletion. are managed by Gitea's internal package GC cron when their last tag
goes away.
Usage: Usage:
python scripts/registry_gc.py \\ GITEA_TOKEN=... python scripts/registry_gc.py \\
--owner <user> \\ --owner justin \\
--package <product>-docs-mcp \\ --package hvm-docs \\
--keep-days 90 \\ --keep-days 90 \\
--keep-latest 5 --keep-latest 5
Auth: reads GITEA_TOKEN from env (set in the workflow as a secret). The Gitea endpoint shape (confirmed 2026-05-22 against git.jpaul.io):
GET /api/v1/packages/{owner}/container/{package}
-> [{id, version, created_at, ...}, ...]
DELETE /api/v1/packages/{owner}/container/{package}/{version}
""" """
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
import os import os
import re
import sys import sys
from datetime import datetime, timedelta, timezone from datetime import datetime, timedelta, timezone
from urllib.request import Request, urlopen
from urllib.error import HTTPError from urllib.error import HTTPError
import json from urllib.parse import quote
from urllib.request import Request, urlopen
GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io") GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io")
DATE_TAG = re.compile(r"^\d{4}\.\d{2}\.\d{2}$")
SHA_TAG = re.compile(r"^[0-9a-f]{7,40}$") # short or full git SHA
BLOB_VER = re.compile(r"^sha256:") # OCI blob versions — skip
def api(token: str, method: str, path: str) -> object: def api(token: str, method: str, path: str) -> object:
# Explicit User-Agent: git.jpaul.io is behind Cloudflare, whose default
# Bot Fight Mode 403s `Python-urllib/X.Y` with error 1010. Any
# recognizable browser/curl-style UA passes.
req = Request(f"{GITEA_HOST}{path}", req = Request(f"{GITEA_HOST}{path}",
headers={"Authorization": f"token {token}"}, headers={
"Authorization": f"token {token}",
"User-Agent": "hvm-docs-registry-gc/1.0",
},
method=method) method=method)
try: try:
with urlopen(req, timeout=30) as r: with urlopen(req, timeout=30) as r:
@@ -63,44 +79,57 @@ def main() -> int:
return 1 return 1
versions = api(token, "GET", versions = api(token, "GET",
f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or [] f"/api/v1/packages/{args.owner}/container/{args.package}") or []
if not versions: if not versions:
print(f"no versions found for {args.owner}/{args.package}") print(f"no versions found for {args.owner}/container/{args.package}")
return 0 return 0
cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days) cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days)
print(f" {len(versions)} version(s); cutoff={cutoff.isoformat()} "
f"keep_days={args.keep_days} keep_latest={args.keep_latest}")
# Date-tagged versions (YYYY.MM.DD), newest first # Sort newest first by created_at.
date_tagged = [] def parsed_ts(v: dict) -> datetime:
for v in versions:
tags = v.get("tags") or []
for t in tags:
if len(t) == 10 and t[4] == "." and t[7] == ".":
date_tagged.append((t, v))
break
date_tagged.sort(key=lambda kv: kv[0], reverse=True)
keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]}
deleted = 0
for v in versions:
tags = v.get("tags") or []
if "latest" in tags:
continue
if any(t in keep_date_tags for t in tags):
continue
try: try:
created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) return datetime.fromisoformat(v["created_at"].replace("Z", "+00:00"))
except (KeyError, ValueError): except (KeyError, ValueError):
return datetime.min.replace(tzinfo=timezone.utc)
versions.sort(key=parsed_ts, reverse=True)
# Compute the keep-set: top-N date tags + top-N sha tags + always latest.
keep_dates: list[str] = []
keep_shas: list[str] = []
for v in versions:
ver = v.get("version") or ""
if DATE_TAG.match(ver) and len(keep_dates) < args.keep_latest:
keep_dates.append(ver)
elif SHA_TAG.match(ver) and len(keep_shas) < args.keep_latest:
keep_shas.append(ver)
keep = {"latest", *keep_dates, *keep_shas}
print(f" keep tags: {sorted(keep)}")
deleted = skipped_blob = skipped_age = skipped_keep = 0
for v in versions:
ver = v.get("version") or ""
ts = parsed_ts(v)
if BLOB_VER.match(ver):
skipped_blob += 1
continue continue
if created >= cutoff: if ver in keep:
skipped_keep += 1
continue continue
version_id = v.get("id") if ts >= cutoff:
print(f" deleting v{version_id} tags={tags} created={v['created_at']}") skipped_age += 1
continue
print(f" deleting {ver!r} id={v.get('id')} created={v.get('created_at')}")
if not args.dry_run: if not args.dry_run:
api(token, "DELETE", api(token, "DELETE",
f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}") f"/api/v1/packages/{args.owner}/container/{args.package}/{quote(ver, safe='')}")
deleted += 1 deleted += 1
print(f"done: {deleted} version(s) deleted")
print(f"done: deleted={deleted} kept_named={skipped_keep} "
f"kept_recent={skipped_age} skipped_blobs={skipped_blob}")
return 0 return 0