name: Weekly docs refresh # Runs the full pipeline: scrape upstream → rebuild indexes → push # image. Cron'd weekly (Mondays). Skip the reindex + image-push if the # scrape produced no diff against the committed corpus. # # IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because # the digest-history step walks git log up to --history-days back. # With a shallow checkout the history file ships empty. on: schedule: - cron: "0 6 * * 1" # Mondays 06:00 UTC workflow_dispatch: inputs: force_build: description: "Rebuild indexes + push image even if corpus is unchanged" type: boolean default: false env: # If your registry sits behind Cloudflare with its 100 MB body cap, # use a LAN endpoint for pushes (bypasses CF) and the public hostname # for pulls (response bodies aren't capped). REGISTRY_PUSH: : REGISTRY_PULL: IMAGE: /-docs-mcp # Embedder. One URL per GPU; the indexer round-robins. OLLAMA_URL: http://:11434 EMBED_MODEL: nomic-embed-text PRODUCT_NAME: jobs: refresh: runs-on: docker container: image: catthehacker/ubuntu:act-latest steps: - name: Checkout uses: actions/checkout@v4 with: # Full history — required for the digest-history step to # walk git log. Default fetch-depth: 1 silently produces a # 0-byte history file. fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install dependencies run: | python -m pip install -q --upgrade pip python -m pip install -q -r requirements.txt # ---- Phase 1: scrape --------------------------------------- - name: Refresh bundle catalog run: python -m scrape.bundles - name: Re-scrape all bundles # --force re-fetches every page so we actually see upstream # edits. Without it the runner skips pages already on disk. run: python -m scrape.runner --all --force --concurrency 6 # ---- Build the digest history BEFORE committing ------------ # See PLAN.md Phase 13. Walks recent corpus-touching commits # and writes corpus/.digest/history.jsonl. The current refresh # gets added on the NEXT run's history (one-week lag is fine). - name: Build digest history run: | mkdir -p corpus/.digest python -m scrape.changelog \ --history-out corpus/.digest/history.jsonl \ --history-days 120 # ---- Commit + retry-on-race -------------------------------- - name: Commit corpus changes (if any) id: commit run: | git config user.name "-docs-refresh" git config user.email "actions@" git add bundles.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" echo "changed=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "changed=true" >> "$GITHUB_OUTPUT" python -m scrape.changelog --cached > /tmp/changelog.txt summary=$(head -1 /tmp/changelog.txt) ts=$(date -u +"%Y-%m-%dT%H:%MZ") { echo "weekly refresh: ${ts} — ${summary}" echo "" cat /tmp/changelog.txt } > /tmp/commitmsg.txt git commit -F /tmp/commitmsg.txt # Retry on race: if main moved while we were scraping (a # human merged a PR during the run), `git push` rejects # with "fetch first". Rebase our corpus commit onto new # main and retry. Corpus + code paths are disjoint, so # the rebase is trivially clean. attempt=1 while [ $attempt -le 3 ]; do if git push; then echo "pushed corpus changes (attempt $attempt)" break fi if [ $attempt -eq 3 ]; then echo "push still failing after 3 attempts — bailing" exit 1 fi git fetch origin main git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; } attempt=$((attempt + 1)) done # ---- Reindex Chroma + BM25 --------------------------------- - name: Rebuild indexes if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild # ---- Build & push image ------------------------------------ - name: Log in to registry (LAN endpoint) if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u --password-stdin - name: Build & push image if: steps.commit.outputs.changed == 'true' || inputs.force_build == true # Runner shell is /bin/sh — use cut instead of ${VAR::N}. # Three tags: :latest (Watchtower target), : # (rollback pin), : (human-readable). run: | SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) DATE_TAG=$(date -u +%Y.%m.%d) docker build \ -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ . docker push "${REGISTRY_PUSH}/${IMAGE}:latest" docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" # ---- Registry GC ------------------------------------------- - name: Prune old container versions if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | python scripts/registry_gc.py \ --owner \ --package -docs-mcp \ --keep-days 90 \ --keep-latest 5