name: Weekly docs refresh # Runs the full pipeline: scrape upstream → rebuild indexes → push # image. Cron'd weekly (Mondays). Skip the reindex + image-push if the # scrape produced no diff against the committed corpus. # # IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because # the digest-history step walks git log up to --history-days back. # With a shallow checkout the history file ships empty. on: schedule: - cron: "0 6 * * 1" # Mondays 06:00 UTC workflow_dispatch: inputs: force_build: description: "Rebuild indexes + push image even if corpus is unchanged" type: boolean default: false env: # If your registry sits behind Cloudflare with its 100 MB body cap, # use a LAN endpoint for pushes (bypasses CF) and the public hostname # for pulls (response bodies aren't capped). REGISTRY_PUSH: : REGISTRY_PULL: # Image name derives from the actual repo at runtime, so a clone # doesn't need to find/replace anything. e.g. justin/my-product-docs. # github.* context is Gitea Actions' inherited GitHub-Actions namespace # — values come from the Gitea server, not github.com. IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} # Embedder. One URL per GPU; the indexer round-robins. OLLAMA_URL: http://:11434 EMBED_MODEL: nomic-embed-text PRODUCT_NAME: jobs: refresh: runs-on: docker container: image: catthehacker/ubuntu:act-latest steps: - name: Checkout uses: actions/checkout@v4 with: # Full history — required for the digest-history step to # walk git log. Default fetch-depth: 1 silently produces a # 0-byte history file. fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install dependencies run: | python -m pip install -q --upgrade pip python -m pip install -q -r requirements.txt # ---- Phase 1: scrape --------------------------------------- - name: Refresh bundle catalog run: python -m scrape.bundles - name: Re-scrape all bundles # --force re-fetches every page so we actually see upstream # edits. Without it the runner skips pages already on disk. run: python -m scrape.runner --all --force --concurrency 6 # ---- Build the digest history BEFORE committing ------------ # See PLAN.md Phase 13. Walks recent corpus-touching commits # and writes corpus/.digest/history.jsonl. The current refresh # gets added on the NEXT run's history (one-week lag is fine). - name: Build digest history run: | mkdir -p corpus/.digest python -m scrape.changelog \ --history-out corpus/.digest/history.jsonl \ --history-days 120 # ---- Commit + retry-on-race -------------------------------- - name: Commit corpus changes (if any) id: commit run: | git config user.name "-docs-refresh" git config user.email "actions@" git add bundles.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" echo "changed=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "changed=true" >> "$GITHUB_OUTPUT" python -m scrape.changelog --cached > /tmp/changelog.txt summary=$(head -1 /tmp/changelog.txt) ts=$(date -u +"%Y-%m-%dT%H:%MZ") { echo "weekly refresh: ${ts} — ${summary}" echo "" cat /tmp/changelog.txt } > /tmp/commitmsg.txt git commit -F /tmp/commitmsg.txt # Retry on race: if main moved while we were scraping (a # human merged a PR during the run), `git push` rejects # with "fetch first". Rebase our corpus commit onto new # main and retry. Corpus + code paths are disjoint, so # the rebase is trivially clean. attempt=1 while [ $attempt -le 3 ]; do if git push; then echo "pushed corpus changes (attempt $attempt)" break fi if [ $attempt -eq 3 ]; then echo "push still failing after 3 attempts — bailing" exit 1 fi git fetch origin main git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; } attempt=$((attempt + 1)) done # ---- Reindex Chroma + BM25 --------------------------------- - name: Rebuild indexes if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild # ---- Build & push image ------------------------------------ - name: Log in to registry (LAN endpoint) if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin - name: Build & push image if: steps.commit.outputs.changed == 'true' || inputs.force_build == true # Runner shell is /bin/sh — use cut instead of ${VAR::N}. # Three tags: :latest (Watchtower target), : # (rollback pin), : (human-readable). run: | SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) DATE_TAG=$(date -u +%Y.%m.%d) docker build \ -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ . docker push "${REGISTRY_PUSH}/${IMAGE}:latest" docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" - name: Link container package to this repo # Gitea container packages are owned by a USER, not a repo — # they don't auto-appear under the repo's Packages tab. # This API call creates the association. One-time-effective: # re-running returns 400 once linked, which we swallow. # Endpoint requires Gitea 1.21+. if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | OWNER="${{ github.repository_owner }}" PKG="${{ github.event.repository.name }}" BODY=$(mktemp) CODE=$(curl -sS -o "$BODY" -w "%{http_code}" -X POST \ -H "Authorization: token ${GITEA_TOKEN}" \ "https://${REGISTRY_PULL}/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}") echo "link http=$CODE body=$(cat "$BODY")" case "$CODE" in 201) echo "linked package to ${OWNER}/${PKG}" ;; 400) echo "already linked (re-link returns 400) — ok" ;; *) echo "unexpected status $CODE"; exit 1 ;; esac # ---- Registry GC ------------------------------------------- - name: Prune old container versions if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | python scripts/registry_gc.py \ --owner "${{ github.repository_owner }}" \ --package "${{ github.event.repository.name }}" \ --keep-days 90 \ --keep-latest 5