name: Weekly docs refresh # Runs the full pipeline: scrape upstream → rebuild indexes → push # image. Cron'd weekly (Mondays). Skip the reindex + image-push if the # scrape produced no diff against the committed corpus. # # IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because # the digest-history step walks git log up to --history-days back. # With a shallow checkout the history file ships empty. on: schedule: - cron: "0 6 * * 1" # Mondays 06:00 UTC workflow_dispatch: inputs: force_build: description: "Rebuild indexes + push image even if corpus is unchanged" type: boolean default: false env: # PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare Tunnel's # 100 MB body cap. PULL uses the public hostname (HTTPS). Same Gitea # registry either way — package lands under the same owner/repo. REGISTRY_PUSH: 192.168.0.2:1234 REGISTRY_PULL: git.jpaul.io # Image name derives from the repo at runtime — clones don't need to # edit this. github.* is the Gitea-Actions inherited namespace. IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} # Two GPU-pinned Ollama containers on the Gitea host — same infra # zerto-docs uses (deploy/ollama-rag.docker-compose.yml over there). # :11435 owns the Titan X, :11436 owns the 1080 Ti; the indexer # round-robins per batch so both cards run in parallel. The host's # primary Ollama on :11434 is left alone for OpenWebUI etc. OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436 EMBED_MODEL: nomic-embed-text PRODUCT_NAME: morpheus jobs: refresh: runs-on: docker container: image: catthehacker/ubuntu:act-latest steps: - name: Checkout uses: actions/checkout@v4 with: # Full history — required for digest-history. Default depth 1 # silently produces a 0-byte history file. fetch-depth: 0 # Set the credentials Gitea injects so we can push corpus # commits back. Persist them across the run. token: ${{ secrets.GITEA_TOKEN }} - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install dependencies run: | python -m pip install -q --upgrade pip python -m pip install -q -r requirements.txt # ---- Phase 1: scrape --------------------------------------- - name: Refresh bundle catalog run: python -m scrape.bundles - name: Re-scrape all bundles # --force re-fetches every page so we actually see upstream # edits. Without it the runner skips pages already on disk. run: python -m scrape.runner --all --force --concurrency 6 # ---- Build the digest history BEFORE committing ------------ # See PLAN.md Phase 13. Walks recent corpus-touching commits # and writes corpus/.digest/history.jsonl. The current refresh # gets added on the NEXT run's history (one-week lag is fine). - name: Build digest history run: | mkdir -p corpus/.digest python -m scrape.changelog \ --history-out corpus/.digest/history.jsonl \ --history-days 120 # ---- Commit + retry-on-race -------------------------------- - name: Commit corpus changes (if any) id: commit run: | git config user.name "hvm-docs-refresh" git config user.email "actions@jpaul.io" git add bundles.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" echo "changed=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "changed=true" >> "$GITHUB_OUTPUT" python -m scrape.changelog --cached > /tmp/changelog.txt summary=$(head -1 /tmp/changelog.txt) ts=$(date -u +"%Y-%m-%dT%H:%MZ") { echo "weekly refresh: ${ts} — ${summary}" echo "" cat /tmp/changelog.txt } > /tmp/commitmsg.txt git commit -F /tmp/commitmsg.txt # Retry on race: if main moved while we were scraping (a # human merged a PR during the run), `git push` rejects # with "fetch first". Rebase our corpus commit onto new # main and retry. Corpus + code paths are disjoint, so # the rebase is trivially clean. attempt=1 while [ $attempt -le 3 ]; do if git push; then echo "pushed corpus changes (attempt $attempt)" break fi if [ $attempt -eq 3 ]; then echo "push still failing after 3 attempts — bailing" exit 1 fi git fetch origin main git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; } attempt=$((attempt + 1)) done # ---- Reindex Chroma + BM25 --------------------------------- - name: Rebuild indexes if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild # ---- Build & push image (LAN endpoint, buildx) ------------- - name: Set up Docker Buildx if: steps.commit.outputs.changed == 'true' || inputs.force_build == true uses: docker/setup-buildx-action@v3 with: # LAN registry is HTTP only. Buildkit needs an explicit # insecure-registry config or it tries to upgrade to HTTPS. config-inline: | [registry."192.168.0.2:1234"] http = true insecure = true - name: Configure registry credentials for buildx # Can't use docker/login-action against the LAN endpoint — # the host docker daemon errors on HTTP-vs-HTTPS. Buildx reads # ~/.docker/config.json directly, so write the auth ourselves. if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }} REGISTRY_USER: ${{ github.actor }} run: | mkdir -p ~/.docker AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0) cat > ~/.docker/config.json < ${PKG}: HTTP ${code}" body=$(cat /tmp/link.out) case "$code" in 201) echo "OK — newly linked" ;; 400|409) echo "OK — already linked: ${body}" ;; *) echo "unexpected: ${body}"; exit 1 ;; esac # ---- Registry GC ------------------------------------------- - name: Prune old container versions if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | python scripts/registry_gc.py \ --owner "${{ github.repository_owner }}" \ --package "${{ github.event.repository.name }}" \ --keep-days 90 \ --keep-latest 5