name: Monthly corpus refresh # Runs the full pipeline: scrape all sources → rebuild indexes → # push image. Cron'd once a month (1st @ 06:00 UTC). Skip the # reindex + image-push if the scrape produced no diff against the # committed corpus. # # Bayer takes ~30 min; EPA PPLS takes ~7 h with row-crop + # registrant filters. The whole monthly job is ~8-9 h end-to-end. # If that's too long for the runner you can: # - Run just one source: workflow_dispatch with sources="bayer" # - Limit EPA at the scraper: edit the step to add "--limit 5000" on: schedule: - cron: "0 6 1 * *" # 1st of each month, 06:00 UTC workflow_dispatch: inputs: force_build: description: "Rebuild indexes + push image even if corpus is unchanged" type: boolean default: false sources: description: "Sources to scrape (comma-separated, blank = all)" type: string default: "" env: # Self-hosted Gitea registry on the same LAN as the runner. REGISTRY_PUSH: 192.168.0.2:1234 REGISTRY_PULL: git.jpaul.io IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} # Embedder pool for the reindex step. Two Ollama instances on the # Gitea/runner host (one per GPU) + the Windows Ollama. Trashpanda's # Ollama is production-shared; CI doesn't hit it. OLLAMA_URL: http://192.168.0.2:11434,http://192.168.0.2:11435,http://192.168.0.125:11434 EMBED_MODEL: nomic-embed-text PRODUCT_NAME: crop_chem jobs: refresh: runs-on: docker container: image: catthehacker/ubuntu:act-latest steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install dependencies run: | python -m pip install -q --upgrade pip python -m pip install -q -r requirements.txt # ---- Phase 1: scrape --------------------------------------- - name: Scrape Bayer if: ${{ inputs.sources == '' || contains(inputs.sources, 'bayer') }} run: python -m scrape.runner --source bayer --force - name: Scrape EPA PPLS if: ${{ inputs.sources == '' || contains(inputs.sources, 'epa_ppls') }} # Row-crop + registrant filters keep this to ~16K PDFs / ~7h. # Pass --no-row-crop-filter or --no-registrant-filter to broaden. run: python -m scrape.runner --source epa_ppls --force # ---- Commit corpus changes + retry-on-race ----------------- - name: Commit corpus changes (if any) id: commit run: | git config user.name "crop-chem-docs-refresh" git config user.email "actions@jpaul.io" git add sources.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" echo "changed=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "changed=true" >> "$GITHUB_OUTPUT" ts=$(date -u +"%Y-%m-%dT%H:%MZ") n_bayer=$(find corpus/bayer -name '*.json' 2>/dev/null | wc -l) n_epa=$(find corpus/epa_ppls -name '*.json' 2>/dev/null | wc -l) git commit -m "monthly refresh: ${ts} — bayer=${n_bayer} epa_ppls=${n_epa}" attempt=1 while [ $attempt -le 3 ]; do if git push; then echo "pushed corpus changes (attempt $attempt)" break fi if [ $attempt -eq 3 ]; then echo "push still failing after 3 attempts"; exit 1 fi git fetch origin main git rebase origin/main || { echo "rebase conflict"; exit 1; } attempt=$((attempt + 1)) done # ---- Rebuild Chroma + BM25 --------------------------------- - name: Rebuild indexes if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild # ---- Build & push image ------------------------------------ - name: Log in to Gitea container registry if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin - name: Build & push image if: steps.commit.outputs.changed == 'true' || inputs.force_build == true # Tags: :latest (Watchtower target), : (rollback pin), # :corpus- (links image to corpus version so # Drawbar can pin to a specific corpus snapshot). run: | SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) CORPUS_TAG="corpus-$(date -u +%Y.%m.%d)" docker build \ -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ -t "${REGISTRY_PUSH}/${IMAGE}:${CORPUS_TAG}" \ . docker push "${REGISTRY_PUSH}/${IMAGE}:latest" docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" docker push "${REGISTRY_PUSH}/${IMAGE}:${CORPUS_TAG}" - name: Link container package to this repo if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | OWNER="${{ github.repository_owner }}" PKG="${{ github.event.repository.name }}" BODY=$(mktemp) CODE=$(curl -sS -o "$BODY" -w "%{http_code}" -X POST \ -H "Authorization: token ${GITEA_TOKEN}" \ "https://${REGISTRY_PULL}/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}") echo "link http=$CODE body=$(cat "$BODY")" case "$CODE" in 201) echo "linked package to ${OWNER}/${PKG}" ;; 400) echo "already linked — ok" ;; *) echo "unexpected status $CODE"; exit 1 ;; esac - name: Prune old container versions # GC requires broader scope than REGISTRY_TOKEN's push perms # (HTTP 403 on /packages/.../versions). Non-critical housekeeping. # TODO: issue separate PAT with admin:package scope. if: steps.commit.outputs.changed == 'true' || inputs.force_build == true continue-on-error: true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | python scripts/registry_gc.py \ --owner "${{ github.repository_owner }}" \ --package "${{ github.event.repository.name }}" \ --keep-days 180 \ --keep-latest 6