name: Monthly seed catalog refresh # Runs the full pipeline: scrape all GREEN sources → rebuild indexes # → push image. Cron'd once a month (1st @ 06:00 UTC). Skip the # reindex + image-push if the scrape produced no diff against the # committed corpus. # # Seed catalogs move slowly (vendors release new hybrids 1-2x/year # at field-day timing); monthly cadence is plenty. # # Total runtime budget: ~2-3 h end-to-end across all 5 GREEN sources. # Bayer is the longest (~475 varieties, ~45 min). Beck's PFR is the # heaviest single-source (~2,089 docs via Sanity GROQ pagination). on: schedule: - cron: "0 6 1 * *" # 1st of each month, 06:00 UTC workflow_dispatch: inputs: force_build: description: "Rebuild indexes + push image even if corpus is unchanged" type: boolean default: false sources: description: "Sources to scrape (comma-separated, blank = all GREEN)" type: string default: "" env: # Self-hosted Gitea registry on the same LAN as the runner. # CF caps push body at 100 MB, so push via LAN endpoint; pull # through the public hostname (response bodies aren't capped). REGISTRY_PUSH: 192.168.0.2:1234 REGISTRY_PULL: git.jpaul.io IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} # Embedder pool — 3 GPU-pinned endpoints reachable from the runner # container on .0.2. Measured throughput (50-chunk batches on # nomic-embed-text): # .0.125:11434 (4090) 242 embeds/sec ← weighted ×4 # .0.2:11436 (GPU-pinned) 108 embeds/sec ← weighted ×2 # .0.2:11435 (GPU-pinned) 72 embeds/sec ← weight 1 # NOTE: .0.2:11434 is NOT GPU-pinned — exclude. # NOTE: `localhost:11434` works locally during dev but resolves to the # runner CONTAINER's own localhost in CI (no Ollama there → 111 # connection refused). Use only LAN endpoints from CI. OLLAMA_URL: http://192.168.0.125:11434,http://192.168.0.125:11434,http://192.168.0.125:11434,http://192.168.0.125:11434,http://192.168.0.2:11436,http://192.168.0.2:11436,http://192.168.0.2:11435 EMBED_MODEL: nomic-embed-text PRODUCT_NAME: crop_seed jobs: refresh: runs-on: docker container: image: catthehacker/ubuntu:act-latest steps: - name: Checkout uses: actions/checkout@v4 with: # Full history — required for the digest-history step # to walk git log. Default fetch-depth: 1 silently # produces a 0-byte history file. fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.12" - name: Install dependencies run: | python -m pip install -q --upgrade pip python -m pip install -q -r requirements.txt # ---- Phase 1: scrape --------------------------------------- - name: Scrape Bayer seeds (DEKALB + Asgrow + WestBred) if: ${{ inputs.sources == '' || contains(inputs.sources, 'bayer_seeds') }} run: python -m scrape.runner --source bayer_seeds --force - name: Scrape Golden Harvest if: ${{ inputs.sources == '' || contains(inputs.sources, 'golden_harvest') }} run: python -m scrape.runner --source golden_harvest --force - name: Scrape NK if: ${{ inputs.sources == '' || contains(inputs.sources, 'nk') }} run: python -m scrape.runner --source nk --force - name: Scrape AgriPro if: ${{ inputs.sources == '' || contains(inputs.sources, 'agripro') }} run: python -m scrape.runner --source agripro --force - name: Scrape AgriPro regional trial PDFs if: ${{ inputs.sources == '' || contains(inputs.sources, 'agripro_trials') }} run: python -m scrape.runner --source agripro_trials --force - name: Scrape Golden Harvest plot reports (cross-vendor yield trials) if: ${{ inputs.sources == '' || contains(inputs.sources, 'gh_plot_reports') }} # Heaviest single source — ~4,600 docs at 1 req/sec ≈ 70 min. # Wraps the bulk of CI time; runs late so an earlier failure # doesn't waste 70 min before failing. run: python -m scrape.runner --source gh_plot_reports --force - name: Scrape Beck's PFR research corpus if: ${{ inputs.sources == '' || contains(inputs.sources, 'becks_pfr') }} # Deferred (returns 0 cleanly from a stub) — implementation # pending. Public Sanity GROQ at mc8v24rf.api.sanity.io. run: python -m scrape.runner --source becks_pfr --force # ---- Commit corpus changes + retry-on-race ----------------- - name: Commit corpus changes (if any) id: commit run: | git config user.name "seed-mcp-refresh" git config user.email "actions@jpaul.io" git add sources.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" echo "changed=false" >> "$GITHUB_OUTPUT" exit 0 fi echo "changed=true" >> "$GITHUB_OUTPUT" ts=$(date -u +"%Y-%m-%dT%H:%MZ") n_bayer=$(find corpus/bayer_seeds -name '*.json' 2>/dev/null | wc -l) n_gh=$(find corpus/golden_harvest -name '*.json' 2>/dev/null | wc -l) n_nk=$(find corpus/nk -name '*.json' 2>/dev/null | wc -l) n_ag=$(find corpus/agripro -name '*.json' 2>/dev/null | wc -l) n_agt=$(find corpus/agripro_trials -name '*.json' 2>/dev/null | wc -l) n_ghpr=$(find corpus/gh_plot_reports -name '*.json' 2>/dev/null | wc -l) n_pfr=$(find corpus/becks_pfr -name '*.json' 2>/dev/null | wc -l) git commit -m "monthly refresh: ${ts} — bayer=${n_bayer} gh=${n_gh} nk=${n_nk} agripro=${n_ag} ag_trials=${n_agt} gh_plot_reports=${n_ghpr} pfr=${n_pfr}" attempt=1 while [ $attempt -le 3 ]; do if git push; then echo "pushed corpus changes (attempt $attempt)" break fi if [ $attempt -eq 3 ]; then echo "push still failing after 3 attempts"; exit 1 fi git fetch origin main git rebase origin/main || { echo "rebase conflict"; exit 1; } attempt=$((attempt + 1)) done # ---- Rebuild Chroma + BM25 --------------------------------- - name: Rebuild indexes if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild # ---- Build & push image ------------------------------------ - name: Log in to Gitea container registry if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin - name: Build & push image if: steps.commit.outputs.changed == 'true' || inputs.force_build == true # Tags: :latest (Watchtower target), : (rollback pin), # :corpus- (links image to corpus version so # Drawbar can pin to a specific seed-catalog snapshot). run: | SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) CORPUS_TAG="corpus-$(date -u +%Y.%m.%d)" docker build \ -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ -t "${REGISTRY_PUSH}/${IMAGE}:${CORPUS_TAG}" \ . docker push "${REGISTRY_PUSH}/${IMAGE}:latest" docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" docker push "${REGISTRY_PUSH}/${IMAGE}:${CORPUS_TAG}" - name: Link container package to this repo if: steps.commit.outputs.changed == 'true' || inputs.force_build == true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | OWNER="${{ github.repository_owner }}" PKG="${{ github.event.repository.name }}" BODY=$(mktemp) CODE=$(curl -sS -o "$BODY" -w "%{http_code}" -X POST \ -H "Authorization: token ${GITEA_TOKEN}" \ "https://${REGISTRY_PULL}/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}") echo "link http=$CODE body=$(cat "$BODY")" case "$CODE" in 201) echo "linked package to ${OWNER}/${PKG}" ;; 400) echo "already linked — ok" ;; *) echo "unexpected status $CODE"; exit 1 ;; esac - name: Prune old container versions # GC requires broader scope than REGISTRY_TOKEN's push perms # (HTTP 403 on /packages/.../versions). Non-critical # housekeeping. TODO: issue separate PAT with admin:package # scope. Until then continue-on-error keeps a failed prune # from breaking the whole refresh. if: steps.commit.outputs.changed == 'true' || inputs.force_build == true continue-on-error: true env: GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} run: | python scripts/registry_gc.py \ --owner "${{ github.repository_owner }}" \ --package "${{ github.event.repository.name }}" \ --keep-days 180 \ --keep-latest 6