From 717426f873d527f202af70be82a72dc5bff36cf4 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sat, 23 May 2026 20:41:56 -0400 Subject: [PATCH] scrape: route corpus via PPLS_CORPUS_ROOT env var Both scrapers now honor PPLS_CORPUS_ROOT so the corpus can land on external storage (USB drive, NAS mount, secondary partition) without editing the repo. Default behavior unchanged: corpus/ at repo root when the env var is unset. Per-source subdirectory layout preserved: ${PPLS_CORPUS_ROOT}/bayer/, ${PPLS_CORPUS_ROOT}/epa_ppls/, etc. Live-verified against /run/media/justin/USB (vfat, 59GB free): PPLS_CORPUS_ROOT=/run/media/justin/USB/ppls-corpus \ python -m scrape.runner --source epa_ppls --reg-no 524-475 -> wrote to USB, root disk untouched Co-Authored-By: Claude Opus 4.7 (1M context) --- scrape/README.md | 17 +++++++++++++++++ scrape/sources/bayer.py | 6 +++++- scrape/sources/epa_ppls.py | 7 ++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/scrape/README.md b/scrape/README.md index e1f371c..219b67f 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -41,6 +41,23 @@ Every scraper is **idempotent** by default — re-running with the same arguments skips records already on disk. Use `--force` to re-fetch. +### Corpus location + +Default: `corpus/` at the repo root. Override with the +`PPLS_CORPUS_ROOT` env var to route the corpus to external storage +(USB drive, NAS mount, secondary partition): + +```bash +export PPLS_CORPUS_ROOT=/mnt/big-disk/ppls-corpus +python -m scrape.runner --source bayer --limit 20 +# writes to /mnt/big-disk/ppls-corpus/bayer/... +``` + +All sources honor the same env var; each creates its own +`/` subdirectory beneath it. Per-source code paths +still resolve `CORPUS_DIR` correctly whether the env var is set +or not. + ## Scope: corn / soybeans / wheat The corpus is scoped to the three crops the consumer app focuses on: diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py index ed0da6f..e21b968 100644 --- a/scrape/sources/bayer.py +++ b/scrape/sources/bayer.py @@ -63,8 +63,12 @@ PRODUCT_CLASS = { } # Repo root: scrape/sources/bayer.py -> repo root is 3 parents up. +# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the +# corpus to external storage (USB drive, NAS mount, etc.) without +# editing the repo. REPO_ROOT = Path(__file__).resolve().parents[2] -CORPUS_DIR = REPO_ROOT / "corpus" / "bayer" +CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "bayer" # Politeness: target ~1 req/sec to Bayer. Each HTTP method goes through # a tiny token-bucket sleeper to enforce this without per-call asyncio. diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index 39bfbe6..e0613cf 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -48,6 +48,7 @@ import argparse import io import json import logging +import os import re import sys import time @@ -72,7 +73,11 @@ PPLS_INDEX_URL_TEMPLATE = ( ) REPO_ROOT = Path(__file__).resolve().parents[2] -CORPUS_DIR = REPO_ROOT / "corpus" / "epa_ppls" +# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the +# corpus to external storage (USB drive, NAS mount, etc.) without +# editing the repo. +CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "epa_ppls" REQUEST_DELAY_SECONDS = 1.1 # polite: ~1 req/sec HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0)