From 717426f873d527f202af70be82a72dc5bff36cf4 Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Sat, 23 May 2026 20:41:56 -0400
Subject: [PATCH] scrape: route corpus via PPLS_CORPUS_ROOT env var

Both scrapers now honor PPLS_CORPUS_ROOT so the corpus can land on
external storage (USB drive, NAS mount, secondary partition) without
editing the repo. Default behavior unchanged: corpus/ at repo root
when the env var is unset.

Per-source subdirectory layout preserved: ${PPLS_CORPUS_ROOT}/bayer/,
${PPLS_CORPUS_ROOT}/epa_ppls/, etc.

Live-verified against /run/media/justin/USB (vfat, 59GB free):
  PPLS_CORPUS_ROOT=/run/media/justin/USB/ppls-corpus \
    python -m scrape.runner --source epa_ppls --reg-no 524-475
  -> wrote to USB, root disk untouched

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scrape/README.md           | 17 +++++++++++++++++
 scrape/sources/bayer.py    |  6 +++++-
 scrape/sources/epa_ppls.py |  7 ++++++-
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/scrape/README.md b/scrape/README.md
index e1f371c..219b67f 100644
--- a/scrape/README.md
+++ b/scrape/README.md
@@ -41,6 +41,23 @@ Every scraper is **idempotent** by default — re-running with the
 same arguments skips records already on disk. Use `--force` to
 re-fetch.
 
+### Corpus location
+
+Default: `corpus/` at the repo root. Override with the
+`PPLS_CORPUS_ROOT` env var to route the corpus to external storage
+(USB drive, NAS mount, secondary partition):
+
+```bash
+export PPLS_CORPUS_ROOT=/mnt/big-disk/ppls-corpus
+python -m scrape.runner --source bayer --limit 20
+# writes to /mnt/big-disk/ppls-corpus/bayer/...
+```
+
+All sources honor the same env var; each creates its own
+`<source_id>/` subdirectory beneath it. Per-source code paths
+still resolve `CORPUS_DIR` correctly whether the env var is set
+or not.
+
 ## Scope: corn / soybeans / wheat
 
 The corpus is scoped to the three crops the consumer app focuses on:
diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py
index ed0da6f..e21b968 100644
--- a/scrape/sources/bayer.py
+++ b/scrape/sources/bayer.py
@@ -63,8 +63,12 @@ PRODUCT_CLASS = {
 }
 
 # Repo root: scrape/sources/bayer.py -> repo root is 3 parents up.
+# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the
+# corpus to external storage (USB drive, NAS mount, etc.) without
+# editing the repo.
 REPO_ROOT = Path(__file__).resolve().parents[2]
-CORPUS_DIR = REPO_ROOT / "corpus" / "bayer"
+CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "bayer"
 
 # Politeness: target ~1 req/sec to Bayer. Each HTTP method goes through
 # a tiny token-bucket sleeper to enforce this without per-call asyncio.
diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py
index 39bfbe6..e0613cf 100644
--- a/scrape/sources/epa_ppls.py
+++ b/scrape/sources/epa_ppls.py
@@ -48,6 +48,7 @@ import argparse
 import io
 import json
 import logging
+import os
 import re
 import sys
 import time
@@ -72,7 +73,11 @@ PPLS_INDEX_URL_TEMPLATE = (
 )
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
-CORPUS_DIR = REPO_ROOT / "corpus" / "epa_ppls"
+# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the
+# corpus to external storage (USB drive, NAS mount, etc.) without
+# editing the repo.
+CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "epa_ppls"
 
 REQUEST_DELAY_SECONDS = 1.1  # polite: ~1 req/sec
 HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0)