From 60657aa6dfcc4b302adcce23e8ab6cfd8841e65f Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Sat, 23 May 2026 19:05:26 -0400
Subject: [PATCH] epa_ppls: filter PPLS enumeration to row-crop products
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The farmer-advisor consumer only cares about US row crops, so the EPA
scraper now drops products without at least one row-crop site in the
PPLS API response. Filter is on by default; --no-row-crop-filter
overrides for one-off broader pulls.

Filter shape:
  - Word-boundary regex match against each entry in the API's `sites`
    array (e.g., "SOYBEANS (FOLIAR TREATMENT)" → keep, "SHIPS, BOATS,
    SHIPHOLDS" → drop even though it contains "OATS" as substring).
  - Allowlist covers the major US row + small-grain + oilseed + sugar/
    fiber crops, plus alfalfa as a common rotation crop. See
    ROW_CROP_KEYWORDS in scrape/sources/epa_ppls.py for the full list.

Cost model:
  - 102K PPIS rows still need one API call each (no bulk filter
    available upstream), so enumeration still takes ~28h at 1 req/sec.
  - But PDF downloads drop from ~67K → ~5-10K (estimated row-crop
    hit rate), saving ~17h wall time and ~60GB disk on a full backfill.

Smoke test (4 mixed reg nos):
  524-475 Roundup Ultra        → kept (CORN/SOYBEANS/COTTON sites)
  524-591 Warrant              → kept (CORN/SOYBEANS/SORGHUM sites)
  100-1486 Advion Cockroach    → filtered (building/transport sites only)
  432-1276 (Bayer pet flea)    → filtered (no row crops)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scrape/README.md           | 21 +++++++++++++
 scrape/sources/epa_ppls.py | 62 ++++++++++++++++++++++++++++++++++++--
 sources.json               |  3 +-
 3 files changed, 82 insertions(+), 4 deletions(-)
diff --git a/scrape/README.md b/scrape/README.md
index 99d8ae7..2dd1fd0 100644
--- a/scrape/README.md
+++ b/scrape/README.md
@@ -41,6 +41,27 @@ Every scraper is **idempotent** by default — re-running with the
 same arguments skips records already on disk. Use `--force` to
 re-fetch.
 
+## Scope: row crops only
+
+The corpus is scoped to **US row crops** — corn, soybeans, cotton,
+wheat, rice, sorghum/milo, barley, oats, rye, sunflowers, peanuts,
+sugar beets, dry/field beans, canola/rapeseed, and alfalfa. The
+EPA PPLS scraper enforces this by inspecting the `sites` array on
+each product's PPLS API response and dropping anything without a
+row-crop site (word-boundary match).
+
+The Bayer scraper doesn't filter — its catalog is implicitly
+ag-focused, and dropping fungicide/insecticide/seed-treatment
+products there would lose row-crop-relevant chemistry. Add
+per-source filters as needed if other manufacturer sources cover
+non-ag products.
+
+Override the EPA filter for a one-off broader pull:
+
+```bash
+python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486
+```
+
 ## Canonical sidecar schema
 
 Every `corpus/<source>/<key>.json` conforms to this shape. Fields
diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py
index e8f3938..77af57f 100644
--- a/scrape/sources/epa_ppls.py
+++ b/scrape/sources/epa_ppls.py
@@ -78,6 +78,46 @@ REQUEST_DELAY_SECONDS = 1.1  # polite: ~1 req/sec
 HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0)
 MAX_RETRIES = 4
 
+# Row-crop scoping. Each pattern is matched case-insensitively against a
+# product's "sites" array from the PPLS API. Word boundaries matter — bare
+# "OATS" naively matches "SHIPS, BOATS, SHIPHOLDS"; bare "RICE" matches
+# "LICORICE"; bare "RYE" matches "FRYER".
+#
+# Scope = the major US row + small-grain + oilseed + sugar/fiber crops the
+# farmer-advisor consumer cares about. Alfalfa included as a common rotation
+# crop; sweet/seed corn included alongside field corn.
+ROW_CROP_KEYWORDS = (
+    "CORN", "MAIZE", "POPCORN",
+    "SOYBEAN", "SOYBEANS",
+    "COTTON",
+    "WHEAT",
+    "RICE",
+    "SORGHUM", "MILO",
+    "BARLEY", "OATS", "RYE",
+    "SUNFLOWER", "SUNFLOWERS",
+    "PEANUT", "PEANUTS",
+    "SUGAR BEET", "SUGAR BEETS",
+    "DRY BEAN", "DRY BEANS", "FIELD BEAN", "FIELD BEANS",
+    "CANOLA", "RAPESEED",
+    "ALFALFA",
+)
+_ROW_CROP_PATTERNS = tuple(
+    re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE)
+    for kw in ROW_CROP_KEYWORDS
+)
+
+
+def matches_row_crop(record: "ProductRecord") -> bool:
+    """True if the product's PPLS API sites array contains at least one
+    row-crop site (CORN, SOYBEANS, COTTON, etc., with word boundaries)."""
+    item = record.raw_api_item or {}
+    sites = item.get("sites") or []
+    for s in sites:
+        site = (s.get("site") or "") if isinstance(s, dict) else str(s)
+        if any(p.search(site) for p in _ROW_CROP_PATTERNS):
+            return True
+    return False
+
 log = logging.getLogger("epa_ppls")
 
 
@@ -382,8 +422,10 @@ def process_one(
     regno: str,
     *,
     force: bool = False,
+    row_crop_filter: bool = True,
 ) -> str:
-    """Fetch + extract one product. Returns 'skipped'|'wrote'|'no-pdf'|'error'."""
+    """Fetch + extract one product. Returns
+    'skipped'|'wrote'|'no-pdf'|'error'|'filtered'."""
     md_path = _md_path(regno)
     json_path = _json_path(regno)
     if not force and md_path.exists() and json_path.exists():
@@ -397,6 +439,10 @@ def process_one(
         return "error"
     time.sleep(REQUEST_DELAY_SECONDS)
 
+    if row_crop_filter and not matches_row_crop(record):
+        log.info("[%s] filtered (not row-crop)", regno)
+        return "filtered"
+
     def _build_sidecar(
         *,
         label_url: str | None,
@@ -570,6 +616,12 @@ def main(argv: list[str] | None = None) -> int:
         "--seed-file", metavar="PATH",
         help="Text file with one EPA Reg No per line (# comments OK).",
     )
+    parser.add_argument(
+        "--row-crop-filter", action=argparse.BooleanOptionalAction, default=True,
+        help="Keep only products with row-crop sites (corn, soy, cotton, "
+             "wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter "
+             "to scrape every PPLS product regardless of crop.",
+    )
     parser.add_argument(
         "--log-level", default="INFO",
         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
@@ -584,10 +636,14 @@ def main(argv: list[str] | None = None) -> int:
 
     CORPUS_DIR.mkdir(parents=True, exist_ok=True)
 
-    summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "error": 0}
+    summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "filtered": 0, "error": 0}
     with _client() as client:
         for regno in _iter_regnos(args, client):
-            result = process_one(client, regno, force=args.force)
+            result = process_one(
+                client, regno,
+                force=args.force,
+                row_crop_filter=args.row_crop_filter,
+            )
             summary[result] = summary.get(result, 0) + 1
 
     log.info("done: %s", summary)
diff --git a/sources.json b/sources.json
index 0c92a42..29d304d 100644
--- a/sources.json
+++ b/sources.json
@@ -15,6 +15,7 @@
     "homepage": "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1",
     "scraper": "scrape.sources.epa_ppls",
     "scraper_version": "0.1.0",
-    "license_note": "US federal government — public domain (no ToS restriction)"
+    "license_note": "US federal government — public domain (no ToS restriction)",
+    "scope_filter": "row-crop only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), COTTON, WHEAT, RICE, SORGHUM, MILO, BARLEY, OATS, RYE, SUNFLOWER(S), PEANUT(S), SUGAR BEET(S), DRY/FIELD BEAN(S), CANOLA, RAPESEED, or ALFALFA (word-boundary match). Pass --no-row-crop-filter to scrape the full PPLS universe."
   }
 ]