From 60657aa6dfcc4b302adcce23e8ab6cfd8841e65f Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sat, 23 May 2026 19:05:26 -0400 Subject: [PATCH] epa_ppls: filter PPLS enumeration to row-crop products MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The farmer-advisor consumer only cares about US row crops, so the EPA scraper now drops products without at least one row-crop site in the PPLS API response. Filter is on by default; --no-row-crop-filter overrides for one-off broader pulls. Filter shape: - Word-boundary regex match against each entry in the API's `sites` array (e.g., "SOYBEANS (FOLIAR TREATMENT)" → keep, "SHIPS, BOATS, SHIPHOLDS" → drop even though it contains "OATS" as substring). - Allowlist covers the major US row + small-grain + oilseed + sugar/ fiber crops, plus alfalfa as a common rotation crop. See ROW_CROP_KEYWORDS in scrape/sources/epa_ppls.py for the full list. Cost model: - 102K PPIS rows still need one API call each (no bulk filter available upstream), so enumeration still takes ~28h at 1 req/sec. - But PDF downloads drop from ~67K → ~5-10K (estimated row-crop hit rate), saving ~17h wall time and ~60GB disk on a full backfill. Smoke test (4 mixed reg nos): 524-475 Roundup Ultra → kept (CORN/SOYBEANS/COTTON sites) 524-591 Warrant → kept (CORN/SOYBEANS/SORGHUM sites) 100-1486 Advion Cockroach → filtered (building/transport sites only) 432-1276 (Bayer pet flea) → filtered (no row crops) Co-Authored-By: Claude Opus 4.7 (1M context) --- scrape/README.md | 21 +++++++++++++ scrape/sources/epa_ppls.py | 62 ++++++++++++++++++++++++++++++++++++-- sources.json | 3 +- 3 files changed, 82 insertions(+), 4 deletions(-) diff --git a/scrape/README.md b/scrape/README.md index 99d8ae7..2dd1fd0 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -41,6 +41,27 @@ Every scraper is **idempotent** by default — re-running with the same arguments skips records already on disk. Use `--force` to re-fetch. +## Scope: row crops only + +The corpus is scoped to **US row crops** — corn, soybeans, cotton, +wheat, rice, sorghum/milo, barley, oats, rye, sunflowers, peanuts, +sugar beets, dry/field beans, canola/rapeseed, and alfalfa. The +EPA PPLS scraper enforces this by inspecting the `sites` array on +each product's PPLS API response and dropping anything without a +row-crop site (word-boundary match). + +The Bayer scraper doesn't filter — its catalog is implicitly +ag-focused, and dropping fungicide/insecticide/seed-treatment +products there would lose row-crop-relevant chemistry. Add +per-source filters as needed if other manufacturer sources cover +non-ag products. + +Override the EPA filter for a one-off broader pull: + +```bash +python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486 +``` + ## Canonical sidecar schema Every `corpus//.json` conforms to this shape. Fields diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index e8f3938..77af57f 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -78,6 +78,46 @@ REQUEST_DELAY_SECONDS = 1.1 # polite: ~1 req/sec HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0) MAX_RETRIES = 4 +# Row-crop scoping. Each pattern is matched case-insensitively against a +# product's "sites" array from the PPLS API. Word boundaries matter — bare +# "OATS" naively matches "SHIPS, BOATS, SHIPHOLDS"; bare "RICE" matches +# "LICORICE"; bare "RYE" matches "FRYER". +# +# Scope = the major US row + small-grain + oilseed + sugar/fiber crops the +# farmer-advisor consumer cares about. Alfalfa included as a common rotation +# crop; sweet/seed corn included alongside field corn. +ROW_CROP_KEYWORDS = ( + "CORN", "MAIZE", "POPCORN", + "SOYBEAN", "SOYBEANS", + "COTTON", + "WHEAT", + "RICE", + "SORGHUM", "MILO", + "BARLEY", "OATS", "RYE", + "SUNFLOWER", "SUNFLOWERS", + "PEANUT", "PEANUTS", + "SUGAR BEET", "SUGAR BEETS", + "DRY BEAN", "DRY BEANS", "FIELD BEAN", "FIELD BEANS", + "CANOLA", "RAPESEED", + "ALFALFA", +) +_ROW_CROP_PATTERNS = tuple( + re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE) + for kw in ROW_CROP_KEYWORDS +) + + +def matches_row_crop(record: "ProductRecord") -> bool: + """True if the product's PPLS API sites array contains at least one + row-crop site (CORN, SOYBEANS, COTTON, etc., with word boundaries).""" + item = record.raw_api_item or {} + sites = item.get("sites") or [] + for s in sites: + site = (s.get("site") or "") if isinstance(s, dict) else str(s) + if any(p.search(site) for p in _ROW_CROP_PATTERNS): + return True + return False + log = logging.getLogger("epa_ppls") @@ -382,8 +422,10 @@ def process_one( regno: str, *, force: bool = False, + row_crop_filter: bool = True, ) -> str: - """Fetch + extract one product. Returns 'skipped'|'wrote'|'no-pdf'|'error'.""" + """Fetch + extract one product. Returns + 'skipped'|'wrote'|'no-pdf'|'error'|'filtered'.""" md_path = _md_path(regno) json_path = _json_path(regno) if not force and md_path.exists() and json_path.exists(): @@ -397,6 +439,10 @@ def process_one( return "error" time.sleep(REQUEST_DELAY_SECONDS) + if row_crop_filter and not matches_row_crop(record): + log.info("[%s] filtered (not row-crop)", regno) + return "filtered" + def _build_sidecar( *, label_url: str | None, @@ -570,6 +616,12 @@ def main(argv: list[str] | None = None) -> int: "--seed-file", metavar="PATH", help="Text file with one EPA Reg No per line (# comments OK).", ) + parser.add_argument( + "--row-crop-filter", action=argparse.BooleanOptionalAction, default=True, + help="Keep only products with row-crop sites (corn, soy, cotton, " + "wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter " + "to scrape every PPLS product regardless of crop.", + ) parser.add_argument( "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], @@ -584,10 +636,14 @@ def main(argv: list[str] | None = None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) - summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "error": 0} + summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "filtered": 0, "error": 0} with _client() as client: for regno in _iter_regnos(args, client): - result = process_one(client, regno, force=args.force) + result = process_one( + client, regno, + force=args.force, + row_crop_filter=args.row_crop_filter, + ) summary[result] = summary.get(result, 0) + 1 log.info("done: %s", summary) diff --git a/sources.json b/sources.json index 0c92a42..29d304d 100644 --- a/sources.json +++ b/sources.json @@ -15,6 +15,7 @@ "homepage": "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1", "scraper": "scrape.sources.epa_ppls", "scraper_version": "0.1.0", - "license_note": "US federal government — public domain (no ToS restriction)" + "license_note": "US federal government — public domain (no ToS restriction)", + "scope_filter": "row-crop only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), COTTON, WHEAT, RICE, SORGHUM, MILO, BARLEY, OATS, RYE, SUNFLOWER(S), PEANUT(S), SUGAR BEET(S), DRY/FIELD BEAN(S), CANOLA, RAPESEED, or ALFALFA (word-boundary match). Pass --no-row-crop-filter to scrape the full PPLS universe." } ]