From 92a95d5e7884218eed309a7de574e40fb57aefda Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sat, 23 May 2026 23:55:38 -0400 Subject: [PATCH] epa_ppls: add registrant allowlist pre-API filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cuts the PPIS-enumeration universe from 102K rows to ~11.5K rows by dropping products from non-row-crop-ag registrants BEFORE the per- product API call. This is the biggest cost lever we have on the EPA scraper — full backfill drops from ~28 h to ~3.5 h. scrape/sources/epa_registrant_allowlist.json holds the 34 verified ag-chem company numbers (Syngenta, Bayer, BASF, Corteva, FMC, Nufarm, ADAMA, UPL, Albaugh, Loveland, AMVAC, Helena, Drexel, Atticus, etc.). Each entry was verified by querying the EPA PPLS API for the first active product registered under that company number. Edit the JSON freely — scraper loads it at run time. Bypass with --no-registrant-filter when you suspect a row-crop product registered to a specialty company not on the list. Why a curated allowlist rather than blacklist consumer brands: the 102K PPIS rows are 89% non-ag-relevant; an allowlist is shorter to maintain and harder to false-positive. Excluded with intent (not omissions): Bayer Environmental Science (turf/ornamental), Scotts (consumer lawn & garden), Wellmark/Zoecon (animal flea/tick), Control Solutions (structural pest), Cleary (turf), PBI/Gordon (mostly turf), Buckman Labs (industrial water). Smoke test --limit 100: - 1239 PPIS rows considered (in first slice of file) - 1139 skipped by registrant filter (no API call paid) - 100 hit API, 81 filtered by row-crop sites, 19 written - = 91% API-call reduction over the prior version Co-Authored-By: Claude Opus 4.7 (1M context) --- scrape/README.md | 23 +++++++++ scrape/sources/epa_ppls.py | 51 +++++++++++++++++++- scrape/sources/epa_registrant_allowlist.json | 42 ++++++++++++++++ sources.json | 3 +- 4 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 scrape/sources/epa_registrant_allowlist.json diff --git a/scrape/README.md b/scrape/README.md index 219b67f..383dcfc 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -85,6 +85,29 @@ Override the EPA filter for a one-off broader pull: python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486 ``` +### EPA registrant allowlist + +The EPA scraper applies a second filter at PPIS enumeration time: +**only consider products from companies on the row-crop ag-chem +allowlist** at [`scrape/sources/epa_registrant_allowlist.json`](sources/epa_registrant_allowlist.json). +This is a pre-API filter — products from non-allowlist registrants +are dropped before paying the per-product API call cost. + +Effect: the 102,378-row PPIS universe shrinks to ~11,500 rows +(~89% reduction). Full backfill drops from ~28 h to ~5–6 h. + +The allowlist covers the major US row-crop ag-chem registrants +(Syngenta, Bayer, BASF, Corteva, FMC, Nufarm, ADAMA, UPL, Albaugh, +Loveland, AMVAC, Helena, Drexel, Atticus, etc.) — see the JSON file +for the full set with verified company names. Edit it freely; the +scraper loads it at run time. Each entry was verified by querying +the EPA PPLS API for the first active product registered under that +company number. + +Bypass with `--no-registrant-filter` to enumerate the full universe +(useful if you suspect a row-crop product is registered to a small +or specialty company not on the list). + ## Canonical sidecar schema Every `corpus//.json` conforms to this shape. Fields diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index e0613cf..6773cef 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -117,6 +117,31 @@ def matches_row_crop(record: "ProductRecord") -> bool: return True return False + +# Registrant allowlist — pre-API filter. Loaded from +# epa_registrant_allowlist.json so the list can be edited without +# touching code. Set to None to disable (via --no-registrant-filter). +_REGISTRANT_ALLOWLIST_PATH = Path(__file__).resolve().parent / "epa_registrant_allowlist.json" + + +def load_registrant_allowlist() -> set[str]: + """Return the set of EPA company numbers (as strings) whose products + are worth hitting the API for. Empty set on any load error — caller + should treat that as 'pass everything through'.""" + try: + data = json.loads(_REGISTRANT_ALLOWLIST_PATH.read_text(encoding="utf-8")) + return {c["number"] for c in data.get("companies", []) if "number" in c} + except (OSError, json.JSONDecodeError, KeyError) as exc: + # Don't fail the scrape over a missing/broken allowlist — log and + # fall back to no filtering. + import logging as _log + _log.getLogger("epa_ppls").warning( + "could not load registrant allowlist at %s: %s — filter disabled", + _REGISTRANT_ALLOWLIST_PATH, exc, + ) + return set() + + log = logging.getLogger("epa_ppls") @@ -582,16 +607,32 @@ def _iter_regnos( return # Default: enumerate via PPIS bulk index rows = fetch_ppis_index(client) + allowlist = load_registrant_allowlist() if args.registrant_filter else set() + if allowlist: + log.info("registrant filter ON: %d companies in allowlist", len(allowlist)) + else: + log.info("registrant filter OFF: enumerating all PPIS active products") count = 0 + skipped_registrant = 0 for row in rows: # Skip transferred-out (status_flag 'T') entries by default; their # registration has moved to another company-product pairing. if row.status_flag == "T": continue + # Pre-API filter: skip products from registrants not on the + # row-crop ag-chem allowlist. Saves one API call per skipped + # product. Bypass with --no-registrant-filter. + if allowlist: + company_num = row.epa_reg_no.split("-", 1)[0] + if company_num not in allowlist: + skipped_registrant += 1 + continue yield row.epa_reg_no count += 1 if args.limit and count >= args.limit: - return + break + if skipped_registrant: + log.info("registrant filter skipped %d PPIS rows", skipped_registrant) def main(argv: list[str] | None = None) -> int: @@ -621,6 +662,14 @@ def main(argv: list[str] | None = None) -> int: "wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter " "to scrape every PPLS product regardless of crop.", ) + parser.add_argument( + "--registrant-filter", action=argparse.BooleanOptionalAction, default=True, + help="Pre-API filter at PPIS enumeration time: only consider products " + "whose company number is in scrape/sources/epa_registrant_allowlist.json " + "(the major US row-crop ag-chem registrants). Default on, since most " + "of the PPIS universe is non-ag. --no-registrant-filter to enumerate " + "everything.", + ) parser.add_argument( "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], diff --git a/scrape/sources/epa_registrant_allowlist.json b/scrape/sources/epa_registrant_allowlist.json new file mode 100644 index 0000000..09d48fb --- /dev/null +++ b/scrape/sources/epa_registrant_allowlist.json @@ -0,0 +1,42 @@ +{ + "_comment": "EPA company numbers known to register pesticides primarily for major US row crops (corn, soybeans, wheat) and the broader ag-chem industry. Used by the EPA PPLS scraper as a pre-API filter in _iter_regnos to skip products from non-ag registrants without paying for the per-product API call. Add/remove companies here without changing code. To bypass entirely use --no-registrant-filter.", + "_verified_on": "2026-05-23", + "_source": "Each entry's registrant name was verified by querying the EPA PPLS API for the first active product registered under that company number.", + "_excluded_examples": "Bayer Environmental Science (432) — turf/ornamental; Scotts (538) — consumer lawn & garden; Wellmark/Zoecon (2724) — animal flea/tick; Control Solutions (53883) — structural pest; Cleary (1001) — turf; PBI/Gordon (2217) — mostly turf; Buckman Labs (1448) — industrial water.", + "companies": [ + {"number": "100", "name": "Syngenta Crop Protection", "ppis_count": 1041}, + {"number": "228", "name": "Nufarm Americas", "ppis_count": 587}, + {"number": "241", "name": "BASF Agricultural Solutions", "ppis_count": 247}, + {"number": "264", "name": "Bayer CropScience (Aventis)", "ppis_count": 660}, + {"number": "279", "name": "FMC Corporation", "ppis_count": 1165}, + {"number": "352", "name": "Corteva Agriscience (DuPont)", "ppis_count": 377}, + {"number": "524", "name": "Bayer CropScience (Monsanto)", "ppis_count": 339}, + {"number": "829", "name": "Southern Agricultural Insecticides", "ppis_count": 171}, + {"number": "1381", "name": "Winfield Solutions", "ppis_count": 211}, + {"number": "1812", "name": "Griffin LLC", "ppis_count": 242}, + {"number": "2935", "name": "Wilbur-Ellis", "ppis_count": 321}, + {"number": "5481", "name": "AMVAC Chemical", "ppis_count": 525}, + {"number": "5905", "name": "Helena Agri-Enterprises", "ppis_count": 566}, + {"number": "7969", "name": "BASF Agricultural Solutions", "ppis_count": 347}, + {"number": "8033", "name": "Nippon Soda", "ppis_count": 75}, + {"number": "9779", "name": "Winfield Solutions", "ppis_count": 260}, + {"number": "10182", "name": "Syngenta Crop Protection", "ppis_count": 142}, + {"number": "19713", "name": "Drexel Chemical", "ppis_count": 498}, + {"number": "33270", "name": "Winfield Solutions", "ppis_count": 22}, + {"number": "34704", "name": "Loveland Products", "ppis_count": 1027}, + {"number": "42750", "name": "Albaugh", "ppis_count": 260}, + {"number": "51036", "name": "BASF Agricultural Solutions", "ppis_count": 166}, + {"number": "55146", "name": "Nufarm Americas", "ppis_count": 147}, + {"number": "62719", "name": "Corteva Agriscience (Dow)", "ppis_count": 547}, + {"number": "66222", "name": "Makhteshim Agan / ADAMA", "ppis_count": 192}, + {"number": "67760", "name": "Cheminova", "ppis_count": 36}, + {"number": "70506", "name": "UPL NA", "ppis_count": 444}, + {"number": "71368", "name": "Nufarm", "ppis_count": 132}, + {"number": "71512", "name": "ISK Biosciences", "ppis_count": 46}, + {"number": "71711", "name": "Nichino America", "ppis_count": 65}, + {"number": "84229", "name": "Tide International USA", "ppis_count": 47}, + {"number": "87290", "name": "Generic Crop Science", "ppis_count": 63}, + {"number": "89167", "name": "Axion Ag Products", "ppis_count": 119}, + {"number": "91234", "name": "Atticus", "ppis_count": 338} + ] +} diff --git a/sources.json b/sources.json index b6f3f07..07c0e28 100644 --- a/sources.json +++ b/sources.json @@ -16,6 +16,7 @@ "scraper": "scrape.sources.epa_ppls", "scraper_version": "0.1.0", "license_note": "US federal government — public domain (no ToS restriction)", - "scope_filter": "corn / soybean / wheat only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), or WHEAT (word-boundary match). Hits ~16% of the PPLS universe in sampling. Pass --no-row-crop-filter to scrape the full PPLS universe." + "scope_filter": "corn / soybean / wheat only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), or WHEAT (word-boundary match). Hits ~16% of the PPLS universe in sampling. Pass --no-row-crop-filter to scrape the full PPLS universe.", + "registrant_filter": "Pre-API filter at PPIS enumeration: only products from registrants on scrape/sources/epa_registrant_allowlist.json (34 major US ag-chem companies — Syngenta, Bayer, BASF, Corteva, FMC, Nufarm, ADAMA, UPL, Albaugh, Loveland, AMVAC, Helena, Drexel, Atticus, etc.) hit the API. Cuts the 102K-row PPIS universe to ~11.5K — full backfill drops from ~28h to ~5-6h. --no-registrant-filter to skip." } ]