diff --git a/scrape/README.md b/scrape/README.md index 219b67f..383dcfc 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -85,6 +85,29 @@ Override the EPA filter for a one-off broader pull: python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486 ``` +### EPA registrant allowlist + +The EPA scraper applies a second filter at PPIS enumeration time: +**only consider products from companies on the row-crop ag-chem +allowlist** at [`scrape/sources/epa_registrant_allowlist.json`](sources/epa_registrant_allowlist.json). +This is a pre-API filter — products from non-allowlist registrants +are dropped before paying the per-product API call cost. + +Effect: the 102,378-row PPIS universe shrinks to ~11,500 rows +(~89% reduction). Full backfill drops from ~28 h to ~5–6 h. + +The allowlist covers the major US row-crop ag-chem registrants +(Syngenta, Bayer, BASF, Corteva, FMC, Nufarm, ADAMA, UPL, Albaugh, +Loveland, AMVAC, Helena, Drexel, Atticus, etc.) — see the JSON file +for the full set with verified company names. Edit it freely; the +scraper loads it at run time. Each entry was verified by querying +the EPA PPLS API for the first active product registered under that +company number. + +Bypass with `--no-registrant-filter` to enumerate the full universe +(useful if you suspect a row-crop product is registered to a small +or specialty company not on the list). + ## Canonical sidecar schema Every `corpus//.json` conforms to this shape. Fields diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index e0613cf..6773cef 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -117,6 +117,31 @@ def matches_row_crop(record: "ProductRecord") -> bool: return True return False + +# Registrant allowlist — pre-API filter. Loaded from +# epa_registrant_allowlist.json so the list can be edited without +# touching code. Set to None to disable (via --no-registrant-filter). +_REGISTRANT_ALLOWLIST_PATH = Path(__file__).resolve().parent / "epa_registrant_allowlist.json" + + +def load_registrant_allowlist() -> set[str]: + """Return the set of EPA company numbers (as strings) whose products + are worth hitting the API for. Empty set on any load error — caller + should treat that as 'pass everything through'.""" + try: + data = json.loads(_REGISTRANT_ALLOWLIST_PATH.read_text(encoding="utf-8")) + return {c["number"] for c in data.get("companies", []) if "number" in c} + except (OSError, json.JSONDecodeError, KeyError) as exc: + # Don't fail the scrape over a missing/broken allowlist — log and + # fall back to no filtering. + import logging as _log + _log.getLogger("epa_ppls").warning( + "could not load registrant allowlist at %s: %s — filter disabled", + _REGISTRANT_ALLOWLIST_PATH, exc, + ) + return set() + + log = logging.getLogger("epa_ppls") @@ -582,16 +607,32 @@ def _iter_regnos( return # Default: enumerate via PPIS bulk index rows = fetch_ppis_index(client) + allowlist = load_registrant_allowlist() if args.registrant_filter else set() + if allowlist: + log.info("registrant filter ON: %d companies in allowlist", len(allowlist)) + else: + log.info("registrant filter OFF: enumerating all PPIS active products") count = 0 + skipped_registrant = 0 for row in rows: # Skip transferred-out (status_flag 'T') entries by default; their # registration has moved to another company-product pairing. if row.status_flag == "T": continue + # Pre-API filter: skip products from registrants not on the + # row-crop ag-chem allowlist. Saves one API call per skipped + # product. Bypass with --no-registrant-filter. + if allowlist: + company_num = row.epa_reg_no.split("-", 1)[0] + if company_num not in allowlist: + skipped_registrant += 1 + continue yield row.epa_reg_no count += 1 if args.limit and count >= args.limit: - return + break + if skipped_registrant: + log.info("registrant filter skipped %d PPIS rows", skipped_registrant) def main(argv: list[str] | None = None) -> int: @@ -621,6 +662,14 @@ def main(argv: list[str] | None = None) -> int: "wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter " "to scrape every PPLS product regardless of crop.", ) + parser.add_argument( + "--registrant-filter", action=argparse.BooleanOptionalAction, default=True, + help="Pre-API filter at PPIS enumeration time: only consider products " + "whose company number is in scrape/sources/epa_registrant_allowlist.json " + "(the major US row-crop ag-chem registrants). Default on, since most " + "of the PPIS universe is non-ag. --no-registrant-filter to enumerate " + "everything.", + ) parser.add_argument( "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], diff --git a/scrape/sources/epa_registrant_allowlist.json b/scrape/sources/epa_registrant_allowlist.json new file mode 100644 index 0000000..09d48fb --- /dev/null +++ b/scrape/sources/epa_registrant_allowlist.json @@ -0,0 +1,42 @@ +{ + "_comment": "EPA company numbers known to register pesticides primarily for major US row crops (corn, soybeans, wheat) and the broader ag-chem industry. Used by the EPA PPLS scraper as a pre-API filter in _iter_regnos to skip products from non-ag registrants without paying for the per-product API call. Add/remove companies here without changing code. To bypass entirely use --no-registrant-filter.", + "_verified_on": "2026-05-23", + "_source": "Each entry's registrant name was verified by querying the EPA PPLS API for the first active product registered under that company number.", + "_excluded_examples": "Bayer Environmental Science (432) — turf/ornamental; Scotts (538) — consumer lawn & garden; Wellmark/Zoecon (2724) — animal flea/tick; Control Solutions (53883) — structural pest; Cleary (1001) — turf; PBI/Gordon (2217) — mostly turf; Buckman Labs (1448) — industrial water.", + "companies": [ + {"number": "100", "name": "Syngenta Crop Protection", "ppis_count": 1041}, + {"number": "228", "name": "Nufarm Americas", "ppis_count": 587}, + {"number": "241", "name": "BASF Agricultural Solutions", "ppis_count": 247}, + {"number": "264", "name": "Bayer CropScience (Aventis)", "ppis_count": 660}, + {"number": "279", "name": "FMC Corporation", "ppis_count": 1165}, + {"number": "352", "name": "Corteva Agriscience (DuPont)", "ppis_count": 377}, + {"number": "524", "name": "Bayer CropScience (Monsanto)", "ppis_count": 339}, + {"number": "829", "name": "Southern Agricultural Insecticides", "ppis_count": 171}, + {"number": "1381", "name": "Winfield Solutions", "ppis_count": 211}, + {"number": "1812", "name": "Griffin LLC", "ppis_count": 242}, + {"number": "2935", "name": "Wilbur-Ellis", "ppis_count": 321}, + {"number": "5481", "name": "AMVAC Chemical", "ppis_count": 525}, + {"number": "5905", "name": "Helena Agri-Enterprises", "ppis_count": 566}, + {"number": "7969", "name": "BASF Agricultural Solutions", "ppis_count": 347}, + {"number": "8033", "name": "Nippon Soda", "ppis_count": 75}, + {"number": "9779", "name": "Winfield Solutions", "ppis_count": 260}, + {"number": "10182", "name": "Syngenta Crop Protection", "ppis_count": 142}, + {"number": "19713", "name": "Drexel Chemical", "ppis_count": 498}, + {"number": "33270", "name": "Winfield Solutions", "ppis_count": 22}, + {"number": "34704", "name": "Loveland Products", "ppis_count": 1027}, + {"number": "42750", "name": "Albaugh", "ppis_count": 260}, + {"number": "51036", "name": "BASF Agricultural Solutions", "ppis_count": 166}, + {"number": "55146", "name": "Nufarm Americas", "ppis_count": 147}, + {"number": "62719", "name": "Corteva Agriscience (Dow)", "ppis_count": 547}, + {"number": "66222", "name": "Makhteshim Agan / ADAMA", "ppis_count": 192}, + {"number": "67760", "name": "Cheminova", "ppis_count": 36}, + {"number": "70506", "name": "UPL NA", "ppis_count": 444}, + {"number": "71368", "name": "Nufarm", "ppis_count": 132}, + {"number": "71512", "name": "ISK Biosciences", "ppis_count": 46}, + {"number": "71711", "name": "Nichino America", "ppis_count": 65}, + {"number": "84229", "name": "Tide International USA", "ppis_count": 47}, + {"number": "87290", "name": "Generic Crop Science", "ppis_count": 63}, + {"number": "89167", "name": "Axion Ag Products", "ppis_count": 119}, + {"number": "91234", "name": "Atticus", "ppis_count": 338} + ] +} diff --git a/sources.json b/sources.json index b6f3f07..07c0e28 100644 --- a/sources.json +++ b/sources.json @@ -16,6 +16,7 @@ "scraper": "scrape.sources.epa_ppls", "scraper_version": "0.1.0", "license_note": "US federal government — public domain (no ToS restriction)", - "scope_filter": "corn / soybean / wheat only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), or WHEAT (word-boundary match). Hits ~16% of the PPLS universe in sampling. Pass --no-row-crop-filter to scrape the full PPLS universe." + "scope_filter": "corn / soybean / wheat only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), or WHEAT (word-boundary match). Hits ~16% of the PPLS universe in sampling. Pass --no-row-crop-filter to scrape the full PPLS universe.", + "registrant_filter": "Pre-API filter at PPIS enumeration: only products from registrants on scrape/sources/epa_registrant_allowlist.json (34 major US ag-chem companies — Syngenta, Bayer, BASF, Corteva, FMC, Nufarm, ADAMA, UPL, Albaugh, Loveland, AMVAC, Helena, Drexel, Atticus, etc.) hit the API. Cuts the 102K-row PPIS universe to ~11.5K — full backfill drops from ~28h to ~5-6h. --no-registrant-filter to skip." } ]