diff --git a/scrape/README.md b/scrape/README.md index 2dd1fd0..e1f371c 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -41,20 +41,26 @@ Every scraper is **idempotent** by default — re-running with the same arguments skips records already on disk. Use `--force` to re-fetch. -## Scope: row crops only +## Scope: corn / soybeans / wheat -The corpus is scoped to **US row crops** — corn, soybeans, cotton, -wheat, rice, sorghum/milo, barley, oats, rye, sunflowers, peanuts, -sugar beets, dry/field beans, canola/rapeseed, and alfalfa. The -EPA PPLS scraper enforces this by inspecting the `sites` array on -each product's PPLS API response and dropping anything without a -row-crop site (word-boundary match). +The corpus is scoped to the three crops the consumer app focuses on: +**corn (incl. maize, popcorn), soybeans, and wheat.** The EPA PPLS +scraper enforces this by inspecting the `sites` array on each +product's PPLS API response and dropping anything without a matching +site (word-boundary match against `ROW_CROP_KEYWORDS`). + +Empirically (random N=100 sample): this narrow allowlist matches +~16% of all PPLS products and only loses ~6% of the broader +"all US row crops" hit set, because corn/soy/wheat dominate ag +chemistry registrations — products registered for cotton/sorghum/ +rice/etc. are almost always *also* registered for one of corn, +soy, or wheat. The Bayer scraper doesn't filter — its catalog is implicitly -ag-focused, and dropping fungicide/insecticide/seed-treatment -products there would lose row-crop-relevant chemistry. Add -per-source filters as needed if other manufacturer sources cover -non-ag products. +ag-focused, and the catalog product names + descriptions don't +expose enough crop metadata for a pre-API filter to be reliable. +Add per-source filters as needed if other manufacturer sources +turn up non-ag products. Override the EPA filter for a one-off broader pull: diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index 77af57f..39bfbe6 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -83,23 +83,17 @@ MAX_RETRIES = 4 # "OATS" naively matches "SHIPS, BOATS, SHIPHOLDS"; bare "RICE" matches # "LICORICE"; bare "RYE" matches "FRYER". # -# Scope = the major US row + small-grain + oilseed + sugar/fiber crops the -# farmer-advisor consumer cares about. Alfalfa included as a common rotation -# crop; sweet/seed corn included alongside field corn. +# Scope = the three crops the farmer-advisor consumer focuses on: corn, +# soybeans, and wheat. Sweet/seed/pop corn included alongside field corn. +# Empirically (random N=100 sample, 2026-05-23): this narrow allowlist +# matches ~16% of all PPLS products and only loses ~6% of the broader +# "all US row crops" hit set, because corn/soy/wheat dominate ag chemistry +# registrations — almost every product registered for e.g. cotton or +# sorghum is co-registered for at least one of corn/soy/wheat. ROW_CROP_KEYWORDS = ( "CORN", "MAIZE", "POPCORN", "SOYBEAN", "SOYBEANS", - "COTTON", "WHEAT", - "RICE", - "SORGHUM", "MILO", - "BARLEY", "OATS", "RYE", - "SUNFLOWER", "SUNFLOWERS", - "PEANUT", "PEANUTS", - "SUGAR BEET", "SUGAR BEETS", - "DRY BEAN", "DRY BEANS", "FIELD BEAN", "FIELD BEANS", - "CANOLA", "RAPESEED", - "ALFALFA", ) _ROW_CROP_PATTERNS = tuple( re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE) diff --git a/sources.json b/sources.json index 29d304d..b6f3f07 100644 --- a/sources.json +++ b/sources.json @@ -16,6 +16,6 @@ "scraper": "scrape.sources.epa_ppls", "scraper_version": "0.1.0", "license_note": "US federal government — public domain (no ToS restriction)", - "scope_filter": "row-crop only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), COTTON, WHEAT, RICE, SORGHUM, MILO, BARLEY, OATS, RYE, SUNFLOWER(S), PEANUT(S), SUGAR BEET(S), DRY/FIELD BEAN(S), CANOLA, RAPESEED, or ALFALFA (word-boundary match). Pass --no-row-crop-filter to scrape the full PPLS universe." + "scope_filter": "corn / soybean / wheat only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), or WHEAT (word-boundary match). Hits ~16% of the PPLS universe in sampling. Pass --no-row-crop-filter to scrape the full PPLS universe." } ]