diff --git a/scrape/README.md b/scrape/README.md index 99d8ae7..2dd1fd0 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -41,6 +41,27 @@ Every scraper is **idempotent** by default — re-running with the same arguments skips records already on disk. Use `--force` to re-fetch. +## Scope: row crops only + +The corpus is scoped to **US row crops** — corn, soybeans, cotton, +wheat, rice, sorghum/milo, barley, oats, rye, sunflowers, peanuts, +sugar beets, dry/field beans, canola/rapeseed, and alfalfa. The +EPA PPLS scraper enforces this by inspecting the `sites` array on +each product's PPLS API response and dropping anything without a +row-crop site (word-boundary match). + +The Bayer scraper doesn't filter — its catalog is implicitly +ag-focused, and dropping fungicide/insecticide/seed-treatment +products there would lose row-crop-relevant chemistry. Add +per-source filters as needed if other manufacturer sources cover +non-ag products. + +Override the EPA filter for a one-off broader pull: + +```bash +python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486 +``` + ## Canonical sidecar schema Every `corpus//.json` conforms to this shape. Fields diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index e8f3938..77af57f 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -78,6 +78,46 @@ REQUEST_DELAY_SECONDS = 1.1 # polite: ~1 req/sec HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0) MAX_RETRIES = 4 +# Row-crop scoping. Each pattern is matched case-insensitively against a +# product's "sites" array from the PPLS API. Word boundaries matter — bare +# "OATS" naively matches "SHIPS, BOATS, SHIPHOLDS"; bare "RICE" matches +# "LICORICE"; bare "RYE" matches "FRYER". +# +# Scope = the major US row + small-grain + oilseed + sugar/fiber crops the +# farmer-advisor consumer cares about. Alfalfa included as a common rotation +# crop; sweet/seed corn included alongside field corn. +ROW_CROP_KEYWORDS = ( + "CORN", "MAIZE", "POPCORN", + "SOYBEAN", "SOYBEANS", + "COTTON", + "WHEAT", + "RICE", + "SORGHUM", "MILO", + "BARLEY", "OATS", "RYE", + "SUNFLOWER", "SUNFLOWERS", + "PEANUT", "PEANUTS", + "SUGAR BEET", "SUGAR BEETS", + "DRY BEAN", "DRY BEANS", "FIELD BEAN", "FIELD BEANS", + "CANOLA", "RAPESEED", + "ALFALFA", +) +_ROW_CROP_PATTERNS = tuple( + re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE) + for kw in ROW_CROP_KEYWORDS +) + + +def matches_row_crop(record: "ProductRecord") -> bool: + """True if the product's PPLS API sites array contains at least one + row-crop site (CORN, SOYBEANS, COTTON, etc., with word boundaries).""" + item = record.raw_api_item or {} + sites = item.get("sites") or [] + for s in sites: + site = (s.get("site") or "") if isinstance(s, dict) else str(s) + if any(p.search(site) for p in _ROW_CROP_PATTERNS): + return True + return False + log = logging.getLogger("epa_ppls") @@ -382,8 +422,10 @@ def process_one( regno: str, *, force: bool = False, + row_crop_filter: bool = True, ) -> str: - """Fetch + extract one product. Returns 'skipped'|'wrote'|'no-pdf'|'error'.""" + """Fetch + extract one product. Returns + 'skipped'|'wrote'|'no-pdf'|'error'|'filtered'.""" md_path = _md_path(regno) json_path = _json_path(regno) if not force and md_path.exists() and json_path.exists(): @@ -397,6 +439,10 @@ def process_one( return "error" time.sleep(REQUEST_DELAY_SECONDS) + if row_crop_filter and not matches_row_crop(record): + log.info("[%s] filtered (not row-crop)", regno) + return "filtered" + def _build_sidecar( *, label_url: str | None, @@ -570,6 +616,12 @@ def main(argv: list[str] | None = None) -> int: "--seed-file", metavar="PATH", help="Text file with one EPA Reg No per line (# comments OK).", ) + parser.add_argument( + "--row-crop-filter", action=argparse.BooleanOptionalAction, default=True, + help="Keep only products with row-crop sites (corn, soy, cotton, " + "wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter " + "to scrape every PPLS product regardless of crop.", + ) parser.add_argument( "--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], @@ -584,10 +636,14 @@ def main(argv: list[str] | None = None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) - summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "error": 0} + summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "filtered": 0, "error": 0} with _client() as client: for regno in _iter_regnos(args, client): - result = process_one(client, regno, force=args.force) + result = process_one( + client, regno, + force=args.force, + row_crop_filter=args.row_crop_filter, + ) summary[result] = summary.get(result, 0) + 1 log.info("done: %s", summary) diff --git a/sources.json b/sources.json index 0c92a42..29d304d 100644 --- a/sources.json +++ b/sources.json @@ -15,6 +15,7 @@ "homepage": "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1", "scraper": "scrape.sources.epa_ppls", "scraper_version": "0.1.0", - "license_note": "US federal government — public domain (no ToS restriction)" + "license_note": "US federal government — public domain (no ToS restriction)", + "scope_filter": "row-crop only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), COTTON, WHEAT, RICE, SORGHUM, MILO, BARLEY, OATS, RYE, SUNFLOWER(S), PEANUT(S), SUGAR BEET(S), DRY/FIELD BEAN(S), CANOLA, RAPESEED, or ALFALFA (word-boundary match). Pass --no-row-crop-filter to scrape the full PPLS universe." } ]