epa_ppls: filter PPLS enumeration to row-crop products
The farmer-advisor consumer only cares about US row crops, so the EPA
scraper now drops products without at least one row-crop site in the
PPLS API response. Filter is on by default; --no-row-crop-filter
overrides for one-off broader pulls.
Filter shape:
- Word-boundary regex match against each entry in the API's `sites`
array (e.g., "SOYBEANS (FOLIAR TREATMENT)" → keep, "SHIPS, BOATS,
SHIPHOLDS" → drop even though it contains "OATS" as substring).
- Allowlist covers the major US row + small-grain + oilseed + sugar/
fiber crops, plus alfalfa as a common rotation crop. See
ROW_CROP_KEYWORDS in scrape/sources/epa_ppls.py for the full list.
Cost model:
- 102K PPIS rows still need one API call each (no bulk filter
available upstream), so enumeration still takes ~28h at 1 req/sec.
- But PDF downloads drop from ~67K → ~5-10K (estimated row-crop
hit rate), saving ~17h wall time and ~60GB disk on a full backfill.
Smoke test (4 mixed reg nos):
524-475 Roundup Ultra → kept (CORN/SOYBEANS/COTTON sites)
524-591 Warrant → kept (CORN/SOYBEANS/SORGHUM sites)
100-1486 Advion Cockroach → filtered (building/transport sites only)
432-1276 (Bayer pet flea) → filtered (no row crops)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,27 @@ Every scraper is **idempotent** by default — re-running with the
|
|||||||
same arguments skips records already on disk. Use `--force` to
|
same arguments skips records already on disk. Use `--force` to
|
||||||
re-fetch.
|
re-fetch.
|
||||||
|
|
||||||
|
## Scope: row crops only
|
||||||
|
|
||||||
|
The corpus is scoped to **US row crops** — corn, soybeans, cotton,
|
||||||
|
wheat, rice, sorghum/milo, barley, oats, rye, sunflowers, peanuts,
|
||||||
|
sugar beets, dry/field beans, canola/rapeseed, and alfalfa. The
|
||||||
|
EPA PPLS scraper enforces this by inspecting the `sites` array on
|
||||||
|
each product's PPLS API response and dropping anything without a
|
||||||
|
row-crop site (word-boundary match).
|
||||||
|
|
||||||
|
The Bayer scraper doesn't filter — its catalog is implicitly
|
||||||
|
ag-focused, and dropping fungicide/insecticide/seed-treatment
|
||||||
|
products there would lose row-crop-relevant chemistry. Add
|
||||||
|
per-source filters as needed if other manufacturer sources cover
|
||||||
|
non-ag products.
|
||||||
|
|
||||||
|
Override the EPA filter for a one-off broader pull:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m scrape.sources.epa_ppls --no-row-crop-filter --reg-no 100-1486
|
||||||
|
```
|
||||||
|
|
||||||
## Canonical sidecar schema
|
## Canonical sidecar schema
|
||||||
|
|
||||||
Every `corpus/<source>/<key>.json` conforms to this shape. Fields
|
Every `corpus/<source>/<key>.json` conforms to this shape. Fields
|
||||||
|
|||||||
@@ -78,6 +78,46 @@ REQUEST_DELAY_SECONDS = 1.1 # polite: ~1 req/sec
|
|||||||
HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0)
|
HTTP_TIMEOUT = httpx.Timeout(60.0, connect=15.0)
|
||||||
MAX_RETRIES = 4
|
MAX_RETRIES = 4
|
||||||
|
|
||||||
|
# Row-crop scoping. Each pattern is matched case-insensitively against a
|
||||||
|
# product's "sites" array from the PPLS API. Word boundaries matter — bare
|
||||||
|
# "OATS" naively matches "SHIPS, BOATS, SHIPHOLDS"; bare "RICE" matches
|
||||||
|
# "LICORICE"; bare "RYE" matches "FRYER".
|
||||||
|
#
|
||||||
|
# Scope = the major US row + small-grain + oilseed + sugar/fiber crops the
|
||||||
|
# farmer-advisor consumer cares about. Alfalfa included as a common rotation
|
||||||
|
# crop; sweet/seed corn included alongside field corn.
|
||||||
|
ROW_CROP_KEYWORDS = (
|
||||||
|
"CORN", "MAIZE", "POPCORN",
|
||||||
|
"SOYBEAN", "SOYBEANS",
|
||||||
|
"COTTON",
|
||||||
|
"WHEAT",
|
||||||
|
"RICE",
|
||||||
|
"SORGHUM", "MILO",
|
||||||
|
"BARLEY", "OATS", "RYE",
|
||||||
|
"SUNFLOWER", "SUNFLOWERS",
|
||||||
|
"PEANUT", "PEANUTS",
|
||||||
|
"SUGAR BEET", "SUGAR BEETS",
|
||||||
|
"DRY BEAN", "DRY BEANS", "FIELD BEAN", "FIELD BEANS",
|
||||||
|
"CANOLA", "RAPESEED",
|
||||||
|
"ALFALFA",
|
||||||
|
)
|
||||||
|
_ROW_CROP_PATTERNS = tuple(
|
||||||
|
re.compile(rf"\b{re.escape(kw)}\b", re.IGNORECASE)
|
||||||
|
for kw in ROW_CROP_KEYWORDS
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def matches_row_crop(record: "ProductRecord") -> bool:
|
||||||
|
"""True if the product's PPLS API sites array contains at least one
|
||||||
|
row-crop site (CORN, SOYBEANS, COTTON, etc., with word boundaries)."""
|
||||||
|
item = record.raw_api_item or {}
|
||||||
|
sites = item.get("sites") or []
|
||||||
|
for s in sites:
|
||||||
|
site = (s.get("site") or "") if isinstance(s, dict) else str(s)
|
||||||
|
if any(p.search(site) for p in _ROW_CROP_PATTERNS):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
log = logging.getLogger("epa_ppls")
|
log = logging.getLogger("epa_ppls")
|
||||||
|
|
||||||
|
|
||||||
@@ -382,8 +422,10 @@ def process_one(
|
|||||||
regno: str,
|
regno: str,
|
||||||
*,
|
*,
|
||||||
force: bool = False,
|
force: bool = False,
|
||||||
|
row_crop_filter: bool = True,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Fetch + extract one product. Returns 'skipped'|'wrote'|'no-pdf'|'error'."""
|
"""Fetch + extract one product. Returns
|
||||||
|
'skipped'|'wrote'|'no-pdf'|'error'|'filtered'."""
|
||||||
md_path = _md_path(regno)
|
md_path = _md_path(regno)
|
||||||
json_path = _json_path(regno)
|
json_path = _json_path(regno)
|
||||||
if not force and md_path.exists() and json_path.exists():
|
if not force and md_path.exists() and json_path.exists():
|
||||||
@@ -397,6 +439,10 @@ def process_one(
|
|||||||
return "error"
|
return "error"
|
||||||
time.sleep(REQUEST_DELAY_SECONDS)
|
time.sleep(REQUEST_DELAY_SECONDS)
|
||||||
|
|
||||||
|
if row_crop_filter and not matches_row_crop(record):
|
||||||
|
log.info("[%s] filtered (not row-crop)", regno)
|
||||||
|
return "filtered"
|
||||||
|
|
||||||
def _build_sidecar(
|
def _build_sidecar(
|
||||||
*,
|
*,
|
||||||
label_url: str | None,
|
label_url: str | None,
|
||||||
@@ -570,6 +616,12 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
"--seed-file", metavar="PATH",
|
"--seed-file", metavar="PATH",
|
||||||
help="Text file with one EPA Reg No per line (# comments OK).",
|
help="Text file with one EPA Reg No per line (# comments OK).",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--row-crop-filter", action=argparse.BooleanOptionalAction, default=True,
|
||||||
|
help="Keep only products with row-crop sites (corn, soy, cotton, "
|
||||||
|
"wheat, rice, sorghum, etc.). Default on; use --no-row-crop-filter "
|
||||||
|
"to scrape every PPLS product regardless of crop.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--log-level", default="INFO",
|
"--log-level", default="INFO",
|
||||||
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
@@ -584,10 +636,14 @@ def main(argv: list[str] | None = None) -> int:
|
|||||||
|
|
||||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "error": 0}
|
summary = {"wrote": 0, "skipped": 0, "no-pdf": 0, "filtered": 0, "error": 0}
|
||||||
with _client() as client:
|
with _client() as client:
|
||||||
for regno in _iter_regnos(args, client):
|
for regno in _iter_regnos(args, client):
|
||||||
result = process_one(client, regno, force=args.force)
|
result = process_one(
|
||||||
|
client, regno,
|
||||||
|
force=args.force,
|
||||||
|
row_crop_filter=args.row_crop_filter,
|
||||||
|
)
|
||||||
summary[result] = summary.get(result, 0) + 1
|
summary[result] = summary.get(result, 0) + 1
|
||||||
|
|
||||||
log.info("done: %s", summary)
|
log.info("done: %s", summary)
|
||||||
|
|||||||
+2
-1
@@ -15,6 +15,7 @@
|
|||||||
"homepage": "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1",
|
"homepage": "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1",
|
||||||
"scraper": "scrape.sources.epa_ppls",
|
"scraper": "scrape.sources.epa_ppls",
|
||||||
"scraper_version": "0.1.0",
|
"scraper_version": "0.1.0",
|
||||||
"license_note": "US federal government — public domain (no ToS restriction)"
|
"license_note": "US federal government — public domain (no ToS restriction)",
|
||||||
|
"scope_filter": "row-crop only — products with at least one site matching CORN, MAIZE, POPCORN, SOYBEAN(S), COTTON, WHEAT, RICE, SORGHUM, MILO, BARLEY, OATS, RYE, SUNFLOWER(S), PEANUT(S), SUGAR BEET(S), DRY/FIELD BEAN(S), CANOLA, RAPESEED, or ALFALFA (word-boundary match). Pass --no-row-crop-filter to scrape the full PPLS universe."
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user