bayer: dedup by EPA reg no across catalog product-type queries
Bayer's seed-treatment catalog query re-serves products from herbicide/fungicide/insecticide queries that have seed-treatment use sites listed. safe_slug() correctly strips the class suffix when the catalog product type matches, but doesn't strip when querying as seed-treatment, so the same product gets written twice — once as "<base>" (canonical class) and once as "<base>-<class>" (class=seed-treatment). First full scrape produced 159 files for 87 unique EPA reg nos — ~45% redundant. Fix: - process_product accepts an optional seen_regs set and returns "dup-skip" when the product's EPA reg no is already in it. - run() seeds seen_regs from existing sidecars on disk via _load_seen_regs() so dedup survives re-runs (force overrides). - run() updates seen_regs after each successful write, so within-run dedup works for the seed-treatment query (which iterates last). Important nuance preserved: when two genuinely-different brand-name products share the same EPA reg (e.g., Absolute Maxx + Adament Flow both = 264-849), they are NOT treated as dups — they're different catalog entries with different slugs and same canonical class. Only the seed-treatment-clone pattern (slug = <canonical>-<class> AND class=seed-treatment AND sibling at same reg with matching class) is the bug we're fixing. One-off cleanup of the existing USB corpus removed 68 dup pairs; 159 → 91 files (73 canonical-class + 18 true seed-treatments). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+48
-6
@@ -551,10 +551,18 @@ def process_product(
|
||||
prod: BayerProduct,
|
||||
*,
|
||||
force: bool,
|
||||
seen_regs: set[str] | None = None,
|
||||
) -> str:
|
||||
"""Fetch detail + PDF and write to disk. Returns a status string
|
||||
suitable for logging: ``written``, ``skipped``, ``no-pdf``,
|
||||
``failed``."""
|
||||
suitable for logging: ``written``, ``skipped``, ``dup-skip``,
|
||||
``no-pdf``, ``failed``.
|
||||
|
||||
``seen_regs``, if provided, is mutated: EPA reg nos written by this
|
||||
call are added so subsequent calls within the same run can dedup
|
||||
against products served under multiple catalog product-type queries
|
||||
(the seed-treatment query in particular re-serves herbicide /
|
||||
fungicide / insecticide products that have seed-treatment use sites).
|
||||
"""
|
||||
md_path = CORPUS_DIR / f"{prod.slug}.md"
|
||||
if md_path.exists() and not force:
|
||||
return "skipped"
|
||||
@@ -564,6 +572,13 @@ def process_product(
|
||||
log.error("detail fetch failed for %s: %s", prod.slug, exc)
|
||||
return "failed"
|
||||
|
||||
# Dedup: same EPA reg no already written in this run under a
|
||||
# different catalog product-type (and thus a different slug).
|
||||
if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs:
|
||||
log.info("dup-skip %s (epa=%s already processed under canonical class)",
|
||||
prod.slug, prod.epa_reg_no)
|
||||
return "dup-skip"
|
||||
|
||||
# Resolve Last-Modified for label + supplementals (HEAD only, cheap).
|
||||
if prod.label_url:
|
||||
prod.label_last_modified = head_last_modified(http, prod.label_url)
|
||||
@@ -597,6 +612,25 @@ def process_product(
|
||||
return "written"
|
||||
|
||||
|
||||
def _load_seen_regs() -> set[str]:
|
||||
"""Hydrate the seen-EPA-reg-no set from existing sidecars on disk
|
||||
so dedup survives across runs (e.g., a re-run with the seed-treatment
|
||||
query won't re-write products already on disk under their canonical
|
||||
slug)."""
|
||||
seen: set[str] = set()
|
||||
if not CORPUS_DIR.exists():
|
||||
return seen
|
||||
for f in CORPUS_DIR.glob("*.json"):
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
reg = data.get("epa_reg_no")
|
||||
if reg:
|
||||
seen.add(reg)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
continue
|
||||
return seen
|
||||
|
||||
|
||||
def run(
|
||||
*,
|
||||
limit: int | None,
|
||||
@@ -622,14 +656,21 @@ def run(
|
||||
|
||||
log.info("catalog yielded %d candidate product(s)", len(products))
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "no-pdf": 0, "failed": 0}
|
||||
# Seed the dedup set from disk so re-runs and force-runs both behave.
|
||||
seen_regs: set[str] = set() if force else _load_seen_regs()
|
||||
if seen_regs:
|
||||
log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs))
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0}
|
||||
processed = 0
|
||||
for prod in products:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
status = process_product(http, prod, force=force)
|
||||
status = process_product(http, prod, force=force, seen_regs=seen_regs)
|
||||
counts[status] = counts.get(status, 0) + 1
|
||||
if status in ("written", "no-pdf") and prod.epa_reg_no:
|
||||
seen_regs.add(prod.epa_reg_no)
|
||||
log.info(
|
||||
"[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s",
|
||||
processed, str(limit) if limit else "all",
|
||||
@@ -641,9 +682,10 @@ def run(
|
||||
)
|
||||
|
||||
log.info(
|
||||
"done: processed=%d written=%d skipped=%d no-pdf=%d failed=%d",
|
||||
"done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d",
|
||||
processed,
|
||||
counts["written"], counts["skipped"], counts["no-pdf"], counts["failed"],
|
||||
counts["written"], counts["skipped"], counts["dup-skip"],
|
||||
counts["no-pdf"], counts["failed"],
|
||||
)
|
||||
return 0 if counts["failed"] == 0 else 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user