bayer: dedup by EPA reg no across catalog product-type queries

Bayer's seed-treatment catalog query re-serves products from
herbicide/fungicide/insecticide queries that have seed-treatment use
sites listed. safe_slug() correctly strips the class suffix when the
catalog product type matches, but doesn't strip when querying as
seed-treatment, so the same product gets written twice — once as
"<base>" (canonical class) and once as "<base>-<class>"
(class=seed-treatment).

First full scrape produced 159 files for 87 unique EPA reg nos —
~45% redundant. Fix:

- process_product accepts an optional seen_regs set and returns
  "dup-skip" when the product's EPA reg no is already in it.
- run() seeds seen_regs from existing sidecars on disk via
  _load_seen_regs() so dedup survives re-runs (force overrides).
- run() updates seen_regs after each successful write, so within-run
  dedup works for the seed-treatment query (which iterates last).

Important nuance preserved: when two genuinely-different brand-name
products share the same EPA reg (e.g., Absolute Maxx + Adament Flow
both = 264-849), they are NOT treated as dups — they're different
catalog entries with different slugs and same canonical class. Only
the seed-treatment-clone pattern (slug = <canonical>-<class> AND
class=seed-treatment AND sibling at same reg with matching class) is
the bug we're fixing.

One-off cleanup of the existing USB corpus removed 68 dup pairs;
159 → 91 files (73 canonical-class + 18 true seed-treatments).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-23 21:27:45 -04:00
parent 717426f873
commit 420e00b44b
+48 -6
View File
@@ -551,10 +551,18 @@ def process_product(
prod: BayerProduct,
*,
force: bool,
seen_regs: set[str] | None = None,
) -> str:
"""Fetch detail + PDF and write to disk. Returns a status string
suitable for logging: ``written``, ``skipped``, ``no-pdf``,
``failed``."""
suitable for logging: ``written``, ``skipped``, ``dup-skip``,
``no-pdf``, ``failed``.
``seen_regs``, if provided, is mutated: EPA reg nos written by this
call are added so subsequent calls within the same run can dedup
against products served under multiple catalog product-type queries
(the seed-treatment query in particular re-serves herbicide /
fungicide / insecticide products that have seed-treatment use sites).
"""
md_path = CORPUS_DIR / f"{prod.slug}.md"
if md_path.exists() and not force:
return "skipped"
@@ -564,6 +572,13 @@ def process_product(
log.error("detail fetch failed for %s: %s", prod.slug, exc)
return "failed"
# Dedup: same EPA reg no already written in this run under a
# different catalog product-type (and thus a different slug).
if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs:
log.info("dup-skip %s (epa=%s already processed under canonical class)",
prod.slug, prod.epa_reg_no)
return "dup-skip"
# Resolve Last-Modified for label + supplementals (HEAD only, cheap).
if prod.label_url:
prod.label_last_modified = head_last_modified(http, prod.label_url)
@@ -597,6 +612,25 @@ def process_product(
return "written"
def _load_seen_regs() -> set[str]:
"""Hydrate the seen-EPA-reg-no set from existing sidecars on disk
so dedup survives across runs (e.g., a re-run with the seed-treatment
query won't re-write products already on disk under their canonical
slug)."""
seen: set[str] = set()
if not CORPUS_DIR.exists():
return seen
for f in CORPUS_DIR.glob("*.json"):
try:
data = json.loads(f.read_text(encoding="utf-8"))
reg = data.get("epa_reg_no")
if reg:
seen.add(reg)
except (OSError, json.JSONDecodeError):
continue
return seen
def run(
*,
limit: int | None,
@@ -622,14 +656,21 @@ def run(
log.info("catalog yielded %d candidate product(s)", len(products))
counts = {"written": 0, "skipped": 0, "no-pdf": 0, "failed": 0}
# Seed the dedup set from disk so re-runs and force-runs both behave.
seen_regs: set[str] = set() if force else _load_seen_regs()
if seen_regs:
log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs))
counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0}
processed = 0
for prod in products:
if limit is not None and processed >= limit:
break
processed += 1
status = process_product(http, prod, force=force)
status = process_product(http, prod, force=force, seen_regs=seen_regs)
counts[status] = counts.get(status, 0) + 1
if status in ("written", "no-pdf") and prod.epa_reg_no:
seen_regs.add(prod.epa_reg_no)
log.info(
"[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s",
processed, str(limit) if limit else "all",
@@ -641,9 +682,10 @@ def run(
)
log.info(
"done: processed=%d written=%d skipped=%d no-pdf=%d failed=%d",
"done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d",
processed,
counts["written"], counts["skipped"], counts["no-pdf"], counts["failed"],
counts["written"], counts["skipped"], counts["dup-skip"],
counts["no-pdf"], counts["failed"],
)
return 0 if counts["failed"] == 0 else 1