bayer: dedup by EPA reg no across catalog product-type queries

Bayer's seed-treatment catalog query re-serves products from herbicide/fungicide/insecticide queries that have seed-treatment use sites listed. safe_slug() correctly strips the class suffix when the catalog product type matches, but doesn't strip when querying as seed-treatment, so the same product gets written twice — once as "<base>" (canonical class) and once as "<base>-<class>" (class=seed-treatment). First full scrape produced 159 files for 87 unique EPA reg nos — ~45% redundant. Fix: - process_product accepts an optional seen_regs set and returns "dup-skip" when the product's EPA reg no is already in it. - run() seeds seen_regs from existing sidecars on disk via _load_seen_regs() so dedup survives re-runs (force overrides). - run() updates seen_regs after each successful write, so within-run dedup works for the seed-treatment query (which iterates last). Important nuance preserved: when two genuinely-different brand-name products share the same EPA reg (e.g., Absolute Maxx + Adament Flow both = 264-849), they are NOT treated as dups — they're different catalog entries with different slugs and same canonical class. Only the seed-treatment-clone pattern (slug = <canonical>-<class> AND class=seed-treatment AND sibling at same reg with matching class) is the bug we're fixing. One-off cleanup of the existing USB corpus removed 68 dup pairs; 159 → 91 files (73 canonical-class + 18 true seed-treatments). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 21:27:45 -04:00
parent 717426f873
commit 420e00b44b
1 changed files with 48 additions and 6 deletions
@@ -551,10 +551,18 @@ def process_product(
    prod: BayerProduct,
    *,
    force: bool,
+    seen_regs: set[str] | None = None,
 ) -> str:
    """Fetch detail + PDF and write to disk. Returns a status string
-    suitable for logging: ``written``, ``skipped``, ``no-pdf``,
-    ``failed``."""
+    suitable for logging: ``written``, ``skipped``, ``dup-skip``,
+    ``no-pdf``, ``failed``.
+
+    ``seen_regs``, if provided, is mutated: EPA reg nos written by this
+    call are added so subsequent calls within the same run can dedup
+    against products served under multiple catalog product-type queries
+    (the seed-treatment query in particular re-serves herbicide /
+    fungicide / insecticide products that have seed-treatment use sites).
+    """
    md_path = CORPUS_DIR / f"{prod.slug}.md"
    if md_path.exists() and not force:
        return "skipped"
@@ -564,6 +572,13 @@ def process_product(
        log.error("detail fetch failed for %s: %s", prod.slug, exc)
        return "failed"

+    # Dedup: same EPA reg no already written in this run under a
+    # different catalog product-type (and thus a different slug).
+    if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs:
+        log.info("dup-skip %s (epa=%s already processed under canonical class)",
+                 prod.slug, prod.epa_reg_no)
+        return "dup-skip"
+
    # Resolve Last-Modified for label + supplementals (HEAD only, cheap).
    if prod.label_url:
        prod.label_last_modified = head_last_modified(http, prod.label_url)
@@ -597,6 +612,25 @@ def process_product(
    return "written"


+def _load_seen_regs() -> set[str]:
+    """Hydrate the seen-EPA-reg-no set from existing sidecars on disk
+    so dedup survives across runs (e.g., a re-run with the seed-treatment
+    query won't re-write products already on disk under their canonical
+    slug)."""
+    seen: set[str] = set()
+    if not CORPUS_DIR.exists():
+        return seen
+    for f in CORPUS_DIR.glob("*.json"):
+        try:
+            data = json.loads(f.read_text(encoding="utf-8"))
+            reg = data.get("epa_reg_no")
+            if reg:
+                seen.add(reg)
+        except (OSError, json.JSONDecodeError):
+            continue
+    return seen
+
+
 def run(
    *,
    limit: int | None,
@@ -622,14 +656,21 @@ def run(

    log.info("catalog yielded %d candidate product(s)", len(products))

-    counts = {"written": 0, "skipped": 0, "no-pdf": 0, "failed": 0}
+    # Seed the dedup set from disk so re-runs and force-runs both behave.
+    seen_regs: set[str] = set() if force else _load_seen_regs()
+    if seen_regs:
+        log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs))
+
+    counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0}
    processed = 0
    for prod in products:
        if limit is not None and processed >= limit:
            break
        processed += 1
-        status = process_product(http, prod, force=force)
+        status = process_product(http, prod, force=force, seen_regs=seen_regs)
        counts[status] = counts.get(status, 0) + 1
+        if status in ("written", "no-pdf") and prod.epa_reg_no:
+            seen_regs.add(prod.epa_reg_no)
        log.info(
            "[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s",
            processed, str(limit) if limit else "all",
@@ -641,9 +682,10 @@ def run(
        )

    log.info(
-        "done: processed=%d written=%d skipped=%d no-pdf=%d failed=%d",
+        "done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d",
        processed,
-        counts["written"], counts["skipped"], counts["no-pdf"], counts["failed"],
+        counts["written"], counts["skipped"], counts["dup-skip"],
+        counts["no-pdf"], counts["failed"],
    )
    return 0 if counts["failed"] == 0 else 1