From 420e00b44b6df217238d1f1324b942fcf5836dfa Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Sat, 23 May 2026 21:27:45 -0400
Subject: [PATCH] bayer: dedup by EPA reg no across catalog product-type
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bayer's seed-treatment catalog query re-serves products from
herbicide/fungicide/insecticide queries that have seed-treatment use
sites listed. safe_slug() correctly strips the class suffix when the
catalog product type matches, but doesn't strip when querying as
seed-treatment, so the same product gets written twice — once as
"<base>" (canonical class) and once as "<base>-<class>"
(class=seed-treatment).

First full scrape produced 159 files for 87 unique EPA reg nos —
~45% redundant. Fix:

- process_product accepts an optional seen_regs set and returns
  "dup-skip" when the product's EPA reg no is already in it.
- run() seeds seen_regs from existing sidecars on disk via
  _load_seen_regs() so dedup survives re-runs (force overrides).
- run() updates seen_regs after each successful write, so within-run
  dedup works for the seed-treatment query (which iterates last).

Important nuance preserved: when two genuinely-different brand-name
products share the same EPA reg (e.g., Absolute Maxx + Adament Flow
both = 264-849), they are NOT treated as dups — they're different
catalog entries with different slugs and same canonical class. Only
the seed-treatment-clone pattern (slug = <canonical>-<class> AND
class=seed-treatment AND sibling at same reg with matching class) is
the bug we're fixing.

One-off cleanup of the existing USB corpus removed 68 dup pairs;
159 → 91 files (73 canonical-class + 18 true seed-treatments).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scrape/sources/bayer.py | 54 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 6 deletions(-)
diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py
index e21b968..eb6f30c 100644
--- a/scrape/sources/bayer.py
+++ b/scrape/sources/bayer.py
@@ -551,10 +551,18 @@ def process_product(
     prod: BayerProduct,
     *,
     force: bool,
+    seen_regs: set[str] | None = None,
 ) -> str:
     """Fetch detail + PDF and write to disk. Returns a status string
-    suitable for logging: ``written``, ``skipped``, ``no-pdf``,
-    ``failed``."""
+    suitable for logging: ``written``, ``skipped``, ``dup-skip``,
+    ``no-pdf``, ``failed``.
+
+    ``seen_regs``, if provided, is mutated: EPA reg nos written by this
+    call are added so subsequent calls within the same run can dedup
+    against products served under multiple catalog product-type queries
+    (the seed-treatment query in particular re-serves herbicide /
+    fungicide / insecticide products that have seed-treatment use sites).
+    """
     md_path = CORPUS_DIR / f"{prod.slug}.md"
     if md_path.exists() and not force:
         return "skipped"
@@ -564,6 +572,13 @@ def process_product(
         log.error("detail fetch failed for %s: %s", prod.slug, exc)
         return "failed"
 
+    # Dedup: same EPA reg no already written in this run under a
+    # different catalog product-type (and thus a different slug).
+    if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs:
+        log.info("dup-skip %s (epa=%s already processed under canonical class)",
+                 prod.slug, prod.epa_reg_no)
+        return "dup-skip"
+
     # Resolve Last-Modified for label + supplementals (HEAD only, cheap).
     if prod.label_url:
         prod.label_last_modified = head_last_modified(http, prod.label_url)
@@ -597,6 +612,25 @@ def process_product(
     return "written"
 
 
+def _load_seen_regs() -> set[str]:
+    """Hydrate the seen-EPA-reg-no set from existing sidecars on disk
+    so dedup survives across runs (e.g., a re-run with the seed-treatment
+    query won't re-write products already on disk under their canonical
+    slug)."""
+    seen: set[str] = set()
+    if not CORPUS_DIR.exists():
+        return seen
+    for f in CORPUS_DIR.glob("*.json"):
+        try:
+            data = json.loads(f.read_text(encoding="utf-8"))
+            reg = data.get("epa_reg_no")
+            if reg:
+                seen.add(reg)
+        except (OSError, json.JSONDecodeError):
+            continue
+    return seen
+
+
 def run(
     *,
     limit: int | None,
@@ -622,14 +656,21 @@ def run(
 
     log.info("catalog yielded %d candidate product(s)", len(products))
 
-    counts = {"written": 0, "skipped": 0, "no-pdf": 0, "failed": 0}
+    # Seed the dedup set from disk so re-runs and force-runs both behave.
+    seen_regs: set[str] = set() if force else _load_seen_regs()
+    if seen_regs:
+        log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs))
+
+    counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0}
     processed = 0
     for prod in products:
         if limit is not None and processed >= limit:
             break
         processed += 1
-        status = process_product(http, prod, force=force)
+        status = process_product(http, prod, force=force, seen_regs=seen_regs)
         counts[status] = counts.get(status, 0) + 1
+        if status in ("written", "no-pdf") and prod.epa_reg_no:
+            seen_regs.add(prod.epa_reg_no)
         log.info(
             "[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s",
             processed, str(limit) if limit else "all",
@@ -641,9 +682,10 @@ def run(
         )
 
     log.info(
-        "done: processed=%d written=%d skipped=%d no-pdf=%d failed=%d",
+        "done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d",
         processed,
-        counts["written"], counts["skipped"], counts["no-pdf"], counts["failed"],
+        counts["written"], counts["skipped"], counts["dup-skip"],
+        counts["no-pdf"], counts["failed"],
     )
     return 0 if counts["failed"] == 0 else 1