From 420e00b44b6df217238d1f1324b942fcf5836dfa Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sat, 23 May 2026 21:27:45 -0400 Subject: [PATCH] bayer: dedup by EPA reg no across catalog product-type queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bayer's seed-treatment catalog query re-serves products from herbicide/fungicide/insecticide queries that have seed-treatment use sites listed. safe_slug() correctly strips the class suffix when the catalog product type matches, but doesn't strip when querying as seed-treatment, so the same product gets written twice — once as "" (canonical class) and once as "-" (class=seed-treatment). First full scrape produced 159 files for 87 unique EPA reg nos — ~45% redundant. Fix: - process_product accepts an optional seen_regs set and returns "dup-skip" when the product's EPA reg no is already in it. - run() seeds seen_regs from existing sidecars on disk via _load_seen_regs() so dedup survives re-runs (force overrides). - run() updates seen_regs after each successful write, so within-run dedup works for the seed-treatment query (which iterates last). Important nuance preserved: when two genuinely-different brand-name products share the same EPA reg (e.g., Absolute Maxx + Adament Flow both = 264-849), they are NOT treated as dups — they're different catalog entries with different slugs and same canonical class. Only the seed-treatment-clone pattern (slug = - AND class=seed-treatment AND sibling at same reg with matching class) is the bug we're fixing. One-off cleanup of the existing USB corpus removed 68 dup pairs; 159 → 91 files (73 canonical-class + 18 true seed-treatments). Co-Authored-By: Claude Opus 4.7 (1M context) --- scrape/sources/bayer.py | 54 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py index e21b968..eb6f30c 100644 --- a/scrape/sources/bayer.py +++ b/scrape/sources/bayer.py @@ -551,10 +551,18 @@ def process_product( prod: BayerProduct, *, force: bool, + seen_regs: set[str] | None = None, ) -> str: """Fetch detail + PDF and write to disk. Returns a status string - suitable for logging: ``written``, ``skipped``, ``no-pdf``, - ``failed``.""" + suitable for logging: ``written``, ``skipped``, ``dup-skip``, + ``no-pdf``, ``failed``. + + ``seen_regs``, if provided, is mutated: EPA reg nos written by this + call are added so subsequent calls within the same run can dedup + against products served under multiple catalog product-type queries + (the seed-treatment query in particular re-serves herbicide / + fungicide / insecticide products that have seed-treatment use sites). + """ md_path = CORPUS_DIR / f"{prod.slug}.md" if md_path.exists() and not force: return "skipped" @@ -564,6 +572,13 @@ def process_product( log.error("detail fetch failed for %s: %s", prod.slug, exc) return "failed" + # Dedup: same EPA reg no already written in this run under a + # different catalog product-type (and thus a different slug). + if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs: + log.info("dup-skip %s (epa=%s already processed under canonical class)", + prod.slug, prod.epa_reg_no) + return "dup-skip" + # Resolve Last-Modified for label + supplementals (HEAD only, cheap). if prod.label_url: prod.label_last_modified = head_last_modified(http, prod.label_url) @@ -597,6 +612,25 @@ def process_product( return "written" +def _load_seen_regs() -> set[str]: + """Hydrate the seen-EPA-reg-no set from existing sidecars on disk + so dedup survives across runs (e.g., a re-run with the seed-treatment + query won't re-write products already on disk under their canonical + slug).""" + seen: set[str] = set() + if not CORPUS_DIR.exists(): + return seen + for f in CORPUS_DIR.glob("*.json"): + try: + data = json.loads(f.read_text(encoding="utf-8")) + reg = data.get("epa_reg_no") + if reg: + seen.add(reg) + except (OSError, json.JSONDecodeError): + continue + return seen + + def run( *, limit: int | None, @@ -622,14 +656,21 @@ def run( log.info("catalog yielded %d candidate product(s)", len(products)) - counts = {"written": 0, "skipped": 0, "no-pdf": 0, "failed": 0} + # Seed the dedup set from disk so re-runs and force-runs both behave. + seen_regs: set[str] = set() if force else _load_seen_regs() + if seen_regs: + log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs)) + + counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0} processed = 0 for prod in products: if limit is not None and processed >= limit: break processed += 1 - status = process_product(http, prod, force=force) + status = process_product(http, prod, force=force, seen_regs=seen_regs) counts[status] = counts.get(status, 0) + 1 + if status in ("written", "no-pdf") and prod.epa_reg_no: + seen_regs.add(prod.epa_reg_no) log.info( "[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s", processed, str(limit) if limit else "all", @@ -641,9 +682,10 @@ def run( ) log.info( - "done: processed=%d written=%d skipped=%d no-pdf=%d failed=%d", + "done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d", processed, - counts["written"], counts["skipped"], counts["no-pdf"], counts["failed"], + counts["written"], counts["skipped"], counts["dup-skip"], + counts["no-pdf"], counts["failed"], ) return 0 if counts["failed"] == 0 else 1