"""Bayer Crop Science US label scraper. Pulls herbicide / fungicide / insecticide / seed-treatment product metadata and label PDFs from https://www.cropscience.bayer.us, extracts each PDF to markdown, and writes a metadata sidecar JSON per product. Output: corpus/bayer/.md extracted label text corpus/bayer/.json metadata sidecar (see SIDECAR_SCHEMA in PLAN.md / this repo's CLAUDE.md) The scraper resolves Bayer's rotating Next.js ``buildId`` from the homepage at runtime, then walks the catalog JSON API for each product class. It extracts the rest of the label/MSDS/supplemental download URLs from each product page's ``__NEXT_DATA__`` JSON island — this is strictly cheaper and more stable than scraping rendered HTML. robots.txt for cropscience.bayer.us explicitly allows scraping for "search engine indexing or artificial intelligence retrieval augmented generation" use cases, which is what this corpus feeds. CLI: python -m scrape.sources.bayer --limit 20 python -m scrape.sources.bayer --limit 20 --force python -m scrape.sources.bayer --product warrant python -m scrape.sources.bayer --class herbicide --limit 5 """ from __future__ import annotations import argparse import io import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Iterable import requests from pypdf import PdfReader SCRAPER_VERSION = "0.1.0" USER_AGENT = "crop-chem-docs-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.cropscience.bayer.us" # Catalog product-type values used in the Next.js data API. PRODUCT_TYPES = ("Herbicide", "Fungicide", "Insecticide", "Seed_Treatment") # Map product-type filter -> the canonical "product_class" we record # in the sidecar (matches the legacy URL segments). PRODUCT_CLASS = { "Herbicide": "herbicide", "Fungicide": "fungicide", "Insecticide": "insecticide", "Seed_Treatment": "seed-treatment", } # Repo root: scrape/sources/bayer.py -> repo root is 3 parents up. # Corpus root is overridable via CORPUS_ROOT for routing the # corpus to external storage (USB drive, NAS mount, etc.) without # editing the repo. REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "bayer" # Politeness: target ~1 req/sec to Bayer. Each HTTP method goes through # a tiny token-bucket sleeper to enforce this without per-call asyncio. REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.bayer") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """``requests.Session`` wrapper with sleep-based rate limiting and polite retries on 429/5xx.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code in (429,) or 500 <= resp.status_code < 600: # Honor Retry-After if present, else exponential backoff. ra = resp.headers.get("Retry-After") if ra and ra.isdigit(): backoff = float(ra) else: backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc # Final response (still bad) returned for caller to handle. return resp def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) def head(self, url: str, **kw: Any) -> requests.Response: kw.setdefault("allow_redirects", True) return self.request("HEAD", url, **kw) # --------------------------------------------------------------------- model @dataclass class SupplementalDoc: kind: str title: str url: str last_modified: str | None = None @dataclass class BayerProduct: slug: str # filesystem-safe slug, e.g. "warrant" catalog_slug: str # bayer's seoSlug, e.g. "warrant-herbicide" product_url_path: str # e.g. "/crop-protection/herbicide/warrant-herbicide" product_class: str # "herbicide" | "fungicide" | ... product_name: str = "" epa_reg_no: str | None = None active_ingredients: list[dict] = field(default_factory=list) # [{name, cas, percent}] label_url: str | None = None label_filename: str | None = None label_last_modified: str | None = None label_page_count: int | None = None label_text_layer: bool | None = None supplemental_pdfs: list[SupplementalDoc] = field(default_factory=list) source_page_url: str = "" # --------------------------------------------------------------------- helpers _NEXT_DATA_RE = re.compile( r'', re.S ) def parse_next_data(html: str) -> dict[str, Any]: """Pull the ``__NEXT_DATA__`` JSON blob out of a Next.js page.""" m = _NEXT_DATA_RE.search(html) if not m: raise RuntimeError("no __NEXT_DATA__ script tag found") return json.loads(m.group(1)) def fetch_build_id(http: RateLimitedSession) -> str: """Grab the rotating ``buildId`` from the Bayer homepage.""" r = http.get(BASE + "/") r.raise_for_status() data = parse_next_data(r.text) bid = data.get("buildId") if not bid: raise RuntimeError("buildId missing from homepage __NEXT_DATA__") log.info("resolved Bayer buildId=%s", bid) return bid def normalize_epa_reg(raw: str | None) -> str | None: """Convert Bayer's padded reg number to canonical EPA form. Example: ``0000524-00591-AA-0000000`` -> ``524-591``. The trailing ``-AA-0000000`` is a Bayer-internal qualifier we don't surface. We keep ``524-591/`` if a non-empty sub-reg appears (rare). """ if not raw: return None parts = raw.split("-") if len(parts) < 2: return raw.strip() or None company = parts[0].lstrip("0") or "0" product = parts[1].lstrip("0") or "0" epa = f"{company}-{product}" # If the third segment is something other than the default "AA", # it's likely a distributor sub-reg. Preserve it. if len(parts) >= 3 and parts[2] and parts[2] != "AA": epa += f"-{parts[2]}" return epa def classify_supplemental(title: str, url: str) -> str: """Classify a supplemental/auxiliary doc by its title or URL. Returns a short kind tag like ``2EE``, ``24C``, ``24C-CA``, ``Bulletin``, ``MSDS``, ``Label``, or ``Other``. The exact tag isn't load-bearing for the scraper — it's metadata to help the chunker/agent later. Best-effort regex; ambiguous = ``Other``. """ t = (title or "").upper() u = (url or "").upper() blob = f"{t} {u}" # State-specific 24c labels usually carry a two-letter state code, # but Bayer's titles rarely encode it. Best we can do is flag 24c. if "24C" in blob or "SECTION_24C" in blob or "SECTION 24C" in blob: # Try to spot a state suffix in the URL (e.g. "_24c_ca"). m = re.search(r"24[_-]?C[_-]([A-Z]{2})\b", u) if m: return f"24C-{m.group(1)}" return "24C" if "2EE" in blob or "2_EE" in blob: return "2EE" if "MSDS" in blob or "SDS" in blob or "SAFETY DATA" in blob: return "MSDS" if "BULLETIN" in blob: return "Bulletin" if "SUPPLEMENTAL" in blob: return "Supplemental" if "LABEL" in blob: return "Label" return "Other" def safe_slug(catalog_slug: str, product_class: str) -> str: """Strip the trailing class suffix so ``warrant-herbicide`` becomes ``warrant``; falls back to the full slug for slugs that don't end with the class word.""" suffix = f"-{product_class}" if catalog_slug.endswith(suffix): return catalog_slug[: -len(suffix)] # seed-treatment is sometimes split or omitted; just return as-is. return catalog_slug def iso_from_http_date(http_date: str | None) -> str | None: """RFC1123 -> ISO 8601 UTC. Returns None if unparseable.""" if not http_date: return None try: from email.utils import parsedate_to_datetime dt = parsedate_to_datetime(http_date) if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) return dt.astimezone(timezone.utc).isoformat() except Exception: # noqa: BLE001 return None # --------------------------------------------------------------------- catalog def walk_catalog( http: RateLimitedSession, build_id: str ) -> Iterable[BayerProduct]: """Yield ``BayerProduct`` stubs for every product across all classes. Stubs carry only catalog-level info (slug, URL, class). The detail fetch (EPA reg, ingredients, PDFs) happens later via :func:`fetch_product_detail`. """ for ptype in PRODUCT_TYPES: product_class = PRODUCT_CLASS[ptype] page = 1 seen = 0 while True: url = ( f"{BASE}/_next/data/{build_id}/crop-protection/catalog.json" f"?productType={ptype}&p={page}" ) r = http.get(url) if r.status_code != 200: log.warning("catalog %s p=%d -> HTTP %d, stopping class", ptype, page, r.status_code) break data = r.json().get("pageProps", {}) products = data.get("serverProducts") or [] total = data.get("total") or 0 if not products: break for p in products: slug = p.get("seoSlug") or "" product_url = p.get("productURL") or "" if not slug or not product_url: continue yield BayerProduct( slug=safe_slug(slug, product_class), catalog_slug=slug, product_url_path=product_url, product_class=product_class, ) seen += len(products) if seen >= total: break page += 1 # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, prod: BayerProduct ) -> BayerProduct: """Populate EPA reg, active ingredients, and the full PDF list on a catalog stub by fetching its product page __NEXT_DATA__.""" page_url = BASE + prod.product_url_path prod.source_page_url = page_url r = http.get(page_url) r.raise_for_status() data = parse_next_data(r.text) pp = (data.get("props") or {}).get("pageProps") or {} pd = pp.get("productDetails") or {} prod.product_name = pd.get("productLabel") or pd.get("productName") or prod.slug prod.epa_reg_no = normalize_epa_reg(pd.get("registrationNumber")) # Bayer's product page exposes ingredient names only — no CAS or percent. # Conform to the canonical schema by emitting objects with name set and # the other fields null; downstream consumers can hydrate from EPA PPLS. prod.active_ingredients = [ {"name": a.get("ingredient"), "cas": None, "percent": None} for a in (pd.get("activeIngredients") or []) if a.get("ingredient") ] # Primary label: prefer downloadLabelUrl, then importantDocuments. important = (pp.get("importantDocuments") or {}).get("labelData") or [] additional = (pp.get("additionalDownloads") or {}).get("labelData") or [] download_url = pp.get("downloadLabelUrl") label_url: str | None = None if download_url and looks_like_pdf(download_url): label_url = download_url else: # First entry titled "Label" or simply the first PDF. for d in important: t = (d.get("title") or "").lower() u = d.get("url") or "" if not looks_like_pdf(u): continue if "label" in t and "msds" not in t and "sds" not in t: label_url = u break if not label_url: for d in important + additional: u = d.get("url") or "" if looks_like_pdf(u): label_url = u break prod.label_url = label_url if label_url: # Last URL segment is the Scene7 asset id (e.g. "Warrant_2025pdf"). prod.label_filename = label_url.rsplit("/", 1)[-1] # Collect ALL other PDFs as supplementals (label/MSDS/24c/2EE/bulletin # /etc.). The kind tag is best-effort; the chunker can refine later. supplementals: list[SupplementalDoc] = [] seen_urls: set[str] = set() if label_url: seen_urls.add(label_url) for d in important + additional: u = d.get("url") or "" t = d.get("title") or "" if not u or u in seen_urls: continue if not looks_like_pdf(u): continue seen_urls.add(u) supplementals.append(SupplementalDoc( kind=classify_supplemental(t, u), title=t, url=u, )) prod.supplemental_pdfs = supplementals return prod def looks_like_pdf(url: str) -> bool: """True if the URL is one of Bayer's PDF endpoints. Bayer serves PDFs via Adobe Scene7 with the literal ``pdf`` (no dot) appended to the asset ID, plus some assets on cs-contentapi with a real ``.pdf`` extension. """ u = url.lower() if u.endswith("pdf"): return True if u.endswith(".pdf"): return True return False # --------------------------------------------------------------------- PDF def head_last_modified(http: RateLimitedSession, url: str) -> str | None: """Resolve Last-Modified for a PDF URL. Returns ISO 8601 or None.""" try: r = http.head(url) except requests.RequestException as exc: log.warning("HEAD failed for %s: %s", url, exc) return None if r.status_code != 200: log.warning("HEAD %s -> HTTP %d", url, r.status_code) return None return iso_from_http_date(r.headers.get("Last-Modified")) def fetch_pdf_text(http: RateLimitedSession, url: str) -> tuple[str, int, bool]: """Download a PDF and return ``(text, page_count, has_text_layer)``. Concatenates all pages, normalizes whitespace, and collapses runs of blank lines so the resulting markdown diffs cleanly. ``has_text_layer`` is False for scanned PDFs whose pypdf extract produced no text. """ r = http.get(url) r.raise_for_status() if "pdf" not in (r.headers.get("Content-Type") or "").lower(): log.warning("expected PDF Content-Type at %s, got %s", url, r.headers.get("Content-Type")) reader = PdfReader(io.BytesIO(r.content)) page_count = len(reader.pages) chunks: list[str] = [] for page in reader.pages: try: text = page.extract_text() or "" except Exception as exc: # noqa: BLE001 log.warning("pypdf extract_text failed on a page of %s: %s", url, exc) text = "" chunks.append(text) raw = "\n\n".join(chunks) normalized = normalize_text(raw) has_text_layer = bool(normalized.strip()) return normalized, page_count, has_text_layer def normalize_text(s: str) -> str: # Strip trailing spaces per line, collapse 3+ blank lines to 2, # and trim NBSPs that pypdf often leaves behind. s = s.replace("\u00a0", " ") s = re.sub(r"[ \t]+\n", "\n", s) s = re.sub(r"\n{3,}", "\n\n", s) return s.strip() + "\n" # --------------------------------------------------------------------- write def write_product(prod: BayerProduct, body_md: str) -> None: """Write the canonical sidecar + markdown body. See scrape/README.md for the schema.""" CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.slug}.md" json_path = CORPUS_DIR / f"{prod.slug}.json" # Lightweight markdown frontmatter for human eyeballing — canonical # metadata lives in the sidecar. title = prod.product_name or prod.slug ai_summary = ", ".join(a["name"] for a in prod.active_ingredients if a.get("name")) or "(unknown)" header = ( f"# {title}\n\n" f"- **Product class:** {prod.product_class}\n" f"- **EPA Reg No:** {prod.epa_reg_no or '(unknown)'}\n" f"- **Active ingredients:** {ai_summary}\n" f"- **Source:** {prod.source_page_url}\n" f"- **Label PDF:** {prod.label_url or '(none on page)'}\n\n" "---\n\n" ) md_path.write_text(header + body_md, encoding="utf-8") sidecar = { "source": "bayer", "source_key": prod.slug, "epa_reg_no": prod.epa_reg_no, "product_name": prod.product_name, "product_class": prod.product_class, "registrant": None, "active_ingredients": prod.active_ingredients, "signal_word": None, "label": { "url": prod.label_url, "filename": prod.label_filename, "accepted_date": None, "last_modified": prod.label_last_modified, "page_count": prod.label_page_count, "text_layer": prod.label_text_layer, }, "supplemental_documents": [ { "kind": s.kind, "title": s.title, "url": s.url, "last_modified": s.last_modified, } for s in prod.supplemental_pdfs ], "source_urls": { "product_page": prod.source_page_url, "label_api": None, "label_index": None, }, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, prod: BayerProduct, *, force: bool, seen_regs: set[str] | None = None, ) -> str: """Fetch detail + PDF and write to disk. Returns a status string suitable for logging: ``written``, ``skipped``, ``dup-skip``, ``no-pdf``, ``failed``. ``seen_regs``, if provided, is mutated: EPA reg nos written by this call are added so subsequent calls within the same run can dedup against products served under multiple catalog product-type queries (the seed-treatment query in particular re-serves herbicide / fungicide / insecticide products that have seed-treatment use sites). """ md_path = CORPUS_DIR / f"{prod.slug}.md" if md_path.exists() and not force: return "skipped" try: fetch_product_detail(http, prod) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s: %s", prod.slug, exc) return "failed" # Dedup: same EPA reg no already written in this run under a # different catalog product-type (and thus a different slug). if seen_regs is not None and prod.epa_reg_no and prod.epa_reg_no in seen_regs: log.info("dup-skip %s (epa=%s already processed under canonical class)", prod.slug, prod.epa_reg_no) return "dup-skip" # Resolve Last-Modified for label + supplementals (HEAD only, cheap). if prod.label_url: prod.label_last_modified = head_last_modified(http, prod.label_url) for s in prod.supplemental_pdfs: s.last_modified = head_last_modified(http, s.url) if not prod.label_url: # Some Bayer products have no public label PDF (e.g. product was # discontinued or the page only carries a Product Bulletin). We # still record the metadata sidecar so the catalog is complete, # but write a stub body so the file count reflects reality. log.info("%s — no label PDF; writing metadata only", prod.slug) prod.label_text_layer = False write_product(prod, "_(No label PDF was found on the product page.)_\n") return "no-pdf" try: body, page_count, text_layer = fetch_pdf_text(http, prod.label_url) except Exception as exc: # noqa: BLE001 log.error("PDF fetch/extract failed for %s (%s): %s", prod.slug, prod.label_url, exc) return "failed" prod.label_page_count = page_count prod.label_text_layer = text_layer if not body.strip(): log.warning("%s — extracted PDF was empty (scanned?)", prod.slug) body = "[SCANNED PDF — OCR REQUIRED]\n" write_product(prod, body) return "written" def _load_seen_regs() -> set[str]: """Hydrate the seen-EPA-reg-no set from existing sidecars on disk so dedup survives across runs (e.g., a re-run with the seed-treatment query won't re-write products already on disk under their canonical slug).""" seen: set[str] = set() if not CORPUS_DIR.exists(): return seen for f in CORPUS_DIR.glob("*.json"): try: data = json.loads(f.read_text(encoding="utf-8")) reg = data.get("epa_reg_no") if reg: seen.add(reg) except (OSError, json.JSONDecodeError): continue return seen def run( *, limit: int | None, force: bool, only_product: str | None, only_class: str | None, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() build_id = fetch_build_id(http) products: list[BayerProduct] = [] for prod in walk_catalog(http, build_id): if only_class and prod.product_class != only_class: continue if only_product and prod.slug != only_product and prod.catalog_slug != only_product: continue products.append(prod) if only_product and not products: log.error("no product matched --product=%s", only_product) return 2 log.info("catalog yielded %d candidate product(s)", len(products)) # Seed the dedup set from disk so re-runs and force-runs both behave. seen_regs: set[str] = set() if force else _load_seen_regs() if seen_regs: log.info("dedup: %d EPA reg nos pre-loaded from existing corpus", len(seen_regs)) counts = {"written": 0, "skipped": 0, "dup-skip": 0, "no-pdf": 0, "failed": 0} processed = 0 for prod in products: if limit is not None and processed >= limit: break processed += 1 status = process_product(http, prod, force=force, seen_regs=seen_regs) counts[status] = counts.get(status, 0) + 1 if status in ("written", "no-pdf") and prod.epa_reg_no: seen_regs.add(prod.epa_reg_no) log.info( "[%d/%s] %s %s | class=%s epa=%s ai=%s label=%s", processed, str(limit) if limit else "all", prod.slug, status, prod.product_class, prod.epa_reg_no or "-", ",".join(a["name"] for a in prod.active_ingredients if a.get("name")) or "-", prod.label_url or "-", ) log.info( "done: processed=%d written=%d skipped=%d dup-skip=%d no-pdf=%d failed=%d", processed, counts["written"], counts["skipped"], counts["dup-skip"], counts["no-pdf"], counts["failed"], ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.bayer", description="Scrape Bayer Crop Science US product labels.", ) p.add_argument( "--limit", type=int, default=None, help="Stop after processing N products (default: all).", ) p.add_argument( "--force", action="store_true", help="Re-download even if the markdown file already exists.", ) p.add_argument( "--product", default=None, help="Process a single product by slug (e.g. 'warrant' or " "'warrant-herbicide').", ) p.add_argument( "--class", dest="product_class", default=None, choices=sorted(set(PRODUCT_CLASS.values())), help="Limit to one product class.", ) p.add_argument( "--log-level", default=os.environ.get("LOG_LEVEL", "INFO"), help="Python logging level (default INFO).", ) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_product=args.product, only_class=args.product_class, ) if __name__ == "__main__": sys.exit(main())