"""Bayer seeds scraper — DEKALB (corn) + Asgrow (soy) + WestBred (wheat). Source: ``www.cropscience.bayer.us`` — the same Next.js + ``__NEXT_DATA__`` infrastructure that powers Bayer's crop-protection catalog (which ``crop-chem-docs`` already scrapes). robots.txt explicitly whitelists *"artificial intelligence retrieval augmented generation"* use of the content, which is what this corpus feeds. Discovery: ``/sitemap-dynamic.xml`` enumerates every variety URL under ``/corn/dekalb/``, ``/soybeans/asgrow/``, ``/wheat/westbred/`` — counts on 2026-05-25: 288 / 102 / 85 = 475 total, matching recon. The seed catalog landing pages SSR only the first 12 of N products via React Query state hydration; we sidestep that entirely by walking the sitemap. Per-variety detail comes from the product page itself. Each page embeds a full ``__NEXT_DATA__`` JSON island whose ``props.pageProps.productDetails`` carries: - Identity: ``brand``, ``crop``, ``productId``, ``hybridLabel``, ``hybridPrefix``, ``hybridSuffix``, ``releaseYear`` - Maturity: ``relativeMaturity`` (corn = RM days, soy = MG, wheat = qualitative early/medium/late) - Traits: ``traits[]`` of ``{trait, traitFullName}`` - Narrative: ``positioningStatement``, ``strengthsAndManagement[]`` - Ratings: ``characteristics[]`` of ``{label, type, items: [{characteristic, value}]}`` — groups vary by crop: corn: DISEASE RATINGS / GROWTH / MANAGEMENT / HARVEST / HERBICIDE / PLANT DESCRIPTION soy: DISEASE RATINGS / SENSITIVITY / MANAGEMENT / PLANT DESCRIPTION / PRODUCTION wheat: KEY CHARACTERISTICS / MANAGEMENT / PRODUCTION / QUALITY / PEST AND DISEASE RESISTANCE - Regional: ``localProfiles[]`` of regional seed-guide listings incl. agronomist name + email Bayer ratings are on the canonical **1-9 (9 = best)** scale already, so no flip is needed (unlike Golden Harvest, which is documented in CLAUDE.md). Non-numeric ratings (S/R for soy disease resistance, gene names like Rps1c, sensitivity letters A/B/C) are preserved verbatim — the chunker (Phase 2) handles surfacing. Output: corpus/bayer_seeds/.md LLM-visible body corpus/bayer_seeds/.json sidecar metadata source_key convention: ``-`` lowercased — derived from the URL terminal slug minus the trailing crop suffix (``-corn``/``-soybeans``/``-wheat``). E.g. ``dekalb-dkc075-70rib`` or ``asgrow-ag005xf3``. CLI: python -m scrape.sources.bayer_seeds --limit 5 python -m scrape.sources.bayer_seeds --brand dekalb --limit 20 python -m scrape.sources.bayer_seeds --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.cropscience.bayer.us" SITEMAP_URL = f"{BASE}/sitemap-dynamic.xml" # All Bayer brand × crop paths in the cropscience.bayer.us sitemap. # Each entry: (brand_key, url_path_prefix, crop_label, is_primary_for_brand). # # `is_primary` controls source_key derivation: for a brand's primary # crop we STRIP the trailing crop suffix from the URL tail (so # DEKALB corn `dekalb-dkc62-08rib-corn` → source_key # `dekalb-dkc62-08rib`, matching the corpus we deployed 2026-05-25). # For non-primary crops we KEEP the suffix (so DEKALB silage # `dekalb-dkc093-05rib-silage` → source_key # `dekalb-dkc093-05rib-silage`, distinct from the corn key and # collision-safe when the same SKU is marketed as both grain and # silage). # # Counts as of 2026-05-25 sitemap walk: # DEKALB corn=288 silage=82 sorghum=18 canola=6 # Asgrow soy=102 # WestBred wheat=85 # Channel corn=181 soy=67 silage=54 sorghum=18 # Deltapine cotton=30 BRAND_PATHS: list[tuple[str, str, str, bool]] = [ ("dekalb", "/corn/dekalb/", "corn", True), ("dekalb", "/silage/dekalb/", "silage", False), ("dekalb", "/sorghum/dekalb/", "sorghum", False), ("dekalb", "/canola/dekalb/", "canola", False), ("asgrow", "/soybeans/asgrow/", "soybeans", True), ("westbred", "/wheat/westbred/", "wheat", True), ("channel", "/corn/channel/", "corn", True), ("channel", "/soybeans/channel/", "soybeans", False), ("channel", "/silage/channel/", "silage", False), ("channel", "/sorghum/channel/", "sorghum", False), ("deltapine", "/cotton/deltapine/", "cotton", True), ] # Distinct brand-key list for the --brand CLI choices. BRANDS = sorted({b for b, _p, _c, _pri in BRAND_PATHS}) # Catalog/landing pages that live under the brand path but are NOT # individual varieties. Skip these during discovery. NON_VARIETY_PATH_TAILS = { "seed-catalog", "product-compare", "find-a-dealer", "find-a-rep", "saved-products", } # Bayer publishes seed ratings on the canonical 1-9 scale (9 = best), # unlike Golden Harvest. This goes into the sidecar so the chunker # knows not to flip. RATING_SCALE_DIRECTION = "1-9 (9 = best)" # Repo root: scrape/sources/bayer_seeds.py -> 3 parents up. REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "bayer_seeds" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.bayer_seeds") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """``requests.Session`` wrapper with sleep-based rate limiting and polite retries on 429/5xx. Lifted from crop-chem-docs' Bayer scraper — same host, same politeness story.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class BayerSeedProduct: # Identity source_key: str # e.g. "dekalb-dkc075-70rib" source_url: str # full product page URL brand: str # "DEKALB" | "ASGROW" | "WESTBRED" crop: str # "corn" | "soybeans" | "wheat" product_name: str = "" # hybridLabel, e.g. "DKC075-70RIB BRAND BLEND" product_id: str | None = None # full Bayer productId hybrid_prefix: str | None = None # e.g. "DKC075-70RIB" hybrid_suffix: str | None = None # e.g. "BRAND BLEND" release_year: int | None = None # Maturity — semantics vary by crop, value preserved as-published. relative_maturity: str | None = None # corn: RM days as string; wheat: qualitative maturity_group: str | None = None # soy MG as string wheat_class: str | None = None # not exposed in productDetails — left null # Traits trait_codes: list[str] = field(default_factory=list) # ["VT2PRIB"] trait_descriptions: list[str] = field(default_factory=list) # full names # Narrative positioning_statement: str | None = None strengths: list[str] = field(default_factory=list) # Ratings — preserved as the source's grouped form. The chunker # re-buckets into the canonical disease/agronomic flats from # seed-mcp/CLAUDE.md. characteristics_groups: list[dict] = field(default_factory=list) # Regional recommendations (Bayer's "local profiles"). regional_recommendations: list[dict] = field(default_factory=list) # Media image_url: str | None = None # Discovery sitemap_last_modified: str | None = None # --------------------------------------------------------------------- helpers _NEXT_DATA_RE = re.compile( r'', re.S ) def parse_next_data(html: str) -> dict[str, Any]: """Pull the ``__NEXT_DATA__`` JSON blob out of a Next.js page.""" m = _NEXT_DATA_RE.search(html) if not m: raise RuntimeError("no __NEXT_DATA__ script tag found") return json.loads(m.group(1)) def source_key_from_url(url: str, brand: str, crop: str, is_primary: bool) -> str: """Derive ``-`` slug from the product URL. For a brand's PRIMARY crop (DEKALB/corn, Asgrow/soybeans, WestBred/wheat, Channel/corn, Deltapine/cotton): strip the trailing ``-`` suffix Bayer puts on every URL — so ``dekalb-dkc075-70rib-corn`` becomes ``dekalb-dkc075-70rib``. For SECONDARY crops on a multi-crop brand (DEKALB silage / sorghum / canola; Channel soybeans / silage / sorghum): KEEP the crop suffix so the same SKU marketed under multiple crops gets distinct source_keys and `lookup_variety(...)` stays unambiguous. """ tail = url.rstrip("/").rsplit("/", 1)[-1].lower() if is_primary: suffix = f"-{crop}" if tail.endswith(suffix): tail = tail[: -len(suffix)] return tail def looks_like_variety_url(url: str, brand_path: str) -> bool: """True if ``url`` is a per-variety product page under ``brand_path`` (not a catalog/landing page or sub-tool).""" rest = url.split(brand_path, 1)[-1].strip("/") if not rest or "/" in rest: return False # empty (the brand index) or a sub-path tool if rest in NON_VARIETY_PATH_TAILS: return False return True # --------------------------------------------------------------------- discovery def discover_varieties( http: RateLimitedSession, *, only_brand: str | None = None, only_crop: str | None = None, ) -> list[tuple[str, str, str, bool, str]]: """Return ``[(url, brand, crop, is_primary, lastmod), ...]`` for every Bayer seed variety found in the dynamic sitemap. ``brand`` is the lowercase brand key (one of ``BRANDS``). ``crop`` is the crop label (corn/soybeans/wheat/silage/sorghum/ canola/cotton) determined by the URL path segment. ``is_primary`` is True when this is the brand's primary crop — drives the source_key suffix-stripping rule. ``lastmod`` is the ISO 8601 timestamp from the sitemap entry. """ log.info("fetching sitemap %s", SITEMAP_URL) r = http.get(SITEMAP_URL) r.raise_for_status() xml = r.text entries = re.findall( r"\s*([^<]+)\s*(?:([^<]+))?", xml, ) log.info("sitemap parsed: %d total URLs", len(entries)) out: list[tuple[str, str, str, bool, str]] = [] for url, lastmod in entries: for brand, brand_path, crop, is_primary in BRAND_PATHS: if only_brand and brand != only_brand: continue if only_crop and crop != only_crop: continue if brand_path in url and looks_like_variety_url(url, brand_path): out.append((url, brand, crop, is_primary, lastmod or "")) break by_brand_crop: dict[tuple[str, str], int] = {} for _, b, c, _, _ in out: by_brand_crop[(b, c)] = by_brand_crop.get((b, c), 0) + 1 log.info( "variety URLs found: %s (total=%d)", ", ".join(f"{b}/{c}={n}" for (b, c), n in sorted(by_brand_crop.items())), len(out), ) return out # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, url: str, brand: str, crop: str, is_primary: bool, lastmod: str, ) -> BayerSeedProduct: """Fetch + parse one product page into a ``BayerSeedProduct``.""" r = http.get(url) r.raise_for_status() data = parse_next_data(r.text) pp = (data.get("props") or {}).get("pageProps") or {} pd = pp.get("productDetails") or {} prod = BayerSeedProduct( source_key=source_key_from_url(url, brand, crop, is_primary), source_url=url, brand=(pd.get("brand") or brand).upper(), crop=(pd.get("crop") or crop).lower(), sitemap_last_modified=lastmod or None, ) prod.product_name = pd.get("hybridLabel") or pd.get("productName") or prod.source_key prod.product_id = pd.get("productId") prod.hybrid_prefix = pd.get("hybridPrefix") prod.hybrid_suffix = pd.get("hybridSuffix") ry = pd.get("releaseYear") if isinstance(ry, int): prod.release_year = ry elif isinstance(ry, str) and ry.isdigit(): prod.release_year = int(ry) # Maturity routing per crop. Source stores all three in # `relativeMaturity` as a string; we split by crop semantics. rm = pd.get("relativeMaturity") if rm is not None: rm_str = str(rm) if prod.crop == "corn": prod.relative_maturity = rm_str elif prod.crop == "soybeans": prod.maturity_group = rm_str elif prod.crop == "wheat": # WestBred encodes Early/Medium/Late as the qualitative # maturity. The class (HRW/HRS/SWW/...) is not in # productDetails — it's only in the marketing narrative. # We surface what we have; a future enrichment step can # parse the narrative if needed. prod.wheat_class = None # explicit: not exposed in this JSON prod.relative_maturity = rm_str # Traits for t in pd.get("traits") or []: code = (t or {}).get("trait") full = (t or {}).get("traitFullName") if code: prod.trait_codes.append(code) if full: prod.trait_descriptions.append(full) # Narrative prod.positioning_statement = pd.get("positioningStatement") sm = pd.get("strengthsAndManagement") or pd.get("strengths") or [] if isinstance(sm, list): prod.strengths = [str(s).strip() for s in sm if s] # Ratings groups — preserved verbatim (label / type / items). chars = pd.get("characteristics") or [] cleaned_groups: list[dict] = [] for g in chars: if not isinstance(g, dict): continue items = [ {"characteristic": (it.get("characteristic") or "").strip(), "value": ("" if it.get("value") is None else str(it.get("value"))).strip()} for it in (g.get("items") or []) if isinstance(it, dict) and it.get("characteristic") ] if not items: continue cleaned_groups.append({ "label": (g.get("label") or "").strip(), "type": (g.get("type") or "").strip(), "items": items, }) prod.characteristics_groups = cleaned_groups # Regional recommendations. lp = pd.get("localProfiles") or [] if isinstance(lp, list): for p in lp: if not isinstance(p, dict): continue prod.regional_recommendations.append({ "product_list_name": p.get("productListName"), "agronomist": p.get("agronomist"), "agronomist_email": p.get("agronomistEmailAddress"), "variant_id": p.get("variantId"), }) # Image (just the first one) imgs = pp.get("images") or [] if isinstance(imgs, list) and imgs and isinstance(imgs[0], dict): prod.image_url = imgs[0].get("url") return prod # --------------------------------------------------------------------- render def render_markdown(p: BayerSeedProduct) -> str: """Build the markdown body for the variety. The Phase 2 chunker will rewrite chunk_0 with a tighter preamble; this is the readable today copy that already covers everything searchable. """ title = p.product_name or p.source_key crop_label = p.crop.capitalize() maturity_lines: list[str] = [] if p.relative_maturity is not None and p.crop == "corn": maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}") if p.maturity_group is not None and p.crop == "soybeans": maturity_lines.append(f"- **Maturity group:** {p.maturity_group}") if p.relative_maturity is not None and p.crop == "wheat": maturity_lines.append(f"- **Maturity:** {p.relative_maturity}") if p.wheat_class: maturity_lines.append(f"- **Wheat class:** {p.wheat_class}") trait_line = "" if p.trait_codes: codes = ", ".join(p.trait_codes) if p.trait_descriptions: descs = "; ".join(p.trait_descriptions) trait_line = f"- **Traits:** {codes} ({descs})" else: trait_line = f"- **Traits:** {codes}" header_lines = [ f"# {title}", "", "- **Vendor:** Bayer", f"- **Brand:** {p.brand.title() if p.brand else '(unknown)'}", f"- **Crop:** {crop_label}", *maturity_lines, ] if trait_line: header_lines.append(trait_line) if p.release_year: header_lines.append(f"- **Release year:** {p.release_year}") header_lines.append(f"- **Source:** {p.source_url}") header_lines.append(f"- **Rating scale (Bayer):** {RATING_SCALE_DIRECTION}") header_lines.append("") header_lines.append("---") header_lines.append("") sections: list[str] = [] if p.positioning_statement: sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n") if p.strengths: bullets = "\n".join(f"- {s}" for s in p.strengths) sections.append("## Strengths & management\n\n" + bullets + "\n") # Render each characteristics group as its own table for readability. for g in p.characteristics_groups: label = g.get("label") or "Characteristics" items = g.get("items") or [] if not items: continue rows = "\n".join( f"| {it['characteristic']} | {it['value']} |" for it in items ) sections.append( f"## {label.title()}\n\n" "| Characteristic | Value |\n" "|---|---|\n" f"{rows}\n" ) if p.regional_recommendations: seen: set[str] = set() rows: list[str] = [] for r in p.regional_recommendations: name = (r.get("product_list_name") or "").strip() agronomist = (r.get("agronomist") or "").strip() key = f"{name}||{agronomist}" if key in seen or not name: continue seen.add(key) rows.append(f"- **{name}** — agronomist: {agronomist or '(unlisted)'}") if rows: sections.append("## Regional seed-guide listings\n\n" + "\n".join(rows) + "\n") return "\n".join(header_lines) + "\n".join(sections) # --------------------------------------------------------------------- write def write_product(prod: BayerSeedProduct, body_md: str) -> None: """Write the markdown body + sidecar JSON. Schema documented in seed-mcp/CLAUDE.md.""" CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "bayer_seeds", "source_key": prod.source_key, "vendor": "Bayer", "brand": prod.brand, "product_name": prod.product_name, "product_id": prod.product_id, "hybrid_prefix": prod.hybrid_prefix, "hybrid_suffix": prod.hybrid_suffix, "crop": prod.crop, "release_year": prod.release_year, "relative_maturity": prod.relative_maturity, "maturity_group": prod.maturity_group, "wheat_class": prod.wheat_class, "trait_stack": prod.trait_codes, "trait_descriptions": prod.trait_descriptions, "positioning_statement": prod.positioning_statement, "strengths": prod.strengths, # Raw grouped ratings preserved as published. Chunker re-buckets # into canonical disease/agronomic flats per CLAUDE.md schema. "characteristics_groups": prod.characteristics_groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": prod.regional_recommendations, "image_url": prod.image_url, "source_urls": [prod.source_url], "sitemap_last_modified": prod.sitemap_last_modified, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, *, url: str, brand: str, crop: str, is_primary: bool, lastmod: str, force: bool, ) -> tuple[str, BayerSeedProduct | None]: """Returns ``(status, prod or None)`` where status is one of ``written`` / ``skipped`` / ``failed``.""" source_key = source_key_from_url(url, brand, crop, is_primary) md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: return "skipped", None try: prod = fetch_product_detail(http, url, brand, crop, is_primary, lastmod) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s: %s", url, exc) return "failed", None body = render_markdown(prod) write_product(prod, body) return "written", prod def run( *, limit: int | None, force: bool, only_brand: str | None, only_crop: str | None, only_product: str | None, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() targets = discover_varieties(http, only_brand=only_brand, only_crop=only_crop) if only_product: targets = [ (u, b, c, p, lm) for (u, b, c, p, lm) in targets if source_key_from_url(u, b, c, p) == only_product or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product ] if not targets: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "failed": 0} processed = 0 for url, brand, crop, is_primary, lastmod in targets: if limit is not None and processed >= limit: break processed += 1 status, prod = process_product( http, url=url, brand=brand, crop=crop, is_primary=is_primary, lastmod=lastmod, force=force, ) counts[status] = counts.get(status, 0) + 1 if prod is not None: log.info( "[%d/%s] %s %s | brand=%s crop=%s rm/mg=%s traits=%s groups=%d", processed, str(limit) if limit else "all", prod.source_key, status, prod.brand, prod.crop, prod.relative_maturity or prod.maturity_group or "-", ",".join(prod.trait_codes) or "-", len(prod.characteristics_groups), ) else: log.info("[%d/%s] %s %s", processed, str(limit) if limit else "all", source_key_from_url(url, brand, crop, is_primary), status) log.info( "done: processed=%d written=%d skipped=%d failed=%d (out of %d candidates)", processed, counts["written"], counts["skipped"], counts["failed"], len(targets), ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI _ALL_CROPS = sorted({c for _b, _p, c, _pri in BRAND_PATHS}) def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.bayer_seeds", description="Scrape Bayer seed varieties — DEKALB / Asgrow / " "WestBred / Channel / Deltapine across corn, " "soybeans, wheat, silage, sorghum, canola, cotton.", ) p.add_argument( "--limit", type=int, default=None, help="Stop after processing N varieties (default: all).", ) p.add_argument( "--force", action="store_true", help="Re-fetch even if the markdown file already exists.", ) p.add_argument( "--brand", default=None, choices=BRANDS, help="Limit to one Bayer seed brand.", ) p.add_argument( "--crop", default=None, choices=_ALL_CROPS, help="Limit to one crop. Useful for incrementally backfilling " "(e.g. `--crop sorghum` to grab just the sorghum lines).", ) p.add_argument( "--product", default=None, help="Process a single variety by source_key " "(e.g. 'dekalb-dkc62-08rib') or terminal URL slug.", ) p.add_argument( "--log-level", default=os.environ.get("LOG_LEVEL", "INFO"), help="Python logging level (default INFO).", ) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_brand=args.brand, only_crop=args.crop, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())