Files
seed-mcp/scrape/sources/bayer_seeds.py
T
justin 2a4c0d4aba bayer_seeds: implement Phase 1 scraper for DEKALB + Asgrow + WestBred
Replace stub with working scraper for all three Bayer seed brands.
Discovery uses the public sitemap-dynamic.xml (475 varieties:
288 DEKALB corn + 102 Asgrow soy + 85 WestBred wheat — matches recon).
Per-variety detail comes from the page's __NEXT_DATA__ JSON island.

Each variety writes corpus/bayer_seeds/<source_key>.{md,json} with:
- Identity (brand, crop, hybridLabel, productId, releaseYear)
- Maturity routed per crop (RM for corn, MG for soy, qualitative for wheat)
- Trait stack (code + full name)
- Positioning + strengths narrative
- Characteristics groups (DISEASE RATINGS, GROWTH, MANAGEMENT, HARVEST,
  etc.) preserved verbatim from source so the chunker can re-bucket
  into canonical disease/agronomic flats per CLAUDE.md schema
- Regional seed-guide listings with agronomist contacts
- _scale_direction tag (Bayer = "1-9 (9 = best)") for chunker

Smoke-tested all three brands (--limit 2 each, plus --product, --force,
and scrape.runner dispatch). Politeness: 1 req/sec, retries on 429/5xx
with Retry-After honored.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 12:53:46 -04:00

687 lines
24 KiB
Python

"""Bayer seeds scraper — DEKALB (corn) + Asgrow (soy) + WestBred (wheat).
Source: ``www.cropscience.bayer.us`` — the same Next.js + ``__NEXT_DATA__``
infrastructure that powers Bayer's crop-protection catalog (which
``crop-chem-docs`` already scrapes). robots.txt explicitly whitelists
*"artificial intelligence retrieval augmented generation"* use of the
content, which is what this corpus feeds.
Discovery: ``/sitemap-dynamic.xml`` enumerates every variety URL under
``/corn/dekalb/``, ``/soybeans/asgrow/``, ``/wheat/westbred/`` — counts
on 2026-05-25: 288 / 102 / 85 = 475 total, matching recon. The seed
catalog landing pages SSR only the first 12 of N products via React
Query state hydration; we sidestep that entirely by walking the
sitemap.
Per-variety detail comes from the product page itself. Each page
embeds a full ``__NEXT_DATA__`` JSON island whose
``props.pageProps.productDetails`` carries:
- Identity: ``brand``, ``crop``, ``productId``,
``hybridLabel``, ``hybridPrefix``, ``hybridSuffix``,
``releaseYear``
- Maturity: ``relativeMaturity`` (corn = RM days, soy = MG,
wheat = qualitative early/medium/late)
- Traits: ``traits[]`` of ``{trait, traitFullName}``
- Narrative: ``positioningStatement``, ``strengthsAndManagement[]``
- Ratings: ``characteristics[]`` of
``{label, type, items: [{characteristic, value}]}`` —
groups vary by crop:
corn: DISEASE RATINGS / GROWTH / MANAGEMENT / HARVEST /
HERBICIDE / PLANT DESCRIPTION
soy: DISEASE RATINGS / SENSITIVITY / MANAGEMENT /
PLANT DESCRIPTION / PRODUCTION
wheat: KEY CHARACTERISTICS / MANAGEMENT / PRODUCTION /
QUALITY / PEST AND DISEASE RESISTANCE
- Regional: ``localProfiles[]`` of regional seed-guide listings
incl. agronomist name + email
Bayer ratings are on the canonical **1-9 (9 = best)** scale already,
so no flip is needed (unlike Golden Harvest, which is documented in
CLAUDE.md). Non-numeric ratings (S/R for soy disease resistance,
gene names like Rps1c, sensitivity letters A/B/C) are preserved
verbatim — the chunker (Phase 2) handles surfacing.
Output:
corpus/bayer_seeds/<source_key>.md LLM-visible body
corpus/bayer_seeds/<source_key>.json sidecar metadata
source_key convention: ``<brand>-<sku>`` lowercased — derived from the
URL terminal slug minus the trailing crop suffix
(``-corn``/``-soybeans``/``-wheat``). E.g.
``dekalb-dkc075-70rib`` or ``asgrow-ag005xf3``.
CLI:
python -m scrape.sources.bayer_seeds --limit 5
python -m scrape.sources.bayer_seeds --brand dekalb --limit 20
python -m scrape.sources.bayer_seeds --force
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.cropscience.bayer.us"
SITEMAP_URL = f"{BASE}/sitemap-dynamic.xml"
# Brand → (URL path segment, crop label). Ordering here defines the
# `--all` walk order and the `--brand` choices.
BRANDS: dict[str, tuple[str, str]] = {
"dekalb": ("/corn/dekalb/", "corn"),
"asgrow": ("/soybeans/asgrow/", "soybeans"),
"westbred": ("/wheat/westbred/", "wheat"),
}
# Per-brand crop-suffix to strip off the URL's terminal slug when
# computing source_key (so ``dekalb-dkc075-70rib-corn`` → ``dekalb-dkc075-70rib``).
CROP_SUFFIX = {
"dekalb": "-corn",
"asgrow": "-soybeans",
"westbred": "-wheat",
}
# Catalog/landing pages that live under the brand path but are NOT
# individual varieties. Skip these during discovery.
NON_VARIETY_PATH_TAILS = {
"seed-catalog",
"product-compare",
"find-a-dealer",
"find-a-rep",
"saved-products",
}
# Bayer publishes seed ratings on the canonical 1-9 scale (9 = best),
# unlike Golden Harvest. This goes into the sidecar so the chunker
# knows not to flip.
RATING_SCALE_DIRECTION = "1-9 (9 = best)"
# Repo root: scrape/sources/bayer_seeds.py -> 3 parents up.
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "bayer_seeds"
REQ_INTERVAL_SEC = 1.0
log = logging.getLogger("scrape.bayer_seeds")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""``requests.Session`` wrapper with sleep-based rate limiting and
polite retries on 429/5xx. Lifted from crop-chem-docs' Bayer scraper
— same host, same politeness story."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(
self,
method: str,
url: str,
*,
max_retries: int = 4,
timeout: float = 30.0,
**kw: Any,
) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
# --------------------------------------------------------------------- model
@dataclass
class BayerSeedProduct:
# Identity
source_key: str # e.g. "dekalb-dkc075-70rib"
source_url: str # full product page URL
brand: str # "DEKALB" | "ASGROW" | "WESTBRED"
crop: str # "corn" | "soybeans" | "wheat"
product_name: str = "" # hybridLabel, e.g. "DKC075-70RIB BRAND BLEND"
product_id: str | None = None # full Bayer productId
hybrid_prefix: str | None = None # e.g. "DKC075-70RIB"
hybrid_suffix: str | None = None # e.g. "BRAND BLEND"
release_year: int | None = None
# Maturity — semantics vary by crop, value preserved as-published.
relative_maturity: str | None = None # corn: RM days as string; wheat: qualitative
maturity_group: str | None = None # soy MG as string
wheat_class: str | None = None # not exposed in productDetails — left null
# Traits
trait_codes: list[str] = field(default_factory=list) # ["VT2PRIB"]
trait_descriptions: list[str] = field(default_factory=list) # full names
# Narrative
positioning_statement: str | None = None
strengths: list[str] = field(default_factory=list)
# Ratings — preserved as the source's grouped form. The chunker
# re-buckets into the canonical disease/agronomic flats from
# seed-mcp/CLAUDE.md.
characteristics_groups: list[dict] = field(default_factory=list)
# Regional recommendations (Bayer's "local profiles").
regional_recommendations: list[dict] = field(default_factory=list)
# Media
image_url: str | None = None
# Discovery
sitemap_last_modified: str | None = None
# --------------------------------------------------------------------- helpers
_NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', re.S
)
def parse_next_data(html: str) -> dict[str, Any]:
"""Pull the ``__NEXT_DATA__`` JSON blob out of a Next.js page."""
m = _NEXT_DATA_RE.search(html)
if not m:
raise RuntimeError("no __NEXT_DATA__ script tag found")
return json.loads(m.group(1))
def source_key_from_url(url: str, brand: str) -> str:
"""Derive ``<brand>-<sku>`` slug from the product URL.
Drops the trailing ``-<crop>`` suffix Bayer puts on every product
URL terminal segment (``dekalb-dkc075-70rib-corn`` →
``dekalb-dkc075-70rib``).
"""
tail = url.rstrip("/").rsplit("/", 1)[-1].lower()
suffix = CROP_SUFFIX.get(brand, "")
if suffix and tail.endswith(suffix):
tail = tail[: -len(suffix)]
return tail
def looks_like_variety_url(url: str, brand_path: str) -> bool:
"""True if ``url`` is a per-variety product page under ``brand_path``
(not a catalog/landing page or sub-tool)."""
rest = url.split(brand_path, 1)[-1].strip("/")
if not rest or "/" in rest:
return False # empty (the brand index) or a sub-path tool
if rest in NON_VARIETY_PATH_TAILS:
return False
return True
# --------------------------------------------------------------------- discovery
def discover_varieties(
http: RateLimitedSession,
*,
only_brand: str | None = None,
) -> list[tuple[str, str, str, str]]:
"""Return ``[(url, brand, crop, lastmod), ...]`` for every Bayer
seed variety found in the dynamic sitemap.
``brand`` is the lowercase brand key (matches ``BRANDS``).
``lastmod`` is the ISO 8601 timestamp from the sitemap entry.
"""
log.info("fetching sitemap %s", SITEMAP_URL)
r = http.get(SITEMAP_URL)
r.raise_for_status()
xml = r.text
# Tiny regex parse — sitemap is flat and well-formed; no need for
# the lxml dependency on a single 600KB file.
entries = re.findall(
r"<url>\s*<loc>([^<]+)</loc>\s*(?:<lastmod>([^<]+)</lastmod>)?",
xml,
)
log.info("sitemap parsed: %d total URLs", len(entries))
out: list[tuple[str, str, str, str]] = []
for url, lastmod in entries:
for brand, (brand_path, crop) in BRANDS.items():
if only_brand and brand != only_brand:
continue
if brand_path in url and looks_like_variety_url(url, brand_path):
out.append((url, brand, crop, lastmod or ""))
break
by_brand: dict[str, int] = {}
for _, b, _, _ in out:
by_brand[b] = by_brand.get(b, 0) + 1
log.info("variety URLs found: %s (total=%d)",
", ".join(f"{k}={v}" for k, v in sorted(by_brand.items())),
len(out))
return out
# --------------------------------------------------------------------- detail
def fetch_product_detail(
http: RateLimitedSession, url: str, brand: str, crop: str, lastmod: str
) -> BayerSeedProduct:
"""Fetch + parse one product page into a ``BayerSeedProduct``."""
r = http.get(url)
r.raise_for_status()
data = parse_next_data(r.text)
pp = (data.get("props") or {}).get("pageProps") or {}
pd = pp.get("productDetails") or {}
prod = BayerSeedProduct(
source_key=source_key_from_url(url, brand),
source_url=url,
brand=(pd.get("brand") or brand).upper(),
crop=(pd.get("crop") or crop).lower(),
sitemap_last_modified=lastmod or None,
)
prod.product_name = pd.get("hybridLabel") or pd.get("productName") or prod.source_key
prod.product_id = pd.get("productId")
prod.hybrid_prefix = pd.get("hybridPrefix")
prod.hybrid_suffix = pd.get("hybridSuffix")
ry = pd.get("releaseYear")
if isinstance(ry, int):
prod.release_year = ry
elif isinstance(ry, str) and ry.isdigit():
prod.release_year = int(ry)
# Maturity routing per crop. Source stores all three in
# `relativeMaturity` as a string; we split by crop semantics.
rm = pd.get("relativeMaturity")
if rm is not None:
rm_str = str(rm)
if prod.crop == "corn":
prod.relative_maturity = rm_str
elif prod.crop == "soybeans":
prod.maturity_group = rm_str
elif prod.crop == "wheat":
# WestBred encodes Early/Medium/Late as the qualitative
# maturity. The class (HRW/HRS/SWW/...) is not in
# productDetails — it's only in the marketing narrative.
# We surface what we have; a future enrichment step can
# parse the narrative if needed.
prod.wheat_class = None # explicit: not exposed in this JSON
prod.relative_maturity = rm_str
# Traits
for t in pd.get("traits") or []:
code = (t or {}).get("trait")
full = (t or {}).get("traitFullName")
if code:
prod.trait_codes.append(code)
if full:
prod.trait_descriptions.append(full)
# Narrative
prod.positioning_statement = pd.get("positioningStatement")
sm = pd.get("strengthsAndManagement") or pd.get("strengths") or []
if isinstance(sm, list):
prod.strengths = [str(s).strip() for s in sm if s]
# Ratings groups — preserved verbatim (label / type / items).
chars = pd.get("characteristics") or []
cleaned_groups: list[dict] = []
for g in chars:
if not isinstance(g, dict):
continue
items = [
{"characteristic": (it.get("characteristic") or "").strip(),
"value": ("" if it.get("value") is None else str(it.get("value"))).strip()}
for it in (g.get("items") or [])
if isinstance(it, dict) and it.get("characteristic")
]
if not items:
continue
cleaned_groups.append({
"label": (g.get("label") or "").strip(),
"type": (g.get("type") or "").strip(),
"items": items,
})
prod.characteristics_groups = cleaned_groups
# Regional recommendations.
lp = pd.get("localProfiles") or []
if isinstance(lp, list):
for p in lp:
if not isinstance(p, dict):
continue
prod.regional_recommendations.append({
"product_list_name": p.get("productListName"),
"agronomist": p.get("agronomist"),
"agronomist_email": p.get("agronomistEmailAddress"),
"variant_id": p.get("variantId"),
})
# Image (just the first one)
imgs = pp.get("images") or []
if isinstance(imgs, list) and imgs and isinstance(imgs[0], dict):
prod.image_url = imgs[0].get("url")
return prod
# --------------------------------------------------------------------- render
def render_markdown(p: BayerSeedProduct) -> str:
"""Build the markdown body for the variety. The Phase 2 chunker will
rewrite chunk_0 with a tighter preamble; this is the readable today
copy that already covers everything searchable.
"""
title = p.product_name or p.source_key
crop_label = p.crop.capitalize()
maturity_lines: list[str] = []
if p.relative_maturity is not None and p.crop == "corn":
maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}")
if p.maturity_group is not None and p.crop == "soybeans":
maturity_lines.append(f"- **Maturity group:** {p.maturity_group}")
if p.relative_maturity is not None and p.crop == "wheat":
maturity_lines.append(f"- **Maturity:** {p.relative_maturity}")
if p.wheat_class:
maturity_lines.append(f"- **Wheat class:** {p.wheat_class}")
trait_line = ""
if p.trait_codes:
codes = ", ".join(p.trait_codes)
if p.trait_descriptions:
descs = "; ".join(p.trait_descriptions)
trait_line = f"- **Traits:** {codes} ({descs})"
else:
trait_line = f"- **Traits:** {codes}"
header_lines = [
f"# {title}",
"",
"- **Vendor:** Bayer",
f"- **Brand:** {p.brand.title() if p.brand else '(unknown)'}",
f"- **Crop:** {crop_label}",
*maturity_lines,
]
if trait_line:
header_lines.append(trait_line)
if p.release_year:
header_lines.append(f"- **Release year:** {p.release_year}")
header_lines.append(f"- **Source:** {p.source_url}")
header_lines.append(f"- **Rating scale (Bayer):** {RATING_SCALE_DIRECTION}")
header_lines.append("")
header_lines.append("---")
header_lines.append("")
sections: list[str] = []
if p.positioning_statement:
sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n")
if p.strengths:
bullets = "\n".join(f"- {s}" for s in p.strengths)
sections.append("## Strengths & management\n\n" + bullets + "\n")
# Render each characteristics group as its own table for readability.
for g in p.characteristics_groups:
label = g.get("label") or "Characteristics"
items = g.get("items") or []
if not items:
continue
rows = "\n".join(
f"| {it['characteristic']} | {it['value']} |"
for it in items
)
sections.append(
f"## {label.title()}\n\n"
"| Characteristic | Value |\n"
"|---|---|\n"
f"{rows}\n"
)
if p.regional_recommendations:
seen: set[str] = set()
rows: list[str] = []
for r in p.regional_recommendations:
name = (r.get("product_list_name") or "").strip()
agronomist = (r.get("agronomist") or "").strip()
key = f"{name}||{agronomist}"
if key in seen or not name:
continue
seen.add(key)
rows.append(f"- **{name}** — agronomist: {agronomist or '(unlisted)'}")
if rows:
sections.append("## Regional seed-guide listings\n\n" + "\n".join(rows) + "\n")
return "\n".join(header_lines) + "\n".join(sections)
# --------------------------------------------------------------------- write
def write_product(prod: BayerSeedProduct, body_md: str) -> None:
"""Write the markdown body + sidecar JSON. Schema documented in
seed-mcp/CLAUDE.md."""
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
md_path = CORPUS_DIR / f"{prod.source_key}.md"
json_path = CORPUS_DIR / f"{prod.source_key}.json"
md_path.write_text(body_md, encoding="utf-8")
sidecar = {
"source": "bayer_seeds",
"source_key": prod.source_key,
"vendor": "Bayer",
"brand": prod.brand,
"product_name": prod.product_name,
"product_id": prod.product_id,
"hybrid_prefix": prod.hybrid_prefix,
"hybrid_suffix": prod.hybrid_suffix,
"crop": prod.crop,
"release_year": prod.release_year,
"relative_maturity": prod.relative_maturity,
"maturity_group": prod.maturity_group,
"wheat_class": prod.wheat_class,
"trait_stack": prod.trait_codes,
"trait_descriptions": prod.trait_descriptions,
"positioning_statement": prod.positioning_statement,
"strengths": prod.strengths,
# Raw grouped ratings preserved as published. Chunker re-buckets
# into canonical disease/agronomic flats per CLAUDE.md schema.
"characteristics_groups": prod.characteristics_groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": prod.regional_recommendations,
"image_url": prod.image_url,
"source_urls": [prod.source_url],
"sitemap_last_modified": prod.sitemap_last_modified,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
json_path.write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
# --------------------------------------------------------------------- pipeline
def process_product(
http: RateLimitedSession,
*,
url: str,
brand: str,
crop: str,
lastmod: str,
force: bool,
) -> tuple[str, BayerSeedProduct | None]:
"""Returns ``(status, prod or None)`` where status is one of
``written`` / ``skipped`` / ``failed``."""
source_key = source_key_from_url(url, brand)
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
return "skipped", None
try:
prod = fetch_product_detail(http, url, brand, crop, lastmod)
except Exception as exc: # noqa: BLE001
log.error("detail fetch failed for %s: %s", url, exc)
return "failed", None
body = render_markdown(prod)
write_product(prod, body)
return "written", prod
def run(
*,
limit: int | None,
force: bool,
only_brand: str | None,
only_product: str | None,
) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
targets = discover_varieties(http, only_brand=only_brand)
if only_product:
targets = [
(u, b, c, lm) for (u, b, c, lm) in targets
if source_key_from_url(u, b) == only_product
or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product
]
if not targets:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "failed": 0}
processed = 0
for url, brand, crop, lastmod in targets:
if limit is not None and processed >= limit:
break
processed += 1
status, prod = process_product(
http, url=url, brand=brand, crop=crop, lastmod=lastmod, force=force,
)
counts[status] = counts.get(status, 0) + 1
if prod is not None:
log.info(
"[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s ratings_groups=%d",
processed, str(limit) if limit else "all",
prod.source_key, status, prod.crop,
prod.relative_maturity or prod.maturity_group or "-",
",".join(prod.trait_codes) or "-",
len(prod.characteristics_groups),
)
else:
log.info("[%d/%s] %s %s",
processed, str(limit) if limit else "all",
source_key_from_url(url, brand), status)
log.info(
"done: processed=%d written=%d skipped=%d failed=%d (out of %d candidates)",
processed, counts["written"], counts["skipped"], counts["failed"], len(targets),
)
return 0 if counts["failed"] == 0 else 1
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.bayer_seeds",
description="Scrape Bayer DEKALB / Asgrow / WestBred seed varieties.",
)
p.add_argument(
"--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).",
)
p.add_argument(
"--force", action="store_true",
help="Re-fetch even if the markdown file already exists.",
)
p.add_argument(
"--brand", default=None, choices=sorted(BRANDS),
help="Limit to one Bayer seed brand.",
)
p.add_argument(
"--product", default=None,
help="Process a single variety by source_key "
"(e.g. 'dekalb-dkc62-08rib') or terminal URL slug.",
)
p.add_argument(
"--log-level", default=os.environ.get("LOG_LEVEL", "INFO"),
help="Python logging level (default INFO).",
)
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr,
)
return run(
limit=args.limit,
force=args.force,
only_brand=args.brand,
only_product=args.product,
)
if __name__ == "__main__":
sys.exit(main())