seed-mcp/scrape/sources/bayer_seeds.py

"""Bayer seeds scraper — DEKALB (corn) + Asgrow (soy) + WestBred (wheat).

Source: ``www.cropscience.bayer.us`` — the same Next.js + ``__NEXT_DATA__``
infrastructure that powers Bayer's crop-protection catalog (which
``crop-chem-docs`` already scrapes). robots.txt explicitly whitelists
*"artificial intelligence retrieval augmented generation"* use of the
content, which is what this corpus feeds.

Discovery: ``/sitemap-dynamic.xml`` enumerates every variety URL under
``/corn/dekalb/``, ``/soybeans/asgrow/``, ``/wheat/westbred/`` — counts
on 2026-05-25: 288 / 102 / 85 = 475 total, matching recon. The seed
catalog landing pages SSR only the first 12 of N products via React
Query state hydration; we sidestep that entirely by walking the
sitemap.

Per-variety detail comes from the product page itself. Each page
embeds a full ``__NEXT_DATA__`` JSON island whose
``props.pageProps.productDetails`` carries:

  - Identity:  ``brand``, ``crop``, ``productId``,
    ``hybridLabel``, ``hybridPrefix``, ``hybridSuffix``,
    ``releaseYear``
  - Maturity:  ``relativeMaturity`` (corn = RM days, soy = MG,
    wheat = qualitative early/medium/late)
  - Traits:    ``traits[]`` of ``{trait, traitFullName}``
  - Narrative: ``positioningStatement``, ``strengthsAndManagement[]``
  - Ratings:   ``characteristics[]`` of
    ``{label, type, items: [{characteristic, value}]}`` —
    groups vary by crop:
      corn: DISEASE RATINGS / GROWTH / MANAGEMENT / HARVEST /
            HERBICIDE / PLANT DESCRIPTION
      soy:  DISEASE RATINGS / SENSITIVITY / MANAGEMENT /
            PLANT DESCRIPTION / PRODUCTION
      wheat: KEY CHARACTERISTICS / MANAGEMENT / PRODUCTION /
             QUALITY / PEST AND DISEASE RESISTANCE
  - Regional:  ``localProfiles[]`` of regional seed-guide listings
    incl. agronomist name + email

Bayer ratings are on the canonical **1-9 (9 = best)** scale already,
so no flip is needed (unlike Golden Harvest, which is documented in
CLAUDE.md). Non-numeric ratings (S/R for soy disease resistance,
gene names like Rps1c, sensitivity letters A/B/C) are preserved
verbatim — the chunker (Phase 2) handles surfacing.

Output:
  corpus/bayer_seeds/<source_key>.md     LLM-visible body
  corpus/bayer_seeds/<source_key>.json   sidecar metadata

source_key convention: ``<brand>-<sku>`` lowercased — derived from the
URL terminal slug minus the trailing crop suffix
(``-corn``/``-soybeans``/``-wheat``). E.g.
``dekalb-dkc075-70rib`` or ``asgrow-ag005xf3``.

CLI:
  python -m scrape.sources.bayer_seeds --limit 5
  python -m scrape.sources.bayer_seeds --brand dekalb --limit 20
  python -m scrape.sources.bayer_seeds --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.cropscience.bayer.us"
SITEMAP_URL = f"{BASE}/sitemap-dynamic.xml"

# Brand → (URL path segment, crop label). Ordering here defines the
# `--all` walk order and the `--brand` choices.
BRANDS: dict[str, tuple[str, str]] = {
    "dekalb": ("/corn/dekalb/", "corn"),
    "asgrow": ("/soybeans/asgrow/", "soybeans"),
    "westbred": ("/wheat/westbred/", "wheat"),
}

# Per-brand crop-suffix to strip off the URL's terminal slug when
# computing source_key (so ``dekalb-dkc075-70rib-corn`` → ``dekalb-dkc075-70rib``).
CROP_SUFFIX = {
    "dekalb": "-corn",
    "asgrow": "-soybeans",
    "westbred": "-wheat",
}

# Catalog/landing pages that live under the brand path but are NOT
# individual varieties. Skip these during discovery.
NON_VARIETY_PATH_TAILS = {
    "seed-catalog",
    "product-compare",
    "find-a-dealer",
    "find-a-rep",
    "saved-products",
}

# Bayer publishes seed ratings on the canonical 1-9 scale (9 = best),
# unlike Golden Harvest. This goes into the sidecar so the chunker
# knows not to flip.
RATING_SCALE_DIRECTION = "1-9 (9 = best)"

# Repo root: scrape/sources/bayer_seeds.py -> 3 parents up.
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "bayer_seeds"

REQ_INTERVAL_SEC = 1.0

log = logging.getLogger("scrape.bayer_seeds")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """``requests.Session`` wrapper with sleep-based rate limiting and
    polite retries on 429/5xx. Lifted from crop-chem-docs' Bayer scraper
    — same host, same politeness story."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(
        self,
        method: str,
        url: str,
        *,
        max_retries: int = 4,
        timeout: float = 30.0,
        **kw: Any,
    ) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class BayerSeedProduct:
    # Identity
    source_key: str                                  # e.g. "dekalb-dkc075-70rib"
    source_url: str                                  # full product page URL
    brand: str                                       # "DEKALB" | "ASGROW" | "WESTBRED"
    crop: str                                        # "corn" | "soybeans" | "wheat"
    product_name: str = ""                           # hybridLabel, e.g. "DKC075-70RIB BRAND BLEND"
    product_id: str | None = None                    # full Bayer productId
    hybrid_prefix: str | None = None                 # e.g. "DKC075-70RIB"
    hybrid_suffix: str | None = None                 # e.g. "BRAND BLEND"
    release_year: int | None = None

    # Maturity — semantics vary by crop, value preserved as-published.
    relative_maturity: str | None = None             # corn: RM days as string; wheat: qualitative
    maturity_group: str | None = None                # soy MG as string
    wheat_class: str | None = None                   # not exposed in productDetails — left null

    # Traits
    trait_codes: list[str] = field(default_factory=list)          # ["VT2PRIB"]
    trait_descriptions: list[str] = field(default_factory=list)   # full names

    # Narrative
    positioning_statement: str | None = None
    strengths: list[str] = field(default_factory=list)

    # Ratings — preserved as the source's grouped form. The chunker
    # re-buckets into the canonical disease/agronomic flats from
    # seed-mcp/CLAUDE.md.
    characteristics_groups: list[dict] = field(default_factory=list)

    # Regional recommendations (Bayer's "local profiles").
    regional_recommendations: list[dict] = field(default_factory=list)

    # Media
    image_url: str | None = None

    # Discovery
    sitemap_last_modified: str | None = None


# --------------------------------------------------------------------- helpers


_NEXT_DATA_RE = re.compile(
    r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', re.S
)


def parse_next_data(html: str) -> dict[str, Any]:
    """Pull the ``__NEXT_DATA__`` JSON blob out of a Next.js page."""
    m = _NEXT_DATA_RE.search(html)
    if not m:
        raise RuntimeError("no __NEXT_DATA__ script tag found")
    return json.loads(m.group(1))


def source_key_from_url(url: str, brand: str) -> str:
    """Derive ``<brand>-<sku>`` slug from the product URL.

    Drops the trailing ``-<crop>`` suffix Bayer puts on every product
    URL terminal segment (``dekalb-dkc075-70rib-corn`` →
    ``dekalb-dkc075-70rib``).
    """
    tail = url.rstrip("/").rsplit("/", 1)[-1].lower()
    suffix = CROP_SUFFIX.get(brand, "")
    if suffix and tail.endswith(suffix):
        tail = tail[: -len(suffix)]
    return tail


def looks_like_variety_url(url: str, brand_path: str) -> bool:
    """True if ``url`` is a per-variety product page under ``brand_path``
    (not a catalog/landing page or sub-tool)."""
    rest = url.split(brand_path, 1)[-1].strip("/")
    if not rest or "/" in rest:
        return False  # empty (the brand index) or a sub-path tool
    if rest in NON_VARIETY_PATH_TAILS:
        return False
    return True


# --------------------------------------------------------------------- discovery


def discover_varieties(
    http: RateLimitedSession,
    *,
    only_brand: str | None = None,
) -> list[tuple[str, str, str, str]]:
    """Return ``[(url, brand, crop, lastmod), ...]`` for every Bayer
    seed variety found in the dynamic sitemap.

    ``brand`` is the lowercase brand key (matches ``BRANDS``).
    ``lastmod`` is the ISO 8601 timestamp from the sitemap entry.
    """
    log.info("fetching sitemap %s", SITEMAP_URL)
    r = http.get(SITEMAP_URL)
    r.raise_for_status()
    xml = r.text

    # Tiny regex parse — sitemap is flat and well-formed; no need for
    # the lxml dependency on a single 600KB file.
    entries = re.findall(
        r"<url>\s*<loc>([^<]+)</loc>\s*(?:<lastmod>([^<]+)</lastmod>)?",
        xml,
    )
    log.info("sitemap parsed: %d total URLs", len(entries))

    out: list[tuple[str, str, str, str]] = []
    for url, lastmod in entries:
        for brand, (brand_path, crop) in BRANDS.items():
            if only_brand and brand != only_brand:
                continue
            if brand_path in url and looks_like_variety_url(url, brand_path):
                out.append((url, brand, crop, lastmod or ""))
                break

    by_brand: dict[str, int] = {}
    for _, b, _, _ in out:
        by_brand[b] = by_brand.get(b, 0) + 1
    log.info("variety URLs found: %s (total=%d)",
             ", ".join(f"{k}={v}" for k, v in sorted(by_brand.items())),
             len(out))
    return out


# --------------------------------------------------------------------- detail


def fetch_product_detail(
    http: RateLimitedSession, url: str, brand: str, crop: str, lastmod: str
) -> BayerSeedProduct:
    """Fetch + parse one product page into a ``BayerSeedProduct``."""
    r = http.get(url)
    r.raise_for_status()
    data = parse_next_data(r.text)
    pp = (data.get("props") or {}).get("pageProps") or {}
    pd = pp.get("productDetails") or {}

    prod = BayerSeedProduct(
        source_key=source_key_from_url(url, brand),
        source_url=url,
        brand=(pd.get("brand") or brand).upper(),
        crop=(pd.get("crop") or crop).lower(),
        sitemap_last_modified=lastmod or None,
    )

    prod.product_name = pd.get("hybridLabel") or pd.get("productName") or prod.source_key
    prod.product_id = pd.get("productId")
    prod.hybrid_prefix = pd.get("hybridPrefix")
    prod.hybrid_suffix = pd.get("hybridSuffix")

    ry = pd.get("releaseYear")
    if isinstance(ry, int):
        prod.release_year = ry
    elif isinstance(ry, str) and ry.isdigit():
        prod.release_year = int(ry)

    # Maturity routing per crop. Source stores all three in
    # `relativeMaturity` as a string; we split by crop semantics.
    rm = pd.get("relativeMaturity")
    if rm is not None:
        rm_str = str(rm)
        if prod.crop == "corn":
            prod.relative_maturity = rm_str
        elif prod.crop == "soybeans":
            prod.maturity_group = rm_str
        elif prod.crop == "wheat":
            # WestBred encodes Early/Medium/Late as the qualitative
            # maturity. The class (HRW/HRS/SWW/...) is not in
            # productDetails — it's only in the marketing narrative.
            # We surface what we have; a future enrichment step can
            # parse the narrative if needed.
            prod.wheat_class = None  # explicit: not exposed in this JSON
            prod.relative_maturity = rm_str

    # Traits
    for t in pd.get("traits") or []:
        code = (t or {}).get("trait")
        full = (t or {}).get("traitFullName")
        if code:
            prod.trait_codes.append(code)
        if full:
            prod.trait_descriptions.append(full)

    # Narrative
    prod.positioning_statement = pd.get("positioningStatement")
    sm = pd.get("strengthsAndManagement") or pd.get("strengths") or []
    if isinstance(sm, list):
        prod.strengths = [str(s).strip() for s in sm if s]

    # Ratings groups — preserved verbatim (label / type / items).
    chars = pd.get("characteristics") or []
    cleaned_groups: list[dict] = []
    for g in chars:
        if not isinstance(g, dict):
            continue
        items = [
            {"characteristic": (it.get("characteristic") or "").strip(),
             "value": ("" if it.get("value") is None else str(it.get("value"))).strip()}
            for it in (g.get("items") or [])
            if isinstance(it, dict) and it.get("characteristic")
        ]
        if not items:
            continue
        cleaned_groups.append({
            "label": (g.get("label") or "").strip(),
            "type": (g.get("type") or "").strip(),
            "items": items,
        })
    prod.characteristics_groups = cleaned_groups

    # Regional recommendations.
    lp = pd.get("localProfiles") or []
    if isinstance(lp, list):
        for p in lp:
            if not isinstance(p, dict):
                continue
            prod.regional_recommendations.append({
                "product_list_name": p.get("productListName"),
                "agronomist": p.get("agronomist"),
                "agronomist_email": p.get("agronomistEmailAddress"),
                "variant_id": p.get("variantId"),
            })

    # Image (just the first one)
    imgs = pp.get("images") or []
    if isinstance(imgs, list) and imgs and isinstance(imgs[0], dict):
        prod.image_url = imgs[0].get("url")

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: BayerSeedProduct) -> str:
    """Build the markdown body for the variety. The Phase 2 chunker will
    rewrite chunk_0 with a tighter preamble; this is the readable today
    copy that already covers everything searchable.
    """
    title = p.product_name or p.source_key
    crop_label = p.crop.capitalize()

    maturity_lines: list[str] = []
    if p.relative_maturity is not None and p.crop == "corn":
        maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}")
    if p.maturity_group is not None and p.crop == "soybeans":
        maturity_lines.append(f"- **Maturity group:** {p.maturity_group}")
    if p.relative_maturity is not None and p.crop == "wheat":
        maturity_lines.append(f"- **Maturity:** {p.relative_maturity}")
    if p.wheat_class:
        maturity_lines.append(f"- **Wheat class:** {p.wheat_class}")

    trait_line = ""
    if p.trait_codes:
        codes = ", ".join(p.trait_codes)
        if p.trait_descriptions:
            descs = "; ".join(p.trait_descriptions)
            trait_line = f"- **Traits:** {codes} ({descs})"
        else:
            trait_line = f"- **Traits:** {codes}"

    header_lines = [
        f"# {title}",
        "",
        "- **Vendor:** Bayer",
        f"- **Brand:** {p.brand.title() if p.brand else '(unknown)'}",
        f"- **Crop:** {crop_label}",
        *maturity_lines,
    ]
    if trait_line:
        header_lines.append(trait_line)
    if p.release_year:
        header_lines.append(f"- **Release year:** {p.release_year}")
    header_lines.append(f"- **Source:** {p.source_url}")
    header_lines.append(f"- **Rating scale (Bayer):** {RATING_SCALE_DIRECTION}")
    header_lines.append("")
    header_lines.append("---")
    header_lines.append("")

    sections: list[str] = []

    if p.positioning_statement:
        sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n")

    if p.strengths:
        bullets = "\n".join(f"- {s}" for s in p.strengths)
        sections.append("## Strengths & management\n\n" + bullets + "\n")

    # Render each characteristics group as its own table for readability.
    for g in p.characteristics_groups:
        label = g.get("label") or "Characteristics"
        items = g.get("items") or []
        if not items:
            continue
        rows = "\n".join(
            f"| {it['characteristic']} | {it['value']} |"
            for it in items
        )
        sections.append(
            f"## {label.title()}\n\n"
            "| Characteristic | Value |\n"
            "|---|---|\n"
            f"{rows}\n"
        )

    if p.regional_recommendations:
        seen: set[str] = set()
        rows: list[str] = []
        for r in p.regional_recommendations:
            name = (r.get("product_list_name") or "").strip()
            agronomist = (r.get("agronomist") or "").strip()
            key = f"{name}||{agronomist}"
            if key in seen or not name:
                continue
            seen.add(key)
            rows.append(f"- **{name}** — agronomist: {agronomist or '(unlisted)'}")
        if rows:
            sections.append("## Regional seed-guide listings\n\n" + "\n".join(rows) + "\n")

    return "\n".join(header_lines) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_product(prod: BayerSeedProduct, body_md: str) -> None:
    """Write the markdown body + sidecar JSON. Schema documented in
    seed-mcp/CLAUDE.md."""
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{prod.source_key}.md"
    json_path = CORPUS_DIR / f"{prod.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")

    sidecar = {
        "source": "bayer_seeds",
        "source_key": prod.source_key,
        "vendor": "Bayer",
        "brand": prod.brand,
        "product_name": prod.product_name,
        "product_id": prod.product_id,
        "hybrid_prefix": prod.hybrid_prefix,
        "hybrid_suffix": prod.hybrid_suffix,
        "crop": prod.crop,
        "release_year": prod.release_year,
        "relative_maturity": prod.relative_maturity,
        "maturity_group": prod.maturity_group,
        "wheat_class": prod.wheat_class,
        "trait_stack": prod.trait_codes,
        "trait_descriptions": prod.trait_descriptions,
        "positioning_statement": prod.positioning_statement,
        "strengths": prod.strengths,
        # Raw grouped ratings preserved as published. Chunker re-buckets
        # into canonical disease/agronomic flats per CLAUDE.md schema.
        "characteristics_groups": prod.characteristics_groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": prod.regional_recommendations,
        "image_url": prod.image_url,
        "source_urls": [prod.source_url],
        "sitemap_last_modified": prod.sitemap_last_modified,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_product(
    http: RateLimitedSession,
    *,
    url: str,
    brand: str,
    crop: str,
    lastmod: str,
    force: bool,
) -> tuple[str, BayerSeedProduct | None]:
    """Returns ``(status, prod or None)`` where status is one of
    ``written`` / ``skipped`` / ``failed``."""
    source_key = source_key_from_url(url, brand)
    md_path = CORPUS_DIR / f"{source_key}.md"
    if md_path.exists() and not force:
        return "skipped", None

    try:
        prod = fetch_product_detail(http, url, brand, crop, lastmod)
    except Exception as exc:  # noqa: BLE001
        log.error("detail fetch failed for %s: %s", url, exc)
        return "failed", None

    body = render_markdown(prod)
    write_product(prod, body)
    return "written", prod


def run(
    *,
    limit: int | None,
    force: bool,
    only_brand: str | None,
    only_product: str | None,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()

    targets = discover_varieties(http, only_brand=only_brand)
    if only_product:
        targets = [
            (u, b, c, lm) for (u, b, c, lm) in targets
            if source_key_from_url(u, b) == only_product
            or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product
        ]
        if not targets:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "failed": 0}
    processed = 0
    for url, brand, crop, lastmod in targets:
        if limit is not None and processed >= limit:
            break
        processed += 1
        status, prod = process_product(
            http, url=url, brand=brand, crop=crop, lastmod=lastmod, force=force,
        )
        counts[status] = counts.get(status, 0) + 1

        if prod is not None:
            log.info(
                "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s ratings_groups=%d",
                processed, str(limit) if limit else "all",
                prod.source_key, status, prod.crop,
                prod.relative_maturity or prod.maturity_group or "-",
                ",".join(prod.trait_codes) or "-",
                len(prod.characteristics_groups),
            )
        else:
            log.info("[%d/%s] %s %s",
                     processed, str(limit) if limit else "all",
                     source_key_from_url(url, brand), status)

    log.info(
        "done: processed=%d written=%d skipped=%d failed=%d (out of %d candidates)",
        processed, counts["written"], counts["skipped"], counts["failed"], len(targets),
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.bayer_seeds",
        description="Scrape Bayer DEKALB / Asgrow / WestBred seed varieties.",
    )
    p.add_argument(
        "--limit", type=int, default=None,
        help="Stop after processing N varieties (default: all).",
    )
    p.add_argument(
        "--force", action="store_true",
        help="Re-fetch even if the markdown file already exists.",
    )
    p.add_argument(
        "--brand", default=None, choices=sorted(BRANDS),
        help="Limit to one Bayer seed brand.",
    )
    p.add_argument(
        "--product", default=None,
        help="Process a single variety by source_key "
             "(e.g. 'dekalb-dkc62-08rib') or terminal URL slug.",
    )
    p.add_argument(
        "--log-level", default=os.environ.get("LOG_LEVEL", "INFO"),
        help="Python logging level (default INFO).",
    )
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit,
        force=args.force,
        only_brand=args.brand,
        only_product=args.product,
    )


if __name__ == "__main__":
    sys.exit(main())