From 2a4c0d4abaf0fbc0f5df843ead27bc9c1ab6ce88 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Mon, 25 May 2026 12:53:46 -0400 Subject: [PATCH] bayer_seeds: implement Phase 1 scraper for DEKALB + Asgrow + WestBred MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace stub with working scraper for all three Bayer seed brands. Discovery uses the public sitemap-dynamic.xml (475 varieties: 288 DEKALB corn + 102 Asgrow soy + 85 WestBred wheat — matches recon). Per-variety detail comes from the page's __NEXT_DATA__ JSON island. Each variety writes corpus/bayer_seeds/.{md,json} with: - Identity (brand, crop, hybridLabel, productId, releaseYear) - Maturity routed per crop (RM for corn, MG for soy, qualitative for wheat) - Trait stack (code + full name) - Positioning + strengths narrative - Characteristics groups (DISEASE RATINGS, GROWTH, MANAGEMENT, HARVEST, etc.) preserved verbatim from source so the chunker can re-bucket into canonical disease/agronomic flats per CLAUDE.md schema - Regional seed-guide listings with agronomist contacts - _scale_direction tag (Bayer = "1-9 (9 = best)") for chunker Smoke-tested all three brands (--limit 2 each, plus --product, --force, and scrape.runner dispatch). Politeness: 1 req/sec, retries on 429/5xx with Retry-After honored. Co-Authored-By: Claude Opus 4.7 (1M context) --- scrape/sources/bayer_seeds.py | 706 ++++++++++++++++++++++++++++++++-- 1 file changed, 668 insertions(+), 38 deletions(-) diff --git a/scrape/sources/bayer_seeds.py b/scrape/sources/bayer_seeds.py index 7d972fe0..ed332dc1 100644 --- a/scrape/sources/bayer_seeds.py +++ b/scrape/sources/bayer_seeds.py @@ -1,56 +1,686 @@ """Bayer seeds scraper — DEKALB (corn) + Asgrow (soy) + WestBred (wheat). -Source: ``cropscience.bayer.us`` — same Next.js + ``__NEXT_DATA__`` -infrastructure used by crop-chem-docs' Bayer crop-protection scraper. -That scraper is the reference; this one lifts ~80% of its plumbing -and adapts the per-product field mapping for seed schema. +Source: ``www.cropscience.bayer.us`` — the same Next.js + ``__NEXT_DATA__`` +infrastructure that powers Bayer's crop-protection catalog (which +``crop-chem-docs`` already scrapes). robots.txt explicitly whitelists +*"artificial intelligence retrieval augmented generation"* use of the +content, which is what this corpus feeds. -Catalog index pages: - /corn/dekalb/seed-catalog - /soybeans/asgrow/seed-catalog - /wheat/westbred/seed-catalog +Discovery: ``/sitemap-dynamic.xml`` enumerates every variety URL under +``/corn/dekalb/``, ``/soybeans/asgrow/``, ``/wheat/westbred/`` — counts +on 2026-05-25: 288 / 102 / 85 = 475 total, matching recon. The seed +catalog landing pages SSR only the first 12 of N products via React +Query state hydration; we sidestep that entirely by walking the +sitemap. -Each catalog page is a Next.js route; the per-variety data lives in -``__NEXT_DATA__.props.pageProps.{whatever}``. The buildId in the -script tag rotates — fetch the index page first, extract the -buildId, then fetch the per-variety JSON. +Per-variety detail comes from the product page itself. Each page +embeds a full ``__NEXT_DATA__`` JSON island whose +``props.pageProps.productDetails`` carries: -Output layout: - corpus/bayer_seeds/.md LLM-visible body - corpus/bayer_seeds/.json Sidecar metadata + - Identity: ``brand``, ``crop``, ``productId``, + ``hybridLabel``, ``hybridPrefix``, ``hybridSuffix``, + ``releaseYear`` + - Maturity: ``relativeMaturity`` (corn = RM days, soy = MG, + wheat = qualitative early/medium/late) + - Traits: ``traits[]`` of ``{trait, traitFullName}`` + - Narrative: ``positioningStatement``, ``strengthsAndManagement[]`` + - Ratings: ``characteristics[]`` of + ``{label, type, items: [{characteristic, value}]}`` — + groups vary by crop: + corn: DISEASE RATINGS / GROWTH / MANAGEMENT / HARVEST / + HERBICIDE / PLANT DESCRIPTION + soy: DISEASE RATINGS / SENSITIVITY / MANAGEMENT / + PLANT DESCRIPTION / PRODUCTION + wheat: KEY CHARACTERISTICS / MANAGEMENT / PRODUCTION / + QUALITY / PEST AND DISEASE RESISTANCE + - Regional: ``localProfiles[]`` of regional seed-guide listings + incl. agronomist name + email -source_key convention: ``-`` lowercased, e.g. -``dekalb-dkc62-08rib`` or ``asgrow-ag34xf2``. +Bayer ratings are on the canonical **1-9 (9 = best)** scale already, +so no flip is needed (unlike Golden Harvest, which is documented in +CLAUDE.md). Non-numeric ratings (S/R for soy disease resistance, +gene names like Rps1c, sensitivity letters A/B/C) are preserved +verbatim — the chunker (Phase 2) handles surfacing. -Sidecar schema (per CLAUDE.md): - source: "bayer_seeds" - source_key: str - vendor: "Bayer" - brand: "DEKALB" | "Asgrow" | "WestBred" - product_name: str - crop: "corn" | "soybeans" | "wheat" - relative_maturity: int | null # corn only - maturity_group: float | null # soy only - wheat_class: str | null # wheat only - trait_stack: list[str] - agronomic_ratings: dict[str, int] # normalized 1-9 (9 = best) - disease_ratings: dict[str, int] # normalized 1-9 (9 = best) - regional_recommendation: list[str] - source_urls: list[str] - fetched_at: str (ISO 8601 UTC) +Output: + corpus/bayer_seeds/.md LLM-visible body + corpus/bayer_seeds/.json sidecar metadata -TODO: implement. Reference: ~/github/crop-chem-docs/scrape/sources/bayer.py +source_key convention: ``-`` lowercased — derived from the +URL terminal slug minus the trailing crop suffix +(``-corn``/``-soybeans``/``-wheat``). E.g. +``dekalb-dkc075-70rib`` or ``asgrow-ag005xf3``. + +CLI: + python -m scrape.sources.bayer_seeds --limit 5 + python -m scrape.sources.bayer_seeds --brand dekalb --limit 20 + python -m scrape.sources.bayer_seeds --force """ + from __future__ import annotations +import argparse +import json +import logging +import os +import random +import re import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://www.cropscience.bayer.us" +SITEMAP_URL = f"{BASE}/sitemap-dynamic.xml" + +# Brand → (URL path segment, crop label). Ordering here defines the +# `--all` walk order and the `--brand` choices. +BRANDS: dict[str, tuple[str, str]] = { + "dekalb": ("/corn/dekalb/", "corn"), + "asgrow": ("/soybeans/asgrow/", "soybeans"), + "westbred": ("/wheat/westbred/", "wheat"), +} + +# Per-brand crop-suffix to strip off the URL's terminal slug when +# computing source_key (so ``dekalb-dkc075-70rib-corn`` → ``dekalb-dkc075-70rib``). +CROP_SUFFIX = { + "dekalb": "-corn", + "asgrow": "-soybeans", + "westbred": "-wheat", +} + +# Catalog/landing pages that live under the brand path but are NOT +# individual varieties. Skip these during discovery. +NON_VARIETY_PATH_TAILS = { + "seed-catalog", + "product-compare", + "find-a-dealer", + "find-a-rep", + "saved-products", +} + +# Bayer publishes seed ratings on the canonical 1-9 scale (9 = best), +# unlike Golden Harvest. This goes into the sidecar so the chunker +# knows not to flip. +RATING_SCALE_DIRECTION = "1-9 (9 = best)" + +# Repo root: scrape/sources/bayer_seeds.py -> 3 parents up. +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "bayer_seeds" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.bayer_seeds") + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + """``requests.Session`` wrapper with sleep-based rate limiting and + polite retries on 429/5xx. Lifted from crop-chem-docs' Bayer scraper + — same host, same politeness story.""" + + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 30.0, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request(method, url, timeout=timeout, **kw) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class BayerSeedProduct: + # Identity + source_key: str # e.g. "dekalb-dkc075-70rib" + source_url: str # full product page URL + brand: str # "DEKALB" | "ASGROW" | "WESTBRED" + crop: str # "corn" | "soybeans" | "wheat" + product_name: str = "" # hybridLabel, e.g. "DKC075-70RIB BRAND BLEND" + product_id: str | None = None # full Bayer productId + hybrid_prefix: str | None = None # e.g. "DKC075-70RIB" + hybrid_suffix: str | None = None # e.g. "BRAND BLEND" + release_year: int | None = None + + # Maturity — semantics vary by crop, value preserved as-published. + relative_maturity: str | None = None # corn: RM days as string; wheat: qualitative + maturity_group: str | None = None # soy MG as string + wheat_class: str | None = None # not exposed in productDetails — left null + + # Traits + trait_codes: list[str] = field(default_factory=list) # ["VT2PRIB"] + trait_descriptions: list[str] = field(default_factory=list) # full names + + # Narrative + positioning_statement: str | None = None + strengths: list[str] = field(default_factory=list) + + # Ratings — preserved as the source's grouped form. The chunker + # re-buckets into the canonical disease/agronomic flats from + # seed-mcp/CLAUDE.md. + characteristics_groups: list[dict] = field(default_factory=list) + + # Regional recommendations (Bayer's "local profiles"). + regional_recommendations: list[dict] = field(default_factory=list) + + # Media + image_url: str | None = None + + # Discovery + sitemap_last_modified: str | None = None + + +# --------------------------------------------------------------------- helpers + + +_NEXT_DATA_RE = re.compile( + r'', re.S +) + + +def parse_next_data(html: str) -> dict[str, Any]: + """Pull the ``__NEXT_DATA__`` JSON blob out of a Next.js page.""" + m = _NEXT_DATA_RE.search(html) + if not m: + raise RuntimeError("no __NEXT_DATA__ script tag found") + return json.loads(m.group(1)) + + +def source_key_from_url(url: str, brand: str) -> str: + """Derive ``-`` slug from the product URL. + + Drops the trailing ``-`` suffix Bayer puts on every product + URL terminal segment (``dekalb-dkc075-70rib-corn`` → + ``dekalb-dkc075-70rib``). + """ + tail = url.rstrip("/").rsplit("/", 1)[-1].lower() + suffix = CROP_SUFFIX.get(brand, "") + if suffix and tail.endswith(suffix): + tail = tail[: -len(suffix)] + return tail + + +def looks_like_variety_url(url: str, brand_path: str) -> bool: + """True if ``url`` is a per-variety product page under ``brand_path`` + (not a catalog/landing page or sub-tool).""" + rest = url.split(brand_path, 1)[-1].strip("/") + if not rest or "/" in rest: + return False # empty (the brand index) or a sub-path tool + if rest in NON_VARIETY_PATH_TAILS: + return False + return True + + +# --------------------------------------------------------------------- discovery + + +def discover_varieties( + http: RateLimitedSession, + *, + only_brand: str | None = None, +) -> list[tuple[str, str, str, str]]: + """Return ``[(url, brand, crop, lastmod), ...]`` for every Bayer + seed variety found in the dynamic sitemap. + + ``brand`` is the lowercase brand key (matches ``BRANDS``). + ``lastmod`` is the ISO 8601 timestamp from the sitemap entry. + """ + log.info("fetching sitemap %s", SITEMAP_URL) + r = http.get(SITEMAP_URL) + r.raise_for_status() + xml = r.text + + # Tiny regex parse — sitemap is flat and well-formed; no need for + # the lxml dependency on a single 600KB file. + entries = re.findall( + r"\s*([^<]+)\s*(?:([^<]+))?", + xml, + ) + log.info("sitemap parsed: %d total URLs", len(entries)) + + out: list[tuple[str, str, str, str]] = [] + for url, lastmod in entries: + for brand, (brand_path, crop) in BRANDS.items(): + if only_brand and brand != only_brand: + continue + if brand_path in url and looks_like_variety_url(url, brand_path): + out.append((url, brand, crop, lastmod or "")) + break + + by_brand: dict[str, int] = {} + for _, b, _, _ in out: + by_brand[b] = by_brand.get(b, 0) + 1 + log.info("variety URLs found: %s (total=%d)", + ", ".join(f"{k}={v}" for k, v in sorted(by_brand.items())), + len(out)) + return out + + +# --------------------------------------------------------------------- detail + + +def fetch_product_detail( + http: RateLimitedSession, url: str, brand: str, crop: str, lastmod: str +) -> BayerSeedProduct: + """Fetch + parse one product page into a ``BayerSeedProduct``.""" + r = http.get(url) + r.raise_for_status() + data = parse_next_data(r.text) + pp = (data.get("props") or {}).get("pageProps") or {} + pd = pp.get("productDetails") or {} + + prod = BayerSeedProduct( + source_key=source_key_from_url(url, brand), + source_url=url, + brand=(pd.get("brand") or brand).upper(), + crop=(pd.get("crop") or crop).lower(), + sitemap_last_modified=lastmod or None, + ) + + prod.product_name = pd.get("hybridLabel") or pd.get("productName") or prod.source_key + prod.product_id = pd.get("productId") + prod.hybrid_prefix = pd.get("hybridPrefix") + prod.hybrid_suffix = pd.get("hybridSuffix") + + ry = pd.get("releaseYear") + if isinstance(ry, int): + prod.release_year = ry + elif isinstance(ry, str) and ry.isdigit(): + prod.release_year = int(ry) + + # Maturity routing per crop. Source stores all three in + # `relativeMaturity` as a string; we split by crop semantics. + rm = pd.get("relativeMaturity") + if rm is not None: + rm_str = str(rm) + if prod.crop == "corn": + prod.relative_maturity = rm_str + elif prod.crop == "soybeans": + prod.maturity_group = rm_str + elif prod.crop == "wheat": + # WestBred encodes Early/Medium/Late as the qualitative + # maturity. The class (HRW/HRS/SWW/...) is not in + # productDetails — it's only in the marketing narrative. + # We surface what we have; a future enrichment step can + # parse the narrative if needed. + prod.wheat_class = None # explicit: not exposed in this JSON + prod.relative_maturity = rm_str + + # Traits + for t in pd.get("traits") or []: + code = (t or {}).get("trait") + full = (t or {}).get("traitFullName") + if code: + prod.trait_codes.append(code) + if full: + prod.trait_descriptions.append(full) + + # Narrative + prod.positioning_statement = pd.get("positioningStatement") + sm = pd.get("strengthsAndManagement") or pd.get("strengths") or [] + if isinstance(sm, list): + prod.strengths = [str(s).strip() for s in sm if s] + + # Ratings groups — preserved verbatim (label / type / items). + chars = pd.get("characteristics") or [] + cleaned_groups: list[dict] = [] + for g in chars: + if not isinstance(g, dict): + continue + items = [ + {"characteristic": (it.get("characteristic") or "").strip(), + "value": ("" if it.get("value") is None else str(it.get("value"))).strip()} + for it in (g.get("items") or []) + if isinstance(it, dict) and it.get("characteristic") + ] + if not items: + continue + cleaned_groups.append({ + "label": (g.get("label") or "").strip(), + "type": (g.get("type") or "").strip(), + "items": items, + }) + prod.characteristics_groups = cleaned_groups + + # Regional recommendations. + lp = pd.get("localProfiles") or [] + if isinstance(lp, list): + for p in lp: + if not isinstance(p, dict): + continue + prod.regional_recommendations.append({ + "product_list_name": p.get("productListName"), + "agronomist": p.get("agronomist"), + "agronomist_email": p.get("agronomistEmailAddress"), + "variant_id": p.get("variantId"), + }) + + # Image (just the first one) + imgs = pp.get("images") or [] + if isinstance(imgs, list) and imgs and isinstance(imgs[0], dict): + prod.image_url = imgs[0].get("url") + + return prod + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: BayerSeedProduct) -> str: + """Build the markdown body for the variety. The Phase 2 chunker will + rewrite chunk_0 with a tighter preamble; this is the readable today + copy that already covers everything searchable. + """ + title = p.product_name or p.source_key + crop_label = p.crop.capitalize() + + maturity_lines: list[str] = [] + if p.relative_maturity is not None and p.crop == "corn": + maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}") + if p.maturity_group is not None and p.crop == "soybeans": + maturity_lines.append(f"- **Maturity group:** {p.maturity_group}") + if p.relative_maturity is not None and p.crop == "wheat": + maturity_lines.append(f"- **Maturity:** {p.relative_maturity}") + if p.wheat_class: + maturity_lines.append(f"- **Wheat class:** {p.wheat_class}") + + trait_line = "" + if p.trait_codes: + codes = ", ".join(p.trait_codes) + if p.trait_descriptions: + descs = "; ".join(p.trait_descriptions) + trait_line = f"- **Traits:** {codes} ({descs})" + else: + trait_line = f"- **Traits:** {codes}" + + header_lines = [ + f"# {title}", + "", + "- **Vendor:** Bayer", + f"- **Brand:** {p.brand.title() if p.brand else '(unknown)'}", + f"- **Crop:** {crop_label}", + *maturity_lines, + ] + if trait_line: + header_lines.append(trait_line) + if p.release_year: + header_lines.append(f"- **Release year:** {p.release_year}") + header_lines.append(f"- **Source:** {p.source_url}") + header_lines.append(f"- **Rating scale (Bayer):** {RATING_SCALE_DIRECTION}") + header_lines.append("") + header_lines.append("---") + header_lines.append("") + + sections: list[str] = [] + + if p.positioning_statement: + sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n") + + if p.strengths: + bullets = "\n".join(f"- {s}" for s in p.strengths) + sections.append("## Strengths & management\n\n" + bullets + "\n") + + # Render each characteristics group as its own table for readability. + for g in p.characteristics_groups: + label = g.get("label") or "Characteristics" + items = g.get("items") or [] + if not items: + continue + rows = "\n".join( + f"| {it['characteristic']} | {it['value']} |" + for it in items + ) + sections.append( + f"## {label.title()}\n\n" + "| Characteristic | Value |\n" + "|---|---|\n" + f"{rows}\n" + ) + + if p.regional_recommendations: + seen: set[str] = set() + rows: list[str] = [] + for r in p.regional_recommendations: + name = (r.get("product_list_name") or "").strip() + agronomist = (r.get("agronomist") or "").strip() + key = f"{name}||{agronomist}" + if key in seen or not name: + continue + seen.add(key) + rows.append(f"- **{name}** — agronomist: {agronomist or '(unlisted)'}") + if rows: + sections.append("## Regional seed-guide listings\n\n" + "\n".join(rows) + "\n") + + return "\n".join(header_lines) + "\n".join(sections) + + +# --------------------------------------------------------------------- write + + +def write_product(prod: BayerSeedProduct, body_md: str) -> None: + """Write the markdown body + sidecar JSON. Schema documented in + seed-mcp/CLAUDE.md.""" + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + + sidecar = { + "source": "bayer_seeds", + "source_key": prod.source_key, + "vendor": "Bayer", + "brand": prod.brand, + "product_name": prod.product_name, + "product_id": prod.product_id, + "hybrid_prefix": prod.hybrid_prefix, + "hybrid_suffix": prod.hybrid_suffix, + "crop": prod.crop, + "release_year": prod.release_year, + "relative_maturity": prod.relative_maturity, + "maturity_group": prod.maturity_group, + "wheat_class": prod.wheat_class, + "trait_stack": prod.trait_codes, + "trait_descriptions": prod.trait_descriptions, + "positioning_statement": prod.positioning_statement, + "strengths": prod.strengths, + # Raw grouped ratings preserved as published. Chunker re-buckets + # into canonical disease/agronomic flats per CLAUDE.md schema. + "characteristics_groups": prod.characteristics_groups, + "_scale_direction": RATING_SCALE_DIRECTION, + "regional_recommendations": prod.regional_recommendations, + "image_url": prod.image_url, + "source_urls": [prod.source_url], + "sitemap_last_modified": prod.sitemap_last_modified, + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_product( + http: RateLimitedSession, + *, + url: str, + brand: str, + crop: str, + lastmod: str, + force: bool, +) -> tuple[str, BayerSeedProduct | None]: + """Returns ``(status, prod or None)`` where status is one of + ``written`` / ``skipped`` / ``failed``.""" + source_key = source_key_from_url(url, brand) + md_path = CORPUS_DIR / f"{source_key}.md" + if md_path.exists() and not force: + return "skipped", None + + try: + prod = fetch_product_detail(http, url, brand, crop, lastmod) + except Exception as exc: # noqa: BLE001 + log.error("detail fetch failed for %s: %s", url, exc) + return "failed", None + + body = render_markdown(prod) + write_product(prod, body) + return "written", prod + + +def run( + *, + limit: int | None, + force: bool, + only_brand: str | None, + only_product: str | None, +) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + + targets = discover_varieties(http, only_brand=only_brand) + if only_product: + targets = [ + (u, b, c, lm) for (u, b, c, lm) in targets + if source_key_from_url(u, b) == only_product + or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product + ] + if not targets: + log.error("no variety matched --product=%s", only_product) + return 2 + + counts = {"written": 0, "skipped": 0, "failed": 0} + processed = 0 + for url, brand, crop, lastmod in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_product( + http, url=url, brand=brand, crop=crop, lastmod=lastmod, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + + if prod is not None: + log.info( + "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s ratings_groups=%d", + processed, str(limit) if limit else "all", + prod.source_key, status, prod.crop, + prod.relative_maturity or prod.maturity_group or "-", + ",".join(prod.trait_codes) or "-", + len(prod.characteristics_groups), + ) + else: + log.info("[%d/%s] %s %s", + processed, str(limit) if limit else "all", + source_key_from_url(url, brand), status) + + log.info( + "done: processed=%d written=%d skipped=%d failed=%d (out of %d candidates)", + processed, counts["written"], counts["skipped"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.bayer_seeds", + description="Scrape Bayer DEKALB / Asgrow / WestBred seed varieties.", + ) + p.add_argument( + "--limit", type=int, default=None, + help="Stop after processing N varieties (default: all).", + ) + p.add_argument( + "--force", action="store_true", + help="Re-fetch even if the markdown file already exists.", + ) + p.add_argument( + "--brand", default=None, choices=sorted(BRANDS), + help="Limit to one Bayer seed brand.", + ) + p.add_argument( + "--product", default=None, + help="Process a single variety by source_key " + "(e.g. 'dekalb-dkc62-08rib') or terminal URL slug.", + ) + p.add_argument( + "--log-level", default=os.environ.get("LOG_LEVEL", "INFO"), + help="Python logging level (default INFO).", + ) + return p def main(argv: list[str] | None = None) -> int: - print("bayer_seeds: not implemented yet — see ~/github/crop-chem-docs/scrape/sources/bayer.py for the reference Next.js extraction pattern", - file=sys.stderr) - return 2 + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run( + limit=args.limit, + force=args.force, + only_brand=args.brand, + only_product=args.product, + ) if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) + sys.exit(main())