Add 4 independent seed brands: Latham + Stine + 1st Choice + Burrus (+623 varieties) (#17)

Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
2026-06-04 21:58:07 -04:00
parent 22e8092faf
commit 84ad2b1de6
1254 changed files with 103589 additions and 4 deletions
@@ -0,0 +1,561 @@
+"""Burrus Seed scraper — independent family-owned company (Arenzville, IL).
+
+Source: Burrus Hybrids ("Burrus Seed"), an independent family company
+founded **1935** in Arenzville, Illinois — NOT owned by any of the
+multinationals (Bayer / Corteva / Syngenta / BASF). It markets corn under
+the **Burrus** and **Power Plus** brands and soybeans under the **Burrus**
+and **DONMARIO** brands, sold through a dealer network across IL / IN / IA
+/ MO / WI.
+
+Unlike the ProHarvest scraper (which parses HTML detail pages), Burrus
+publishes its full agronomic dataset through the **Seedware** catalog
+widget's JSON-over-JSONP API (the backend for the product finder on
+``burrusseed.com/products/{corn,soybeans}``). So this scraper does TWO
+list calls and maps JSON fields straight into ``characteristics_groups``;
+there is no per-variety page fetch.
+
+Seedware API
+------------
+``GET https://burrus25.seedware.net/app/_queries/crop_varieties.php
+    ?crop_pkey=101&callback=cb``  -> CORN  (JSONP)
+``crop_pkey=102``                 -> SOYBEANS
+
+Both require:
+  * a ``callback`` query param (WITHOUT it the endpoint returns ``[]``),
+  * a ``Referer: https://burrusseed.com/`` header.
+The response is ``cb([...]);`` — strip the JSONP wrapper to get a JSON
+array of ~38 corn + ~26 soy records. Each record has ~44 fields:
+``id`` (variety code, e.g. ``8J697AM``), ``description`` (brand + code,
+e.g. ``Power Plus 8J697AM``), ``pkey`` (Seedware row id), ``maturity``
+(RM for corn / MG for soy, as a string like ``"97.00"`` / ``"2.00"``),
+``released`` (year int), ``trait`` / ``trait_platform``, a per-record
+brand in ``stat_corn_brand`` / ``stat_soybean_brand``, and many
+``stat_*`` agronomic / disease / herbicide-tolerance ratings.
+
+Rating scales (confirmed from the live data, Jun 2026)
+------------------------------------------------------
+  * **Numeric agronomic + disease ratings: 1-10, 10 = best / most
+    tolerant** (observed values 4-10; standard Seedware/seed-industry
+    high-is-better scale). Soy agronomic stats arrive as ``"8.000"`` —
+    the trailing zeros are stripped to ``"8"``. ``NR`` / ``None`` /
+    blank / ``-`` = not rated and are SKIPPED (never coerced to a value).
+  * **Herbicide tolerance + insect-protection packages: Yes / No**
+    (verbatim). ``glyphosate`` / ``glufosinate`` / ``2,4-D choline`` /
+    ``FOPs`` / ``dicamba`` tolerances and the Bt insect packages
+    (corn borer / rootworm / etc.) are categorical Yes/No, not numeric.
+  * **Categorical agronomic notes** (corn-on-corn suitability, refuge
+    structure) pass through verbatim.
+
+Output:
+  corpus/burrus/<source_key>.md
+  corpus/burrus/<source_key>.json
+
+source_key: ``burrus-<id>`` lowercased + slugified, e.g.
+``burrus-8j697am``. The variety ``id`` (the catalog code) is stable.
+
+CLI:
+  python -m scrape.sources.burrus --crop corn --limit 2 --force
+  python -m scrape.sources.burrus --crop soybeans
+  python -m scrape.sources.burrus --force
+  python -m scrape.sources.burrus --product burrus-8j697am
+
+ROBOTS / UA: burrusseed.com robots.txt blocks ~33 NAMED AI/scraper bots
+(Scrapy, CCBot, Bytespider, Diffbot, ...) and declares ``Crawl-delay: 10``
+ ``Content-signal: ai-train=no``; ``User-agent: *`` is allowed. The
+operator has chosen to include this source. We use a non-blacklisted UA
+and honour the 10-second crawl delay (the API call count is tiny — two
+list calls — so this is cheap).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import random
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import requests
+
+SCRAPER_VERSION = "0.1.0"
+# NOT any blacklisted bot name — robots.txt allows User-agent: *.
+USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
+SEEDWARE = "https://burrus25.seedware.net"
+API = f"{SEEDWARE}/app/_queries/crop_varieties.php"
+SITE = "https://burrusseed.com"
+REFERER = "https://burrusseed.com/"
+
+# crop_pkey -> (chunker crop value, public product page slug).
+CROP_PKEYS = {
+    "corn": (101, "corn"),
+    "soybeans": (102, "soybeans"),
+}
+
+# robots.txt declares Crawl-delay: 10 for burrusseed.com / seedware.net.
+# Honour it — the catalog is only two list calls so this is cheap.
+REQ_INTERVAL_SEC = 10.0
+
+RATING_SCALE_DIRECTION = (
+    "numeric agronomic + disease ratings 1-10, 10=best/most-tolerant "
+    "(observed 4-10; higher is better); NR/blank/0/'-' = not rated (omitted). "
+    "Herbicide tolerances and Bt insect-protection packages are Yes/No "
+    "(verbatim, not numeric). Corn-on-corn suitability and refuge structure "
+    "are categorical."
+)
+
+# ----- stat_* field -> (group label, human characteristic name) -----------
+#
+# Group labels match the chunker's buckets in rag/chunk.py:
+#   "DISEASE RATINGS"           -> disease framing
+#   "AGRONOMIC CHARACTERISTICS" -> agronomic framing
+#   "HERBICIDE TOLERANCE"       -> falls into the chunker's MANAGEMENT
+#                                  bucket ("HERBICIDE" is a recognised label),
+#                                  so it renders as "Management notes".
+# Fields intentionally NOT mapped: stat_corn_brand / stat_soybean_brand
+# (used for the per-record brand), stat_herbicide_tolerance (always blank
+# in the live data — the per-chemistry stats carry the real signal).
+
+DISEASE_FIELDS = {
+    # corn
+    "stat_gray_leaf_spot_tolerance": "Gray leaf spot tolerance",
+    "stat_tar_spot_tolerance": "Tar spot tolerance",
+    # soy
+    "stat_brown_stem_rot": "Brown stem rot (BSR) tolerance",
+    "stat_sds": "Sudden death syndrome (SDS) tolerance",
+    "stat_phytophthora_root_rot": "Phytophthora root rot tolerance",
+    "stat_prr_phytophthora_root_rot": "Phytophthora root rot (PRR) tolerance",
+}
+
+# Agronomic ratings — numeric 1-10 (corn) and "8.000"-style (soy).
+AGRONOMIC_NUMERIC_FIELDS = {
+    # corn
+    "stat_drought_tolerance": "Drought tolerance",
+    "stat_greensnap_tolerance": "Greensnap tolerance",
+    "stat_root_strength": "Root strength",
+    "stat_stalk_strength": "Stalk strength",
+    "stat_standability": "Standability",
+    "stat_black_cutworm": "Black cutworm tolerance",
+    # soy
+    "stat_emergence": "Emergence",
+    "stat_canopy_width": "Canopy width",
+    "stat_plant_height": "Plant height",
+}
+
+# Agronomic categorical / Yes-No notes (insect protection + placement).
+AGRONOMIC_CATEGORICAL_FIELDS = {
+    "stat_corn_corn": "Corn-on-corn suitability",
+    "stat_refuge": "Refuge structure",
+    "stat_corn_borer": "Corn borer protection (Bt)",
+    "stat_corn_rootworm": "Corn rootworm protection (Bt)",
+    "stat_corn_earworm": "Corn earworm protection (Bt)",
+    "stat_nematode": "Nematode protection",
+    "stat_wireworm": "Wireworm protection",
+}
+
+# Herbicide tolerance — Yes/No per chemistry.
+HERBICIDE_FIELDS = {
+    "stat_glyphosate_tolerance": "Glyphosate tolerance",
+    "stat_glufosinate_tolerance": "Glufosinate tolerance",
+    "stat_24d_choline_tolerance": "2,4-D choline tolerance",
+    "stat_dicamba_tolerance": "Dicamba tolerance",
+    "stat_fops_tolerance": "FOPs (fop herbicide) tolerance",
+}
+
+GROUP_ORDER = [
+    ("DISEASE RATINGS", DISEASE_FIELDS),
+    ("AGRONOMIC CHARACTERISTICS", {**AGRONOMIC_NUMERIC_FIELDS,
+                                   **AGRONOMIC_CATEGORICAL_FIELDS}),
+    ("HERBICIDE TOLERANCE", HERBICIDE_FIELDS),
+]
+
+# Values that mean "not rated" — never coerced into a chunk.
+_NOT_RATED = {"", "-", "--", "n/a", "na", "nr", "none", "0", "0.000", "0.00"}
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "burrus"
+
+log = logging.getLogger("scrape.burrus")
+
+
+# --------------------------------------------------------------------- HTTP
+
+
+class RateLimitedSession:
+    """Polite session with backoff. Honours burrusseed.com's
+    Crawl-delay: 10 (>=10 s between requests to seedware.net /
+    burrusseed.com). The Burrus catalog is two list calls total."""
+
+    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
+        self.s = requests.Session()
+        self.s.headers["User-Agent"] = USER_AGENT
+        self.s.headers["Referer"] = REFERER
+        self.s.headers["Accept"] = "*/*"
+        self.interval = interval
+        self._last = 0.0
+
+    def _wait(self) -> None:
+        delta = time.monotonic() - self._last
+        if self._last and delta < self.interval:
+            time.sleep(self.interval - delta)
+        self._last = time.monotonic()
+
+    def request(self, method: str, url: str, *, max_retries: int = 4,
+                timeout: float = 30.0, **kw: Any) -> requests.Response:
+        last_exc: Exception | None = None
+        resp: requests.Response | None = None
+        for attempt in range(max_retries):
+            self._wait()
+            try:
+                resp = self.s.request(method, url, timeout=timeout, **kw)
+            except requests.RequestException as exc:
+                last_exc = exc
+                backoff = min(30.0, (2 ** attempt) + random.random())
+                log.warning("network error on %s %s: %s — retry in %.1fs",
+                            method, url, exc, backoff)
+                time.sleep(backoff)
+                continue
+            if resp.status_code == 429 or 500 <= resp.status_code < 600:
+                ra = resp.headers.get("Retry-After")
+                backoff = float(ra) if (ra and ra.isdigit()) else min(
+                    30.0, (2 ** attempt) + random.random())
+                log.warning("HTTP %d on %s %s — retry in %.1fs",
+                            resp.status_code, method, url, backoff)
+                time.sleep(backoff)
+                continue
+            return resp
+        if last_exc:
+            raise last_exc
+        assert resp is not None
+        return resp
+
+    def get(self, url: str, **kw: Any) -> requests.Response:
+        return self.request("GET", url, **kw)
+
+
+def _strip_jsonp(text: str) -> Any:
+    """Strip a ``cb( ... );`` JSONP wrapper and parse the JSON inside."""
+    s = text.strip()
+    m = re.match(r"^[^(]*\((.*)\)\s*;?\s*$", s, re.S)
+    body = m.group(1) if m else s
+    return json.loads(body)
+
+
+# --------------------------------------------------------------------- model
+
+
+@dataclass
+class BurrusVariety:
+    source_key: str
+    crop: str                          # chunker value: corn / soybeans
+    product_name: str                  # "Power Plus 8J697AM"
+    product_id: str                    # "8J697AM"
+    brand: str                         # "Burrus" | "Power Plus" | "DONMARIO"
+    relative_maturity: int | None = None
+    maturity_group: float | None = None
+    release_year: int | None = None
+    trait_stack: list[str] = field(default_factory=list)
+    positioning: str | None = None
+    groups: list[dict] = field(default_factory=list)
+    source_url: str = ""
+
+
+# --------------------------------------------------------------------- fetch
+
+
+def fetch_crop(http: RateLimitedSession, crop_pkey: int) -> list[dict]:
+    """Fetch + decode the JSONP variety array for one crop_pkey."""
+    url = f"{API}?crop_pkey={crop_pkey}&callback=cb"
+    r = http.get(url)
+    r.raise_for_status()
+    data = _strip_jsonp(r.text)
+    if not isinstance(data, list):
+        raise ValueError(f"unexpected payload for crop_pkey={crop_pkey}: "
+                         f"{type(data).__name__}")
+    return data
+
+
+# --------------------------------------------------------------------- mapping
+
+
+def _slug(s: str) -> str:
+    s = (s or "").strip().lower()
+    s = re.sub(r"[^a-z0-9]+", "-", s)
+    return re.sub(r"-+", "-", s).strip("-")
+
+
+def _is_rated(v: Any) -> bool:
+    if v is None:
+        return False
+    return str(v).strip().lower() not in _NOT_RATED
+
+
+def _clean_value(v: Any) -> str:
+    """Normalise a stat value for display. Numeric soy stats arrive as
+    '8.000' — strip the trailing zeros to '8'. Everything else passes
+    through verbatim (Yes / No / Suitable / Integrated refuge / ...)."""
+    s = str(v).strip()
+    # numeric like "8.000" / "8.00" / "97.00" -> "8" / "97"
+    if re.fullmatch(r"-?\d+(?:\.\d+)?", s):
+        f = float(s)
+        return str(int(f)) if f == int(f) else (f"{f:g}")
+    return s
+
+
+def _maturity(rec: dict, crop: str) -> tuple[int | None, float | None]:
+    raw = rec.get("maturity")
+    if raw is None or str(raw).strip() == "":
+        return None, None
+    try:
+        f = float(str(raw).strip())
+    except ValueError:
+        return None, None
+    if crop == "corn":
+        return int(round(f)), None
+    return None, round(f, 1)
+
+
+def _brand(rec: dict) -> str:
+    """Per-record brand. corn -> stat_corn_brand (Burrus / Power Plus);
+    soy -> stat_soybean_brand (Burrus / DONMARIO). Falls back to the
+    leading token of the description, else 'Burrus'."""
+    b = rec.get("stat_corn_brand") or rec.get("stat_soybean_brand")
+    if b and str(b).strip():
+        return str(b).strip()
+    desc = (rec.get("description") or "").strip()
+    code = (rec.get("id") or "").strip()
+    if desc and code and desc.lower().endswith(code.lower()):
+        lead = desc[: len(desc) - len(code)].strip()
+        if lead:
+            return lead
+    return "Burrus"
+
+
+def _traits(rec: dict) -> list[str]:
+    out: list[str] = []
+    for key in ("trait", "trait_platform"):
+        v = rec.get(key)
+        if v and str(v).strip():
+            # strip stray trailing punctuation seen in the data
+            # ("Conventional." / "AM`")
+            t = str(v).strip().rstrip(".`")
+            if t and t not in out:
+                out.append(t)
+    return out
+
+
+def _build_groups(rec: dict) -> list[dict]:
+    groups: list[dict] = []
+    for label, fields in GROUP_ORDER:
+        items: list[dict] = []
+        for stat_key, human in fields.items():
+            v = rec.get(stat_key)
+            if _is_rated(v):
+                items.append({"characteristic": human, "value": _clean_value(v)})
+        if items:
+            groups.append({"label": label, "items": items})
+    return groups
+
+
+def map_record(rec: dict, crop: str) -> BurrusVariety:
+    code = (rec.get("id") or "").strip()
+    pkey = rec.get("pkey")
+    key_seed = code or (f"pkey-{pkey}" if pkey else (rec.get("description") or ""))
+    source_key = f"burrus-{_slug(key_seed)}"
+    name = (rec.get("description") or code or key_seed).strip()
+    rm, mg = _maturity(rec, crop)
+    page_slug = CROP_PKEYS[crop][1]
+    return BurrusVariety(
+        source_key=source_key,
+        crop=crop,
+        product_name=name,
+        product_id=code or name,
+        brand=_brand(rec),
+        relative_maturity=rm,
+        maturity_group=mg,
+        release_year=(rec.get("released")
+                      if isinstance(rec.get("released"), int) else None),
+        trait_stack=_traits(rec),
+        # The Seedware records carry no marketing blurb; leave positioning
+        # null rather than fabricate one.
+        positioning=None,
+        groups=_build_groups(rec),
+        source_url=f"{SITE}/products/{page_slug}",
+    )
+
+
+# --------------------------------------------------------------------- render
+
+
+def render_markdown(v: BurrusVariety) -> str:
+    crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
+        v.crop, v.crop.title())
+    head: list[str] = [
+        f"# {v.product_name}",
+        "",
+        "- **Vendor:** Burrus Seed (Burrus Hybrids — independent family "
+        "company, Arenzville, IL, since 1935)",
+        f"- **Brand:** {v.brand}",
+        f"- **Crop:** {crop_label}",
+    ]
+    if v.crop == "corn" and v.relative_maturity is not None:
+        head.append(f"- **Relative maturity:** {v.relative_maturity} days")
+    if v.crop == "soybeans" and v.maturity_group is not None:
+        head.append(f"- **Maturity group:** {v.maturity_group}")
+    if v.trait_stack:
+        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
+    if v.release_year:
+        head.append(f"- **Released:** {v.release_year}")
+    head.append(f"- **Source:** {v.source_url}")
+    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
+    head.append("- **Service area:** Burrus dealer network "
+                "(IL / IN / IA / MO / WI)")
+    head.append("")
+    head += ["---", ""]
+    for g in v.groups:
+        head.append(f"## {g['label'].title()}")
+        head.append("")
+        for it in g["items"]:
+            head.append(f"- **{it['characteristic']}:** {it['value'] or '—'}")
+        head.append("")
+    return "\n".join(head)
+
+
+def write_variety(v: BurrusVariety, body_md: str) -> None:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
+    sidecar = {
+        "source": "burrus",
+        "source_key": v.source_key,
+        "vendor": "Burrus Seed",
+        "brand": v.brand,
+        "product_name": v.product_name,
+        "product_id": v.product_id,
+        "crop": v.crop,
+        "release_year": v.release_year,
+        "relative_maturity": v.relative_maturity,
+        "maturity_group": v.maturity_group,
+        "wheat_class": None,
+        "trait_stack": v.trait_stack,
+        "trait_descriptions": [],
+        "positioning_statement": v.positioning,
+        "strengths": [],
+        "characteristics_groups": v.groups,
+        "_scale_direction": RATING_SCALE_DIRECTION,
+        "regional_recommendations": [
+            {"product_list_name": "Burrus dealer network (IL/IN/IA/MO/WI)",
+             "agronomist": None, "agronomist_email": None, "variant_id": None},
+        ],
+        "image_url": None,
+        "source_urls": [v.source_url],
+        "sitemap_last_modified": None,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "scraper_version": SCRAPER_VERSION,
+    }
+    (CORPUS_DIR / f"{v.source_key}.json").write_text(
+        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8")
+
+
+# --------------------------------------------------------------------- pipeline
+
+
+def run(*, limit: int | None, force: bool,
+        only_crop: str | None, only_product: str | None) -> int:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    http = RateLimitedSession()
+
+    crops = [only_crop] if only_crop else list(CROP_PKEYS.keys())
+    records: list[tuple[str, dict]] = []
+    for crop in crops:
+        crop_pkey = CROP_PKEYS[crop][0]
+        try:
+            raw = fetch_crop(http, crop_pkey)
+        except (requests.HTTPError, ValueError) as exc:
+            log.error("fetch failed for crop=%s (pkey=%d): %s",
+                      crop, crop_pkey, exc)
+            continue
+        log.info("crop=%-9s pkey=%d: %d records", crop, crop_pkey, len(raw))
+        for rec in raw:
+            records.append((crop, rec))
+
+    varieties = [map_record(rec, crop) for crop, rec in records]
+
+    if only_product:
+        key = only_product.lower()
+        varieties = [v for v in varieties
+                     if v.source_key == key or v.product_id.lower() == key
+                     or _slug(v.product_id) == _slug(key)]
+        if not varieties:
+            log.error("no variety matched --product=%s", only_product)
+            return 2
+
+    counts = {"written": 0, "skipped": 0, "empty": 0}
+    processed = 0
+    total = len(varieties)
+    for v in varieties:
+        if limit is not None and processed >= limit:
+            break
+        processed += 1
+        md_path = CORPUS_DIR / f"{v.source_key}.md"
+        if md_path.exists() and not force:
+            counts["skipped"] += 1
+            log.info("[%d/%d] %s skipped", processed, total, v.source_key)
+            continue
+        if not v.groups:
+            counts["empty"] += 1
+            log.warning("[%d/%d] %s — no rating groups (still writing identity)",
+                        processed, total, v.source_key)
+        write_variety(v, render_markdown(v))
+        counts["written"] += 1
+        log.info("[%d/%d] %s written | brand=%s crop=%s rm/mg=%s groups=%d "
+                 "traits=%s", processed, total, v.source_key, v.brand, v.crop,
+                 v.relative_maturity or v.maturity_group or "-",
+                 len(v.groups), ",".join(v.trait_stack) or "-")
+
+    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
+             processed, counts["written"], counts["skipped"], counts["empty"],
+             total)
+    return 0
+
+
+# --------------------------------------------------------------------- CLI
+
+
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="scrape.sources.burrus",
+        description="Scrape Burrus Seed (independent family company, "
+                    "Arenzville IL) — corn / soybeans via the Seedware "
+                    "JSON-over-JSONP catalog API.")
+    p.add_argument("--limit", type=int, default=None,
+                   help="Stop after processing N varieties (default: all).")
+    p.add_argument("--force", action="store_true",
+                   help="Re-write even if the markdown file already exists.")
+    p.add_argument("--crop", default=None, choices=sorted(CROP_PKEYS.keys()),
+                   help="Limit to one crop (corn / soybeans).")
+    p.add_argument("--product", default=None,
+                   help="Process a single variety by source_key or id.")
+    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_argparser().parse_args(argv)
+    logging.basicConfig(
+        level=args.log_level.upper(),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+        stream=sys.stderr)
+    return run(limit=args.limit, force=args.force,
+               only_crop=args.crop, only_product=args.product)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,671 @@
+"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).
+
+Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
+(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
+seed company in Rushville, Indiana, serving the Eastern Corn Belt
+(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
+that is out of scope for the row-crop advisor).
+
+Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
+post types (corn-hybrids / soybeans / wheat) are NOT exposed to
+``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
+fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
+per-crop child sitemaps:
+
+  - ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/``  (~52 URLs)
+  - ``/soybeans-sitemap.xml``     -> ``/soybeans/<slug>/``      (~22 URLs)
+  - ``/wheat-sitemap.xml``        -> ``/wheat/<slug>/``         (~4 URLs)
+
+robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
+``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
+no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
+between requests.
+
+Detail-page DOM (server-rendered, no JS needed for the text):
+  * Product name: the second ``<h1>`` inside ``article.content`` (the
+    first is the site logo "1st Choice Seeds").
+  * Corn — three ``<h2>`` sections + a side table:
+      - "Hybrid Characteristics": a single ``<p>`` of ``label • value``
+        lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
+        Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
+        Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
+        Seedling Vigor (genuinely thin pages — still written).
+      - "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
+        (the numeric 0-10 bars are drawn client-side by d3 and are NOT
+        in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
+        Superior, so higher = better.
+      - "Management Tips": ``label: value`` lines (Corn-On-Corn,
+        Productivity / soil guidance, Silage Rating).
+      - A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
+        the Low/Medium/High recommended planting populations.
+  * Soybeans — three ``<h2>`` sections:
+      - "Field Notes": a ``<ul>`` of strengths (often includes SCN
+        source / PRR gene call-outs).
+      - "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
+      - "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
+        pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
+        Color, Pubescence, Pod, Hilum).
+  * Wheat — thin (title + date only; wheat is private-label). We still
+    write an identity record so the variety is discoverable.
+
+Rating scale: the published legend is **0-10, higher = better**
+("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
+Superior 9-10"). 1st Choice publishes the *qualitative* word
+(Excellent / Very Good / …) in the HTML — those map directly onto that
+legend — while the numeric bar is d3-rendered and absent from the
+markup. NA / blank = not rated.
+
+Output:
+  corpus/first_choice/<source_key>.md
+  corpus/first_choice/<source_key>.json
+
+source_key: ``firstchoice-<slug>`` lowercased, e.g.
+``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.
+
+CLI:
+  python -m scrape.sources.first_choice --crop corn --limit 5
+  python -m scrape.sources.first_choice --force
+  python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import random
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup, NavigableString, Tag
+
+SCRAPER_VERSION = "0.1.0"
+USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
+BASE = "https://www.1stchoiceseeds.com"
+SITEMAP_INDEX = f"{BASE}/sitemap.xml"
+
+# Per-crop child sitemap -> chunker crop value. The chunker keys on
+# "soybeans" (plural) for the MG branch, so map accordingly. The
+# cover-crops sitemap is intentionally omitted (out of scope for the
+# row-crop advisor).
+CROP_SITEMAPS = {
+    "corn": "corn-hybrids-sitemap.xml",
+    "soybeans": "soybeans-sitemap.xml",
+    "wheat": "wheat-sitemap.xml",
+}
+
+# URL path prefix that confirms a sitemap entry is a variety detail page
+# (vs. a category/archive page that can sneak into a child sitemap).
+CROP_PATH = {
+    "corn": "/corn-hybrids/",
+    "soybeans": "/soybeans/",
+    "wheat": "/wheat/",
+}
+
+# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
+# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
+REQ_INTERVAL_SEC = 1.2
+
+RATING_SCALE_DIRECTION = (
+    "0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
+    "6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
+    "publishes the qualitative word in HTML (the numeric bar is "
+    "d3-rendered, not in markup); blank/NA = not rated"
+)
+
+# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
+# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
+_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}
+
+# Trait-suffix -> human label, derived from the slug tail. Best-effort;
+# an unmapped suffix is title-cased verbatim so nothing is dropped.
+TRAIT_LABELS = {
+    # corn
+    "vt2p": "VT Double PRO (VT2P)",
+    "gt": "Glyphosate Tolerant (GT)",
+    "c": "Conventional",
+    "pc": "PowerCore (PC)",
+    "tre": "Trecepta (TRE)",
+    "ss": "SmartStax (SS)",
+    "v": "VT (V)",
+    "dv": "Double VT (DV)",
+    "aa": "Agrisure Artesian (AA)",
+    # soybeans
+    "en": "Enlist E3 (EN)",
+    "xf": "XtendFlex (XF)",
+    "sts": "STS",
+    # wheat
+    "b": "Bin-run / branded (B)",
+    "s": "Soft (S)",
+}
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "first_choice"
+
+log = logging.getLogger("scrape.first_choice")
+
+
+# --------------------------------------------------------------------- HTTP
+
+
+class RateLimitedSession:
+    """Polite session with backoff. The 1st Choice row-crop catalog is
+    small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
+    a couple minutes."""
+
+    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
+        self.s = requests.Session()
+        self.s.headers["User-Agent"] = USER_AGENT
+        self.interval = interval
+        self._last = 0.0
+
+    def _wait(self) -> None:
+        delta = time.monotonic() - self._last
+        if delta < self.interval:
+            time.sleep(self.interval - delta)
+        self._last = time.monotonic()
+
+    def request(self, method: str, url: str, *, max_retries: int = 4,
+                timeout: float = 30.0, **kw: Any) -> requests.Response:
+        last_exc: Exception | None = None
+        resp: requests.Response | None = None
+        for attempt in range(max_retries):
+            self._wait()
+            try:
+                resp = self.s.request(method, url, timeout=timeout, **kw)
+            except requests.RequestException as exc:
+                last_exc = exc
+                backoff = min(30.0, (2 ** attempt) + random.random())
+                log.warning("network error on %s %s: %s — retry in %.1fs",
+                            method, url, exc, backoff)
+                time.sleep(backoff)
+                continue
+            if resp.status_code == 429 or 500 <= resp.status_code < 600:
+                ra = resp.headers.get("Retry-After")
+                backoff = float(ra) if (ra and ra.isdigit()) else min(
+                    30.0, (2 ** attempt) + random.random())
+                log.warning("HTTP %d on %s %s — retry in %.1fs",
+                            resp.status_code, method, url, backoff)
+                time.sleep(backoff)
+                continue
+            return resp
+        if last_exc:
+            raise last_exc
+        assert resp is not None
+        return resp
+
+    def get(self, url: str, **kw: Any) -> requests.Response:
+        return self.request("GET", url, **kw)
+
+
+# --------------------------------------------------------------------- model
+
+
+@dataclass
+class FCVariety:
+    source_key: str
+    source_url: str
+    crop: str                          # chunker value: corn / soybeans / wheat
+    product_name: str = ""             # "FC 8455 VT2P"
+    relative_maturity: int | None = None      # corn (days)
+    maturity_group: float | None = None       # soy
+    wheat_class: str | None = None             # wheat
+    trait_stack: list[str] = field(default_factory=list)
+    positioning: str | None = None
+    strengths: list[str] = field(default_factory=list)
+    # [{label, items:[{characteristic, value}]}] — chunker source of truth
+    groups: list[dict] = field(default_factory=list)
+    sitemap_last_modified: str | None = None
+
+
+# --------------------------------------------------------------------- discovery (sitemaps)
+
+
+_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
+                     re.IGNORECASE | re.DOTALL)
+_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
+_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
+                         re.IGNORECASE | re.DOTALL)
+
+
+def _slug_from_url(url: str) -> str:
+    return url.rstrip("/").rsplit("/", 1)[-1].lower()
+
+
+def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
+    """Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
+    by walking the per-crop child sitemaps under /sitemap.xml.
+
+    We fetch each known child sitemap directly (their names are stable
+    All-in-One-SEO conventions) rather than trusting the index ordering,
+    but we still confirm against the index so a renamed sitemap is caught.
+    """
+    # Pull the sitemap index once so we can warn if a crop sitemap is
+    # missing/renamed (defensive; we still target the known names).
+    index_locs: set[str] = set()
+    try:
+        idx = http.get(SITEMAP_INDEX)
+        idx.raise_for_status()
+        index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
+    except requests.RequestException as exc:
+        log.warning("could not read sitemap index %s: %s (continuing with "
+                    "known child sitemap names)", SITEMAP_INDEX, exc)
+
+    records: list[dict] = []
+    for crop, child in CROP_SITEMAPS.items():
+        if only_crop and crop != only_crop:
+            continue
+        child_url = f"{BASE}/{child}"
+        if index_locs and child_url not in index_locs:
+            log.warning("crop sitemap %s not listed in the index — site may "
+                        "have renamed it; trying anyway", child_url)
+        r = http.get(child_url)
+        if r.status_code == 404:
+            log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
+            continue
+        r.raise_for_status()
+        prefix = CROP_PATH[crop]
+        seen: set[str] = set()
+        n = 0
+        for block in _URL_BLOCK_RE.findall(r.text):
+            loc_m = _LOC_RE.search(block)
+            if not loc_m:
+                continue
+            url = loc_m.group(1).strip()
+            if prefix not in url:
+                continue  # category/archive page leaked into the sitemap
+            slug = _slug_from_url(url)
+            if not slug or slug in seen:
+                continue
+            seen.add(slug)
+            lm_m = _LASTMOD_RE.search(block)
+            records.append({
+                "crop": crop,
+                "url": url,
+                "slug": slug,
+                "lastmod": lm_m.group(1).strip() if lm_m else None,
+            })
+            n += 1
+        log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
+    log.info("total varieties discovered: %d", len(records))
+    return records
+
+
+# --------------------------------------------------------------------- detail parse
+
+
+def _clean(s: str) -> str:
+    return re.sub(r"\s+", " ", s or "").strip()
+
+
+def _direct_text(el: Tag) -> str:
+    return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
+
+
+def _br_lines(el: Tag) -> list[str]:
+    """Text of an element with <br> treated as a line break."""
+    # Work on a copy so the original tree (used by other parsers) stays intact.
+    for br in el.find_all("br"):
+        br.replace_with("\n")
+    return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]
+
+
+def _product_name(article: Tag, slug: str) -> str:
+    """The variety name is the 2nd <h1> in article.content (the 1st is the
+    site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
+    for h1 in article.find_all("h1"):
+        txt = _clean(h1.get_text(" ", strip=True))
+        if txt and txt.lower() != "1st choice seeds":
+            return txt
+    return slug.upper().replace("-", " ")
+
+
+def _trait_stack(slug: str, crop: str) -> list[str]:
+    """Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
+    fb-3545-c-sts -> Conventional + STS). The leading model token
+    (fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
+    parts = slug.split("-")
+    # Drop the leading model identifier: typically the first 1-2 tokens
+    # (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
+    # that is a known trait suffix counts; we scan from the right.
+    traits: list[str] = []
+    for tok in parts:
+        t = tok.lower()
+        if t in TRAIT_LABELS:
+            label = TRAIT_LABELS[t]
+            if label not in traits:
+                traits.append(label)
+    # Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
+    # above naturally skips them. Preserve discovery order (left->right).
+    return traits
+
+
+def _parse_corn(article: Tag, v: FCVariety) -> None:
+    """Populate corn ratings from Hybrid Characteristics + Management Tips
+    + the Relative Maturity / Degree Days side table."""
+    agronomic: list[dict] = []
+    disease: list[dict] = []
+    management: list[dict] = []
+
+    # Hybrid Characteristics: a <p> of "label • value" lines.
+    hc = next((h for h in article.find_all("h2")
+               if _clean(h.get_text()) == "Hybrid Characteristics"), None)
+    if hc is not None:
+        sib = hc.find_next_sibling()
+        if sib is not None and sib.name == "p":
+            for ln in _br_lines(sib):
+                # split on bullet (•) or fall back to first colon
+                if "•" in ln:
+                    k, _, val = ln.partition("•")
+                elif ":" in ln:
+                    k, _, val = ln.partition(":")
+                else:
+                    k, val = ln, ""
+                k, val = _clean(k), _clean(val)
+                if not k:
+                    continue
+                item = {"characteristic": k, "value": val}
+                if k.lower() in _CORN_DISEASE_LABELS:
+                    disease.append(item)
+                else:
+                    agronomic.append(item)
+
+    # Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
+    # Silage Rating). Stop pulling once we wander into the footer address.
+    mt = next((h for h in article.find_all("h2")
+               if _clean(h.get_text()) == "Management Tips"), None)
+    if mt is not None:
+        sib = mt.find_next_sibling()
+        if sib is not None and sib.name == "p":
+            for ln in _br_lines(sib):
+                if ":" not in ln:
+                    continue
+                k, _, val = ln.partition(":")
+                k, val = _clean(k), _clean(val)
+                # Footer noise (address / © line) has no useful colon form.
+                if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
+                    management.append({"characteristic": k, "value": val})
+
+    # Side table: Relative Maturity / Degree Days + planting populations.
+    pop_rows: list[str] = []
+    for tbl in article.find_all("table"):
+        for tr in tbl.find_all("tr"):
+            cells = [_clean(c.get_text(" ", strip=True))
+                     for c in tr.find_all(["td", "th"])]
+            cells = [c for c in cells if c]
+            if not cells:
+                continue
+            joined = " ".join(cells).lower()
+            if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
+                m = re.search(r"(\d+)", cells[1])
+                if m:
+                    v.relative_maturity = int(m.group(1))
+                agronomic.insert(0, {"characteristic": "Relative Maturity",
+                                     "value": cells[1]})
+            elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
+                agronomic.append({"characteristic": "Degree Days (GDU)",
+                                  "value": cells[1]})
+            elif joined.startswith("low") and ("medium" in joined or "high" in joined):
+                pop_rows.append(" / ".join(cells))
+    if pop_rows:
+        management.append({"characteristic": "Recommended Planting Population",
+                           "value": "; ".join(pop_rows)})
+
+    if agronomic:
+        v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
+    if disease:
+        v.groups.append({"label": "DISEASE RATINGS", "items": disease})
+    if management:
+        v.groups.append({"label": "MANAGEMENT", "items": management})
+
+
+def _parse_soy(article: Tag, v: FCVariety) -> None:
+    """Populate soy MG + agronomic descriptors + field-note strengths."""
+    # Field Notes -> strengths (and positioning from the first one).
+    fn = next((h for h in article.find_all("h2")
+               if _clean(h.get_text()) == "Field Notes"), None)
+    if fn is not None:
+        sib = fn.find_next_sibling()
+        if sib is not None and sib.name == "ul":
+            notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
+            v.strengths = [n for n in notes if n]
+            if v.strengths and not v.positioning:
+                v.positioning = v.strengths[0]
+
+    # Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
+    agronomic: list[dict] = []
+    vd = next((h for h in article.find_all("h2")
+               if _clean(h.get_text()) == "Variety Description"), None)
+    if vd is not None:
+        for el in vd.find_all_next():
+            if el.name == "h2" and el is not vd:
+                break
+            if not isinstance(el, Tag):
+                continue
+            # Stop at the action buttons / right-nav / footer region.
+            cls = el.get("class") or []
+            if el.name == "div" and any(
+                    c in cls for c in ("btn", "right-bar", "right-navigation",
+                                       "address", "wrapper")):
+                break
+            b = el.find("b", recursive=False) if el.name == "div" else None
+            if b is not None:
+                k = _clean(b.get_text(" ", strip=True)).rstrip(":")
+                val = _direct_text(el)
+                if not k:
+                    continue
+                if k.lower() == "maturity":
+                    try:
+                        v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
+                    except (AttributeError, ValueError):
+                        pass
+                    agronomic.append({"characteristic": "Maturity Group", "value": val})
+                else:
+                    agronomic.append({"characteristic": k, "value": val})
+    if agronomic:
+        v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
+
+
+def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
+    crop = rec["crop"]
+    slug = rec["slug"]
+    url = rec["url"]
+    v = FCVariety(
+        source_key=f"firstchoice-{slug}",
+        source_url=url,
+        crop=crop,
+        trait_stack=_trait_stack(slug, crop),
+        sitemap_last_modified=rec.get("lastmod"),
+    )
+    r = http.get(url)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    article = soup.find("article", class_="content") or soup
+    v.product_name = _product_name(article, slug)
+
+    if crop == "corn":
+        _parse_corn(article, v)
+    elif crop == "soybeans":
+        _parse_soy(article, v)
+    # wheat: thin pages — identity only (no spec sections to parse).
+    return v
+
+
+# --------------------------------------------------------------------- render
+
+
+def render_markdown(v: FCVariety) -> str:
+    crop_label = {"corn": "Corn", "soybeans": "Soybeans",
+                  "wheat": "Wheat"}.get(v.crop, v.crop.title())
+    head: list[str] = [
+        f"# {v.product_name}",
+        "",
+        "- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
+        "- **Brand:** 1st Choice Seeds",
+        f"- **Crop:** {crop_label}",
+    ]
+    if v.crop == "corn" and v.relative_maturity is not None:
+        head.append(f"- **Relative maturity:** {v.relative_maturity} day")
+    if v.crop == "soybeans" and v.maturity_group is not None:
+        head.append(f"- **Maturity group:** {v.maturity_group}")
+    if v.crop == "wheat" and v.wheat_class:
+        head.append(f"- **Wheat class:** {v.wheat_class}")
+    if v.trait_stack:
+        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
+    head.append(f"- **Source:** {v.source_url}")
+    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
+    head.append("- **Service area:** 1st Choice Seeds dealer network — "
+                "Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
+    head.append("")
+    if v.positioning:
+        head += ["---", "", f"_{v.positioning}_", ""]
+    if v.strengths:
+        head += ["---", "", "## Field Notes", ""]
+        head += [f"- {s}" for s in v.strengths]
+        head.append("")
+    head += ["---", ""]
+    for g in v.groups:
+        head.append(f"## {g['label'].title()}")
+        head.append("")
+        for it in g["items"]:
+            ch = it["characteristic"]
+            val = it["value"] or "—"
+            head.append(f"- **{ch}:** {val}")
+        head.append("")
+    if not v.groups and v.crop == "wheat":
+        head += ["_Identity record only — 1st Choice wheat is private-label "
+                 "and the catalog page carries no agronomic spec block._", ""]
+    return "\n".join(head)
+
+
+def write_variety(v: FCVariety, body_md: str) -> None:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
+    sidecar = {
+        "source": "first_choice",
+        "source_key": v.source_key,
+        "vendor": "1st Choice Seeds",
+        "brand": "1st Choice Seeds",
+        "product_name": v.product_name,
+        "product_id": v.product_name,
+        "crop": v.crop,
+        "release_year": None,
+        "relative_maturity": v.relative_maturity,
+        "maturity_group": v.maturity_group,
+        "wheat_class": v.wheat_class,
+        "trait_stack": v.trait_stack,
+        "trait_descriptions": [],
+        "positioning_statement": v.positioning,
+        "strengths": v.strengths,
+        "characteristics_groups": v.groups,
+        "_scale_direction": RATING_SCALE_DIRECTION,
+        "regional_recommendations": [
+            {"product_list_name": "1st Choice Seeds dealer network "
+                                  "(Eastern Corn Belt — IN/OH/KY/TN)",
+             "agronomist": None, "agronomist_email": None, "variant_id": None},
+        ],
+        "image_url": None,
+        "source_urls": [v.source_url],
+        "sitemap_last_modified": v.sitemap_last_modified,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "scraper_version": SCRAPER_VERSION,
+    }
+    (CORPUS_DIR / f"{v.source_key}.json").write_text(
+        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+
+# --------------------------------------------------------------------- pipeline
+
+
+def run(*, limit: int | None, force: bool,
+        only_crop: str | None, only_product: str | None) -> int:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    http = RateLimitedSession()
+    records = discover(http, only_crop=only_crop)
+
+    if only_product:
+        key = only_product.lower()
+        records = [r for r in records
+                   if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
+        if not records:
+            log.error("no variety matched --product=%s", only_product)
+            return 2
+
+    counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
+    processed = 0
+    for rec in records:
+        if limit is not None and processed >= limit:
+            break
+        processed += 1
+        source_key = f"firstchoice-{rec['slug']}"
+        md_path = CORPUS_DIR / f"{source_key}.md"
+        if md_path.exists() and not force:
+            counts["skipped"] += 1
+            log.info("[%d/%d] %s skipped", processed, len(records), source_key)
+            continue
+        try:
+            v = parse_detail(http, rec)
+        except requests.HTTPError as exc:
+            counts["failed"] += 1
+            log.error("[%d/%d] %s detail fetch failed: %s",
+                      processed, len(records), source_key, exc)
+            continue
+        if not v.groups:
+            counts["empty"] += 1
+            log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
+                        processed, len(records), source_key,
+                        "; thin wheat page" if v.crop == "wheat" else "")
+        write_variety(v, render_markdown(v))
+        counts["written"] += 1
+        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
+                 processed, len(records), source_key, v.crop,
+                 v.relative_maturity or v.maturity_group or "-",
+                 len(v.groups), ",".join(v.trait_stack) or "-")
+
+    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
+             processed, counts["written"], counts["skipped"], counts["empty"],
+             counts["failed"], len(records))
+    return 0
+
+
+# --------------------------------------------------------------------- CLI
+
+
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="scrape.sources.first_choice",
+        description="Scrape 1st Choice Seeds (independent, employee-owned — "
+                    "Rushville, IN) — corn / soybeans / wheat via sitemaps "
+                    "+ detail pages.")
+    p.add_argument("--limit", type=int, default=None,
+                   help="Stop after processing N varieties (default: all).")
+    p.add_argument("--force", action="store_true",
+                   help="Re-fetch even if the markdown file already exists.")
+    p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
+                   help="Limit to one crop (corn / soybeans / wheat).")
+    p.add_argument("--product", default=None,
+                   help="Process a single variety by source_key or slug.")
+    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_argparser().parse_args(argv)
+    logging.basicConfig(
+        level=args.log_level.upper(),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+        stream=sys.stderr)
+    return run(limit=args.limit, force=args.force,
+               only_crop=args.crop, only_product=args.product)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,594 @@
+"""Latham Hi-Tech Seeds scraper — independent family-owned brand (Alexander, IA).
+
+Source: ``www.lathamseeds.com`` — WordPress site exposing a public,
+no-auth REST API. robots.txt is permissive (only ``/wp-admin/``
+disallowed; the catalog + ``/wp-json/`` are open, no Crawl-delay).
+Independent Upper-Midwest seed company (the self-styled "Latham
+Country" — IA / MN / WI / IL / ND / SD / NE); corn + soybeans only
+(an Alfalfa crop term exists in the taxonomy but has zero published
+varieties — no wheat).
+
+Two-step ingestion (mirrors the ProHarvest scraper):
+
+1. **Enumerate** via the WP REST API. ``/wp/v2/varieties`` is the
+   variety custom-post-type (~265 records, ``X-WP-Total: 265``).
+   ``/wp/v2/variety_crop`` is the crop taxonomy (Corn=2013,
+   Soybean=2029, Alfalfa=2159/empty); ``/wp/v2/variety_trait`` is the
+   trait taxonomy (Enlist E3, VT2 PRO RIB, Smart Stax, XtendFlex, …).
+   The REST payload gives the canonical id / slug / title / permalink
+   and taxonomy term IDs, plus a human-readable ``class_list`` (e.g.
+   ``variety_crop-soybean``, ``variety_trait-enlist-e3``). ``acf`` is
+   ``[]`` and ``content.rendered`` is EMPTY in REST, so the ratings
+   have to come from the detail page.
+
+2. **Parse the detail page.** Each ``/products/<slug>/`` page
+   server-renders the agronomic data as ``<h3>`` spec sections, each a
+   run of ``<li><span>label</span><span>value</span></li>`` rows up to
+   the next section header:
+     - Corn: "Agronomic Characteristics" (Early Vigor / Stalk Strength
+       / Root Strength / Stay Green / Drydown / Test Weight / Drought
+       Tolerance / Foliar Fungicide / Corn-on-Corn), "Plant
+       Characteristics" (Ear Height / Ear Type), "Disease Ratings"
+       (Goss's Wilt / Northern Leaf Blight / Anthracnose Stalk Rot /
+       Gray Leaf Spot / tar spot etc).
+     - Soybean: "Plant Characteristics" (Relative Maturity / Emergence
+       / Plant Height / Plant Type / Flower Color / Pubescence / Pod
+       Color / Hilum Color), "Defensive Characteristics & Disease
+       Ratings" (SCN Resistance source / Iron Chlorosis / Stress
+       Tolerance / Phytophthora Rps gene / Brown Stem Rot / White Mold
+       / Sudden Death). "Herbicide Tolerance" + "Placement" sections
+       are present but carry no ``<li>`` rows.
+   The relative maturity also sits in a "Key Features" ``Maturity``
+   row ("113.00 RM" / "3.60 RM"); we read RM/MG from the per-crop
+   spec section first and fall back to that.
+
+Rating scale: **numeric, LOWER = BETTER** (1 = best / most
+tolerant / most resistant). No explicit on-page legend, so the
+direction was confirmed by cross-referencing the Product Overview
+prose against the published values across ~12 corn varieties:
+hybrids described "very good / superior / excellent stalks and roots"
+carry Stalk/Root Strength 1.0–1.5, weaker traits run 3.0–3.5, and no
+value approaches 9 (observed range ~1.0–3.5). The soybean disease
+panel (Iron Chlorosis / Brown Stem Rot / White Mold / Sudden Death /
+Stress Tolerance) reads the same direction (lower = more resistant).
+A handful of values are categorical rather than numeric and pass
+through verbatim: SCN Resistance source ("PI 88788"), Phytophthora
+"Rps 1k", Anthracnose "ASR", plant descriptors ("Medium Tall",
+"Flex"). ``NA`` / blank = not rated.
+
+Unlike the Ebbert's scraper (which left ``characteristics_groups``
+empty and relied on a verbatim body), we parse the spec sections into
+structured ``characteristics_groups`` so the numeric + categorical
+ratings land in the embedded chunk and are actually retrievable. The
+soybean "Defensive Characteristics & Disease Ratings" section maps to
+the DISEASE RATINGS bucket; corn "Agronomic Characteristics" +
+"Plant Characteristics" map to AGRONOMIC CHARACTERISTICS.
+
+Output:
+  corpus/latham/<source_key>.md
+  corpus/latham/<source_key>.json
+
+source_key: ``latham-<slug>`` lowercased, e.g. ``latham-l-3632-e3``.
+
+CLI:
+  python -m scrape.sources.latham --crop corn --limit 5
+  python -m scrape.sources.latham --force
+  python -m scrape.sources.latham --product latham-l-3632-e3
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import random
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+SCRAPER_VERSION = "0.1.0"
+USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
+BASE = "https://www.lathamseeds.com"
+WP = f"{BASE}/wp-json/wp/v2"
+
+# variety_crop taxonomy slug -> chunker crop value. The chunker keys on
+# "soybeans" (plural) for the MG branch, so map accordingly. "alfalfa"
+# is in the taxonomy but has zero published varieties; everything not
+# listed here is out of scope for the row-crop advisor. (No wheat.)
+CROP_TYPES = {
+    "corn": "corn",
+    "soybean": "soybeans",
+}
+
+# robots.txt declares no Crawl-delay and only blocks /wp-admin/; we
+# stay polite. ~265 detail pages at 1.5 s/req finishes in ~7 min.
+REQ_INTERVAL_SEC = 1.5
+
+RATING_SCALE_DIRECTION = (
+    "numeric ratings ~1-9 where LOWER = BETTER (1 = best / most "
+    "tolerant / most resistant); confirmed by cross-referencing "
+    "Product Overview prose vs values (top-rated stalks/roots cluster "
+    "1.0-1.5, weak traits 3.0-3.5, none approach 9). Categorical "
+    "values pass through verbatim (SCN source 'PI 88788', "
+    "Phytophthora 'Rps 1k', Anthracnose 'ASR', 'Medium Tall', 'Flex'). "
+    "NA/blank = not rated."
+)
+
+# Detail-page spec section headers (<h3>) -> characteristics_groups
+# label. DISEASE RATINGS -> disease framing, AGRONOMIC CHARACTERISTICS
+# -> agronomic framing in the chunker; anything else passes through as
+# its own titled section. Both corn and soy headers are covered. The
+# soybean "Defensive Characteristics & Disease Ratings" panel mixes
+# disease 1-9 ratings with categorical resistance source/genes — we
+# bucket the whole panel as DISEASE so it embeds under disease framing.
+SPEC_SECTIONS = {
+    "agronomic characteristics": "AGRONOMIC CHARACTERISTICS",
+    "plant characteristics": "AGRONOMIC CHARACTERISTICS",
+    "disease ratings": "DISEASE RATINGS",
+    "defensive characteristics & disease ratings": "DISEASE RATINGS",
+    "defensive characteristics and disease ratings": "DISEASE RATINGS",
+}
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "latham"
+
+log = logging.getLogger("scrape.latham")
+
+
+# --------------------------------------------------------------------- HTTP
+
+
+class RateLimitedSession:
+    """Polite session with backoff. Latham's catalog is ~265 detail
+    pages so 1.5 s/req finishes the full scrape in ~7 min."""
+
+    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
+        self.s = requests.Session()
+        self.s.headers["User-Agent"] = USER_AGENT
+        self.interval = interval
+        self._last = 0.0
+
+    def _wait(self) -> None:
+        delta = time.monotonic() - self._last
+        if delta < self.interval:
+            time.sleep(self.interval - delta)
+        self._last = time.monotonic()
+
+    def request(self, method: str, url: str, *, max_retries: int = 4,
+                timeout: float = 30.0, **kw: Any) -> requests.Response:
+        last_exc: Exception | None = None
+        for attempt in range(max_retries):
+            self._wait()
+            try:
+                resp = self.s.request(method, url, timeout=timeout, **kw)
+            except requests.RequestException as exc:
+                last_exc = exc
+                backoff = min(30.0, (2 ** attempt) + random.random())
+                log.warning("network error on %s %s: %s — retry in %.1fs",
+                            method, url, exc, backoff)
+                time.sleep(backoff)
+                continue
+            if resp.status_code == 429 or 500 <= resp.status_code < 600:
+                ra = resp.headers.get("Retry-After")
+                backoff = float(ra) if (ra and ra.isdigit()) else min(
+                    30.0, (2 ** attempt) + random.random())
+                log.warning("HTTP %d on %s %s — retry in %.1fs",
+                            resp.status_code, method, url, backoff)
+                time.sleep(backoff)
+                continue
+            return resp
+        if last_exc:
+            raise last_exc
+        return resp  # type: ignore[return-value]
+
+    def get(self, url: str, **kw: Any) -> requests.Response:
+        return self.request("GET", url, **kw)
+
+    def get_json(self, url: str, **kw: Any) -> Any:
+        r = self.get(url, **kw)
+        r.raise_for_status()
+        return r.json()
+
+
+# --------------------------------------------------------------------- model
+
+
+@dataclass
+class LathamVariety:
+    source_key: str
+    source_url: str
+    crop: str                         # chunker value: corn / soybeans
+    product_name: str = ""            # "L 3632 E3"
+    relative_maturity: int | None = None     # corn (days)
+    maturity_group: float | None = None      # soy
+    release_year: str | None = None
+    trait_stack: list[str] = field(default_factory=list)
+    positioning: str | None = None
+    # [{label, items:[{characteristic, value}]}] — chunker source of truth
+    groups: list[dict] = field(default_factory=list)
+
+
+# --------------------------------------------------------------------- discovery (REST)
+
+
+def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
+    """term_id -> name for a WP taxonomy (paged)."""
+    out: dict[int, str] = {}
+    page = 1
+    while True:
+        url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
+        r = http.get(url)
+        if r.status_code == 400:   # past last page
+            break
+        r.raise_for_status()
+        terms = r.json()
+        if not terms:
+            break
+        for t in terms:
+            out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
+        if len(terms) < 100:
+            break
+        page += 1
+    return out
+
+
+def _crop_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
+    out: dict[str, int] = {}
+    for t in http.get_json(f"{WP}/variety_crop?per_page=100&_fields=id,slug"):
+        out[t["slug"]] = t["id"]
+    return out
+
+
+def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
+    """Return REST variety records for the in-scope row crops."""
+    crop_ids = _crop_slug_to_id(http)
+    records: list[dict] = []
+    seen: set[int] = set()
+    for crop_slug, crop in CROP_TYPES.items():
+        if only_crop and crop != only_crop:
+            continue
+        cid = crop_ids.get(crop_slug)
+        if cid is None:
+            log.warning("variety_crop %r not found in taxonomy — skipping", crop_slug)
+            continue
+        page = 1
+        while True:
+            url = (f"{WP}/varieties?variety_crop={cid}&per_page=100&page={page}"
+                   "&_fields=id,slug,title,link,variety_trait,variety_year")
+            r = http.get(url)
+            if r.status_code == 400:
+                break
+            r.raise_for_status()
+            batch = r.json()
+            if not batch:
+                break
+            for v in batch:
+                if v["id"] in seen:
+                    continue
+                seen.add(v["id"])
+                v["_crop"] = crop
+                records.append(v)
+            if len(batch) < 100:
+                break
+            page += 1
+        log.info("variety_crop %-8s (%s): cumulative %d", crop_slug, crop, len(records))
+    return records
+
+
+# --------------------------------------------------------------------- detail parse
+
+
+_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
+
+
+def _clean(s: str) -> str:
+    return re.sub(r"\s+", " ", s or "").strip()
+
+
+def _two_span(li: Tag) -> tuple[str, str] | None:
+    """A spec row is an <li> with exactly two non-empty <span>
+    descendants: (label, value)."""
+    spans = [_clean(s.get_text(" ", strip=True)) for s in li.find_all("span")]
+    if len(spans) == 2 and all(spans):
+        return spans[0], spans[1]
+    return None
+
+
+def _section_rows(header: Tag) -> list[tuple[str, str]]:
+    """Collect every two-span <li> from a section header up to (but not
+    including) the next section header (h2/h3) in document order."""
+    rows: list[tuple[str, str]] = []
+    for el in header.find_all_next():
+        if el.name in ("h2", "h3") and el is not header:
+            break
+        if isinstance(el, Tag) and el.name == "li":
+            pair = _two_span(el)
+            if pair:
+                rows.append(pair)
+    return rows
+
+
+def _parse_groups(soup: BeautifulSoup) -> list[dict]:
+    """Parse each known spec <h3> into a {label, items:[{characteristic,
+    value}]} group. Sections with no rows are dropped."""
+    groups: list[dict] = []
+    for header in soup.find_all(["h2", "h3"]):
+        head = _clean(header.get_text(" ", strip=True)).lower()
+        label = SPEC_SECTIONS.get(head)
+        if not label:
+            continue
+        rows = _section_rows(header)
+        if not rows:
+            continue
+        items = [{"characteristic": k, "value": v} for k, v in rows]
+        # If a previous section already mapped to this label (corn maps
+        # both Agronomic + Plant Characteristics -> AGRONOMIC), merge so
+        # the chunker sees one coherent bucket instead of two.
+        existing = next((g for g in groups if g["label"] == label), None)
+        if existing:
+            existing["items"].extend(items)
+        else:
+            groups.append({"label": label, "items": items})
+    return groups
+
+
+def _parse_maturity_from_groups(groups: list[dict], crop: str,
+                                ) -> tuple[int | None, float | None]:
+    """Pull RM (corn) / MG (soy) from the parsed groups. Corn carries
+    'Maturity' under the page's Key Features and 'Relative Maturity' is
+    soy-side under Plant Characteristics."""
+    keys = ("relative maturity", "maturity")
+    for g in groups:
+        for it in g["items"]:
+            if it["characteristic"].strip().lower() in keys:
+                m = _MATURITY_RE.search(it["value"])
+                if not m:
+                    continue
+                if crop == "corn":
+                    return int(float(m.group(1))), None
+                return None, float(m.group(1))
+    return None, None
+
+
+def _parse_maturity_keyfeatures(soup: BeautifulSoup, crop: str,
+                                ) -> tuple[int | None, float | None]:
+    """Fallback: the 'Key Features' block carries a 'Maturity' row
+    ('113.00 RM' / '3.60 RM')."""
+    for li in soup.find_all("li"):
+        pair = _two_span(li)
+        if pair and pair[0].strip().lower() == "maturity":
+            m = _MATURITY_RE.search(pair[1])
+            if m:
+                if crop == "corn":
+                    return int(float(m.group(1))), None
+                return None, float(m.group(1))
+    return None, None
+
+
+def _parse_positioning(soup: BeautifulSoup) -> str | None:
+    """First substantive paragraph under the 'Product Overview' /
+    'Hybrid Advantages' heading. Best-effort marketing blurb."""
+    for header in soup.find_all(["h2", "h3"]):
+        if _clean(header.get_text(" ", strip=True)).lower() not in (
+                "product overview", "hybrid advantages"):
+            continue
+        for el in header.find_all_next():
+            if el.name in ("h2", "h3") and el is not header:
+                break
+            if isinstance(el, Tag) and el.name == "p":
+                t = _clean(el.get_text(" ", strip=True))
+                if len(t) >= 40:
+                    return t
+    return None
+
+
+def parse_detail(http: RateLimitedSession, rec: dict,
+                 trait_names: dict[int, str],
+                 year_names: dict[int, str]) -> LathamVariety:
+    crop = rec["_crop"]
+    slug = rec["slug"]
+    url = rec.get("link") or f"{BASE}/products/{slug}/"
+    name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
+    r = http.get(url)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    # Drop noise so footer/nav text never bleeds into positioning.
+    for t in soup(["script", "style", "noscript"]):
+        t.decompose()
+
+    groups = _parse_groups(soup)
+    rm, mg = _parse_maturity_from_groups(groups, crop)
+    if rm is None and mg is None:
+        rm, mg = _parse_maturity_keyfeatures(soup, crop)
+    positioning = _parse_positioning(soup)
+    traits = [trait_names[t] for t in (rec.get("variety_trait") or [])
+              if t in trait_names]
+    years = [year_names[t] for t in (rec.get("variety_year") or [])
+             if t in year_names]
+    release_year = years[0] if years else None
+
+    return LathamVariety(
+        source_key=f"latham-{slug.lower()}",
+        source_url=url,
+        crop=crop,
+        product_name=name,
+        relative_maturity=rm,
+        maturity_group=mg,
+        release_year=release_year,
+        trait_stack=traits,
+        positioning=positioning,
+        groups=groups,
+    )
+
+
+# --------------------------------------------------------------------- render
+
+
+def render_markdown(v: LathamVariety) -> str:
+    crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
+        v.crop, v.crop.title())
+    head: list[str] = [
+        f"# {v.product_name}",
+        "",
+        "- **Vendor:** Latham Hi-Tech Seeds (independent family-owned, Alexander, IA)",
+        "- **Brand:** Latham Hi-Tech Seeds",
+        f"- **Crop:** {crop_label}",
+    ]
+    if v.crop == "corn" and v.relative_maturity is not None:
+        head.append(f"- **Relative maturity:** {v.relative_maturity} days")
+    if v.crop == "soybeans" and v.maturity_group is not None:
+        head.append(f"- **Maturity group:** {v.maturity_group}")
+    if v.trait_stack:
+        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
+    head.append(f"- **Source:** {v.source_url}")
+    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
+    head.append("- **Service area:** Latham dealer network — Upper Midwest "
+                "(IA/MN/WI/IL/ND/SD/NE)")
+    head.append("")
+    if v.positioning:
+        head += ["---", "", f"_{v.positioning}_", ""]
+    head += ["---", ""]
+    for g in v.groups:
+        head.append(f"## {g['label'].title()}")
+        head.append("")
+        for it in g["items"]:
+            ch = it["characteristic"]
+            val = it["value"] or "—"
+            head.append(f"- **{ch}:** {val}")
+        head.append("")
+    return "\n".join(head)
+
+
+def write_variety(v: LathamVariety, body_md: str) -> None:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
+    sidecar = {
+        "source": "latham",
+        "source_key": v.source_key,
+        "vendor": "Latham Hi-Tech Seeds",
+        "brand": "Latham Hi-Tech Seeds",
+        "product_name": v.product_name,
+        "product_id": v.product_name,
+        "crop": v.crop,
+        "release_year": v.release_year,
+        "relative_maturity": v.relative_maturity,
+        "maturity_group": v.maturity_group,
+        "wheat_class": None,
+        "trait_stack": v.trait_stack,
+        "trait_descriptions": [],
+        "positioning_statement": v.positioning,
+        "strengths": [],
+        "characteristics_groups": v.groups,
+        "_scale_direction": RATING_SCALE_DIRECTION,
+        "regional_recommendations": [
+            {"product_list_name": "Latham dealer network (Upper Midwest — "
+                                  "IA/MN/WI/IL/ND/SD/NE)",
+             "agronomist": None, "agronomist_email": None, "variant_id": None},
+        ],
+        "image_url": None,
+        "source_urls": [v.source_url],
+        "sitemap_last_modified": None,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "scraper_version": SCRAPER_VERSION,
+    }
+    (CORPUS_DIR / f"{v.source_key}.json").write_text(
+        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8")
+
+
+# --------------------------------------------------------------------- pipeline
+
+
+def run(*, limit: int | None, force: bool,
+        only_crop: str | None, only_product: str | None) -> int:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    http = RateLimitedSession()
+    trait_names = _taxonomy_map(http, "variety_trait")
+    year_names = _taxonomy_map(http, "variety_year")
+    records = discover(http, only_crop=only_crop)
+
+    if only_product:
+        key = only_product.lower()
+        records = [r for r in records
+                   if f"latham-{r['slug'].lower()}" == key
+                   or r["slug"].lower() == key]
+        if not records:
+            log.error("no variety matched --product=%s", only_product)
+            return 2
+
+    counts = {"written": 0, "skipped": 0, "empty": 0}
+    processed = 0
+    for rec in records:
+        if limit is not None and processed >= limit:
+            break
+        processed += 1
+        source_key = f"latham-{rec['slug'].lower()}"
+        md_path = CORPUS_DIR / f"{source_key}.md"
+        if md_path.exists() and not force:
+            counts["skipped"] += 1
+            log.info("[%d/%d] %s skipped", processed, len(records), source_key)
+            continue
+        try:
+            v = parse_detail(http, rec, trait_names, year_names)
+        except requests.HTTPError as exc:
+            log.error("[%d/%d] %s detail fetch failed: %s",
+                      processed, len(records), source_key, exc)
+            continue
+        if not v.groups:
+            counts["empty"] += 1
+            log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
+                        processed, len(records), source_key)
+        write_variety(v, render_markdown(v))
+        counts["written"] += 1
+        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
+                 processed, len(records), source_key, v.crop,
+                 v.relative_maturity or v.maturity_group or "-",
+                 len(v.groups), ",".join(v.trait_stack) or "-")
+
+    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
+             processed, counts["written"], counts["skipped"], counts["empty"],
+             len(records))
+    return 0
+
+
+# --------------------------------------------------------------------- CLI
+
+
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="scrape.sources.latham",
+        description="Scrape Latham Hi-Tech Seeds (independent Upper-Midwest "
+                    "brand) — corn / soybeans via the WP REST API + detail pages.")
+    p.add_argument("--limit", type=int, default=None,
+                   help="Stop after processing N varieties (default: all).")
+    p.add_argument("--force", action="store_true",
+                   help="Re-fetch even if the markdown file already exists.")
+    p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
+                   help="Limit to one crop (corn / soybeans).")
+    p.add_argument("--product", default=None,
+                   help="Process a single variety by source_key or slug.")
+    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_argparser().parse_args(argv)
+    logging.basicConfig(
+        level=args.log_level.upper(),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+        stream=sys.stderr)
+    return run(limit=args.limit, force=args.force,
+               only_crop=args.crop, only_product=args.product)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,767 @@
+"""Stine Seed Company scraper — independent family-owned breeder (Adel, IA).
+
+Source: ``www.stineseed.com`` — a custom PHP site (NOT WordPress;
+``/wp-json/`` 404s). robots.txt returns 404 (none published); the
+``/legal/`` page carries only a standard copyright / no-reproduction
+clause (no anti-automation term — same posture as the other corpus
+vendors). ``sitemap.xml`` (~499 URLs) lists every live product page,
+so it is our canonical enumeration source.
+
+Stine is the largest privately-owned seed company in the US; it
+breeds and sells **corn + soybeans** only (no wheat). The catalog is
+~58 corn hybrids + ~159 soybean varieties.
+
+Two-step ingestion:
+
+1. **Enumerate** the current catalog from ``sitemap.xml``. A product
+   *detail* URL has the shape ``/{crop}/traits/{trait-slug}/{code}/``
+   (four path segments); the bare ``/{crop}/traits/{trait-slug}/``
+   landing pages are skipped. This yields exactly the live catalog
+   (58 corn + 159 soy), unlike the comparison ajax endpoint which
+   also returns thousands of discontinued/historical entries.
+
+   Fallback enumeration (``--enumerate ajax``) hits the comparison
+   ajax fragments:
+     - corn: POST ``/ajax/corn-comparison/filter_products.php``
+     - soy:  POST ``/ajax/soybean-comparison/filter_products.php``
+   with ``sel1=&sel2=&sel3=`` (empty = all). Each ``<li>`` carries a
+   numeric product id + the canonical detail URL.
+
+2. **Parse the detail page.** Each ``/{crop}/traits/{slug}/{code}/``
+   page server-renders all agronomic data (no JS needed) as
+   ``<section class="agronomic-details">`` →
+   ``<ul class="agronomy-chart"> <li> <strong>label</strong>
+   <span class="value">value</span> </li> …``. The variety code +
+   brand mark live in the ``<h1>`` (``Stine ® 9444-22 Brand``).
+
+Rating scales differ by crop and are preserved verbatim (the chunker
+never fabricates a value):
+
+  - **Corn** publishes an on-page legend:
+    ``9: Excellent, 8: Very Good, 7: Good, 6: Average,
+    5: Below Average`` — a **1-9 numeric** scale, **HIGHER = BETTER /
+    more tolerant** (same direction as Bayer/NK, so no flip). Applies
+    to the agronomic performance panel (Drydown/Root/Stalk/Stress/
+    Cold Emergence/Test Weight) and the disease panel (Tar Spot/Gray
+    Leaf Spot/Eye Spot/N.C. Leaf Blight/Goss' Wilt/Common Rust/…).
+    Plant descriptors / soil placement / herbicide rows are
+    qualitative (Tall, Highly Recommended, Yes/No) and pass through.
+  - **Soybeans** are entirely **qualitative** (Excellent / Very Good
+    / Good / … and Resistant / Strong / Good / Susceptible for
+    disease; "higher/'Resistant' = better"). There is no numeric
+    legend on soy pages. SCN (Soybean Cyst Nematode) and RPS Gene
+    rows carry the *source/gene* (e.g. Peking, 3a) rather than a
+    rating.
+
+We parse the chart into structured ``characteristics_groups`` — a
+DISEASE RATINGS group, an AGRONOMIC CHARACTERISTICS group, and a few
+pass-through groups (PLANT DESCRIPTION / SOIL & PLACEMENT / HERBICIDE
+TOLERANCE / SEED TREATMENT NOTES) — so every rating lands in the
+embedded chunk and is actually retrievable.
+
+Output:
+  corpus/stine/<source_key>.md
+  corpus/stine/<source_key>.json
+
+source_key: ``stine-<productcode>`` lowercased, e.g.
+``stine-9444-22`` (corn) or ``stine-22r32`` (soy).
+
+CLI:
+  python -m scrape.sources.stine --crop corn --limit 2 --force
+  python -m scrape.sources.stine --crop soybeans --limit 2 --force
+  python -m scrape.sources.stine --force
+  python -m scrape.sources.stine --product stine-9444-22
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import random
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import warnings
+
+import requests
+from bs4 import BeautifulSoup
+
+try:  # bs4>=4.11 raises this when html.parser sees an XML doc (the sitemap)
+    from bs4 import XMLParsedAsHTMLWarning
+    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
+except Exception:  # pragma: no cover — older bs4 without the warning class
+    pass
+
+SCRAPER_VERSION = "0.1.0"
+USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
+BASE = "https://www.stineseed.com"
+SITEMAP = f"{BASE}/sitemap.xml"
+AJAX = {
+    "corn": f"{BASE}/ajax/corn-comparison/filter_products.php",
+    "soybeans": f"{BASE}/ajax/soybean-comparison/filter_products.php",
+}
+
+# Stine site path segment -> chunker crop value (chunker keys on the
+# PLURAL "soybeans" for the MG branch). Stine has no wheat.
+CROP_PATHS = {
+    "corn": "corn",
+    "soybeans": "soybeans",
+}
+
+# No robots.txt (404) and no Crawl-delay; stay polite at 1.5 s/req.
+# ~217 detail pages -> a full run finishes in ~6 min.
+REQ_INTERVAL_SEC = 1.5
+
+RATING_SCALE_DIRECTION = (
+    "corn agronomic+disease 1-9 numeric, 9=Excellent/best/most-tolerant, "
+    "8=Very Good, 7=Good, 6=Average, 5=Below Average (higher=better, same "
+    "direction as Bayer/NK; blank/'-'=not rated); soybeans qualitative "
+    "(Excellent/Very Good/Good for vigor; Resistant/Strong/Good/Susceptible "
+    "for disease, Resistant/Strong=best); SCN row gives source (e.g. Peking) "
+    "and RPS Gene gives the gene, not a rating; plant/soil/herbicide rows "
+    "qualitative (Tall, Highly Recommended/Recommended, Yes/No)"
+)
+
+# ---- Chart-label classification -------------------------------------
+# The agronomy chart is a flat run of label/value <li>s mixing identity,
+# performance ratings, disease ratings, plant descriptors, soil/placement,
+# and herbicide rows. We bucket by label into characteristics_groups the
+# chunker understands (DISEASE RATINGS -> disease framing, AGRONOMIC
+# CHARACTERISTICS -> agronomic framing; the rest pass through titled).
+
+# Identity rows already captured into RM/MG/dedicated facts — not repeated
+# as a generic characteristic.
+_IDENTITY_LABELS = {"maturity", "maturity end"}
+
+# Corn 1-9 performance ratings -> AGRONOMIC CHARACTERISTICS.
+_CORN_AGRONOMIC = {
+    "gdd", "mn maturity", "drydown", "root", "stalk", "stress",
+    "cold emergence", "test weight", "harvest population",
+}
+# Corn disease ratings -> DISEASE RATINGS. Set kept generous because the
+# disease list varies per page (some add S.C. Leaf Blight / Anthracnose).
+_CORN_DISEASE = {
+    "tar spot", "gray leaf spot", "eye spot", "n.c. leaf blight",
+    "s.c. leaf blight", "anthracnose", "goss' wilt", "goss’ wilt",
+    "common rust", "northern corn leaf blight", "southern corn leaf blight",
+    "diplodia", "fusarium", "head smut",
+}
+# Corn plant descriptors -> PLANT DESCRIPTION.
+_CORN_PLANT = {"plant height", "ear placement", "ear flex", "cob color"}
+# Corn soil/placement -> SOIL & PLACEMENT.
+_CORN_SOIL = {
+    "corn-on-corn", "sand", "loam", "clay", "wide rows", "narrow rows",
+    'population % in 30" or wider rows', "population % in narrow rows",
+    "population", "drought tolerance",
+}
+# Corn herbicide -> HERBICIDE TOLERANCE.
+_CORN_HERBICIDE = {"glyphosate tolerant", "glufosinate tolerant"}
+
+# Soy vigor/standability -> AGRONOMIC CHARACTERISTICS.
+_SOY_AGRONOMIC = {"emergence", "standability", "shattering", "lodging"}
+# Soy disease + nematode + gene rows -> DISEASE RATINGS (SCN/RPS carry a
+# source/gene rather than a rating; that's still the disease panel).
+_SOY_DISEASE = {
+    "phytophthora root rot", "rps gene", "iron deficiency chlorosis",
+    "brown stem rot", "sudden death syndrome", "soybean cyst nematode",
+    "frogeye leafspot", "frogeye leaf spot", "sclerotinia white mold",
+    "white mold", "stem canker", "root knot nematode", "soybean rust",
+}
+# Soy plant descriptors / quality -> PLANT DESCRIPTION.
+_SOY_PLANT = {
+    "height", "flower", "pubescence", "hilum", "chloride", "pod color",
+    "canopy", "protein", "oil",
+}
+# Soy herbicide/trait management -> HERBICIDE TOLERANCE.
+_SOY_HERBICIDE = {"sulfonylurea tolerance", "sts", "glyphosate tolerant"}
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_DIR = CORPUS_ROOT / "stine"
+
+log = logging.getLogger("scrape.stine")
+
+
+# --------------------------------------------------------------------- HTTP
+
+
+class RateLimitedSession:
+    """Polite session with backoff. Stine's live catalog is ~217 detail
+    pages, so 1.5 s/req still finishes in a few minutes."""
+
+    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
+        self.s = requests.Session()
+        self.s.headers["User-Agent"] = USER_AGENT
+        self.interval = interval
+        self._last = 0.0
+
+    def _wait(self) -> None:
+        delta = time.monotonic() - self._last
+        if delta < self.interval:
+            time.sleep(self.interval - delta)
+        self._last = time.monotonic()
+
+    def request(self, method: str, url: str, *, max_retries: int = 4,
+                timeout: float = 30.0, **kw: Any) -> requests.Response:
+        last_exc: Exception | None = None
+        for attempt in range(max_retries):
+            self._wait()
+            try:
+                resp = self.s.request(method, url, timeout=timeout, **kw)
+            except requests.RequestException as exc:
+                last_exc = exc
+                backoff = min(30.0, (2 ** attempt) + random.random())
+                log.warning("network error on %s %s: %s — retry in %.1fs",
+                            method, url, exc, backoff)
+                time.sleep(backoff)
+                continue
+            if resp.status_code == 429 or 500 <= resp.status_code < 600:
+                ra = resp.headers.get("Retry-After")
+                backoff = float(ra) if (ra and ra.isdigit()) else min(
+                    30.0, (2 ** attempt) + random.random())
+                log.warning("HTTP %d on %s %s — retry in %.1fs",
+                            resp.status_code, method, url, backoff)
+                time.sleep(backoff)
+                continue
+            return resp
+        if last_exc:
+            raise last_exc
+        return resp  # type: ignore[return-value]
+
+    def get(self, url: str, **kw: Any) -> requests.Response:
+        return self.request("GET", url, **kw)
+
+    def post(self, url: str, **kw: Any) -> requests.Response:
+        return self.request("POST", url, **kw)
+
+
+# --------------------------------------------------------------------- model
+
+
+@dataclass
+class StineVariety:
+    source_key: str
+    source_url: str
+    crop: str                         # chunker value: corn / soybeans
+    product_name: str = ""            # "9444-22", "22R32"
+    relative_maturity: int | None = None     # corn (representative RM days)
+    maturity_group: float | None = None      # soy MG
+    trait_stack: list[str] = field(default_factory=list)
+    positioning: str | None = None
+    # [{label, items:[{characteristic, value}]}] — chunker source of truth
+    groups: list[dict] = field(default_factory=list)
+    sitemap_last_modified: str | None = None
+
+
+# --------------------------------------------------------------------- discovery
+
+
+_DETAIL_RE = re.compile(
+    r"^https?://(?:www\.)?stineseed\.com/(corn|soybeans)/traits/"
+    r"([^/]+)/([^/]+)/?$",
+    re.IGNORECASE,
+)
+
+
+@dataclass
+class DiscoveredURL:
+    url: str
+    crop: str
+    trait_slug: str
+    code: str
+    lastmod: str | None = None
+
+
+def _norm_url(url: str) -> str:
+    """Canonical product URL has a trailing slash."""
+    url = url.strip()
+    if not url.endswith("/"):
+        url += "/"
+    return url
+
+
+def discover_sitemap(http: RateLimitedSession, *,
+                     only_crop: str | None) -> list[DiscoveredURL]:
+    """Parse sitemap.xml for live product detail pages.
+
+    A detail URL has FOUR path segments (``/{crop}/traits/{slug}/{code}/``);
+    the bare ``/{crop}/traits/{slug}/`` landing pages are excluded.
+    """
+    r = http.get(SITEMAP)
+    r.raise_for_status()
+    # Parse with html.parser (lxml/xml backend isn't a guaranteed dep). It
+    # lowercases tag names but <loc>/<lastmod> are already lowercase, so
+    # find_all("url") still works on the sitemap fragments.
+    soup = BeautifulSoup(r.text, "html.parser")
+    out: list[DiscoveredURL] = []
+    seen: set[str] = set()
+    for u in soup.find_all("url"):
+        loc_el = u.find("loc")
+        if not loc_el:
+            continue
+        loc = loc_el.get_text(strip=True)
+        m = _DETAIL_RE.match(loc)
+        if not m:
+            continue
+        crop, trait_slug, code = m.group(1).lower(), m.group(2), m.group(3)
+        crop = CROP_PATHS.get(crop)
+        if not crop:
+            continue
+        if only_crop and crop != only_crop:
+            continue
+        canon = _norm_url(loc)
+        if canon in seen:
+            continue
+        seen.add(canon)
+        lm_el = u.find("lastmod")
+        lastmod = lm_el.get_text(strip=True) if lm_el else None
+        out.append(DiscoveredURL(canon, crop, trait_slug, code, lastmod))
+    out.sort(key=lambda d: (d.crop, d.code))
+    log.info("sitemap: discovered %d product detail pages%s",
+             len(out), f" (crop={only_crop})" if only_crop else "")
+    return out
+
+
+def discover_ajax(http: RateLimitedSession, *,
+                  only_crop: str | None) -> list[DiscoveredURL]:
+    """Fallback enumeration via the comparison ajax fragments.
+
+    NOTE: these endpoints return the FULL historical product set
+    (thousands of discontinued entries, with code dupes pointing at the
+    same slug), so we de-dupe on canonical URL. The sitemap is preferred
+    because it reflects only the current live catalog.
+    """
+    out: list[DiscoveredURL] = []
+    seen: set[str] = set()
+    for crop, endpoint in AJAX.items():
+        if only_crop and crop != only_crop:
+            continue
+        r = http.post(endpoint, data={"sel1": "", "sel2": "", "sel3": ""})
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        for a in soup.select("ul.comparison-list a[href]"):
+            href = a.get("href") or ""
+            loc = href if href.startswith("http") else BASE + href
+            m = _DETAIL_RE.match(loc)
+            if not m:
+                continue
+            mcrop = CROP_PATHS.get(m.group(1).lower())
+            if not mcrop or (only_crop and mcrop != only_crop):
+                continue
+            canon = _norm_url(loc)
+            if canon in seen:
+                continue
+            seen.add(canon)
+            out.append(DiscoveredURL(canon, mcrop, m.group(2), m.group(3)))
+    out.sort(key=lambda d: (d.crop, d.code))
+    log.info("ajax: discovered %d product detail pages%s",
+             len(out), f" (crop={only_crop})" if only_crop else "")
+    return out
+
+
+# --------------------------------------------------------------------- parse
+
+
+def _clean(s: str) -> str:
+    return re.sub(r"\s+", " ", s or "").strip()
+
+
+def _slug_to_trait(slug: str) -> str:
+    """Humanize a trait-slug into a display trait name.
+
+    ``duracade-refuge-renew`` -> ``DuraCade Refuge Renew``;
+    ``enlist-e3-soybeans`` -> ``Enlist E3``; ``stine-gt-`` ->
+    ``Stine GT``; ``vt-double-pro-technology`` -> ``VT Double Pro``;
+    ``conventional-corn`` -> ``Conventional``.
+    """
+    words = [w for w in re.split(r"[-_]+", slug) if w]
+    drop_tail = {"soybeans", "soybean", "corn", "technology"}
+    while words and words[-1].lower() in drop_tail:
+        words.pop()
+    if not words:
+        return slug
+    # Known acronyms / brand casings.
+    acronyms = {"gt": "GT", "vt": "VT", "e3": "E3", "rnai": "RNAi",
+                "sts": "STS", "ll": "LL", "rr2": "RR2", "3010": "3010",
+                "3110": "3110", "3110a": "3110A"}
+    out: list[str] = []
+    for w in words:
+        lw = w.lower()
+        if lw in acronyms:
+            out.append(acronyms[lw])
+        elif lw == "duracade":
+            out.append("DuraCade")
+        elif lw == "viptera":
+            out.append("Viptera")
+        elif lw == "smartstax":
+            out.append("SmartStax")
+        elif lw == "xtendflex":
+            out.append("XtendFlex")
+        elif lw == "trecepta":
+            out.append("Trecepta")
+        elif lw == "agrisure":
+            out.append("Agrisure")
+        elif lw == "gt27":
+            out.append("GT27")
+        else:
+            out.append(w.capitalize())
+    return " ".join(out)
+
+
+def _extract_code(h1_text: str, fallback: str) -> str:
+    """Pull the product code from the ``Stine ® 9444-22 Brand`` H1.
+    Falls back to the URL code segment (uppercased) if the H1 is odd."""
+    t = h1_text
+    t = re.sub(r"®|™", " ", t)
+    t = re.sub(r"\bStine\b", " ", t, flags=re.I)
+    t = re.sub(r"\bBrand\b", " ", t, flags=re.I)
+    t = re.sub(r"\bNEW\b", " ", t)
+    t = _clean(t)
+    # Code is the first non-space token; keep it if it has a digit.
+    tok = t.split(" ")[0] if t else ""
+    if tok and any(ch.isdigit() for ch in tok):
+        return tok
+    return fallback.upper()
+
+
+def _parse_corn_maturity(value: str) -> int | None:
+    """Corn 'Maturity' is an RM range like '98 - 100' or a single '99'.
+    Store the representative integer (mean of the range, rounded)."""
+    nums = [int(n) for n in re.findall(r"\d+", value or "")]
+    if not nums:
+        return None
+    if len(nums) == 1:
+        return nums[0]
+    return round(sum(nums[:2]) / 2)
+
+
+def _parse_soy_mg(value: str) -> float | None:
+    """Soy 'Maturity' is the RM expressed as a 2- or 3-digit code where
+    MG = value/10 for 2-digit codes ('21' -> 2.1, '50' -> 5.0) and
+    value/100 for 3-digit leading-zero codes ('008' -> 0.08). For a
+    range ('008 - 009') take the start value."""
+    m = re.match(r"\s*(\d+)", value or "")
+    if not m:
+        return None
+    raw = m.group(1)
+    n = int(raw)
+    if len(raw) >= 3:
+        return round(n / 100.0, 2)
+    return round(n / 10.0, 2)
+
+
+def _bucket(crop: str, label: str) -> str:
+    """Map a chart label to a characteristics_groups label."""
+    lk = label.lower().strip()
+    if lk in _IDENTITY_LABELS:
+        return ""  # handled as a dedicated fact, not a generic item
+    if crop == "corn":
+        if lk in _CORN_DISEASE:
+            return "DISEASE RATINGS"
+        if lk in _CORN_AGRONOMIC:
+            return "AGRONOMIC CHARACTERISTICS"
+        if lk in _CORN_PLANT:
+            return "PLANT DESCRIPTION"
+        if lk in _CORN_SOIL:
+            return "SOIL & PLACEMENT"
+        if lk in _CORN_HERBICIDE:
+            return "HERBICIDE TOLERANCE"
+    else:  # soybeans
+        if lk in _SOY_DISEASE:
+            return "DISEASE RATINGS"
+        if lk in _SOY_AGRONOMIC:
+            return "AGRONOMIC CHARACTERISTICS"
+        if lk in _SOY_PLANT:
+            return "PLANT DESCRIPTION"
+        if lk in _SOY_HERBICIDE:
+            return "HERBICIDE TOLERANCE"
+    return "OTHER CHARACTERISTICS"
+
+
+def _parse_chart(crop: str, chart) -> tuple[list[dict], list[tuple[str, str]]]:
+    """Parse ``ul.agronomy-chart`` into grouped items.
+
+    Returns (groups, raw_pairs) where groups is the bucketed
+    characteristics_groups list (display order preserved) and raw_pairs
+    is every (label, value) pair (used to pull RM/MG)."""
+    # Stable group order for rendering.
+    order = ["AGRONOMIC CHARACTERISTICS", "DISEASE RATINGS",
+             "PLANT DESCRIPTION", "SOIL & PLACEMENT",
+             "HERBICIDE TOLERANCE", "OTHER CHARACTERISTICS"]
+    bucketed: dict[str, list[dict]] = {k: [] for k in order}
+    raw_pairs: list[tuple[str, str]] = []
+    seen_item: set[tuple[str, str]] = set()
+    for li in chart.find_all("li", recursive=False):
+        strong = li.find("strong")
+        val_el = li.find("span", class_="value")
+        if not strong:
+            continue
+        label = _clean(strong.get_text(" ", strip=True))
+        value = _clean(val_el.get_text(" ", strip=True)) if val_el else ""
+        if not label:
+            continue
+        raw_pairs.append((label, value))
+        grp = _bucket(crop, label)
+        if not grp:
+            continue
+        # The soy page repeats "Maturity" twice and we drop those via
+        # _IDENTITY_LABELS; de-dupe any other accidental repeats too.
+        key = (label.lower(), value.lower())
+        if key in seen_item:
+            continue
+        seen_item.add(key)
+        bucketed[grp].append({"characteristic": label, "value": value})
+    groups = [{"label": k, "items": bucketed[k]} for k in order if bucketed[k]]
+    return groups, raw_pairs
+
+
+def parse_detail(http: RateLimitedSession, d: DiscoveredURL) -> StineVariety:
+    r = http.get(d.url)
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+
+    h1 = soup.find("h1")
+    h1_text = _clean(h1.get_text(" ", strip=True)) if h1 else ""
+    code = _extract_code(h1_text, d.code)
+
+    sec = soup.find("section", class_="agronomic-details")
+    chart = sec.find("ul", class_="agronomy-chart") if sec else None
+    groups: list[dict] = []
+    raw_pairs: list[tuple[str, str]] = []
+    if chart:
+        groups, raw_pairs = _parse_chart(d.crop, chart)
+
+    # Pull maturity from the first "Maturity" pair.
+    rm: int | None = None
+    mg: float | None = None
+    mat_text = ""
+    for label, value in raw_pairs:
+        if label.lower() == "maturity":
+            mat_text = value
+            break
+    if d.crop == "corn":
+        rm = _parse_corn_maturity(mat_text)
+        # Keep the RM range text as a characteristic so the verbatim
+        # range is retrievable alongside the representative integer.
+        if mat_text:
+            for g in groups:
+                if g["label"] == "AGRONOMIC CHARACTERISTICS":
+                    g["items"].insert(0, {"characteristic": "Maturity (RM range)",
+                                          "value": mat_text})
+                    break
+            else:
+                groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
+                                  "items": [{"characteristic": "Maturity (RM range)",
+                                             "value": mat_text}]})
+    else:
+        mg = _parse_soy_mg(mat_text)
+        if mat_text:
+            for g in groups:
+                if g["label"] == "AGRONOMIC CHARACTERISTICS":
+                    g["items"].insert(0, {"characteristic": "Maturity (RM)",
+                                          "value": mat_text})
+                    break
+            else:
+                groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
+                                  "items": [{"characteristic": "Maturity (RM)",
+                                             "value": mat_text}]})
+
+    trait = _slug_to_trait(d.trait_slug)
+    trait_stack = [trait] if trait and trait.lower() != "conventional" else (
+        ["Conventional"] if trait.lower() == "conventional" else [])
+
+    return StineVariety(
+        source_key=f"stine-{code.lower()}",
+        source_url=d.url,
+        crop=d.crop,
+        product_name=code,
+        relative_maturity=rm,
+        maturity_group=mg,
+        trait_stack=trait_stack,
+        positioning=None,
+        groups=groups,
+        sitemap_last_modified=d.lastmod,
+    )
+
+
+# --------------------------------------------------------------------- render
+
+
+def render_markdown(v: StineVariety) -> str:
+    crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
+        v.crop, v.crop.title())
+    head: list[str] = [
+        f"# Stine {v.product_name}",
+        "",
+        "- **Vendor:** Stine Seed Company (independent family-owned breeder, Adel, IA)",
+        "- **Brand:** Stine",
+        f"- **Crop:** {crop_label}",
+    ]
+    if v.crop == "corn" and v.relative_maturity is not None:
+        head.append(f"- **Relative maturity:** {v.relative_maturity} days (representative)")
+    if v.crop == "soybeans" and v.maturity_group is not None:
+        head.append(f"- **Maturity group:** {v.maturity_group}")
+    if v.trait_stack:
+        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
+    head.append(f"- **Source:** {v.source_url}")
+    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
+    head.append("- **Service area:** Stine dealer network — Corn Belt (IA/IL/IN/MN/NE/MO etc.)")
+    head.append("")
+    head += ["---", ""]
+    for g in v.groups:
+        head.append(f"## {g['label'].title()}")
+        head.append("")
+        for it in g["items"]:
+            ch = it["characteristic"]
+            val = it["value"] or "—"
+            head.append(f"- **{ch}:** {val}")
+        head.append("")
+    return "\n".join(head)
+
+
+def write_variety(v: StineVariety, body_md: str) -> None:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
+    sidecar = {
+        "source": "stine",
+        "source_key": v.source_key,
+        "vendor": "Stine Seed Company",
+        "brand": "Stine",
+        "product_name": v.product_name,
+        "product_id": v.product_name,
+        "crop": v.crop,
+        "release_year": None,
+        "relative_maturity": v.relative_maturity,
+        "maturity_group": v.maturity_group,
+        "wheat_class": None,
+        "trait_stack": v.trait_stack,
+        "trait_descriptions": [],
+        "positioning_statement": v.positioning,
+        "strengths": [],
+        "characteristics_groups": v.groups,
+        "_scale_direction": RATING_SCALE_DIRECTION,
+        "regional_recommendations": [
+            {"product_list_name": "Stine dealer network (Corn Belt — IA/IL/IN/MN/NE/MO etc.)",
+             "agronomist": None, "agronomist_email": None, "variant_id": None},
+        ],
+        "image_url": None,
+        "source_urls": [v.source_url],
+        "sitemap_last_modified": v.sitemap_last_modified,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "scraper_version": SCRAPER_VERSION,
+    }
+    (CORPUS_DIR / f"{v.source_key}.json").write_text(
+        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
+        encoding="utf-8")
+
+
+# --------------------------------------------------------------------- pipeline
+
+
+def run(*, limit: int | None, force: bool, only_crop: str | None,
+        only_product: str | None, enumerate_via: str) -> int:
+    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
+    http = RateLimitedSession()
+
+    if enumerate_via == "ajax":
+        discovered = discover_ajax(http, only_crop=only_crop)
+    else:
+        discovered = discover_sitemap(http, only_crop=only_crop)
+        if not discovered:
+            log.warning("sitemap yielded nothing — falling back to ajax")
+            discovered = discover_ajax(http, only_crop=only_crop)
+
+    if only_product:
+        key = only_product.lower()
+        discovered = [d for d in discovered
+                      if f"stine-{d.code.lower()}" == key
+                      or d.code.lower() == key]
+        if not discovered:
+            log.error("no variety matched --product=%s", only_product)
+            return 2
+
+    counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
+    processed = 0
+    total = len(discovered)
+    for d in discovered:
+        if limit is not None and processed >= limit:
+            break
+        processed += 1
+        source_key = f"stine-{d.code.lower()}"
+        md_path = CORPUS_DIR / f"{source_key}.md"
+        if md_path.exists() and not force:
+            counts["skipped"] += 1
+            log.info("[%d/%d] %s skipped", processed, total, source_key)
+            continue
+        try:
+            v = parse_detail(http, d)
+        except requests.HTTPError as exc:
+            counts["failed"] += 1
+            log.error("[%d/%d] %s detail fetch failed: %s",
+                      processed, total, source_key, exc)
+            continue
+        except Exception as exc:  # noqa: BLE001 — keep the run going
+            counts["failed"] += 1
+            log.error("[%d/%d] %s parse failed: %s",
+                      processed, total, source_key, exc)
+            continue
+        if not v.groups:
+            counts["empty"] += 1
+            log.warning("[%d/%d] %s — no chart groups parsed (still writing identity)",
+                        processed, total, source_key)
+        write_variety(v, render_markdown(v))
+        counts["written"] += 1
+        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
+                 processed, total, source_key, v.crop,
+                 v.relative_maturity if v.crop == "corn" else v.maturity_group,
+                 len(v.groups), ",".join(v.trait_stack) or "-")
+
+    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
+             processed, counts["written"], counts["skipped"],
+             counts["empty"], counts["failed"], total)
+    return 0
+
+
+# --------------------------------------------------------------------- CLI
+
+
+def _build_argparser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(
+        prog="scrape.sources.stine",
+        description="Scrape Stine Seed Company (independent Corn Belt breeder) — "
+                    "corn + soybeans via sitemap enumeration + detail pages.")
+    p.add_argument("--limit", type=int, default=None,
+                   help="Stop after processing N varieties (default: all).")
+    p.add_argument("--force", action="store_true",
+                   help="Re-fetch even if the markdown file already exists.")
+    p.add_argument("--crop", default=None, choices=sorted(CROP_PATHS),
+                   help="Limit to one crop (corn / soybeans).")
+    p.add_argument("--product", default=None,
+                   help="Process a single variety by source_key or product code.")
+    p.add_argument("--enumerate", dest="enumerate_via", default="sitemap",
+                   choices=["sitemap", "ajax"],
+                   help="Enumeration source (default: sitemap; ajax = full historical set).")
+    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
+    return p
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = _build_argparser().parse_args(argv)
+    logging.basicConfig(
+        level=args.log_level.upper(),
+        format="%(asctime)s %(levelname)s %(name)s %(message)s",
+        stream=sys.stderr)
+    return run(limit=args.limit, force=args.force,
+               only_crop=args.crop, only_product=args.product,
+               enumerate_via=args.enumerate_via)
+
+
+if __name__ == "__main__":
+    sys.exit(main())