seed-mcp/scrape/sources/stine.py

"""Stine Seed Company scraper — independent family-owned breeder (Adel, IA).

Source: ``www.stineseed.com`` — a custom PHP site (NOT WordPress;
``/wp-json/`` 404s). robots.txt returns 404 (none published); the
``/legal/`` page carries only a standard copyright / no-reproduction
clause (no anti-automation term — same posture as the other corpus
vendors). ``sitemap.xml`` (~499 URLs) lists every live product page,
so it is our canonical enumeration source.

Stine is the largest privately-owned seed company in the US; it
breeds and sells **corn + soybeans** only (no wheat). The catalog is
~58 corn hybrids + ~159 soybean varieties.

Two-step ingestion:

1. **Enumerate** the current catalog from ``sitemap.xml``. A product
   *detail* URL has the shape ``/{crop}/traits/{trait-slug}/{code}/``
   (four path segments); the bare ``/{crop}/traits/{trait-slug}/``
   landing pages are skipped. This yields exactly the live catalog
   (58 corn + 159 soy), unlike the comparison ajax endpoint which
   also returns thousands of discontinued/historical entries.

   Fallback enumeration (``--enumerate ajax``) hits the comparison
   ajax fragments:
     - corn: POST ``/ajax/corn-comparison/filter_products.php``
     - soy:  POST ``/ajax/soybean-comparison/filter_products.php``
   with ``sel1=&sel2=&sel3=`` (empty = all). Each ``<li>`` carries a
   numeric product id + the canonical detail URL.

2. **Parse the detail page.** Each ``/{crop}/traits/{slug}/{code}/``
   page server-renders all agronomic data (no JS needed) as
   ``<section class="agronomic-details">`` →
   ``<ul class="agronomy-chart"> <li> <strong>label</strong>
   <span class="value">value</span> </li> …``. The variety code +
   brand mark live in the ``<h1>`` (``Stine ® 9444-22 Brand``).

Rating scales differ by crop and are preserved verbatim (the chunker
never fabricates a value):

  - **Corn** publishes an on-page legend:
    ``9: Excellent, 8: Very Good, 7: Good, 6: Average,
    5: Below Average`` — a **1-9 numeric** scale, **HIGHER = BETTER /
    more tolerant** (same direction as Bayer/NK, so no flip). Applies
    to the agronomic performance panel (Drydown/Root/Stalk/Stress/
    Cold Emergence/Test Weight) and the disease panel (Tar Spot/Gray
    Leaf Spot/Eye Spot/N.C. Leaf Blight/Goss' Wilt/Common Rust/…).
    Plant descriptors / soil placement / herbicide rows are
    qualitative (Tall, Highly Recommended, Yes/No) and pass through.
  - **Soybeans** are entirely **qualitative** (Excellent / Very Good
    / Good / … and Resistant / Strong / Good / Susceptible for
    disease; "higher/'Resistant' = better"). There is no numeric
    legend on soy pages. SCN (Soybean Cyst Nematode) and RPS Gene
    rows carry the *source/gene* (e.g. Peking, 3a) rather than a
    rating.

We parse the chart into structured ``characteristics_groups`` — a
DISEASE RATINGS group, an AGRONOMIC CHARACTERISTICS group, and a few
pass-through groups (PLANT DESCRIPTION / SOIL & PLACEMENT / HERBICIDE
TOLERANCE / SEED TREATMENT NOTES) — so every rating lands in the
embedded chunk and is actually retrievable.

Output:
  corpus/stine/<source_key>.md
  corpus/stine/<source_key>.json

source_key: ``stine-<productcode>`` lowercased, e.g.
``stine-9444-22`` (corn) or ``stine-22r32`` (soy).

CLI:
  python -m scrape.sources.stine --crop corn --limit 2 --force
  python -m scrape.sources.stine --crop soybeans --limit 2 --force
  python -m scrape.sources.stine --force
  python -m scrape.sources.stine --product stine-9444-22
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import warnings

import requests
from bs4 import BeautifulSoup

try:  # bs4>=4.11 raises this when html.parser sees an XML doc (the sitemap)
    from bs4 import XMLParsedAsHTMLWarning
    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except Exception:  # pragma: no cover — older bs4 without the warning class
    pass

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.stineseed.com"
SITEMAP = f"{BASE}/sitemap.xml"
AJAX = {
    "corn": f"{BASE}/ajax/corn-comparison/filter_products.php",
    "soybeans": f"{BASE}/ajax/soybean-comparison/filter_products.php",
}

# Stine site path segment -> chunker crop value (chunker keys on the
# PLURAL "soybeans" for the MG branch). Stine has no wheat.
CROP_PATHS = {
    "corn": "corn",
    "soybeans": "soybeans",
}

# No robots.txt (404) and no Crawl-delay; stay polite at 1.5 s/req.
# ~217 detail pages -> a full run finishes in ~6 min.
REQ_INTERVAL_SEC = 1.5

RATING_SCALE_DIRECTION = (
    "corn agronomic+disease 1-9 numeric, 9=Excellent/best/most-tolerant, "
    "8=Very Good, 7=Good, 6=Average, 5=Below Average (higher=better, same "
    "direction as Bayer/NK; blank/'-'=not rated); soybeans qualitative "
    "(Excellent/Very Good/Good for vigor; Resistant/Strong/Good/Susceptible "
    "for disease, Resistant/Strong=best); SCN row gives source (e.g. Peking) "
    "and RPS Gene gives the gene, not a rating; plant/soil/herbicide rows "
    "qualitative (Tall, Highly Recommended/Recommended, Yes/No)"
)

# ---- Chart-label classification -------------------------------------
# The agronomy chart is a flat run of label/value <li>s mixing identity,
# performance ratings, disease ratings, plant descriptors, soil/placement,
# and herbicide rows. We bucket by label into characteristics_groups the
# chunker understands (DISEASE RATINGS -> disease framing, AGRONOMIC
# CHARACTERISTICS -> agronomic framing; the rest pass through titled).

# Identity rows already captured into RM/MG/dedicated facts — not repeated
# as a generic characteristic.
_IDENTITY_LABELS = {"maturity", "maturity end"}

# Corn 1-9 performance ratings -> AGRONOMIC CHARACTERISTICS.
_CORN_AGRONOMIC = {
    "gdd", "mn maturity", "drydown", "root", "stalk", "stress",
    "cold emergence", "test weight", "harvest population",
}
# Corn disease ratings -> DISEASE RATINGS. Set kept generous because the
# disease list varies per page (some add S.C. Leaf Blight / Anthracnose).
_CORN_DISEASE = {
    "tar spot", "gray leaf spot", "eye spot", "n.c. leaf blight",
    "s.c. leaf blight", "anthracnose", "goss' wilt", "goss’ wilt",
    "common rust", "northern corn leaf blight", "southern corn leaf blight",
    "diplodia", "fusarium", "head smut",
}
# Corn plant descriptors -> PLANT DESCRIPTION.
_CORN_PLANT = {"plant height", "ear placement", "ear flex", "cob color"}
# Corn soil/placement -> SOIL & PLACEMENT.
_CORN_SOIL = {
    "corn-on-corn", "sand", "loam", "clay", "wide rows", "narrow rows",
    'population % in 30" or wider rows', "population % in narrow rows",
    "population", "drought tolerance",
}
# Corn herbicide -> HERBICIDE TOLERANCE.
_CORN_HERBICIDE = {"glyphosate tolerant", "glufosinate tolerant"}

# Soy vigor/standability -> AGRONOMIC CHARACTERISTICS.
_SOY_AGRONOMIC = {"emergence", "standability", "shattering", "lodging"}
# Soy disease + nematode + gene rows -> DISEASE RATINGS (SCN/RPS carry a
# source/gene rather than a rating; that's still the disease panel).
_SOY_DISEASE = {
    "phytophthora root rot", "rps gene", "iron deficiency chlorosis",
    "brown stem rot", "sudden death syndrome", "soybean cyst nematode",
    "frogeye leafspot", "frogeye leaf spot", "sclerotinia white mold",
    "white mold", "stem canker", "root knot nematode", "soybean rust",
}
# Soy plant descriptors / quality -> PLANT DESCRIPTION.
_SOY_PLANT = {
    "height", "flower", "pubescence", "hilum", "chloride", "pod color",
    "canopy", "protein", "oil",
}
# Soy herbicide/trait management -> HERBICIDE TOLERANCE.
_SOY_HERBICIDE = {"sulfonylurea tolerance", "sts", "glyphosate tolerant"}

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "stine"

log = logging.getLogger("scrape.stine")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Polite session with backoff. Stine's live catalog is ~217 detail
    pages, so 1.5 s/req still finishes in a few minutes."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)

    def post(self, url: str, **kw: Any) -> requests.Response:
        return self.request("POST", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class StineVariety:
    source_key: str
    source_url: str
    crop: str                         # chunker value: corn / soybeans
    product_name: str = ""            # "9444-22", "22R32"
    relative_maturity: int | None = None     # corn (representative RM days)
    maturity_group: float | None = None      # soy MG
    trait_stack: list[str] = field(default_factory=list)
    positioning: str | None = None
    # [{label, items:[{characteristic, value}]}] — chunker source of truth
    groups: list[dict] = field(default_factory=list)
    sitemap_last_modified: str | None = None


# --------------------------------------------------------------------- discovery


_DETAIL_RE = re.compile(
    r"^https?://(?:www\.)?stineseed\.com/(corn|soybeans)/traits/"
    r"([^/]+)/([^/]+)/?$",
    re.IGNORECASE,
)


@dataclass
class DiscoveredURL:
    url: str
    crop: str
    trait_slug: str
    code: str
    lastmod: str | None = None


def _norm_url(url: str) -> str:
    """Canonical product URL has a trailing slash."""
    url = url.strip()
    if not url.endswith("/"):
        url += "/"
    return url


def discover_sitemap(http: RateLimitedSession, *,
                     only_crop: str | None) -> list[DiscoveredURL]:
    """Parse sitemap.xml for live product detail pages.

    A detail URL has FOUR path segments (``/{crop}/traits/{slug}/{code}/``);
    the bare ``/{crop}/traits/{slug}/`` landing pages are excluded.
    """
    r = http.get(SITEMAP)
    r.raise_for_status()
    # Parse with html.parser (lxml/xml backend isn't a guaranteed dep). It
    # lowercases tag names but <loc>/<lastmod> are already lowercase, so
    # find_all("url") still works on the sitemap fragments.
    soup = BeautifulSoup(r.text, "html.parser")
    out: list[DiscoveredURL] = []
    seen: set[str] = set()
    for u in soup.find_all("url"):
        loc_el = u.find("loc")
        if not loc_el:
            continue
        loc = loc_el.get_text(strip=True)
        m = _DETAIL_RE.match(loc)
        if not m:
            continue
        crop, trait_slug, code = m.group(1).lower(), m.group(2), m.group(3)
        crop = CROP_PATHS.get(crop)
        if not crop:
            continue
        if only_crop and crop != only_crop:
            continue
        canon = _norm_url(loc)
        if canon in seen:
            continue
        seen.add(canon)
        lm_el = u.find("lastmod")
        lastmod = lm_el.get_text(strip=True) if lm_el else None
        out.append(DiscoveredURL(canon, crop, trait_slug, code, lastmod))
    out.sort(key=lambda d: (d.crop, d.code))
    log.info("sitemap: discovered %d product detail pages%s",
             len(out), f" (crop={only_crop})" if only_crop else "")
    return out


def discover_ajax(http: RateLimitedSession, *,
                  only_crop: str | None) -> list[DiscoveredURL]:
    """Fallback enumeration via the comparison ajax fragments.

    NOTE: these endpoints return the FULL historical product set
    (thousands of discontinued entries, with code dupes pointing at the
    same slug), so we de-dupe on canonical URL. The sitemap is preferred
    because it reflects only the current live catalog.
    """
    out: list[DiscoveredURL] = []
    seen: set[str] = set()
    for crop, endpoint in AJAX.items():
        if only_crop and crop != only_crop:
            continue
        r = http.post(endpoint, data={"sel1": "", "sel2": "", "sel3": ""})
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        for a in soup.select("ul.comparison-list a[href]"):
            href = a.get("href") or ""
            loc = href if href.startswith("http") else BASE + href
            m = _DETAIL_RE.match(loc)
            if not m:
                continue
            mcrop = CROP_PATHS.get(m.group(1).lower())
            if not mcrop or (only_crop and mcrop != only_crop):
                continue
            canon = _norm_url(loc)
            if canon in seen:
                continue
            seen.add(canon)
            out.append(DiscoveredURL(canon, mcrop, m.group(2), m.group(3)))
    out.sort(key=lambda d: (d.crop, d.code))
    log.info("ajax: discovered %d product detail pages%s",
             len(out), f" (crop={only_crop})" if only_crop else "")
    return out


# --------------------------------------------------------------------- parse


def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()


def _slug_to_trait(slug: str) -> str:
    """Humanize a trait-slug into a display trait name.

    ``duracade-refuge-renew`` -> ``DuraCade Refuge Renew``;
    ``enlist-e3-soybeans`` -> ``Enlist E3``; ``stine-gt-`` ->
    ``Stine GT``; ``vt-double-pro-technology`` -> ``VT Double Pro``;
    ``conventional-corn`` -> ``Conventional``.
    """
    words = [w for w in re.split(r"[-_]+", slug) if w]
    drop_tail = {"soybeans", "soybean", "corn", "technology"}
    while words and words[-1].lower() in drop_tail:
        words.pop()
    if not words:
        return slug
    # Known acronyms / brand casings.
    acronyms = {"gt": "GT", "vt": "VT", "e3": "E3", "rnai": "RNAi",
                "sts": "STS", "ll": "LL", "rr2": "RR2", "3010": "3010",
                "3110": "3110", "3110a": "3110A"}
    out: list[str] = []
    for w in words:
        lw = w.lower()
        if lw in acronyms:
            out.append(acronyms[lw])
        elif lw == "duracade":
            out.append("DuraCade")
        elif lw == "viptera":
            out.append("Viptera")
        elif lw == "smartstax":
            out.append("SmartStax")
        elif lw == "xtendflex":
            out.append("XtendFlex")
        elif lw == "trecepta":
            out.append("Trecepta")
        elif lw == "agrisure":
            out.append("Agrisure")
        elif lw == "gt27":
            out.append("GT27")
        else:
            out.append(w.capitalize())
    return " ".join(out)


def _extract_code(h1_text: str, fallback: str) -> str:
    """Pull the product code from the ``Stine ® 9444-22 Brand`` H1.
    Falls back to the URL code segment (uppercased) if the H1 is odd."""
    t = h1_text
    t = re.sub(r"®|™", " ", t)
    t = re.sub(r"\bStine\b", " ", t, flags=re.I)
    t = re.sub(r"\bBrand\b", " ", t, flags=re.I)
    t = re.sub(r"\bNEW\b", " ", t)
    t = _clean(t)
    # Code is the first non-space token; keep it if it has a digit.
    tok = t.split(" ")[0] if t else ""
    if tok and any(ch.isdigit() for ch in tok):
        return tok
    return fallback.upper()


def _parse_corn_maturity(value: str) -> int | None:
    """Corn 'Maturity' is an RM range like '98 - 100' or a single '99'.
    Store the representative integer (mean of the range, rounded)."""
    nums = [int(n) for n in re.findall(r"\d+", value or "")]
    if not nums:
        return None
    if len(nums) == 1:
        return nums[0]
    return round(sum(nums[:2]) / 2)


def _parse_soy_mg(value: str) -> float | None:
    """Soy 'Maturity' is the RM expressed as a 2- or 3-digit code where
    MG = value/10 for 2-digit codes ('21' -> 2.1, '50' -> 5.0) and
    value/100 for 3-digit leading-zero codes ('008' -> 0.08). For a
    range ('008 - 009') take the start value."""
    m = re.match(r"\s*(\d+)", value or "")
    if not m:
        return None
    raw = m.group(1)
    n = int(raw)
    if len(raw) >= 3:
        return round(n / 100.0, 2)
    return round(n / 10.0, 2)


def _bucket(crop: str, label: str) -> str:
    """Map a chart label to a characteristics_groups label."""
    lk = label.lower().strip()
    if lk in _IDENTITY_LABELS:
        return ""  # handled as a dedicated fact, not a generic item
    if crop == "corn":
        if lk in _CORN_DISEASE:
            return "DISEASE RATINGS"
        if lk in _CORN_AGRONOMIC:
            return "AGRONOMIC CHARACTERISTICS"
        if lk in _CORN_PLANT:
            return "PLANT DESCRIPTION"
        if lk in _CORN_SOIL:
            return "SOIL & PLACEMENT"
        if lk in _CORN_HERBICIDE:
            return "HERBICIDE TOLERANCE"
    else:  # soybeans
        if lk in _SOY_DISEASE:
            return "DISEASE RATINGS"
        if lk in _SOY_AGRONOMIC:
            return "AGRONOMIC CHARACTERISTICS"
        if lk in _SOY_PLANT:
            return "PLANT DESCRIPTION"
        if lk in _SOY_HERBICIDE:
            return "HERBICIDE TOLERANCE"
    return "OTHER CHARACTERISTICS"


def _parse_chart(crop: str, chart) -> tuple[list[dict], list[tuple[str, str]]]:
    """Parse ``ul.agronomy-chart`` into grouped items.

    Returns (groups, raw_pairs) where groups is the bucketed
    characteristics_groups list (display order preserved) and raw_pairs
    is every (label, value) pair (used to pull RM/MG)."""
    # Stable group order for rendering.
    order = ["AGRONOMIC CHARACTERISTICS", "DISEASE RATINGS",
             "PLANT DESCRIPTION", "SOIL & PLACEMENT",
             "HERBICIDE TOLERANCE", "OTHER CHARACTERISTICS"]
    bucketed: dict[str, list[dict]] = {k: [] for k in order}
    raw_pairs: list[tuple[str, str]] = []
    seen_item: set[tuple[str, str]] = set()
    for li in chart.find_all("li", recursive=False):
        strong = li.find("strong")
        val_el = li.find("span", class_="value")
        if not strong:
            continue
        label = _clean(strong.get_text(" ", strip=True))
        value = _clean(val_el.get_text(" ", strip=True)) if val_el else ""
        if not label:
            continue
        raw_pairs.append((label, value))
        grp = _bucket(crop, label)
        if not grp:
            continue
        # The soy page repeats "Maturity" twice and we drop those via
        # _IDENTITY_LABELS; de-dupe any other accidental repeats too.
        key = (label.lower(), value.lower())
        if key in seen_item:
            continue
        seen_item.add(key)
        bucketed[grp].append({"characteristic": label, "value": value})
    groups = [{"label": k, "items": bucketed[k]} for k in order if bucketed[k]]
    return groups, raw_pairs


def parse_detail(http: RateLimitedSession, d: DiscoveredURL) -> StineVariety:
    r = http.get(d.url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    h1 = soup.find("h1")
    h1_text = _clean(h1.get_text(" ", strip=True)) if h1 else ""
    code = _extract_code(h1_text, d.code)

    sec = soup.find("section", class_="agronomic-details")
    chart = sec.find("ul", class_="agronomy-chart") if sec else None
    groups: list[dict] = []
    raw_pairs: list[tuple[str, str]] = []
    if chart:
        groups, raw_pairs = _parse_chart(d.crop, chart)

    # Pull maturity from the first "Maturity" pair.
    rm: int | None = None
    mg: float | None = None
    mat_text = ""
    for label, value in raw_pairs:
        if label.lower() == "maturity":
            mat_text = value
            break
    if d.crop == "corn":
        rm = _parse_corn_maturity(mat_text)
        # Keep the RM range text as a characteristic so the verbatim
        # range is retrievable alongside the representative integer.
        if mat_text:
            for g in groups:
                if g["label"] == "AGRONOMIC CHARACTERISTICS":
                    g["items"].insert(0, {"characteristic": "Maturity (RM range)",
                                          "value": mat_text})
                    break
            else:
                groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
                                  "items": [{"characteristic": "Maturity (RM range)",
                                             "value": mat_text}]})
    else:
        mg = _parse_soy_mg(mat_text)
        if mat_text:
            for g in groups:
                if g["label"] == "AGRONOMIC CHARACTERISTICS":
                    g["items"].insert(0, {"characteristic": "Maturity (RM)",
                                          "value": mat_text})
                    break
            else:
                groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
                                  "items": [{"characteristic": "Maturity (RM)",
                                             "value": mat_text}]})

    trait = _slug_to_trait(d.trait_slug)
    trait_stack = [trait] if trait and trait.lower() != "conventional" else (
        ["Conventional"] if trait.lower() == "conventional" else [])

    return StineVariety(
        source_key=f"stine-{code.lower()}",
        source_url=d.url,
        crop=d.crop,
        product_name=code,
        relative_maturity=rm,
        maturity_group=mg,
        trait_stack=trait_stack,
        positioning=None,
        groups=groups,
        sitemap_last_modified=d.lastmod,
    )


# --------------------------------------------------------------------- render


def render_markdown(v: StineVariety) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
        v.crop, v.crop.title())
    head: list[str] = [
        f"# Stine {v.product_name}",
        "",
        "- **Vendor:** Stine Seed Company (independent family-owned breeder, Adel, IA)",
        "- **Brand:** Stine",
        f"- **Crop:** {crop_label}",
    ]
    if v.crop == "corn" and v.relative_maturity is not None:
        head.append(f"- **Relative maturity:** {v.relative_maturity} days (representative)")
    if v.crop == "soybeans" and v.maturity_group is not None:
        head.append(f"- **Maturity group:** {v.maturity_group}")
    if v.trait_stack:
        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
    head.append(f"- **Source:** {v.source_url}")
    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
    head.append("- **Service area:** Stine dealer network — Corn Belt (IA/IL/IN/MN/NE/MO etc.)")
    head.append("")
    head += ["---", ""]
    for g in v.groups:
        head.append(f"## {g['label'].title()}")
        head.append("")
        for it in g["items"]:
            ch = it["characteristic"]
            val = it["value"] or "—"
            head.append(f"- **{ch}:** {val}")
        head.append("")
    return "\n".join(head)


def write_variety(v: StineVariety, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "stine",
        "source_key": v.source_key,
        "vendor": "Stine Seed Company",
        "brand": "Stine",
        "product_name": v.product_name,
        "product_id": v.product_name,
        "crop": v.crop,
        "release_year": None,
        "relative_maturity": v.relative_maturity,
        "maturity_group": v.maturity_group,
        "wheat_class": None,
        "trait_stack": v.trait_stack,
        "trait_descriptions": [],
        "positioning_statement": v.positioning,
        "strengths": [],
        "characteristics_groups": v.groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [
            {"product_list_name": "Stine dealer network (Corn Belt — IA/IL/IN/MN/NE/MO etc.)",
             "agronomist": None, "agronomist_email": None, "variant_id": None},
        ],
        "image_url": None,
        "source_urls": [v.source_url],
        "sitemap_last_modified": v.sitemap_last_modified,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{v.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool, only_crop: str | None,
        only_product: str | None, enumerate_via: str) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()

    if enumerate_via == "ajax":
        discovered = discover_ajax(http, only_crop=only_crop)
    else:
        discovered = discover_sitemap(http, only_crop=only_crop)
        if not discovered:
            log.warning("sitemap yielded nothing — falling back to ajax")
            discovered = discover_ajax(http, only_crop=only_crop)

    if only_product:
        key = only_product.lower()
        discovered = [d for d in discovered
                      if f"stine-{d.code.lower()}" == key
                      or d.code.lower() == key]
        if not discovered:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
    processed = 0
    total = len(discovered)
    for d in discovered:
        if limit is not None and processed >= limit:
            break
        processed += 1
        source_key = f"stine-{d.code.lower()}"
        md_path = CORPUS_DIR / f"{source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%d] %s skipped", processed, total, source_key)
            continue
        try:
            v = parse_detail(http, d)
        except requests.HTTPError as exc:
            counts["failed"] += 1
            log.error("[%d/%d] %s detail fetch failed: %s",
                      processed, total, source_key, exc)
            continue
        except Exception as exc:  # noqa: BLE001 — keep the run going
            counts["failed"] += 1
            log.error("[%d/%d] %s parse failed: %s",
                      processed, total, source_key, exc)
            continue
        if not v.groups:
            counts["empty"] += 1
            log.warning("[%d/%d] %s — no chart groups parsed (still writing identity)",
                        processed, total, source_key)
        write_variety(v, render_markdown(v))
        counts["written"] += 1
        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
                 processed, total, source_key, v.crop,
                 v.relative_maturity if v.crop == "corn" else v.maturity_group,
                 len(v.groups), ",".join(v.trait_stack) or "-")

    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
             processed, counts["written"], counts["skipped"],
             counts["empty"], counts["failed"], total)
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.stine",
        description="Scrape Stine Seed Company (independent Corn Belt breeder) — "
                    "corn + soybeans via sitemap enumeration + detail pages.")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=sorted(CROP_PATHS),
                   help="Limit to one crop (corn / soybeans).")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or product code.")
    p.add_argument("--enumerate", dest="enumerate_via", default="sitemap",
                   choices=["sitemap", "ajax"],
                   help="Enumeration source (default: sitemap; ajax = full historical set).")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    return run(limit=args.limit, force=args.force,
               only_crop=args.crop, only_product=args.product,
               enumerate_via=args.enumerate_via)


if __name__ == "__main__":
    sys.exit(main())