seed-mcp/scrape/sources/first_choice.py

"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).

Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
seed company in Rushville, Indiana, serving the Eastern Corn Belt
(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
that is out of scope for the row-crop advisor).

Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
post types (corn-hybrids / soybeans / wheat) are NOT exposed to
``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
per-crop child sitemaps:

  - ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/``  (~52 URLs)
  - ``/soybeans-sitemap.xml``     -> ``/soybeans/<slug>/``      (~22 URLs)
  - ``/wheat-sitemap.xml``        -> ``/wheat/<slug>/``         (~4 URLs)

robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
between requests.

Detail-page DOM (server-rendered, no JS needed for the text):
  * Product name: the second ``<h1>`` inside ``article.content`` (the
    first is the site logo "1st Choice Seeds").
  * Corn — three ``<h2>`` sections + a side table:
      - "Hybrid Characteristics": a single ``<p>`` of ``label • value``
        lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
        Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
        Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
        Seedling Vigor (genuinely thin pages — still written).
      - "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
        (the numeric 0-10 bars are drawn client-side by d3 and are NOT
        in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
        Superior, so higher = better.
      - "Management Tips": ``label: value`` lines (Corn-On-Corn,
        Productivity / soil guidance, Silage Rating).
      - A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
        the Low/Medium/High recommended planting populations.
  * Soybeans — three ``<h2>`` sections:
      - "Field Notes": a ``<ul>`` of strengths (often includes SCN
        source / PRR gene call-outs).
      - "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
      - "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
        pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
        Color, Pubescence, Pod, Hilum).
  * Wheat — thin (title + date only; wheat is private-label). We still
    write an identity record so the variety is discoverable.

Rating scale: the published legend is **0-10, higher = better**
("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
Superior 9-10"). 1st Choice publishes the *qualitative* word
(Excellent / Very Good / …) in the HTML — those map directly onto that
legend — while the numeric bar is d3-rendered and absent from the
markup. NA / blank = not rated.

Output:
  corpus/first_choice/<source_key>.md
  corpus/first_choice/<source_key>.json

source_key: ``firstchoice-<slug>`` lowercased, e.g.
``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.

CLI:
  python -m scrape.sources.first_choice --crop corn --limit 5
  python -m scrape.sources.first_choice --force
  python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString, Tag

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.1stchoiceseeds.com"
SITEMAP_INDEX = f"{BASE}/sitemap.xml"

# Per-crop child sitemap -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. The
# cover-crops sitemap is intentionally omitted (out of scope for the
# row-crop advisor).
CROP_SITEMAPS = {
    "corn": "corn-hybrids-sitemap.xml",
    "soybeans": "soybeans-sitemap.xml",
    "wheat": "wheat-sitemap.xml",
}

# URL path prefix that confirms a sitemap entry is a variety detail page
# (vs. a category/archive page that can sneak into a child sitemap).
CROP_PATH = {
    "corn": "/corn-hybrids/",
    "soybeans": "/soybeans/",
    "wheat": "/wheat/",
}

# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
REQ_INTERVAL_SEC = 1.2

RATING_SCALE_DIRECTION = (
    "0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
    "6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
    "publishes the qualitative word in HTML (the numeric bar is "
    "d3-rendered, not in markup); blank/NA = not rated"
)

# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}

# Trait-suffix -> human label, derived from the slug tail. Best-effort;
# an unmapped suffix is title-cased verbatim so nothing is dropped.
TRAIT_LABELS = {
    # corn
    "vt2p": "VT Double PRO (VT2P)",
    "gt": "Glyphosate Tolerant (GT)",
    "c": "Conventional",
    "pc": "PowerCore (PC)",
    "tre": "Trecepta (TRE)",
    "ss": "SmartStax (SS)",
    "v": "VT (V)",
    "dv": "Double VT (DV)",
    "aa": "Agrisure Artesian (AA)",
    # soybeans
    "en": "Enlist E3 (EN)",
    "xf": "XtendFlex (XF)",
    "sts": "STS",
    # wheat
    "b": "Bin-run / branded (B)",
    "s": "Soft (S)",
}

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "first_choice"

log = logging.getLogger("scrape.first_choice")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Polite session with backoff. The 1st Choice row-crop catalog is
    small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
    a couple minutes."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        resp: requests.Response | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        assert resp is not None
        return resp

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class FCVariety:
    source_key: str
    source_url: str
    crop: str                          # chunker value: corn / soybeans / wheat
    product_name: str = ""             # "FC 8455 VT2P"
    relative_maturity: int | None = None      # corn (days)
    maturity_group: float | None = None       # soy
    wheat_class: str | None = None             # wheat
    trait_stack: list[str] = field(default_factory=list)
    positioning: str | None = None
    strengths: list[str] = field(default_factory=list)
    # [{label, items:[{characteristic, value}]}] — chunker source of truth
    groups: list[dict] = field(default_factory=list)
    sitemap_last_modified: str | None = None


# --------------------------------------------------------------------- discovery (sitemaps)


_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
                     re.IGNORECASE | re.DOTALL)
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
                         re.IGNORECASE | re.DOTALL)


def _slug_from_url(url: str) -> str:
    return url.rstrip("/").rsplit("/", 1)[-1].lower()


def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
    """Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
    by walking the per-crop child sitemaps under /sitemap.xml.

    We fetch each known child sitemap directly (their names are stable
    All-in-One-SEO conventions) rather than trusting the index ordering,
    but we still confirm against the index so a renamed sitemap is caught.
    """
    # Pull the sitemap index once so we can warn if a crop sitemap is
    # missing/renamed (defensive; we still target the known names).
    index_locs: set[str] = set()
    try:
        idx = http.get(SITEMAP_INDEX)
        idx.raise_for_status()
        index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
    except requests.RequestException as exc:
        log.warning("could not read sitemap index %s: %s (continuing with "
                    "known child sitemap names)", SITEMAP_INDEX, exc)

    records: list[dict] = []
    for crop, child in CROP_SITEMAPS.items():
        if only_crop and crop != only_crop:
            continue
        child_url = f"{BASE}/{child}"
        if index_locs and child_url not in index_locs:
            log.warning("crop sitemap %s not listed in the index — site may "
                        "have renamed it; trying anyway", child_url)
        r = http.get(child_url)
        if r.status_code == 404:
            log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
            continue
        r.raise_for_status()
        prefix = CROP_PATH[crop]
        seen: set[str] = set()
        n = 0
        for block in _URL_BLOCK_RE.findall(r.text):
            loc_m = _LOC_RE.search(block)
            if not loc_m:
                continue
            url = loc_m.group(1).strip()
            if prefix not in url:
                continue  # category/archive page leaked into the sitemap
            slug = _slug_from_url(url)
            if not slug or slug in seen:
                continue
            seen.add(slug)
            lm_m = _LASTMOD_RE.search(block)
            records.append({
                "crop": crop,
                "url": url,
                "slug": slug,
                "lastmod": lm_m.group(1).strip() if lm_m else None,
            })
            n += 1
        log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
    log.info("total varieties discovered: %d", len(records))
    return records


# --------------------------------------------------------------------- detail parse


def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()


def _direct_text(el: Tag) -> str:
    return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))


def _br_lines(el: Tag) -> list[str]:
    """Text of an element with <br> treated as a line break."""
    # Work on a copy so the original tree (used by other parsers) stays intact.
    for br in el.find_all("br"):
        br.replace_with("\n")
    return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]


def _product_name(article: Tag, slug: str) -> str:
    """The variety name is the 2nd <h1> in article.content (the 1st is the
    site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
    for h1 in article.find_all("h1"):
        txt = _clean(h1.get_text(" ", strip=True))
        if txt and txt.lower() != "1st choice seeds":
            return txt
    return slug.upper().replace("-", " ")


def _trait_stack(slug: str, crop: str) -> list[str]:
    """Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
    fb-3545-c-sts -> Conventional + STS). The leading model token
    (fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
    parts = slug.split("-")
    # Drop the leading model identifier: typically the first 1-2 tokens
    # (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
    # that is a known trait suffix counts; we scan from the right.
    traits: list[str] = []
    for tok in parts:
        t = tok.lower()
        if t in TRAIT_LABELS:
            label = TRAIT_LABELS[t]
            if label not in traits:
                traits.append(label)
    # Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
    # above naturally skips them. Preserve discovery order (left->right).
    return traits


def _parse_corn(article: Tag, v: FCVariety) -> None:
    """Populate corn ratings from Hybrid Characteristics + Management Tips
    + the Relative Maturity / Degree Days side table."""
    agronomic: list[dict] = []
    disease: list[dict] = []
    management: list[dict] = []

    # Hybrid Characteristics: a <p> of "label • value" lines.
    hc = next((h for h in article.find_all("h2")
               if _clean(h.get_text()) == "Hybrid Characteristics"), None)
    if hc is not None:
        sib = hc.find_next_sibling()
        if sib is not None and sib.name == "p":
            for ln in _br_lines(sib):
                # split on bullet (•) or fall back to first colon
                if "•" in ln:
                    k, _, val = ln.partition("•")
                elif ":" in ln:
                    k, _, val = ln.partition(":")
                else:
                    k, val = ln, ""
                k, val = _clean(k), _clean(val)
                if not k:
                    continue
                item = {"characteristic": k, "value": val}
                if k.lower() in _CORN_DISEASE_LABELS:
                    disease.append(item)
                else:
                    agronomic.append(item)

    # Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
    # Silage Rating). Stop pulling once we wander into the footer address.
    mt = next((h for h in article.find_all("h2")
               if _clean(h.get_text()) == "Management Tips"), None)
    if mt is not None:
        sib = mt.find_next_sibling()
        if sib is not None and sib.name == "p":
            for ln in _br_lines(sib):
                if ":" not in ln:
                    continue
                k, _, val = ln.partition(":")
                k, val = _clean(k), _clean(val)
                # Footer noise (address / © line) has no useful colon form.
                if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
                    management.append({"characteristic": k, "value": val})

    # Side table: Relative Maturity / Degree Days + planting populations.
    pop_rows: list[str] = []
    for tbl in article.find_all("table"):
        for tr in tbl.find_all("tr"):
            cells = [_clean(c.get_text(" ", strip=True))
                     for c in tr.find_all(["td", "th"])]
            cells = [c for c in cells if c]
            if not cells:
                continue
            joined = " ".join(cells).lower()
            if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
                m = re.search(r"(\d+)", cells[1])
                if m:
                    v.relative_maturity = int(m.group(1))
                agronomic.insert(0, {"characteristic": "Relative Maturity",
                                     "value": cells[1]})
            elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
                agronomic.append({"characteristic": "Degree Days (GDU)",
                                  "value": cells[1]})
            elif joined.startswith("low") and ("medium" in joined or "high" in joined):
                pop_rows.append(" / ".join(cells))
    if pop_rows:
        management.append({"characteristic": "Recommended Planting Population",
                           "value": "; ".join(pop_rows)})

    if agronomic:
        v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
    if disease:
        v.groups.append({"label": "DISEASE RATINGS", "items": disease})
    if management:
        v.groups.append({"label": "MANAGEMENT", "items": management})


def _parse_soy(article: Tag, v: FCVariety) -> None:
    """Populate soy MG + agronomic descriptors + field-note strengths."""
    # Field Notes -> strengths (and positioning from the first one).
    fn = next((h for h in article.find_all("h2")
               if _clean(h.get_text()) == "Field Notes"), None)
    if fn is not None:
        sib = fn.find_next_sibling()
        if sib is not None and sib.name == "ul":
            notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
            v.strengths = [n for n in notes if n]
            if v.strengths and not v.positioning:
                v.positioning = v.strengths[0]

    # Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
    agronomic: list[dict] = []
    vd = next((h for h in article.find_all("h2")
               if _clean(h.get_text()) == "Variety Description"), None)
    if vd is not None:
        for el in vd.find_all_next():
            if el.name == "h2" and el is not vd:
                break
            if not isinstance(el, Tag):
                continue
            # Stop at the action buttons / right-nav / footer region.
            cls = el.get("class") or []
            if el.name == "div" and any(
                    c in cls for c in ("btn", "right-bar", "right-navigation",
                                       "address", "wrapper")):
                break
            b = el.find("b", recursive=False) if el.name == "div" else None
            if b is not None:
                k = _clean(b.get_text(" ", strip=True)).rstrip(":")
                val = _direct_text(el)
                if not k:
                    continue
                if k.lower() == "maturity":
                    try:
                        v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
                    except (AttributeError, ValueError):
                        pass
                    agronomic.append({"characteristic": "Maturity Group", "value": val})
                else:
                    agronomic.append({"characteristic": k, "value": val})
    if agronomic:
        v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})


def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
    crop = rec["crop"]
    slug = rec["slug"]
    url = rec["url"]
    v = FCVariety(
        source_key=f"firstchoice-{slug}",
        source_url=url,
        crop=crop,
        trait_stack=_trait_stack(slug, crop),
        sitemap_last_modified=rec.get("lastmod"),
    )
    r = http.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    article = soup.find("article", class_="content") or soup
    v.product_name = _product_name(article, slug)

    if crop == "corn":
        _parse_corn(article, v)
    elif crop == "soybeans":
        _parse_soy(article, v)
    # wheat: thin pages — identity only (no spec sections to parse).
    return v


# --------------------------------------------------------------------- render


def render_markdown(v: FCVariety) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybeans",
                  "wheat": "Wheat"}.get(v.crop, v.crop.title())
    head: list[str] = [
        f"# {v.product_name}",
        "",
        "- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
        "- **Brand:** 1st Choice Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if v.crop == "corn" and v.relative_maturity is not None:
        head.append(f"- **Relative maturity:** {v.relative_maturity} day")
    if v.crop == "soybeans" and v.maturity_group is not None:
        head.append(f"- **Maturity group:** {v.maturity_group}")
    if v.crop == "wheat" and v.wheat_class:
        head.append(f"- **Wheat class:** {v.wheat_class}")
    if v.trait_stack:
        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
    head.append(f"- **Source:** {v.source_url}")
    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
    head.append("- **Service area:** 1st Choice Seeds dealer network — "
                "Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
    head.append("")
    if v.positioning:
        head += ["---", "", f"_{v.positioning}_", ""]
    if v.strengths:
        head += ["---", "", "## Field Notes", ""]
        head += [f"- {s}" for s in v.strengths]
        head.append("")
    head += ["---", ""]
    for g in v.groups:
        head.append(f"## {g['label'].title()}")
        head.append("")
        for it in g["items"]:
            ch = it["characteristic"]
            val = it["value"] or "—"
            head.append(f"- **{ch}:** {val}")
        head.append("")
    if not v.groups and v.crop == "wheat":
        head += ["_Identity record only — 1st Choice wheat is private-label "
                 "and the catalog page carries no agronomic spec block._", ""]
    return "\n".join(head)


def write_variety(v: FCVariety, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "first_choice",
        "source_key": v.source_key,
        "vendor": "1st Choice Seeds",
        "brand": "1st Choice Seeds",
        "product_name": v.product_name,
        "product_id": v.product_name,
        "crop": v.crop,
        "release_year": None,
        "relative_maturity": v.relative_maturity,
        "maturity_group": v.maturity_group,
        "wheat_class": v.wheat_class,
        "trait_stack": v.trait_stack,
        "trait_descriptions": [],
        "positioning_statement": v.positioning,
        "strengths": v.strengths,
        "characteristics_groups": v.groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [
            {"product_list_name": "1st Choice Seeds dealer network "
                                  "(Eastern Corn Belt — IN/OH/KY/TN)",
             "agronomist": None, "agronomist_email": None, "variant_id": None},
        ],
        "image_url": None,
        "source_urls": [v.source_url],
        "sitemap_last_modified": v.sitemap_last_modified,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{v.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    records = discover(http, only_crop=only_crop)

    if only_product:
        key = only_product.lower()
        records = [r for r in records
                   if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
        if not records:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
    processed = 0
    for rec in records:
        if limit is not None and processed >= limit:
            break
        processed += 1
        source_key = f"firstchoice-{rec['slug']}"
        md_path = CORPUS_DIR / f"{source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%d] %s skipped", processed, len(records), source_key)
            continue
        try:
            v = parse_detail(http, rec)
        except requests.HTTPError as exc:
            counts["failed"] += 1
            log.error("[%d/%d] %s detail fetch failed: %s",
                      processed, len(records), source_key, exc)
            continue
        if not v.groups:
            counts["empty"] += 1
            log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
                        processed, len(records), source_key,
                        "; thin wheat page" if v.crop == "wheat" else "")
        write_variety(v, render_markdown(v))
        counts["written"] += 1
        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
                 processed, len(records), source_key, v.crop,
                 v.relative_maturity or v.maturity_group or "-",
                 len(v.groups), ",".join(v.trait_stack) or "-")

    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
             processed, counts["written"], counts["skipped"], counts["empty"],
             counts["failed"], len(records))
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.first_choice",
        description="Scrape 1st Choice Seeds (independent, employee-owned — "
                    "Rushville, IN) — corn / soybeans / wheat via sitemaps "
                    "+ detail pages.")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
                   help="Limit to one crop (corn / soybeans / wheat).")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or slug.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    return run(limit=args.limit, force=args.force,
               only_crop=args.crop, only_product=args.product)


if __name__ == "__main__":
    sys.exit(main())