seed-mcp/scrape/sources/robseeco.py

"""RobSeeCo (Rob-See-Co / Innotech) seed-guide scraper.

Source: the 2026 RobSeeCo Seed Guide PDF
(``robseeco.com/s/2026_RobSeeCo-Seed-Guide_FINAL-LR-Single.pdf`` — a
302 to a static1.squarespace.com asset). Rob-See-Co is an independent
seed company serving the Western/Central Corn Belt + Dakotas (Eastern
Corn Belt via Federal Hybrids / Kiser Seed); the guide carries the
Rob-See-Co corn line, the Rob-See-Co + Innotech soybean lines, plus
Masters Choice silage corn and sorghum (both OUT OF SCOPE for the
row-crop advisor — skipped).

This is a marketing-laid-out PDF, the hardest extraction class. The
value lives in two complementary places per crop:

1. A dense **ratings table** (corn p5-8, soy p19-26) — one row per
   hybrid/variety with ~14 single-digit 1-9 agronomic + disease
   ratings. The numeric column headers are ROTATED 90deg (rendered as
   reversed text), so we reconstruct each header label by clustering the
   rotated glyphs by their x-position, ordering bottom-to-top, and
   reversing the string. We then map each numeric data cell to its
   column by **x-center alignment** (the most reliable signal — the
   whitespace-tokenised text drops/duplicates cells around the sparse
   "Fungicide Response" / soil-fit columns and the trailing geo letters).

2. Descriptive **2-column cards** (corn p9-18, soy p21-26) — code +
   trait variants (corn) + 3-6 marketing bullets. We split the page at
   its mid-x into a left / right column, cluster each into lines, and
   group consecutive title lines (font size 14) + their following
   bullet lines (size 9.5) into one card. Cards enrich the table record
   with the corn trait-variant suffixes (-RR2 / -VT2P / -Conv / …) and
   the bullets (``strengths``).

The table is authoritative for the structured 1-9 ``characteristics_groups``;
cards add the trait stack + bullets. A table row that won't parse cleanly
(too few aligned numeric cells) falls back to a **card-only** record
(identity + bullets) rather than emit mis-assigned ratings.

EVERY content page in the PDF is DUPLICATED consecutively (p5 == p6,
p7 == p8, …), and the two covers carry mirrored text — we dedup by
source_key and only ever process the first occurrence of a code.

Output:
  corpus/robseeco/<source_key>.md
  corpus/robseeco/<source_key>.json

source_key: ``robseeco-<code>`` lowercased, e.g. ``robseeco-rc2500``,
``robseeco-is1162e3``.

CLI:
  python -m scrape.sources.robseeco --crop corn --limit 5
  python -m scrape.sources.robseeco --force
  python -m scrape.sources.robseeco --product robseeco-rc2500
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from collections import defaultdict
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import pdfplumber
import requests

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
PDF_URL = "https://www.robseeco.com/s/2026_RobSeeCo-Seed-Guide_FINAL-LR-Single.pdf"
PRODUCTS_URL = "https://www.robseeco.com/products"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "robseeco"
PDF_CACHE = Path(os.environ.get("ROBSEECO_PDF_CACHE")
                 or (REPO_ROOT / "var" / "robseeco_2026_seed_guide.pdf"))

REQ_INTERVAL_SEC = 1.5

# Scale direction — taken verbatim from the legend on p7-8 of the guide.
SCALE_DIRECTION = (
    "Agronomic and disease ratings 1-9, 9=Best, 1=Worst, -=not available "
    "(higher is better). Plant Height 9=Tall/1=Short; Ear Height 9=High/1=Low. "
    "Planting Rate Guideline L=Low / ML=Medium-Low / M=Medium / MH=Medium-High / "
    "H=High. Disease/pest letter codes (soy): R=resistant, MR=moderately resistant, "
    "S=susceptible. Product Fit Geography A=All, C=Central (IA/MN/WI), "
    "E=East (IN/MI/OH/PA/MD), W=West (ND/SD/NE/KS/OK/TX & West), CW=Central+West."
)

REGIONAL_REC = [{
    "product_list_name": ("RobSeeCo dealer network (Western/Central Corn Belt + "
                          "Dakotas; Eastern Corn Belt via Federal Hybrids/Kiser Seed)"),
    "agronomist": None,
    "agronomist_email": None,
    "variant_id": None,
}]

# Page sections (0-indexed). Content pages are duplicated consecutively,
# so the ranges deliberately span both copies — dedup by source_key
# handles the repeat.
CORN_TABLE_PAGES = range(5, 9)      # p5-8
CORN_CARD_PAGES = range(9, 19)      # p9-18
SOY_TABLE_PAGES = range(19, 27)     # p19-26
SOY_CARD_PAGES = range(21, 27)      # p21-26 (soy cards interleave after the table)

log = logging.getLogger("scrape.robseeco")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Polite session with backoff. We make exactly one request (the PDF)
    so this is mostly belt-and-suspenders, but it keeps the source uniform
    with the other scrapers and gives retry/backoff on a flaky CDN."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 90.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout,
                                      allow_redirects=True, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


def fetch_pdf(http: RateLimitedSession, *, force: bool) -> Path:
    """Download the seed guide to the cache path (reused unless --force).
    Reproducible for the monthly refresh — the scraper owns the download."""
    if PDF_CACHE.exists() and PDF_CACHE.stat().st_size > 1_000_000 and not force:
        log.info("using cached PDF %s (%d bytes)", PDF_CACHE, PDF_CACHE.stat().st_size)
        return PDF_CACHE
    PDF_CACHE.parent.mkdir(parents=True, exist_ok=True)
    log.info("downloading seed guide %s", PDF_URL)
    r = http.get(PDF_URL)
    r.raise_for_status()
    ct = r.headers.get("Content-Type", "")
    if "pdf" not in ct.lower() and not r.content[:4] == b"%PDF":
        raise RuntimeError(f"unexpected content-type {ct!r} for {PDF_URL}")
    PDF_CACHE.write_bytes(r.content)
    log.info("cached PDF -> %s (%d bytes)", PDF_CACHE, len(r.content))
    return PDF_CACHE


# --------------------------------------------------------------------- model


@dataclass
class RSVariety:
    source_key: str
    crop: str                          # "corn" | "soybeans"
    brand: str                         # "Rob-See-Co" | "Innotech"
    product_name: str                  # the base code
    relative_maturity: int | None = None     # corn
    maturity_group: float | None = None      # soy
    trait_stack: list[str] = field(default_factory=list)
    positioning: str | None = None
    strengths: list[str] = field(default_factory=list)
    # characteristics_groups: [{label, items:[{characteristic,value}]}]
    groups: list[dict] = field(default_factory=list)
    from_table: bool = True            # False = card-only fallback


# --------------------------------------------------------------------- header reconstruction


def reconstruct_rotated_headers(page: pdfplumber.page.Page,
                                top_max: float) -> list[tuple[float, str]]:
    """Return ``[(x_center, label), ...]`` for the rotated 90deg column
    headers in the top header band of a ratings-table page.

    Rotated text comes back as ``upright=False`` words; each column shares
    an x-position and reads bottom-to-top, so we cluster by ``round(x0)``,
    order each column's glyphs by descending ``top`` (read order), join,
    and reverse the string to recover the human-readable label.
    """
    words = page.extract_words(use_text_flow=False, keep_blank_chars=False,
                               extra_attrs=["upright"])
    rot = [w for w in words if not w["upright"] and w["top"] < top_max]
    cols: dict[int, list[dict]] = defaultdict(list)
    for w in rot:
        cols[round(w["x0"])].append(w)
    out: list[tuple[float, str]] = []
    for x0 in sorted(cols):
        ws = cols[x0]
        ws.sort(key=lambda w: -w["top"])          # bottom glyph first
        label = "".join(w["text"] for w in ws)[::-1]
        cx = sum((w["x0"] + w["x1"]) / 2 for w in ws) / len(ws)
        out.append((round(cx, 2), label))
    return out


# --------------------------------------------------------------------- column maps
#
# The decoded (reversed) rotated-header label -> a canonical column key.
# Built once from the reconstructed headers per page; the per-row cell
# alignment then keys on x-center, so these maps only need to recognise
# the header *strings* the reconstruction produces.

# Canonical corn columns (decoded label fragments -> canonical key).
# We match on the de-spaced reversed string the reconstructor emits.
CORN_HEADER_CANON = {
    "RM": "RM",
    "SilktoRM": "RM to Silk",
    "BlacklayertoRM": "RM to Blacklayer",
    "BlacklayertoGDU": "GDU to Blacklayer",
    "Emergence": "Emergence",
    "VigorSeedling": "Seedling Vigor",
    "StrengthRoot": "Root Strength",
    "StrengthStalk": "Stalk Strength",
    "SnapGreen": "Green Snap",
    "Staygreen": "Staygreen",
    "Drydown": "Drydown",
    "ToleranceDrought": "Drought Tolerance",
    "HeightPlant": "Plant Height",
    "HeightEar": "Ear Height",
    "WeightTest": "Test Weight",
    "SpotLeafGray": "Gray Leaf Spot",
    "WiltGoss’s": "Goss's Wilt",
    "WiltGoss's": "Goss's Wilt",
    "CornNorthern": "Northern Corn Leaf Blight",
    "BlightLeaf": "Northern Corn Leaf Blight",
    "SpotTar": "Tar Spot",
    "ResponseFungicide": "Fungicide Response (Absence of Disease in Continuous Corn)",
    "DiseaseofAbsencein": "Fungicide Response (Absence of Disease in Continuous Corn)",
    "RatePlanting": "Planting Rate Guideline",
    "Guidelines": "Planting Rate Guideline",
    "SoilsProductiveHighly": "Continuous-corn fit: Highly Productive Soils",
    "SoilsVariable": "Continuous-corn fit: Variable Soils",
    "SoilsDrainedPoorly": "Continuous-corn fit: Poorly Drained Soils",
    "AgronomicCorn:onCorn": "Corn-on-Corn Agronomic Characteristics",
    "Characteristics": "Corn-on-Corn Agronomic Characteristics",
    "RegionRecommended": "Product Fit Geography",
}

# Corn classification of canonical column -> chunker bucket label.
CORN_DISEASE = {
    "Gray Leaf Spot", "Goss's Wilt", "Northern Corn Leaf Blight", "Tar Spot",
    "Fungicide Response (Absence of Disease in Continuous Corn)",
}
CORN_AGRONOMIC = {
    "Emergence", "Seedling Vigor", "Root Strength", "Stalk Strength",
    "Green Snap", "Staygreen", "Drydown", "Drought Tolerance",
    "Plant Height", "Ear Height", "Test Weight",
}
# Everything else (RM-to-Silk/Blacklayer, GDU, planting rate, soil fit,
# product-fit geography, corn-on-corn) is passthrough placement context.
CORN_PASSTHROUGH = "Planting & Placement"

# Canonical soy columns (decoded reversed-label fragments -> key). The
# soy reconstruction joins the two label words; match on the de-spaced
# string. Two-word labels like "Maturity Relative" reverse word-order.
SOY_HEADER_CANON = {
    "MaturityRelative": "Relative Maturity",
    "TraitHerbicide": "Herbicide Trait",
    "Herbicide": "Herbicide Trait",
    "TraitTolerance": "Herbicide Trait",
    "ResistanceSCN": "SCN Resistance Source",
    "Source": "SCN Resistance Source",
    "ChlorosisIron": "Iron Chlorosis (IDC) Tolerance",
    "(IDC)Tolerance": "Iron Chlorosis (IDC) Tolerance",
    "Phytophthora": "Phytophthora",   # disambiguated by x-order below
    "GeneResistance": "Phytophthora Gene Resistance",
    "(PRR)RatingField": "Phytophthora (PRR) Field Rating",
    "RotStemBrown": "Brown Stem Rot (BSR)",
    "(BSR)": "Brown Stem Rot (BSR)",
    "MoldWhite": "White Mold (SWM) Tolerance",
    "(SWM)Tolerance": "White Mold (SWM) Tolerance",
    "DeathSudden": "Sudden Death Syndrome (SDS) Tolerance",
    "(SDS)Syndrome": "Sudden Death Syndrome (SDS) Tolerance",
    "Emergence": "Emergence",
    "Standability": "Standability",
    "HeightPlant": "Plant Height for Maturity",
    "Maturityfor": "Plant Height for Maturity",
    "WidthCanopy": "Canopy Width / Plant Type",
    "SoilsVariable": "Variable Soils fit",
    "ProductiveHigh": "High Productive Soil fit",
    "Soil": "High Productive Soil fit",
    "StressandVariableEnvironments": "Stress & Variable Environments fit",
    "EnvironmentsYieldHigh": "High Yield Environments fit",
    "RegionRecommended": "Product Fit Geography",
    "Recommended": "Product Fit Geography",
    "Region": "Product Fit Geography",
}
SOY_DISEASE = {
    "SCN Resistance Source", "Iron Chlorosis (IDC) Tolerance",
    "Phytophthora Gene Resistance", "Phytophthora (PRR) Field Rating",
    "Brown Stem Rot (BSR)", "White Mold (SWM) Tolerance",
    "Sudden Death Syndrome (SDS) Tolerance",
}
SOY_AGRONOMIC = {
    "Emergence", "Standability", "Plant Height for Maturity",
    "Canopy Width / Plant Type",
}
SOY_PASSTHROUGH = "Placement"


def _despace(s: str) -> str:
    return re.sub(r"\s+", "", s or "")


def build_corn_colmap(headers: list[tuple[float, str]]) -> list[tuple[float, str, str]]:
    """Return ``[(x_center, canonical_key, bucket), ...]`` sorted by x.
    bucket in {"disease","agronomic","pass"}."""
    out: list[tuple[float, str, str]] = []
    for cx, raw in headers:
        key = CORN_HEADER_CANON.get(_despace(raw))
        if not key:
            continue
        if key in CORN_DISEASE:
            bucket = "disease"
        elif key in CORN_AGRONOMIC:
            bucket = "agronomic"
        else:
            bucket = "pass"
        out.append((cx, key, bucket))
    out.sort(key=lambda t: t[0])
    return out


def build_soy_colmap(headers: list[tuple[float, str]]) -> list[tuple[float, str, str]]:
    out: list[tuple[float, str, str]] = []
    seen_phyto = False
    for cx, raw in sorted(headers, key=lambda t: t[0]):
        ds = _despace(raw)
        key = SOY_HEADER_CANON.get(ds)
        if key == "Phytophthora":
            # First "Phytophthora" header = the gene-resistance column;
            # the (PRR)RatingField label sits at the next column.
            key = "Phytophthora Gene Resistance" if not seen_phyto else None
            seen_phyto = True
        if not key:
            continue
        if key in SOY_DISEASE:
            bucket = "disease"
        elif key in SOY_AGRONOMIC:
            bucket = "agronomic"
        else:
            bucket = "pass"
        out.append((cx, key, bucket))
    out.sort(key=lambda t: t[0])
    return out


# --------------------------------------------------------------------- table row parsing


_CORN_CODE = re.compile(r"^(RC\d{4}|[A-Z]\d{2}-\d{2})$")
_SOY_CODE = re.compile(r"^(IS|RS)\d.*$")


def _nearest_col(cx: float, colmap: list[tuple[float, str, str]],
                 tol: float) -> tuple[str, str] | None:
    best = None
    bestd = tol
    for x, key, bucket in colmap:
        d = abs(x - cx)
        if d < bestd:
            bestd = d
            best = (key, bucket)
    return best


def _row_words(page: pdfplumber.page.Page) -> dict[int, list[dict]]:
    """Group upright words on a page into rows keyed by rounded top."""
    words = [w for w in page.extract_words(use_text_flow=False,
                                            keep_blank_chars=False,
                                            extra_attrs=["upright"])
             if w["upright"]]
    rows: dict[int, list[dict]] = defaultdict(list)
    for w in words:
        # snap near-equal tops together
        key = None
        for k in list(rows):
            if abs(k - w["top"]) < 4:
                key = k
                break
        rows[key if key is not None else round(w["top"])].append(w)
    return rows


def parse_corn_table(page: pdfplumber.page.Page,
                     colmap: list[tuple[float, str, str]]) -> list[RSVariety]:
    out: list[RSVariety] = []
    rows = _row_words(page)
    for _top, ws in rows.items():
        ws = sorted(ws, key=lambda w: w["x0"])
        if not ws:
            continue
        code = ws[0]["text"]
        if not _CORN_CODE.match(code) or ws[0]["x0"] > 80:
            continue
        cells = ws[1:]
        # A genuine data row leads with the RM integer right after the code.
        nums = [c for c in cells if re.fullmatch(r"-?\d+(?:\.\d+)?", c["text"])]
        if not nums:
            continue
        v = _assemble_corn(code, cells, colmap)
        if v:
            out.append(v)
    return out


def _assemble_corn(code: str, cells: list[dict],
                   colmap: list[tuple[float, str, str]]) -> RSVariety | None:
    disease: list[dict] = []
    agronomic: list[dict] = []
    passthrough: list[dict] = []
    rm: int | None = None
    placed = 0
    for c in cells:
        cx = (c["x0"] + c["x1"]) / 2
        # The page-number column sits far right (x > 740) — ignore it.
        if cx > 738:
            continue
        hit = _nearest_col(cx, colmap, tol=13.0)
        if not hit:
            continue
        key, bucket = hit
        val = c["text"].strip()
        if key == "RM":
            try:
                rm = int(float(val))
            except ValueError:
                pass
            continue
        if val in ("", "-"):
            # still record a placeholder so the chunk shows the column
            item = {"characteristic": key, "value": val or "-"}
        else:
            item = {"characteristic": key, "value": val}
        if bucket == "disease":
            disease.append(item)
        elif bucket == "agronomic":
            agronomic.append(item)
        else:
            passthrough.append(item)
        placed += 1
    # A clean corn row aligns into a healthy number of columns; if almost
    # nothing lined up, the caller falls back to a card-only record.
    if placed < 8 or rm is None:
        return None
    groups: list[dict] = []
    if agronomic:
        groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
    if disease:
        groups.append({"label": "DISEASE RATINGS", "items": disease})
    if passthrough:
        groups.append({"label": CORN_PASSTHROUGH, "items": passthrough})
    return RSVariety(
        source_key=f"robseeco-{code.lower()}",
        crop="corn",
        brand="Rob-See-Co",
        product_name=code,
        relative_maturity=rm,
        groups=groups,
    )


def parse_soy_table(page: pdfplumber.page.Page,
                    colmap: list[tuple[float, str, str]]) -> list[RSVariety]:
    out: list[RSVariety] = []
    rows = _row_words(page)
    for _top, ws in rows.items():
        ws = sorted(ws, key=lambda w: w["x0"])
        if not ws:
            continue
        code = ws[0]["text"]
        if not _SOY_CODE.match(code) or ws[0]["x0"] > 80:
            continue
        v = _assemble_soy(code, ws[1:], colmap)
        if v:
            out.append(v)
    return out


def _assemble_soy(code: str, cells: list[dict],
                  colmap: list[tuple[float, str, str]]) -> RSVariety | None:
    disease: list[dict] = []
    agronomic: list[dict] = []
    passthrough: list[dict] = []
    mg: float | None = None
    trait: str | None = None
    # Phytophthora gene cells can be two tokens ("Rps1c, Rps3a") split by
    # a space; merge any cell that lands in the same column.
    bycol: dict[str, list[str]] = defaultdict(list)
    colbucket: dict[str, str] = {}
    placed = 0
    for c in cells:
        cx = (c["x0"] + c["x1"]) / 2
        if cx > 738:
            continue
        hit = _nearest_col(cx, colmap, tol=16.0)
        if not hit:
            continue
        key, bucket = hit
        bycol[key].append(c["text"].strip())
        colbucket[key] = bucket
        placed += 1
    for key, parts in bycol.items():
        val = " ".join(p for p in parts if p).strip()
        if key == "Relative Maturity":
            try:
                mg = float(val)
            except ValueError:
                pass
            continue
        if key == "Herbicide Trait":
            trait = val or None
            continue
        item = {"characteristic": key, "value": val or "-"}
        b = colbucket[key]
        if b == "disease":
            disease.append(item)
        elif b == "agronomic":
            agronomic.append(item)
        else:
            passthrough.append(item)
    if placed < 6 or mg is None:
        return None
    brand = "Innotech" if code.upper().startswith("IS") else "Rob-See-Co"
    trait_stack = [trait] if trait else _trait_from_soy_code(code)
    groups: list[dict] = []
    if agronomic:
        groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
    if disease:
        groups.append({"label": "DISEASE RATINGS", "items": disease})
    if passthrough:
        groups.append({"label": SOY_PASSTHROUGH, "items": passthrough})
    return RSVariety(
        source_key=f"robseeco-{code.lower()}",
        crop="soybeans",
        brand=brand,
        product_name=code,
        maturity_group=mg,
        trait_stack=trait_stack,
        groups=groups,
    )


def _trait_from_soy_code(code: str) -> list[str]:
    """Fallback trait from the code suffix (E3 / E3S / XF / XFS)."""
    m = re.search(r"(E3S|E3|XFS|XF)$", code.upper())
    if not m:
        return []
    mapping = {"E3": "E3", "E3S": "E3/STS", "XF": "XF", "XFS": "XF/STS"}
    return [mapping[m.group(1)]]


# --------------------------------------------------------------------- card parsing


# Try the hyphenated D94-26 form FIRST so a code like ``N97-55-Organic``
# splits to base ``N97-55`` + trait ``Organic`` rather than base ``N97``.
_CARD_CODE = re.compile(r"^([A-Z]\d{2}-\d{2}|[A-Z]{1,3}\d{2,4}[A-Z]?)(?:-(\S+))?$")

# Recognised corn trait suffixes — used to filter card-derived traits so a
# mis-split sibling fragment (e.g. "55-Conv") never leaks into trait_stack.
_KNOWN_CORN_TRAITS = {
    "Conv", "RR2", "VT2P", "DGVT2P", "GT", "GTA", "GT/LL", "GT/LLA", "SS",
    "SSP", "TRE", "AA", "DV", "D", "V", "VZ", "PCE", "3110A", "Artesian",
    "Organic",
}


# A soybean card code is the full variety code (IS0325E3 / RS3437XFS) —
# no trait suffix, so the whole token is the base.
_SOY_CARD_CODE = re.compile(r"^(IS|RS)\d{2,5}(E3S|E3|XFS|XF)$", re.I)


def _card_base(token: str) -> tuple[str, str | None]:
    """``RC4185-VT2P`` -> ("RC4185","VT2P"); ``D94-26-VT2P`` -> ("D94-26","VT2P");
    ``IS0325E3`` -> ("IS0325E3", None)."""
    if _SOY_CARD_CODE.match(token):
        return token, None
    m = _CARD_CODE.match(token)
    if not m:
        return token, None
    return m.group(1), m.group(2)


def _is_card_code(token: str) -> bool:
    return bool(_SOY_CARD_CODE.match(token) or _CARD_CODE.match(token))


def parse_cards(page: pdfplumber.page.Page, *, ncols: int = 2) -> list[dict]:
    """Return ``[{bases:set, traits:set, bullets:[str]}, ...]`` for the
    descriptive cards on a card page. Corn cards are a 2-column layout,
    soy cards a 3-column layout — ``ncols`` sets the equal-width column
    split. Within a column a card is a run of size-14 code lines followed
    by its size-9.5 bullet lines (sub-11pt continuation lines fold into
    the preceding bullet)."""
    width = page.width
    bounds = [(width * i / ncols, width * (i + 1) / ncols) for i in range(ncols)]
    words = [w for w in page.extract_words(use_text_flow=False,
                                           keep_blank_chars=False,
                                           extra_attrs=["upright", "size"])
             if w["upright"]]
    cards: list[dict] = []
    for lo, hi in bounds:
        colw = [w for w in words if lo <= w["x0"] < hi and 140 < w["top"] < 1180]
        colw.sort(key=lambda w: (round(w["top"]), w["x0"]))
        lines: list[dict] = []
        for w in colw:
            if lines and abs(w["top"] - lines[-1]["top"]) < 5:
                lines[-1]["ws"].append(w)
            else:
                lines.append({"top": w["top"], "ws": [w]})
        cur: dict | None = None
        for ln in lines:
            ws = sorted(ln["ws"], key=lambda w: w["x0"])
            txt = " ".join(w["text"] for w in ws).strip()
            sz = max(w["size"] for w in ws)
            collapsed = txt.replace(" ", "")
            if sz >= 13 and _is_card_code(collapsed) and len(collapsed) <= 20:
                if cur is None or cur["bullets"]:
                    cur = {"bases": set(), "traits": set(), "bullets": []}
                    cards.append(cur)
                base, trait = _card_base(collapsed)
                cur["bases"].add(base)
                if trait:
                    cur["traits"].add(trait)
            elif txt.startswith("•"):
                if cur is not None:
                    cur["bullets"].append(_clean_bullet(txt.lstrip("• ")))
            elif sz < 11 and cur is not None and cur["bullets"]:
                cur["bullets"][-1] = _clean_bullet(cur["bullets"][-1] + " " + txt)
    return cards


def _clean_bullet(s: str) -> str:
    s = re.sub(r"\s+", " ", (s or "")).strip()
    s = re.sub(r"\s*®", "®", s)             # "Artesian ®" -> "Artesian®"
    s = re.sub(r"\s*\bNEW\b\s*$", "", s).strip()  # trailing NEW badge
    return s


def build_card_groups(pdf: pdfplumber.PDF, pages: range, *, ncols: int = 2) -> list[dict]:
    """Return the list of distinct card groups across the card pages,
    each ``{bases:set(UPPER), traits:set, bullets:[...]}``. Pages are
    deduped by their reconstructed card signature so the duplicated page
    copy doesn't double the cards. A card can group several sibling codes
    for one hybrid (e.g. legacy A94-16 / G94-86 alongside the table's
    D94-26) — keeping the grouping lets the caller fold those into the
    table record instead of minting duplicate identities."""
    groups: list[dict] = []
    seen_sig: set[tuple] = set()
    for pi in pages:
        cards = parse_cards(pdf.pages[pi], ncols=ncols)
        sig = tuple(sorted(b for c in cards for b in c["bases"]))
        if not sig or sig in seen_sig:
            continue
        seen_sig.add(sig)
        for c in cards:
            if not c["bases"]:
                continue
            groups.append({
                "bases": {b.upper() for b in c["bases"]},
                "traits": set(c["traits"]),
                "bullets": [b for b in c["bullets"] if b],
            })
    return groups


# --------------------------------------------------------------------- extraction orchestration


# Corn trait suffix ordering preference for a stable trait_stack.
_CORN_TRAIT_ORDER = ["Conv", "RR2", "VT2P", "DGVT2P", "GT", "GTA", "GT/LL",
                     "GT/LLA", "SS", "SSP", "TRE", "AA", "DV", "D", "V", "VZ",
                     "PCE", "3110A", "Artesian", "Organic"]


def _sorted_corn_traits(traits: set[str]) -> list[str]:
    order = {t: i for i, t in enumerate(_CORN_TRAIT_ORDER)}
    clean = {t for t in traits if t in _KNOWN_CORN_TRAITS}
    return sorted(clean, key=lambda t: (order.get(t, 999), t))


def _group_for_base(groups: list[dict], base: str) -> dict | None:
    for g in groups:
        if base in g["bases"]:
            return g
    return None


def extract(pdf: pdfplumber.PDF, *, only_crop: str | None) -> list[RSVariety]:
    varieties: dict[str, RSVariety] = {}

    # ---- Corn ----------------------------------------------------------
    if only_crop in (None, "corn"):
        corn_groups = build_card_groups(pdf, CORN_CARD_PAGES)
        table_bases: set[str] = set()
        # Pass 1: table records (authoritative ratings), enriched from the
        # card group that contains the base.
        for pi in CORN_TABLE_PAGES:
            page = pdf.pages[pi]
            headers = reconstruct_rotated_headers(page, top_max=320)
            colmap = build_corn_colmap(headers)
            if len(colmap) < 12:
                log.warning("p%d corn header reconstruction thin (%d cols) — skipping",
                            pi, len(colmap))
                continue
            for v in parse_corn_table(page, colmap):
                if v.source_key in varieties:
                    continue
                base = v.product_name.upper()
                g = _group_for_base(corn_groups, base)
                if g:
                    v.trait_stack = _sorted_corn_traits(g["traits"])
                    v.strengths = list(g["bullets"])
                varieties[v.source_key] = v
                table_bases.add(base)
        # Pass 2: card-only fallback — one record per card group that has
        # NO table-backed sibling. Sibling legacy codes that share a card
        # with a table hybrid fold into that record (no duplicate identity).
        for g in corn_groups:
            if g["bases"] & table_bases or not g["bullets"]:
                continue
            # Pick a deterministic primary base (prefer an RC#### code).
            cand = sorted(b for b in g["bases"] if _CORN_CODE.match(b))
            if not cand:
                continue
            primary = sorted(cand, key=lambda b: (not b.startswith("RC"), b))[0]
            sk = f"robseeco-{primary.lower()}"
            if sk in varieties:
                continue
            varieties[sk] = RSVariety(
                source_key=sk, crop="corn", brand="Rob-See-Co",
                product_name=primary,
                trait_stack=_sorted_corn_traits(g["traits"]),
                strengths=list(g["bullets"]), from_table=False)

    # ---- Soy -----------------------------------------------------------
    if only_crop in (None, "soybeans"):
        soy_groups = build_card_groups(pdf, SOY_CARD_PAGES, ncols=3)
        soy_table_bases: set[str] = set()
        for pi in SOY_TABLE_PAGES:
            page = pdf.pages[pi]
            headers = reconstruct_rotated_headers(page, top_max=330)
            colmap = build_soy_colmap(headers)
            if len(colmap) < 8:
                continue
            for v in parse_soy_table(page, colmap):
                if v.source_key in varieties:
                    continue
                g = _group_for_base(soy_groups, v.product_name.upper())
                if g:
                    v.strengths = list(g["bullets"])
                varieties[v.source_key] = v
                soy_table_bases.add(v.product_name.upper())
        for g in soy_groups:
            if g["bases"] & soy_table_bases or not g["bullets"]:
                continue
            cand = sorted(b for b in g["bases"] if _SOY_CODE.match(b))
            if not cand:
                continue
            primary = cand[0]
            sk = f"robseeco-{primary.lower()}"
            if sk in varieties:
                continue
            brand = "Innotech" if primary.startswith("IS") else "Rob-See-Co"
            varieties[sk] = RSVariety(
                source_key=sk, crop="soybeans", brand=brand, product_name=primary,
                trait_stack=_trait_from_soy_code(primary),
                strengths=list(g["bullets"]), from_table=False)

    return list(varieties.values())


# --------------------------------------------------------------------- render


def render_markdown(v: RSVariety) -> str:
    crop_label = "Corn" if v.crop == "corn" else "Soybeans"
    head: list[str] = [
        f"# {v.product_name}",
        "",
        "- **Vendor:** RobSeeCo",
        f"- **Brand:** {v.brand}",
        f"- **Crop:** {crop_label}",
    ]
    if v.crop == "corn" and v.relative_maturity is not None:
        head.append(f"- **Relative maturity:** {v.relative_maturity} days")
    if v.crop == "soybeans" and v.maturity_group is not None:
        head.append(f"- **Maturity group:** {v.maturity_group}")
    if v.trait_stack:
        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
    head.append(f"- **Source:** {PRODUCTS_URL}")
    head.append(f"- **Rating scale:** {SCALE_DIRECTION}")
    head.append("- **Service area:** RobSeeCo dealer network — Western/Central "
                "Corn Belt + Dakotas (Eastern Corn Belt via Federal Hybrids/Kiser Seed)")
    head.append("")
    if not v.from_table:
        head += ["_Identity from the descriptive card; no structured ratings table "
                 "row matched._", ""]
    head += ["---", ""]
    for g in v.groups:
        head.append(f"## {g['label'].title()}")
        head.append("")
        for it in g["items"]:
            head.append(f"- **{it['characteristic']}:** {it['value'] or '—'}")
        head.append("")
    if v.strengths:
        head += ["## Strengths", ""]
        for s in v.strengths:
            head.append(f"- {s}")
        head.append("")
    return "\n".join(head)


def write_variety(v: RSVariety) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{v.source_key}.md").write_text(render_markdown(v), encoding="utf-8")
    sidecar = {
        "source": "robseeco",
        "source_key": v.source_key,
        "vendor": "RobSeeCo",
        "brand": v.brand,
        "product_name": v.product_name,
        "product_id": v.product_name,
        "crop": v.crop,
        "release_year": None,
        "relative_maturity": v.relative_maturity,
        "maturity_group": v.maturity_group,
        "wheat_class": None,
        "trait_stack": v.trait_stack,
        "trait_descriptions": [],
        "positioning_statement": v.positioning,
        "strengths": v.strengths,
        "characteristics_groups": v.groups,
        "_scale_direction": SCALE_DIRECTION,
        "regional_recommendations": REGIONAL_REC,
        "image_url": None,
        "source_urls": [PRODUCTS_URL],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{v.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    pdf_path = fetch_pdf(http, force=force)

    with pdfplumber.open(pdf_path) as pdf:
        varieties = extract(pdf, only_crop=only_crop)

    if only_product:
        key = only_product.lower()
        varieties = [v for v in varieties
                     if v.source_key == key or v.product_name.lower() == key]
        if not varieties:
            log.error("no variety matched --product=%s", only_product)
            return 2

    varieties.sort(key=lambda v: (v.crop, v.brand, v.source_key))

    counts = {"written": 0, "skipped": 0, "card_only": 0,
              "corn": 0, "soybeans": 0}
    processed = 0
    for v in varieties:
        if limit is not None and processed >= limit:
            break
        processed += 1
        md_path = CORPUS_DIR / f"{v.source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%d] %s skipped", processed, len(varieties), v.source_key)
            continue
        write_variety(v)
        counts["written"] += 1
        counts[v.crop] += 1
        if not v.from_table:
            counts["card_only"] += 1
        log.info("[%d/%d] %s written | crop=%s brand=%s rm/mg=%s traits=%s "
                 "groups=%d strengths=%d%s",
                 processed, len(varieties), v.source_key, v.crop, v.brand,
                 v.relative_maturity if v.crop == "corn" else v.maturity_group,
                 ",".join(v.trait_stack) or "-", len(v.groups), len(v.strengths),
                 " [CARD-ONLY]" if not v.from_table else "")

    log.info("done: processed=%d written=%d skipped=%d card_only=%d "
             "| corn=%d soybeans=%d (of %d)",
             processed, counts["written"], counts["skipped"], counts["card_only"],
             counts["corn"], counts["soybeans"], len(varieties))
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.robseeco",
        description="Scrape the RobSeeCo (Rob-See-Co / Innotech) 2026 Seed Guide "
                    "PDF — corn + soybean variety identity + 1-9 ratings.")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after writing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-download the PDF and re-write all variety files.")
    p.add_argument("--crop", default=None, choices=["corn", "soybeans"],
                   help="Limit to one crop.")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or code.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    return run(limit=args.limit, force=args.force,
               only_crop=args.crop, only_product=args.product)


if __name__ == "__main__":
    sys.exit(main())