seed-mcp/scrape/sources/ohio_ocpt_trials.py

"""Ohio Corn & Soybean Performance Test (OCPT/OSPT) — independent,
cross-vendor yield trials (data_type=trial).

Source: ``ohiocroptest.cfaes.osu.edu`` — The Ohio State University /
CFAES extension publishes the annual Ohio Corn Performance Test and
Ohio Soybean Performance Trials as full-report PDFs with a real text
layer. These are *third-party* university trials: every brand that
pays the entry fee is evaluated head-to-head at the SAME sites, so a
single report ranks CHANNEL, DEKALB, NK, Golden Harvest, LG Seeds,
Augusta, Ebberts, Seed Consultants, etc. against each other — the
highest-value class of trial data because no vendor controls it.

This is the FOURTH ``data_type: "trial"`` source family after the
vendor plot reports (gh / lg / agrigold / proharvest). Unlike those —
which are ONE plot per report — the OCPT report is ONE PDF carrying a
dozen multi-site tables, each table laying out several SITES as
side-by-side column groups (Hebron | Washington CH | South Charleston
| Covington | Summary). We split each report into ONE sidecar per
SITE (and one per regional Summary), so the corpus's per-site shape
matches the vendor plot reports and the trial chunker's shared
``_render_gh_plot_chunk`` renderer handles it with NO chunk.py edit
(we emit ``results: [{rank, brand, product, traits, metrics}]`` with a
canonical ``"Yield"`` metric key).

PDF layout (corn, e.g. CountryJournal2025.pdf):
  Table 1E/1L/4E/4L/7E/7L : single-year per-site tables. A site-name
    line names the column groups; each hybrid row carries 5 numbers
    (Yield, Harv.Mst, Stk.Ldg, Final Std, Emergence) PER site, then a
    single trailing TW (test weight, summary-level). These are the
    gold per-site data — we emit one sidecar per real site (+ Summary).
  Table 2/3/5/6/8/9 : multi-year / combined regional summaries — same
    column-group structure (year/region groups). We emit these as
    region-level summary sidecars (the Summary group of the matching
    single-year table already covers the site mean, so these add the
    2-yr / 3-yr / statewide aggregates).
  Table 10 : hybrid -> technology-traits lookup (RR,CB,TRE,...). Parsed
    into a traits map and joined onto every result's ``traits`` field.

PDF layout (soybeans, e.g. 2025OCJwithproteinandoil.pdf): a different
column order — ``Variety | Brand | Type | Seed Treatment | RM | <per-
site yields> | Mean``. Region (North/Central/South) x maturity
(Early/Late) tables, with site codes (N1/N2, C1/C2, S1/S2) defined in
a Site-Descriptions table (Table 1). We emit one sidecar per region x
maturity, keyed by the region (the soy report does not break per-site
yields into separate ranked tables the way corn does — the per-site
columns are within the regional table, so a region sidecar carries the
full ranked field with each site's yield in the metrics).

SANITY GATE: every parsed row must have a real (non-numeric, len>1)
brand, a product code, and a plausible Yield (1 < y < 400). Rows that
fail (stat rows High/Average/Low/LSD, wrapped lines, footnotes) are
dropped + counted. If a whole table won't parse into >=2 good rows it
is SKIPPED + logged + counted — never emitted with mis-assigned rows.

Scope: 2025 (latest) + 2024 baseline. ``--include-old`` pulls the
u.osu.edu archive years.

Output:
  corpus/ohio_ocpt_trials/<source_key>.md
  corpus/ohio_ocpt_trials/<source_key>.json

source_key: ``ocpt-<crop>-<year>-<region-or-site>``
  e.g. ``ocpt-corn-2025-sw-hebron``, ``ocpt-soybeans-2025-north-early``.

CLI:
  python -m scrape.sources.ohio_ocpt_trials --year 2025 --crop corn
  python -m scrape.sources.ohio_ocpt_trials --force
  python -m scrape.sources.ohio_ocpt_trials --include-old
"""

from __future__ import annotations

import argparse
import io
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import pdfplumber
import requests

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://ohiocroptest.cfaes.osu.edu"
ARCHIVE = "https://u.osu.edu/perf/archive/"

# The publisher labels (per the trial sidecar contract).
VENDOR = "The Ohio State University"
BRAND_AGGREGATOR = "Ohio Corn/Soybean Performance Test publishes"
PUBLISHER_BRAND = "Ohio Crop Performance Test"
TOS_NOTE = ("© OSU on the report; explicit no-endorsement clause; public "
            "CFAES extension publication; attribute Ohio Corn/Soybean "
            "Performance Test, OSU CFAES.")

BASELINE_YEARS = [2024, 2025]
OLD_YEARS = [2018, 2019, 2020, 2021, 2022, 2023]

REQ_INTERVAL_SEC = 2.0  # polite, low rate against a university host

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "ohio_ocpt_trials"

log = logging.getLogger("scrape.ohio_ocpt_trials")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 60.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class SiteTrial:
    """One per-site (or per-region-summary) ranked trial, ready to write."""
    source_key: str
    crop: str                       # "corn" | "soybeans"
    year: int
    region: str                     # SW / NW / NE region or county
    site: str | None                # site town / county / "Summary"
    pdf_url: str
    table_label: str = ""           # e.g. "Table 1E (early maturity)"
    # per-site agronomic footnote metadata (corn single-year tables)
    soil_type: str | None = None
    previous_crop: str | None = None
    planting_date: str | None = None
    harvest_date: str | None = None
    tillage: str | None = None
    fungicide: str | None = None
    cooperator: str | None = None
    county: str | None = None
    results: list[dict] = field(default_factory=list)


# --------------------------------------------------------------------- discovery


_PDF_HREF_RE = re.compile(r'href="([^"]+\.pdf)"', re.I)


def _discover_pdf_hrefs(http: RateLimitedSession, index_url: str) -> list[str]:
    """Fetch an index page and return absolute hrefs to every .pdf linked."""
    from urllib.parse import urljoin
    try:
        r = http.get(index_url)
        r.raise_for_status()
    except requests.RequestException as exc:
        log.warning("index fetch failed %s: %s", index_url, exc)
        return []
    out: list[str] = []
    seen: set[str] = set()
    for href in _PDF_HREF_RE.findall(r.text):
        full = urljoin(index_url, href)
        if full not in seen:
            seen.add(full)
            out.append(full)
    return out


# The full-report PDF (corn) is "CountryJournal" / (soy) the
# "OCJwithproteinandoil" file. Weather / seed-quality / pollinator /
# seed-composition PDFs carry no head-to-head yield table → ignore them.
def _pick_report_pdf(hrefs: list[str], crop: str, year: int) -> str | None:
    yr = str(year)
    if crop == "corn":
        for h in hrefs:
            base = h.rsplit("/", 1)[-1].lower()
            if "countryjournal" in base and yr in base:
                return h
        # fallback: any non-weather pdf with the year
        for h in hrefs:
            base = h.rsplit("/", 1)[-1].lower()
            if yr in base and "weather" not in base:
                return h
    else:  # soybeans
        for h in hrefs:
            base = h.rsplit("/", 1)[-1].lower()
            if "ocj" in base and "protein" in base and yr in base:
                return h
        for h in hrefs:
            base = h.rsplit("/", 1)[-1].lower()
            if yr in base and "seed" not in base and "pollinator" not in base:
                return h
    return None


def discover_report_pdf(http: RateLimitedSession, crop: str,
                        year: int) -> str | None:
    """Find the full-report PDF for a (crop, year) by walking the live
    index pages and extracting PDF hrefs (no hardcoded filename)."""
    if crop == "corn":
        # The corn index is /corntrials/ ; older years via ?year=NNNN.
        indexes = [f"{BASE}/corntrials/default.asp?year={year}",
                   f"{BASE}/corntrials/"]
    else:
        # Soy index lives at /soyNNNN/ (year in path).
        indexes = [f"{BASE}/soy{year}/", f"{BASE}/soy{year}"]
    for idx in indexes:
        hrefs = _discover_pdf_hrefs(http, idx)
        pdf = _pick_report_pdf(hrefs, crop, year)
        if pdf:
            log.info("%s %d report PDF: %s", crop, year, pdf)
            return pdf
    log.warning("no report PDF discovered for %s %d", crop, year)
    return None


# --------------------------------------------------------------------- helpers


_NUM_TOKEN = re.compile(r"^-?\d+(?:\.\d+)?$")
# A yield-significance marker the soy tables append to per-site yields.
_SIG_RE = re.compile(r"[*]+$")


def _to_num(s: str) -> float | int | None:
    s = (s or "").strip()
    s = _SIG_RE.sub("", s)
    if not s or not _NUM_TOKEN.match(s):
        return None
    f = float(s)
    return int(f) if f.is_integer() else f


def _slug(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", (s or "").lower()).strip("-")


# Lines that begin a non-data (stat / footnote) row — never a hybrid row.
_STAT_PREFIXES = (
    "high", "average", "low", "lsd", "mean", "soil type", "soil test",
    "previous crop", "planting", "harvest", "tillage", "fertilizer",
    "fungicide", "cooperator", "county", "entry", "brand", "variety",
    "bu/a", "lbs.", "harv.", "stk.", "final", "table",
)


# --------------------------------------------------------------------- CORN parse
#
# Corn region codes by region phrase in the table caption.
_CORN_REGION = (
    ("SOUTHWESTERN", "SW"),
    ("NORTHWESTERN", "NW"),
    ("NORTH CENTRAL", "NE"),
    ("NORTHEASTERN", "NE"),
)
# Known corn seed brands (ALL-CAPS publisher labels). Longest match first
# so multi-word brands aren't split. Built from the distinct brands seen
# across the SW/NW/NE regions; matched case-insensitively at line start.
CORN_BRANDS = [
    "1ST CHOICE SEEDS", "AGRIGOLD HYBRIDS", "AGRIGOLD", "AUGUSTA SEED",
    "AXIS SEED", "BA GENETICS", "CHANNEL", "DEKALB", "DYNA-GRO", "EBBERTS",
    "FS INVISION", "GOLDEN HARVEST", "GREAT HEART SEED", "GRO-MOR",
    "LG SEEDS", "NK", "PC SEED", "SEED CONSULTANTS", "SEED GENETICS DIRECT",
    "SEEDWAY", "SHUR GROW", "VIKING / BLUE RIVER", "VIKING/BLUE RIVER",
    "PIONEER", "BREVANT", "STINE", "BECK'S", "BECKS", "WYFFELS",
    "CROPLAN", "MASTERS CHOICE", "HOEGEMEYER", "LOCAL SEED", "MYCOGEN",
]
_CORN_BRANDS_SORTED = sorted(CORN_BRANDS, key=len, reverse=True)


def _corn_region(caption: str) -> str:
    up = caption.upper()
    for phrase, code in _CORN_REGION:
        if phrase in up:
            return code
    return "OH"


def _corn_split_brand_product(text: str) -> tuple[str, str] | None:
    """Split a row's leading text (brand + hybrid) into (brand, hybrid).

    Brands are the ALL-CAPS publisher labels in CORN_BRANDS; match the
    longest known phrase at the start, the rest is the hybrid code.
    """
    up = text.upper()
    for b in _CORN_BRANDS_SORTED:
        if up.startswith(b + " ") or up == b:
            brand = text[:len(b)].strip()
            product = text[len(b):].strip()
            if product:
                return brand, product
            return None  # brand with no hybrid → not a data row
    return None


def _corn_site_groups(site_line: str) -> list[str]:
    """Parse the site-name line into ordered column-group labels.

    The names are multi-word and space-separated on one line, e.g.
    'Hebron Washington Court House South Charleston Covington Summary'.
    We can't split purely on whitespace, so we match against the known
    OCPT site vocabulary (longest first) plus 'Summary'.
    """
    # Known multi-word OCPT site/group names (longest-match-first). Single-
    # word sites (Hebron, Greenville, Covington, Bucyrus…) need no vocabulary
    # entry — anything not matched as a multi-word phrase is its own group.
    known_multi = [
        "Washington Court House", "South Charleston", "Upper Sandusky",
        "Western Ohio", "Statewide All Regions", "Van Wert",
    ]
    s = re.sub(r"\s+", " ", site_line).strip()
    groups: list[str] = []
    tokens = s.split(" ")
    i = 0
    while i < len(tokens):
        # Fold a parenthetical annotation like "(7 Sites)" into the prev group.
        if groups and (tokens[i].startswith("(") or tokens[i].endswith(")")):
            groups[-1] = (groups[-1] + " " + tokens[i]).strip()
            i += 1
            continue
        matched = None
        for name in sorted(known_multi, key=lambda n: -len(n.split())):
            nlen = len(name.split())
            if " ".join(tokens[i:i + nlen]).lower() == name.lower():
                matched = name
                break
        if matched:
            groups.append(matched)
            i += len(matched.split())
        else:
            # Single-token site name (Hebron / Greenville / Summary / …).
            groups.append(tokens[i])
            i += 1
    return groups


# Metric labels per corn site group, in column order.
_CORN_METRICS = ["Yield", "Harv. Moisture", "Stalk Lodging",
                 "Final Stand", "Emergence"]


def _corn_parse_table(lines: list[str], header_idx: int, caption: str,
                      site_line: str) -> tuple[list[str], list[dict]] | None:
    """Parse one corn table body into ``(group_labels, rows)`` where each
    row is ``{brand, product, by_group: {label: {metric: val}}, tw}``."""
    groups = _corn_site_groups(site_line)
    if not groups:
        return None
    # The header carries one 'Yield' token per column group — the
    # authoritative group count. If the parsed site-name count disagrees
    # (an unrecognized layout), SKIP the table rather than mis-assign
    # numerics to the wrong sites.
    n_header = lines[header_idx].lower().count("yield")
    if n_header and n_header != len(groups):
        log.warning("site-count mismatch (sites=%d, header Yield=%d) — "
                    "skipping table: %s", len(groups), n_header,
                    caption[:60])
        return None
    n = len(groups)
    rows: list[dict] = []
    for line in lines[header_idx + 1:]:
        stripped = line.strip()
        if not stripped:
            continue
        low = stripped.lower()
        if low.startswith(_STAT_PREFIXES):
            # A stat/footnote row, or the next table's header — stop if we
            # hit a new caption, else just skip the stat row.
            if low.startswith("table"):
                break
            continue
        # Identify trailing numeric run.
        toks = stripped.split()
        j = len(toks)
        while j > 0 and _NUM_TOKEN.match(toks[j - 1]):
            j -= 1
        nums = [_to_num(t) for t in toks[j:]]
        lead = " ".join(toks[:j])
        bp = _corn_split_brand_product(lead)
        if bp is None:
            continue
        brand, product = bp
        # Expected numeric count: 5 per group, + optional trailing TW.
        per = 5 * n
        tw = None
        if len(nums) == per + 1:
            tw = nums[-1]
            nums = nums[:per]
        elif len(nums) == per:
            pass
        else:
            # Wrapped / short / malformed numeric run → skip (sanity gate).
            continue
        by_group: dict[str, dict] = {}
        for gi, gname in enumerate(groups):
            chunk = nums[gi * 5:(gi + 1) * 5]
            if len(chunk) < 5:
                continue
            m: dict[str, Any] = {}
            for label, v in zip(_CORN_METRICS, chunk):
                if v is not None:
                    m[label] = v
            by_group[gname] = m
        rows.append({"brand": brand, "product": product,
                     "by_group": by_group, "tw": tw})
    return groups, rows


# Per-site footnote fields we lift from the single-year table footers.
_FOOTNOTE_FIELDS = {
    "Soil Type": "soil_type",
    "Previous Crop": "previous_crop",
    "Tillage": "tillage",
    "Cooperator": "cooperator",
    "County": "county",
}


def build_corn_footnotes(pages: list) -> dict[str, dict[str, str]]:
    """Word-position footnote extractor.

    pdfplumber's ``extract_text`` collapses the column whitespace, so a
    footnote line like ``Cooperator  Parrish Farms  Sollars Farm  ...``
    can't be split back into per-site values from text alone. Instead we
    read WORDS with x-coordinates: the site-name header words give each
    column's x-anchor, and we bucket every footnote word under the
    nearest column to the LEFT-or-at its x. Returns
    ``{site_name: {field: value}}`` so values are never mis-assigned.
    """
    out: dict[str, dict[str, str]] = {}
    for page in pages:
        words = page.extract_words(use_text_flow=False)
        # Group words into lines by their 'top' coordinate.
        lines: dict[float, list[dict]] = {}
        for w in words:
            key = round(w["top"] / 2.0) * 2.0  # ~2px bucket
            lines.setdefault(key, []).append(w)
        ordered = [sorted(ws, key=lambda w: w["x0"])
                   for _, ws in sorted(lines.items())]
        # Find site-header lines: a row whose words are all known site names.
        for li, lw in enumerate(ordered):
            joined = " ".join(w["text"] for w in lw)
            site_groups = _corn_site_groups(joined)
            # require at least 2 real sites + the words to BE those sites
            real = [g for g in site_groups if not g.lower().startswith("summary")]
            if len(real) < 2:
                continue
            # Build column x-anchors: first word x0 of each site phrase.
            anchors = _site_anchor_xs(lw, site_groups)
            if not anchors:
                continue
            # Scan the following lines for footnote labels until next site hdr.
            # The footnote block sits below 50-100+ hybrid data rows, so the
            # window must be generous (it self-terminates at the next site
            # header anyway).
            for fl in ordered[li + 1:li + 130]:
                ftxt = " ".join(w["text"] for w in fl)
                if _corn_site_groups(ftxt) and len(
                        [g for g in _corn_site_groups(ftxt)
                         if not g.lower().startswith("summary")]) >= 2:
                    break  # next table's site header
                label, key = _match_footnote_label(fl)
                if not label:
                    continue
                # Words after the label, bucketed by nearest anchor.
                lbl_words = label.split()
                value_words = fl[len(lbl_words):]
                buckets = _bucket_by_anchor(value_words, anchors)
                for site_name, val in buckets.items():
                    if val:
                        out.setdefault(site_name, {})[key] = val
    return out


def _site_anchor_xs(header_words: list[dict],
                    groups: list[str]) -> list[tuple[float, str]]:
    """Return ``[(x0, site_name), ...]`` for the real (non-Summary) sites,
    using the first word of each multi-word site phrase as its x-anchor."""
    anchors: list[tuple[float, str]] = []
    i = 0
    for g in groups:
        glen = len(g.split())
        if i < len(header_words):
            x0 = header_words[i]["x0"]
            if not g.lower().startswith("summary"):
                anchors.append((x0, g))
        i += glen
    return anchors


def _match_footnote_label(line_words: list[dict]) -> tuple[str | None, str | None]:
    """If a footnote line starts with a known label, return (label, key)."""
    txt = " ".join(w["text"] for w in line_words)
    for label, key in _FOOTNOTE_FIELDS.items():
        if txt.lower().startswith(label.lower()):
            return label, key
    return None, None


def _bucket_by_anchor(value_words: list[dict],
                      anchors: list[tuple[float, str]]) -> dict[str, str]:
    """Assign each value word to the site whose x-anchor is nearest at or
    to the left of the word's x0 (footnote values sit roughly under their
    column header). Joins the words per site preserving order."""
    if not anchors:
        return {}
    xs = [a[0] for a in anchors]
    out: dict[str, list[str]] = {a[1]: [] for a in anchors}
    for w in value_words:
        x = w["x0"]
        # nearest anchor by absolute distance
        best_i = min(range(len(xs)), key=lambda i: abs(xs[i] - x))
        out[anchors[best_i][1]].append(w["text"])
    return {site: " ".join(ws).strip(" ,") for site, ws in out.items()}


# --------------------------------------------------------------------- CORN build


def build_corn_sites(text: str, year: int, pdf_url: str,
                     footnotes_by_site: dict[str, dict[str, str]] | None = None
                     ) -> list[SiteTrial]:
    """Parse the whole corn report text into per-site SiteTrial objects.

    We process the SINGLE-YEAR per-site tables (1E/1L/4E/4L/7E/7L) as the
    gold per-site data, and the multi-year/combined tables (2/3/5/6/8/9)
    as region-level summary sidecars. Within a region+year, the Early and
    Full-season tables for the same SITE are merged into one site sidecar.

    ``footnotes_by_site`` (from ``build_corn_footnotes``) supplies per-site
    soil/cooperator/county metadata extracted by word x-position.
    """
    footnotes_by_site = footnotes_by_site or {}
    lines = text.splitlines()
    # First parse Table 10 -> hybrid->traits.
    traits_map = _corn_traits_map(lines)

    # Index every (caption_idx, site_line_idx, header_idx) triple.
    blocks: list[tuple[int, str, str, int]] = []
    i = 0
    while i < len(lines):
        m = re.match(r"^\s*(TABLE|Table)\s+(\d+)([EL]?)\.?[:\s]", lines[i])
        if m:
            tbl_no = int(m.group(2))
            tbl_sfx = m.group(3)
            caption = lines[i].strip()
            # Find the next 'Brand ... Hybrid ...' header within a few lines.
            header_idx = None
            site_line = ""
            for k in range(i + 1, min(i + 8, len(lines))):
                lk = lines[k].strip()
                if lk.lower().startswith("brand") and "hybrid" in lk.lower():
                    header_idx = k
                    break
                # The site-name line is the first non-empty, non-unit line
                # after the caption that isn't the 'Harv. Stk. Final' line.
                if lk and not site_line and not lk.lower().startswith(
                        ("harv.", "stk.", "final", "bu/a")):
                    site_line = lk
            if header_idx is not None:
                blocks.append((tbl_no, tbl_sfx, caption, header_idx))
                # also stash the site_line on a parallel structure
                blocks[-1] = (tbl_no, tbl_sfx, caption, header_idx)
                # remember site line via attribute on a dict below
                _BLOCK_SITELINE[header_idx] = site_line
            i = header_idx if header_idx else i + 1
        i += 1

    # Aggregate per (region, table-kind, site).
    site_acc: dict[str, SiteTrial] = {}

    for tbl_no, tbl_sfx, caption, header_idx in blocks:
        site_line = _BLOCK_SITELINE.get(header_idx, "")
        parsed = _corn_parse_table(lines, header_idx, caption, site_line)
        if not parsed:
            continue
        groups, rows = parsed
        if not rows:
            continue
        region = _corn_region(caption)
        single_year = tbl_sfx in ("E", "L")  # 1E/1L/4E/4L/7E/7L
        maturity = {"E": "early", "L": "full-season"}.get(tbl_sfx, "")

        for gi, gname in enumerate(groups):
            is_summary = gname.lower().startswith("summary")
            # Build results list for this group.
            results: list[dict] = []
            for r in rows:
                m = dict(r["by_group"].get(gname, {}))
                if not m or "Yield" not in m:
                    continue
                # Test weight (TW) is a summary-level number → attach to the
                # Summary group only.
                if is_summary and r.get("tw") is not None:
                    m["Test Wt."] = r["tw"]
                results.append({
                    "brand": r["brand"],
                    "product": r["product"],
                    "traits": traits_map.get(_norm_hybrid(r["product"])),
                    "metrics": m,
                })
            # Sanity-gate + rank.
            results = _finalize_results(results)
            if len(results) < 2:
                continue

            if single_year:
                if is_summary:
                    site = "Summary"
                    key = f"ocpt-corn-{year}-{region.lower()}-summary"
                else:
                    site = gname
                    key = f"ocpt-corn-{year}-{region.lower()}-{_slug(gname)}"
            else:
                # multi-year / combined → region (or statewide) summary
                site = gname
                tag = {2: "2yr", 3: "3yr", 5: "2yr", 6: "3yr", 8: "2yr",
                       9: "combined"}.get(tbl_no, f"t{tbl_no}")
                key = (f"ocpt-corn-{year}-{region.lower()}-{tag}-{_slug(gname)}"
                       if not is_summary
                       else f"ocpt-corn-{year}-{region.lower()}-{tag}-summary")

            st = site_acc.get(key)
            if st is None:
                st = SiteTrial(
                    source_key=key, crop="corn", year=year, region=region,
                    site=site, pdf_url=pdf_url,
                    table_label=caption)
                # attach footnote agronomic metadata for real sites
                # (word-position keyed by site name → never mis-assigned)
                if single_year and not is_summary:
                    fn = footnotes_by_site.get(gname, {})
                    st.soil_type = fn.get("soil_type") or None
                    st.previous_crop = fn.get("previous_crop") or None
                    st.tillage = fn.get("tillage") or None
                    st.cooperator = fn.get("cooperator") or None
                    st.county = fn.get("county") or None
                site_acc[key] = st
            # Merge: append new hybrids (early + full season tables of the
            # same site land in the same sidecar). Dedup by (brand, product).
            seen = {(r["brand"], r["product"]) for r in st.results}
            for r in results:
                if (r["brand"], r["product"]) not in seen:
                    st.results.append(r)
                    seen.add((r["brand"], r["product"]))
            if maturity and single_year:
                st.table_label = caption  # last caption wins (informational)

    # Re-rank each merged sidecar by Yield desc.
    out = list(site_acc.values())
    for st in out:
        st.results = _finalize_results(st.results)
    return [st for st in out if len(st.results) >= 2]


_BLOCK_SITELINE: dict[int, str] = {}


def _norm_hybrid(h: str) -> str:
    return re.sub(r"\s+", " ", (h or "").strip()).upper()


def _corn_traits_map(lines: list[str]) -> dict[str, str]:
    """Parse Table 10 (Seed source / Hybrid No. / Technology Traits) into a
    ``hybrid -> traits`` map. The table has a 'Hybrid No.' header and a
    'Technology Traits' column; rows are messy (brand wraps across lines),
    so we anchor on the trait token vocabulary."""
    out: dict[str, str] = {}
    trait_vocab = re.compile(
        r"\b(RR|GT|CB|RW|LL|TRE|WBC|CEW|BCW|VIP|VT2P?|SmartStax|Enlist|"
        r"NON-GMO|Conv|STS|PWE)\b", re.I)
    in_t10 = False
    for line in lines:
        if re.match(r"^\s*TABLE\s+10\.", line):
            in_t10 = True
            continue
        if in_t10 and re.match(r"^\s*TABLE\s+11", line):
            break
        if not in_t10:
            continue
        s = line.strip()
        if not s or not trait_vocab.search(s):
            continue
        # The hybrid number precedes the table-no column and the traits.
        # Pattern: <hybrid tokens> <tableNo list> <TRAIT,TRAIT,...> ...
        m = re.search(r"((?:[A-Z]{1,3}\s*)?[A-Z0-9][A-Z0-9\- ]*?)\s+"
                      r"(?:\d+[EL]?(?:,\s*)?)+\s+([A-Za-z0-9,\- ]*"
                      + r"(?:RR|GT|CB|RW|LL|TRE|WBC|CEW|BCW|VIP|VT2|"
                      r"Enlist|NON-GMO|Conv|STS|PWE)[A-Za-z0-9,\- ]*)", s)
        if not m:
            continue
        hybrid = _norm_hybrid(m.group(1))
        traits = re.sub(r"\s+", " ", m.group(2).strip()).rstrip(",")
        traits = _trim_trait_codes(traits)
        if hybrid and traits and len(hybrid) <= 30:
            out.setdefault(hybrid, traits)
    return out


# Technology-trait code vocabulary (Table 10 'Technology Traits' column).
# Everything after this comma-list is the Fungicide / Seed-Treatment column,
# which we drop so ``traits`` carries ONLY the genetic trait stack.
_TRAIT_CODES = {
    "RR", "GT", "CB", "RW", "LL", "TRE", "WBC", "CEW", "BCW", "VIP", "VT2",
    "VT2P", "VT2PRO", "DG", "DT", "ENLIST", "NON-GMO", "CONV", "STS", "PWE",
    "SMARTSTAX", "RW2", "SS",
}


def _trim_trait_codes(traits: str) -> str:
    """Keep only the leading comma-separated technology-trait codes.

    The Table 10 trait column reads e.g. ``RR,CB,LL, Enlist Lumiscend PRO``
    where ``Lumiscend PRO`` is the Fungicide/Seed-Treatment column bleeding
    in. The trait codes are comma-joined; the first SPACE-separated token
    that isn't itself a trait code marks the end of the trait stack."""
    # Normalize: split on commas first to get the code tokens, but the codes
    # themselves are space-or-comma separated. Walk tokens; stop at the first
    # token that (a) follows a space (not a comma) and (b) is not a code.
    kept: list[str] = []
    # Tokenize keeping comma adjacency info.
    parts = re.findall(r"[A-Za-z0-9\-]+|,", traits)
    prev_was_comma = True  # treat start as if a comma preceded
    for tok in parts:
        if tok == ",":
            kept.append(",")
            prev_was_comma = True
            continue
        if tok.upper() in _TRAIT_CODES:
            kept.append(tok)
            prev_was_comma = False
            continue
        # A non-code token: only allowed if it directly follows a comma
        # (e.g. ", Enlist" already handled above; this guards odd spacing).
        if prev_was_comma and tok.upper() in _TRAIT_CODES:
            kept.append(tok)
            prev_was_comma = False
            continue
        break  # reached the Fungicide column → stop
    s = "".join(
        (t if t == "," else (" " + t if i and kept[i - 1] != "," else t))
        for i, t in enumerate(kept))
    return s.strip().strip(",").strip()


# --------------------------------------------------------------------- SOY parse
#
# Soy region by caption phrase.
_SOY_REGION = (("NORTH", "North"), ("CENTRAL", "Central"), ("SOUTH", "South"))
# Soy seed brands — Brand is the SECOND column. Multi-word, longest first.
# Includes year-to-year string variants (the report is inconsistent:
# "Axis" vs "Axis Seed", "Dyna-Gro" vs "Dyna-Gro Seed").
SOY_BRANDS = [
    "Seed Consultants, Inc.", "Ebberts Field Seeds", "Seed Genetics Direct",
    "Great Heart Seed Co.", "Viking|Blue River", "GROWMARK, INC.",
    "Albert Lea Seed", "Dyna-Gro Seed", "Shur Grow", "Dyna-Gro", "Gro Mor",
    "Seedway", "Xitavo", "Asgrow", "DonMario", "Golden Harvest", "Axis Seed",
    "Axis", "NK Seeds", "Confluence Genetics", "FS HiSOY", "Benson Hill",
    "Bayer", "Beck's", "Stine", "Pioneer", "Brevant", "Channel", "LG Seeds",
    "Hoegemeyer",
]
_SOY_BRANDS_SORTED = sorted(SOY_BRANDS, key=len, reverse=True)
# Soy "Type" column values that mark the boundary between the
# Variety+Brand columns and the Seed-Treatment column. The Type column is
# strictly one of these herbicide-trait classes (EN=Enlist, CV=conventional,
# XF=XtendFlex, STS=sulfonylurea-tolerant) and may be a comma-compound
# ("EN, STS"). NOT E3/RR2X/LL — those appear INSIDE variety names
# ("E3190 E3") and would split the row in the wrong place. A trailing
# comma signals a compound type whose continuation we consume.
_SOY_TYPE_TOKENS = {"EN", "CV", "XF", "STS"}


def _soy_type_at(toks: list[str], i: int) -> int | None:
    """If a Type column begins at index i, return the index AFTER it (so the
    Seed-Treatment column starts there). Handles bare 'EN' and the
    comma-compound 'EN, STS' (where toks[i]=='EN,' then 'STS')."""
    t = toks[i]
    bare = t.rstrip(",")
    if bare not in _SOY_TYPE_TOKENS:
        return None
    end = i + 1
    # consume continuation tokens of a comma-compound type
    while t.endswith(",") and end < len(toks) and \
            toks[end].rstrip(",") in _SOY_TYPE_TOKENS:
        t = toks[end]
        end += 1
    return end


def _soy_region(caption: str) -> tuple[str, str]:
    up = caption.upper()
    region = "OH"
    for phrase, code in _SOY_REGION:
        if phrase in up:
            region = code
            break
    maturity = "late" if "LATE" in up else ("early" if "EARLY" in up else "")
    return region, maturity


def _soy_site_codes(text: str) -> dict[str, str]:
    """Parse Table 1 (Site Descriptions) -> {code: 'County Co.'}.

    The header line lists codes (N1 N2 C1 C2 S1 S2) and the next line
    lists the matching counties."""
    lines = text.splitlines()
    for i, line in enumerate(lines):
        codes = re.findall(r"\b([NCS]\d)\b", line)
        if len(codes) >= 4 and i + 1 < len(lines):
            # county line follows
            county_line = lines[i + 1].strip()
            counties = re.split(r"\s{2,}", county_line)
            counties = [c.strip() for c in counties if c.strip()]
            if len(counties) >= len(codes):
                return dict(zip(codes, counties[:len(codes)]))
            # fall back: single-space split
            cs = county_line.split()
            # rebuild "X Co." pairs
            merged: list[str] = []
            j = 0
            while j < len(cs):
                if j + 1 < len(cs) and cs[j + 1].lower().startswith("co"):
                    merged.append(f"{cs[j]} {cs[j+1]}")
                    j += 2
                else:
                    merged.append(cs[j])
                    j += 1
            if len(merged) >= len(codes):
                return dict(zip(codes, merged[:len(codes)]))
    return {}


def _soy_parse_table(lines: list[str], header_idx: int,
                     site_codes_in_header: list[str]) -> list[dict]:
    """Parse a soy region table body. Header columns:
       Variety | Brand | Type | Seed Treatment | RM | <site yields...> | Mean[s]
    Returns rows of ``{brand, product, traits, metrics}`` (metrics keyed
    by site code yield + '25 Mean' + optional 2-yr mean)."""
    rows: list[dict] = []
    for line in lines[header_idx + 1:]:
        s = line.strip()
        if not s:
            continue
        low = s.lower()
        if low.startswith(("table", "average", "lsd", "mean", "cv",
                           "high", "low", "variety", "entry")):
            if low.startswith("table"):
                break
            continue
        toks = s.split()
        # Trailing numeric run = RM + per-site yields + means.
        j = len(toks)
        while j > 0 and _NUM_TOKEN.match(_SIG_RE.sub("", toks[j - 1])):
            j -= 1
        nums = toks[j:]
        if len(nums) < 3:  # need RM + >=1 site + mean
            continue
        lead = toks[:j]
        if len(lead) < 3:
            continue
        # Find the Type column to split Variety+Brand | SeedTreatment.
        # Skip the variety (token 0) and require at least variety+brand
        # before the Type column (type_pos >= 2).
        type_pos = None
        type_end = None
        for ti in range(2, len(lead)):
            end = _soy_type_at(lead, ti)
            if end is not None:
                type_pos, type_end = ti, end
                break
        if type_pos is None:
            continue
        soy_type = " ".join(lead[type_pos:type_end]).rstrip(",")
        vb = " ".join(lead[:type_pos])           # variety + brand
        seed_treatment = " ".join(lead[type_end:])
        # Split Variety | Brand: brand is a known phrase at the END of vb.
        brand, variety = _soy_split_variety_brand(vb)
        if not brand or not variety:
            continue
        # Numeric run: nums[0] is RM (a 2.x-4.x decimal); rest are yields.
        rm_val = _to_num(nums[0])
        yields = [_to_num(x) for x in nums[1:]]
        metrics: dict[str, Any] = {}
        if rm_val is not None:
            metrics["RM"] = rm_val
        # Map per-site yields by header code; the last 1-2 numbers are
        # the regional mean(s). primary "Yield" = '25 mean (the regional
        # mean for the harvest year).
        site_yields = yields[:len(site_codes_in_header)]
        for code, yv in zip(site_codes_in_header, site_yields):
            if yv is not None:
                metrics[f"Yield {code}"] = yv
        # Means: whatever's left after the site columns.
        rest = yields[len(site_codes_in_header):]
        if rest and rest[0] is not None:
            metrics["Yield"] = rest[0]  # '25 regional mean — primary
        if len(rest) >= 2 and rest[1] is not None:
            metrics["Yield 2yr Mean"] = rest[1]
        if "Yield" not in metrics:
            # No regional mean column → use the best available site yield.
            site_vals = [v for v in site_yields if v is not None]
            if site_vals:
                metrics["Yield"] = max(site_vals)
        rows.append({
            "brand": brand, "product": variety,
            "traits": soy_type if soy_type not in ("",) else None,
            "_seed_treatment": seed_treatment or None,
            "metrics": metrics,
        })
    return rows


def _soy_split_variety_brand(vb: str) -> tuple[str | None, str | None]:
    """``'30B4 Viking|Blue River'`` -> (brand='Viking|Blue River',
    variety='30B4'). Brand is a known SOY_BRANDS phrase; it sits AFTER the
    variety token(s)."""
    low = vb.lower()
    for b in _SOY_BRANDS_SORTED:
        bl = b.lower()
        if low.endswith(" " + bl) or low == bl:
            variety = vb[:len(vb) - len(b)].strip()
            if variety:
                return b, variety
            return b, None
    # Fallback: first token = variety, rest = brand (best effort).
    parts = vb.split()
    if len(parts) >= 2:
        return " ".join(parts[1:]), parts[0]
    return None, None


def build_soy_regions(text: str, year: int, pdf_url: str) -> list[SiteTrial]:
    """Parse the soy report text into per-region x maturity SiteTrials."""
    lines = text.splitlines()
    site_codes = _soy_site_codes(text)  # {N1: 'Henry Co.', ...}

    out: dict[str, SiteTrial] = {}
    i = 0
    while i < len(lines):
        m = re.match(r"^\s*(TABLE|Table)\s+(\d+)\s*[:\.]", lines[i])
        if not m:
            i += 1
            continue
        tbl_no = int(m.group(2))
        caption = lines[i].strip()
        # Only the regional yield tables (3-8) carry Variety/Brand rows.
        if tbl_no < 3 or tbl_no > 8:
            i += 1
            continue
        # Find the column header and the per-site codes it lists.
        header_idx = None
        for k in range(i + 1, min(i + 6, len(lines))):
            lk = lines[k].strip()
            if lk.lower().startswith("variety") and "brand" in lk.lower():
                header_idx = k
                break
        if header_idx is None:
            i += 1
            continue
        header = lines[header_idx]
        codes_in_header = re.findall(r"\b([NCS]\d)\b", header)
        rows = _soy_parse_table(lines, header_idx, codes_in_header)
        rows = _finalize_results(rows)
        if len(rows) < 2:
            i = header_idx + 1
            continue
        region, maturity = _soy_region(caption)
        key = f"ocpt-soybeans-{year}-{region.lower()}-{maturity or 't'+str(tbl_no)}"
        # Build a per-site mapping for the sidecar's site list.
        site_towns = "; ".join(
            f"{c}={site_codes.get(c, c)}" for c in codes_in_header)
        st = out.get(key)
        if st is None:
            st = SiteTrial(
                source_key=key, crop="soybeans", year=year, region=region,
                site=(maturity + " maturity").strip() or None, pdf_url=pdf_url,
                table_label=caption)
            st.cooperator = site_towns or None  # repurpose for site map note
            out[key] = st
        seen = {(r["brand"], r["product"]) for r in st.results}
        for r in rows:
            if (r["brand"], r["product"]) not in seen:
                st.results.append(r)
                seen.add((r["brand"], r["product"]))
        i = header_idx + 1

    res = list(out.values())
    for st in res:
        st.results = _finalize_results(st.results)
    return [st for st in res if len(st.results) >= 2]


# --------------------------------------------------------------------- sanity + rank


def _row_ok(r: dict) -> bool:
    brand = (r.get("brand") or "").strip()
    if not brand or len(brand) <= 1:
        return False
    # A purely-numeric brand is junk (a stat row leaked through).
    if _NUM_TOKEN.match(brand):
        return False
    if not (r.get("product") or "").strip():
        return False
    y = (r.get("metrics") or {}).get("Yield")
    if not isinstance(y, (int, float)) or not (1 < y < 400):
        return False
    return True


def _finalize_results(results: list[dict]) -> list[dict]:
    """Drop junk rows, sort by Yield desc, assign 1-based rank."""
    good = [r for r in results if _row_ok(r)]
    good.sort(key=lambda r: -float(r["metrics"]["Yield"]))
    for idx, r in enumerate(good, start=1):
        r["rank"] = idx
        # drop internal-only key if present
        r.pop("_seed_treatment", None)
    # canonical key order: rank, brand, product, traits, metrics
    return [{"rank": r["rank"], "brand": r["brand"], "product": r["product"],
             "traits": r.get("traits"), "metrics": r["metrics"]}
            for r in good]


# --------------------------------------------------------------------- render


def render_markdown(st: SiteTrial) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(
        st.crop, st.crop.title())
    where = ", ".join(filter(None, [st.site, "OH"]))
    head: list[str] = [
        f"# {crop_label} yield trial — {where} ({st.region}, {st.year})",
        "",
        f"- **Publisher:** {PUBLISHER_BRAND} (independent third-party,"
        f" all brands)",
        f"- **Vendor:** {VENDOR} (CFAES extension)",
        f"- **Crop:** {crop_label}",
        f"- **Year:** {st.year}",
        f"- **Region:** {st.region}",
    ]
    if st.site:
        head.append(f"- **Site:** {st.site}")
    if st.county:
        head.append(f"- **County:** {st.county}")
    if st.table_label:
        head.append(f"- **Source table:** {st.table_label}")
    if st.cooperator:
        head.append(f"- **Cooperator / sites:** {st.cooperator}")
    if st.soil_type:
        head.append(f"- **Soil type:** {st.soil_type}")
    if st.previous_crop:
        head.append(f"- **Previous crop:** {st.previous_crop}")
    if st.tillage:
        head.append(f"- **Tillage:** {st.tillage}")
    head += [f"- **Source PDF:** {st.pdf_url}",
             f"- **Note:** {TOS_NOTE}", "", "---", "",
             "## Results (by yield rank)", ""]
    # Discover metric columns from the first result.
    metric_keys: list[str] = []
    for r in st.results:
        if r.get("metrics"):
            metric_keys = list(r["metrics"].keys())
            break
    headers = ["Rank", "Brand", "Hybrid/Variety", "Traits"] + metric_keys
    head.append("| " + " | ".join(headers) + " |")
    head.append("|" + "|".join(["---"] * len(headers)) + "|")
    for r in st.results:
        row = [str(r.get("rank", "-")), r.get("brand") or "-",
               r.get("product") or "-", r.get("traits") or "-"]
        m = r.get("metrics") or {}
        for k in metric_keys:
            v = m.get(k)
            row.append("-" if v is None else str(v))
        head.append("| " + " | ".join(row) + " |")
    head.append("")
    return "\n".join(head)


def write_trial(st: SiteTrial, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{st.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "ohio_ocpt_trials",
        "source_key": st.source_key,
        "data_type": "trial",
        "vendor": VENDOR,
        "brand_aggregator": BRAND_AGGREGATOR,
        "brand": PUBLISHER_BRAND,
        "crop": st.crop,
        "state": "OH",
        "state_abbrev": "oh",
        "year": st.year,
        "region": st.region,
        "site": st.site,
        "cooperator": st.cooperator,
        "county": st.county,
        "soil_type": st.soil_type,
        "previous_crop": st.previous_crop,
        "tillage": st.tillage,
        "table_label": st.table_label,
        "results": st.results,
        "n_results": len(st.results),
        "tos_note": TOS_NOTE,
        "source_urls": [st.pdf_url],
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{st.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def _extract_pdf(http: RateLimitedSession, pdf_url: str
                 ) -> tuple[str, dict[str, dict[str, str]]]:
    """Return ``(full_text, corn_footnotes_by_site)``. The footnote map is
    only meaningful for corn; soy ignores it."""
    r = http.get(pdf_url)
    r.raise_for_status()
    with pdfplumber.open(io.BytesIO(r.content)) as pdf:
        text = "\n".join((p.extract_text() or "") for p in pdf.pages)
        try:
            footnotes = build_corn_footnotes(pdf.pages)
        except Exception as exc:  # noqa: BLE001 — footnotes are enrichment
            log.warning("footnote extraction failed (%s): %s", pdf_url, exc)
            footnotes = {}
    return text, footnotes


def process_report(http: RateLimitedSession, *, crop: str, year: int,
                   force: bool, limit: int | None,
                   counts: dict) -> int:
    pdf_url = discover_report_pdf(http, crop, year)
    if not pdf_url:
        counts["no_pdf"] += 1
        return 0
    try:
        text, footnotes = _extract_pdf(http, pdf_url)
    except Exception as exc:  # noqa: BLE001
        log.error("%s %d PDF fetch/parse failed: %s", crop, year, exc)
        counts["failed"] += 1
        return 0
    if not text or len(text) < 1000:
        log.warning("%s %d: PDF text layer too thin (%d chars) — skip",
                    crop, year, len(text))
        counts["image_skip"] += 1
        return 0

    _BLOCK_SITELINE.clear()
    if crop == "corn":
        trials = build_corn_sites(text, year, pdf_url,
                                  footnotes_by_site=footnotes)
    else:
        trials = build_soy_regions(text, year, pdf_url)

    if not trials:
        log.warning("%s %d: no per-site tables parsed cleanly — skip",
                    crop, year)
        counts["table_skip"] += 1
        return 0

    written = 0
    for st in trials:
        if limit is not None and counts["written"] >= limit:
            break
        md_path = CORPUS_DIR / f"{st.source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            continue
        write_trial(st, render_markdown(st))
        counts["written"] += 1
        written += 1
        brands = sorted({r["brand"] for r in st.results})
        log.info("%s | %d results | %d brands (%s%s)",
                 st.source_key, len(st.results), len(brands),
                 ", ".join(brands[:4]), "…" if len(brands) > 4 else "")
    log.info("%s %d: %d sidecars written (%d candidate tables)",
             crop, year, written, len(trials))
    return written


def run(*, crops: list[str], years: list[int], limit: int | None,
        force: bool) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    counts = {"written": 0, "skipped": 0, "no_pdf": 0, "failed": 0,
              "image_skip": 0, "table_skip": 0}
    for crop in crops:
        for year in years:
            if limit is not None and counts["written"] >= limit:
                break
            process_report(http, crop=crop, year=year, force=force,
                           limit=limit, counts=counts)
    log.info("done: written=%d skipped=%d no_pdf=%d failed=%d "
             "image_skip=%d table_skip=%d",
             counts["written"], counts["skipped"], counts["no_pdf"],
             counts["failed"], counts["image_skip"], counts["table_skip"])
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.ohio_ocpt_trials",
        description="Scrape the Ohio Corn/Soybean Performance Test report "
                    "PDFs into per-site cross-vendor yield trials.")
    p.add_argument("--year", type=int, default=None,
                   help="Scrape a single year (default: 2024+2025 baseline).")
    p.add_argument("--include-old", action="store_true",
                   help="Also scrape the u.osu.edu archive years (2018–2023).")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after writing N sidecars (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=("corn", "soybeans"),
                   help="Limit to one crop (default: both).")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    crops = [args.crop] if args.crop else ["corn", "soybeans"]
    if args.year is not None:
        years = [args.year]
    elif args.include_old:
        years = OLD_YEARS + BASELINE_YEARS
    else:
        years = BASELINE_YEARS
    return run(crops=crops, years=years, limit=args.limit, force=args.force)


if __name__ == "__main__":
    sys.exit(main())