seed-mcp/scrape/sources/iowa_icpt_trials.py

"""Iowa Crop Performance Tests (ICPT) — cross-vendor yield trials.

Iowa State University / the Iowa Crop Improvement Association run the
**Iowa Crop Performance Tests**, an independent, third-party variety
trial program. Because the trial is publisher-neutral, a single
district table ranks EVERY brand head-to-head — Pioneer, DEKALB,
Brevant, NuTech, Renk, Legacy, Epley Brothers, etc. — on identical
plots. That makes it the highest-trust ``data_type: "trial"`` source
in the corpus: unlike the vendor plot reports (Golden Harvest, LG,
AgriGold, ProHarvest), no seed company controls the entry list or the
agronomy, so there's no home-brand bias.

Site shape (ASP.NET, server-rendered GridView tables — requests +
BeautifulSoup, no JS / headless browser needed):

  Corn:    https://www.croptesting.iastate.edu/corn/CornDistrict2.aspx
  Soybean: https://www.croptesting.iastate.edu/Soybean/SoybeanDistrict2.aspx

``...District2.aspx`` is the ONLY live district URL — the district
(North / Central / South) is chosen *on that same page* via a
``radLstDistrict`` radio (1/2/3) ASP.NET **postback**, NOT a separate
URL (CornDistrict1/3.aspx 302-redirect away). Likewise the year
(``cmbYear`` dropdown, 2025→2014) and the maturity season
(``radListSeason``: 1=Early, 2=Full) are postbacks — there are no
stable GET URLs for them. So we GET the page once to harvest the
ASP.NET hidden fields (``__VIEWSTATE`` / ``__VIEWSTATEGENERATOR`` /
``__VIEWSTATEENCRYPTED``), then POST the form with the desired
year/district/season + ``btnFilter=Filter`` to drive the view.
``CornDistrict.aspx`` (no number) is the 2013-and-older legacy page —
out of scope.

A district table is a multi-site aggregate: the GridView carries the
district-wide Yield plus a West/East sub-region split (Wyld/Eyld) and a
per-site yield column for each cooperator location in the district.
That makes **one district × season × year table the natural document
granularity** — one sidecar per ``(crop, year, district, season)``.

GridView column → field map:
  corn:    Company | Entry | RM | Herb Tech | Trait Package |
           Yield | Yldp | Moist | Wyld | Eyld | <site> ...
  soybean: Company | Entry | MG | Herb Tech |
           Yield | WestYield | EastYield | <site> ...
  Company       -> result.brand   (the seed COMPANY — critical)
  Entry         -> result.product (variety / hybrid code)
  Herb Tech +
    Trait Package -> result.traits
  everything else (RM/MG, Yield, Yldp, Moist, Wyld/Eyld, per-site)
                  -> result.metrics  ("Yield" kept verbatim as the
                     primary key the chunker's top-N picker reads)
Rows are pre-sorted by Yield DESC on the page; we re-sort defensively
and assign rank ourselves (the table has no rank column).

We emit the SAME sidecar shape as ``agrigold_plot_reports`` /
``lg_plot_reports`` / ``gh_plot_reports`` / ``proharvest_plots``
(``results: [{rank, brand, product, traits, metrics}]``). The trial
chunker's source dispatch doesn't list ``iowa_icpt_trials`` explicitly,
so it FALLS THROUGH to the shared ``_render_gh_plot_chunk`` renderer —
no ``rag/chunk.py`` edit required.

Output:
  corpus/iowa_icpt_trials/<source_key>.md      LLM-visible body
  corpus/iowa_icpt_trials/<source_key>.json    sidecar metadata

source_key: ``icpt-<crop>-<year>-<district>[-<season>]``
  e.g. ``icpt-corn-2025-north-early``, ``icpt-soybeans-2024-south-full``.

Scope: 2024 + 2025 baseline. ``--include-old`` walks 2014–2023.

robots/ToS: no robots.txt (the ASP.NET app 404s it); footer
"Copyright (c) 1995-2016 Iowa State University ... All rights reserved"
carries no automation clause. Public land-grant ICPT data, polite UA,
low request rate. (See ``tos_note`` in the sidecar.)

CLI:
  python -m scrape.sources.iowa_icpt_trials --limit 4
  python -m scrape.sources.iowa_icpt_trials --crop corn --year 2025
  python -m scrape.sources.iowa_icpt_trials --include-old --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.croptesting.iastate.edu"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "iowa_icpt_trials"

REQ_INTERVAL_SEC = 2.0  # land-grant box; be polite, single-threaded

BASELINE_YEARS = [2024, 2025]
OLD_YEARS = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

TOS_NOTE = (
    "Footer 'Copyright (c) ...ISU...All rights reserved' (no automation "
    "clause, no robots.txt); public ICPT data; low request rate; attribute "
    "Iowa Crop Performance Tests / ISU."
)

# crop -> (district-results page URL, RM/MG header label)
CROPS: dict[str, tuple[str, str]] = {
    "corn":     (f"{BASE}/corn/CornDistrict2.aspx", "RM"),
    "soybeans": (f"{BASE}/Soybean/SoybeanDistrict2.aspx", "MG"),
}

# radLstDistrict radio value -> (slug, label)
DISTRICTS: dict[str, tuple[str, str]] = {
    "1": ("north", "North"),
    "2": ("central", "Central"),
    "3": ("south", "South"),
}
# radListSeason radio value -> (slug, label)
SEASONS: dict[str, tuple[str, str]] = {
    "1": ("early", "Early Season"),
    "2": ("full", "Full Season"),
}

# ASP.NET control names
C_YEAR = "ctl00$MainContent$cmbYear"
C_DISTRICT = "ctl00$MainContent$radLstDistrict"
C_SEASON = "ctl00$MainContent$radListSeason"
C_SHOW = "ctl00$MainContent$radLstShowOptions"
C_FILTER = "ctl00$MainContent$btnFilter"

# GridView header labels that are NOT metric columns.
BRAND_COL = "company"
PRODUCT_COL = "entry"
TRAIT_COLS = {"herb tech", "trait package"}

log = logging.getLogger("scrape.iowa_icpt_trials")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Single-threaded rate-limited requests.Session (ASP.NET viewstate
    flow is inherently sequential per page, so no global lock needed)."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 45.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        resp: requests.Response | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        assert resp is not None
        return resp

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)

    def post(self, url: str, **kw: Any) -> requests.Response:
        return self.request("POST", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class TrialResult:
    rank: int | None = None
    brand: str = ""
    product: str = ""
    traits: str = ""
    metrics: dict[str, float | str | None] = field(default_factory=dict)


@dataclass
class DistrictTrial:
    source_key: str
    source_url: str
    crop: str            # "corn" / "soybeans"
    year: int
    district_slug: str   # north / central / south
    district_label: str  # North / Central / South
    season_slug: str     # early / full
    season_label: str    # Early Season / Full Season
    sites: list[str] = field(default_factory=list)   # cooperator locations
    experiment_mean: float | None = None
    results: list[TrialResult] = field(default_factory=list)


# --------------------------------------------------------------------- parse


def _hidden_fields(soup: BeautifulSoup) -> dict[str, str]:
    out: dict[str, str] = {}
    for inp in soup.find_all("input", {"type": "hidden"}):
        name = inp.get("name")
        if name:
            out[name] = inp.get("value") or ""
    return out


_NUM_RE = re.compile(r"^-?\d+(?:\.\d+)?$")


def _to_num(s: str | None) -> float | int | None:
    s = (s or "").strip()
    if not s or s == "-" or not _NUM_RE.match(s):
        return None
    f = float(s)
    return int(f) if f.is_integer() else f


def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip()).lower()


def _grid_rows(soup: BeautifulSoup, table_id: str) -> list[list[str]]:
    table = soup.find("table", {"id": table_id})
    if table is None:
        return []
    rows: list[list[str]] = []
    for tr in table.find_all("tr"):
        cells = [c.get_text(" ", strip=True) for c in tr.find_all(["th", "td"])]
        if cells:
            rows.append(cells)
    return rows


def _experiment_mean(soup: BeautifulSoup) -> float | None:
    """Pull the district-wide 'Experiment Mean' Yield from the summary
    GridView (first data row, second column)."""
    rows = _grid_rows(soup, "MainContent_gvDataSummary")
    for r in rows:
        if r and _norm(r[0]).startswith("experiment mean") and len(r) > 1:
            return _to_num(r[1])  # type: ignore[return-value]
    return None


def parse_district_table(
    soup: BeautifulSoup,
    *,
    rm_mg_label: str,
) -> tuple[list[TrialResult], list[str], float | None]:
    """Parse the ``MainContent_gvData`` GridView into ranked results.

    Returns ``(results, site_columns, experiment_mean)``. Rows arrive
    pre-sorted by Yield DESC; we re-sort by Yield DESC defensively and
    assign rank ourselves (no rank column on the page)."""
    rows = _grid_rows(soup, "MainContent_gvData")
    if len(rows) < 2:
        return [], [], None

    header = rows[0]
    hkeys = [_norm(h) for h in header]

    # Locate the structural columns.
    def find_col(*want: str) -> int | None:
        for w in want:
            for i, h in enumerate(hkeys):
                if h == w:
                    return i
        return None

    i_brand = find_col(BRAND_COL)
    i_product = find_col(PRODUCT_COL)
    i_traits = [i for i, h in enumerate(hkeys) if h in TRAIT_COLS]

    # Identify the per-site (cooperator-location) yield columns: they
    # come AFTER the West/East sub-region split (Wyld/Eyld /
    # WestYield/EastYield), and their header is a location name, not a
    # known metric. Anything that isn't brand/product/trait is a metric;
    # per-site columns are metrics whose header isn't a reserved label.
    reserved_metric = {
        _norm(rm_mg_label), "yield", "yldp", "yield pct", "yield %",
        "moist", "wyld", "eyld", "westyield", "eastyield",
    }
    sites: list[str] = []
    for i, h in enumerate(hkeys):
        if i == i_brand or i == i_product or i in i_traits:
            continue
        if h and h not in reserved_metric:
            sites.append(header[i])

    skip = {i_brand, i_product, *i_traits}
    metric_cols = [(header[i], i) for i in range(len(header)) if i not in skip and header[i]]

    results: list[TrialResult] = []
    for raw in rows[1:]:
        # Pad/truncate row to header width defensively.
        cells = raw + [""] * (len(header) - len(raw))

        def cell(i: int | None) -> str:
            return cells[i].strip() if i is not None and 0 <= i < len(cells) else ""

        brand = cell(i_brand)
        product = cell(i_product)
        traits = " ".join(
            t for t in (cell(i) for i in i_traits)
            if t and _norm(t) != "none"
        ).strip()

        metrics: dict[str, float | str | None] = {}
        for name, idx in metric_cols:
            raw_val = cell(idx)
            num = _to_num(raw_val)
            if num is not None:
                metrics[name] = num
            elif raw_val and raw_val != "-":
                metrics[name] = raw_val
            # else: leave the column out (empty)

        res = TrialResult(brand=brand, product=product, traits=traits, metrics=metrics)
        if _row_ok(res):
            results.append(res)

    # Re-sort by Yield DESC (page is already sorted, but don't trust it),
    # then assign rank. Rows with no numeric Yield sink to the bottom.
    def _ysort(r: TrialResult) -> tuple[int, float]:
        y = r.metrics.get("Yield")
        if isinstance(y, (int, float)):
            return (0, -float(y))
        return (1, 0.0)

    results.sort(key=_ysort)
    for n, r in enumerate(results, start=1):
        r.rank = n

    return results, sites, _experiment_mean(soup)


def _row_ok(r: TrialResult) -> bool:
    """Per-row sanity gate. A sound entry has a real (non-numeric)
    company/brand, a product code, and a plausible bu/a Yield. Drops
    summary/blank rows and any leaked aggregate line."""
    brand = (r.brand or "").strip()
    product = (r.product or "").strip()
    if not brand or brand.isdigit():
        return False
    if _norm(brand) in ("summary", "experiment mean", "minimum mean",
                         "maximum mean", "lsd", "coefficient of variability"):
        return False
    if not product:
        return False
    y = r.metrics.get("Yield")
    # Corn runs ~120-280 bu/a, soy ~30-90; gate generously but reject
    # garbage / a moisture/RM value that leaked into the Yield slot.
    if not isinstance(y, (int, float)) or not (10 < float(y) < 400):
        return False
    return True


# --------------------------------------------------------------------- fetch


def source_key_for(crop: str, year: int, district_slug: str, season_slug: str) -> str:
    return f"icpt-{crop}-{year}-{district_slug}-{season_slug}"


def fetch_view(
    http: RateLimitedSession,
    *,
    crop: str,
    year: int,
    district: str,   # radio value "1"/"2"/"3"
    season: str,     # radio value "1"/"2"
) -> DistrictTrial | None:
    """GET the district page (for viewstate), then POST the filter form
    to switch to the requested year/district/season. Returns a parsed
    DistrictTrial, or None if the table is empty for that combination."""
    url, rm_mg_label = CROPS[crop]
    district_slug, district_label = DISTRICTS[district]
    season_slug, season_label = SEASONS[season]

    seed = http.get(url)
    seed.raise_for_status()
    seed_soup = BeautifulSoup(seed.text, "html.parser")

    payload = _hidden_fields(seed_soup)
    payload[C_YEAR] = str(year)
    payload[C_DISTRICT] = district
    payload[C_SEASON] = season
    payload[C_SHOW] = "yield"  # yield view carries Yield/Yldp/Moist + per-SITE yields
    payload[C_FILTER] = "Filter"

    resp = http.post(url, data=payload)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    results, sites, mean = parse_district_table(soup, rm_mg_label=rm_mg_label)
    if not results:
        return None

    return DistrictTrial(
        source_key=source_key_for(crop, year, district_slug, season_slug),
        source_url=url,
        crop=crop,
        year=year,
        district_slug=district_slug,
        district_label=district_label,
        season_slug=season_slug,
        season_label=season_label,
        sites=sites,
        experiment_mean=mean,
        results=results,
    )


# --------------------------------------------------------------------- render


def render_markdown(t: DistrictTrial) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(t.crop, t.crop.title())
    head: list[str] = [
        f"# {crop_label} yield trial — Iowa {t.district_label} District "
        f"({t.season_label}), {t.year}",
        "",
        "- **Source:** Iowa Crop Performance Tests (independent third-party trial)",
        "- **Publisher:** Iowa State University / Iowa Crop Improvement Association",
        f"- **Crop:** {crop_label}",
        "- **State:** IA",
        f"- **District:** {t.district_label}",
        f"- **Maturity season:** {t.season_label}",
        f"- **Year:** {t.year}",
    ]
    if t.experiment_mean is not None:
        head.append(f"- **Experiment mean yield:** {t.experiment_mean} bu/a")
    if t.sites:
        head.append(f"- **Cooperator sites:** {', '.join(t.sites)}")
    head += [f"- **URL:** {t.source_url}", "", "---", ""]

    # Discover metric column order from the first result with metrics.
    metric_keys: list[str] = []
    for r in t.results:
        if r.metrics:
            metric_keys = list(r.metrics.keys())
            break

    sections: list[str] = ["## Results (by yield, all brands)", ""]
    headers = ["Rank", "Company", "Entry", "Traits"] + metric_keys
    sections.append("| " + " | ".join(headers) + " |")
    sections.append("|" + "|".join(["---"] * len(headers)) + "|")
    for r in t.results:
        row = [
            str(r.rank) if r.rank is not None else "-",
            r.brand or "-",
            r.product or "-",
            r.traits or "-",
        ]
        for k in metric_keys:
            v = r.metrics.get(k)
            row.append("-" if v is None else str(v))
        sections.append("| " + " | ".join(row) + " |")
    sections.append("")

    # Compact top-5 line for embedder signal.
    top = [r for r in t.results if isinstance(r.metrics.get("Yield"), (int, float))][:5]
    if top:
        bits = [f"{r.product} ({r.brand}) {r.metrics['Yield']}" for r in top]
        sections.append(f"Top 5 by Yield: " + ", ".join(bits) + ".")
        sections.append("")

    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_trial(t: DistrictTrial, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{t.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "iowa_icpt_trials",
        "source_key": t.source_key,
        "data_type": "trial",
        "vendor": "Iowa State University",
        "brand_aggregator": "Iowa Crop Performance Tests publishes",
        "brand": "Iowa Crop Performance Tests",
        "crop": t.crop,
        "state": "IA",
        "state_abbrev": "ia",
        "year": t.year,
        "region": f"District {t.district_label}",
        "district": t.district_label,
        "season": t.season_label,
        "cooperator_sites": t.sites,
        "experiment_mean_yield": t.experiment_mean,
        "results": [
            {
                "rank": r.rank,
                "brand": r.brand,
                "product": r.product,
                "traits": r.traits,
                "metrics": r.metrics,
            }
            for r in t.results
        ],
        "n_results": len(t.results),
        "source_urls": [t.source_url],
        "tos_note": TOS_NOTE,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{t.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def run(
    *,
    crops: set[str],
    years: list[int],
    limit: int | None,
    force: bool,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
    processed = 0

    targets: list[tuple[str, int, str, str]] = []
    for crop in sorted(crops):
        for year in years:
            for district in DISTRICTS:        # 1/2/3
                for season in SEASONS:        # 1/2
                    targets.append((crop, year, district, season))

    log.info("planned %d (crop x year x district x season) targets", len(targets))

    for crop, year, district, season in targets:
        if limit is not None and processed >= limit:
            break
        district_slug = DISTRICTS[district][0]
        season_slug = SEASONS[season][0]
        sk = source_key_for(crop, year, district_slug, season_slug)
        md_path = CORPUS_DIR / f"{sk}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            continue
        processed += 1
        try:
            trial = fetch_view(http, crop=crop, year=year,
                               district=district, season=season)
        except Exception as exc:  # noqa: BLE001
            counts["failed"] += 1
            log.error("[%s] fetch failed: %s", sk, exc)
            continue
        if trial is None:
            counts["empty"] += 1
            log.info("[%s] empty table (no entries) — skipping", sk)
            continue
        write_trial(trial, render_markdown(trial))
        counts["written"] += 1
        log.info("[%s] written | %d entries | %d sites | brands=%d",
                 sk, len(trial.results), len(trial.sites),
                 len({r.brand for r in trial.results}))

    log.info("done: written=%d skipped=%d empty=%d failed=%d (processed=%d)",
             counts["written"], counts["skipped"], counts["empty"],
             counts["failed"], processed)
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.iowa_icpt_trials",
        description="Scrape Iowa Crop Performance Tests (ICPT) cross-vendor "
                    "yield trials (corn + soybean district tables).",
    )
    p.add_argument("--year", type=int, default=None,
                   choices=tuple(BASELINE_YEARS + OLD_YEARS),
                   help="Limit to a single year (default: 2024+2025 baseline).")
    p.add_argument("--include-old", action="store_true",
                   help="Also scrape 2014-2023 (deferred by default).")
    p.add_argument("--crop", default=None, choices=tuple(CROPS.keys()),
                   help="Limit to one crop (default: both).")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after writing/processing N tables (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    crops = {args.crop} if args.crop else set(CROPS.keys())
    if args.year is not None:
        years = [args.year]
    elif args.include_old:
        years = sorted(set(OLD_YEARS + BASELINE_YEARS))
    else:
        years = list(BASELINE_YEARS)
    return run(crops=crops, years=years, limit=args.limit, force=args.force)


if __name__ == "__main__":
    sys.exit(main())