seed-mcp/scrape/sources/lg_plot_reports.py

"""LG Seeds plot-report scraper — cross-vendor yield trials.

LG Seeds publishes its plot data via a JSON XHR API behind
``lgseeds.com/performance/{crop}``. Each plot is a head-to-head
yield trial at a specific state/year/site, with the **top 5
performing hybrids** shown — sometimes all LG products, sometimes a
mix of LG + competitors (e.g. "212-02VT2PRIB" from another brand
shows up as a competitor entry with no productId).

This is the SECOND ``data_type: "trial"`` source in the corpus
after ``gh_plot_reports`` — same shape (per-site cross-vendor
yield), different vendor (AgReliant Genetics / LG Seeds).

Sources:
  POST /performance/{crop}/GetPlots/   body={seedType,traits:[],products:[]}
    → list of {id, lat, lng, year, avg, ...} (sparse; state=null until detail)
  GET  /performance/{crop}/GetPlotData/?PlotId=<id>&IsSilage=<bool>
    → {id, lat, lng, avg, year, city, state, salesperson, cooperator,
       plantingDate, harvestDate, top: [{name, avg, productId, crop, variety}]}

Crops supported by LG's portal: corn, soybeans, sorghum, silage.
Plot counts (across all years 2023-2025):

  Crop      2023   2024   2025  Total
  Corn       553    487    476  1,516
  Soybeans   198    134    153    485
  Sorghum      9      6      4     19
  Silage      29     26     24     79

Initial scrape: 2024+2025 (matching gh_plot_reports baseline).
2023 is older but still informative; defer to ``--include-2023``.

Output:
  corpus/lg_plot_reports/<source_key>.md      LLM-visible body
  corpus/lg_plot_reports/<source_key>.json    sidecar metadata

source_key convention: ``lgpr-<crop>-<year>-<plot_id>``
e.g. ``lgpr-corn-2025-146257``. State isn't in the source_key
because the discovery endpoint returns ``state: null`` — it's only
filled in by the detail call. The full state name lives in the
sidecar so filters work.

CLI:
  python -m scrape.sources.lg_plot_reports --limit 5
  python -m scrape.sources.lg_plot_reports --crop corn --year 2025
  python -m scrape.sources.lg_plot_reports --include-2023 --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
# Note: lgseeds.com 301-redirects www→apex for the POST endpoints,
# which curl follows but turns the POST into a GET. Hit the apex
# host directly.
BASE = "https://lgseeds.com"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "lg_plot_reports"

REQ_INTERVAL_SEC = 0.25
DEFAULT_WORKERS = 4

log = logging.getLogger("scrape.lg_plot_reports")

# Crop → (URL segment, seedType payload, IsSilage flag for detail call)
CROPS: dict[str, tuple[str, str, bool]] = {
    "corn":     ("corn",     "Corn",     False),
    "soybeans": ("soybeans", "Soybeans", False),
    "sorghum":  ("sorghum",  "Sorghum",  False),
    "silage":   ("silage",   "Silage",   True),
}

# State abbrev → full name. Used for sidecar normalization when
# detail returns a 2-letter state code.
STATE_NAMES = {
    "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas",
    "ca": "California", "co": "Colorado", "ct": "Connecticut",
    "de": "Delaware", "fl": "Florida", "ga": "Georgia", "hi": "Hawaii",
    "id": "Idaho", "il": "Illinois", "in": "Indiana", "ia": "Iowa",
    "ks": "Kansas", "ky": "Kentucky", "la": "Louisiana", "me": "Maine",
    "md": "Maryland", "ma": "Massachusetts", "mi": "Michigan",
    "mn": "Minnesota", "ms": "Mississippi", "mo": "Missouri",
    "mt": "Montana", "ne": "Nebraska", "nv": "Nevada", "nh": "New Hampshire",
    "nj": "New Jersey", "nm": "New Mexico", "ny": "New York",
    "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio",
    "ok": "Oklahoma", "or": "Oregon", "pa": "Pennsylvania",
    "ri": "Rhode Island", "sc": "South Carolina", "sd": "South Dakota",
    "tn": "Tennessee", "tx": "Texas", "ut": "Utah", "vt": "Vermont",
    "va": "Virginia", "wa": "Washington", "wv": "West Virginia",
    "wi": "Wisconsin", "wy": "Wyoming",
}


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Thread-safe rate-limited requests.Session wrapper.

    Same primitive as gh_plot_reports — single global cadence across
    every worker thread + retries on 429/5xx with backoff.
    """

    _lock = threading.Lock()
    _last_global: float = 0.0
    _global_interval: float = REQ_INTERVAL_SEC

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        # XHR + JSON headers — required because the LG endpoints
        # return the whole HTML page if not flagged as XHR.
        self.s.headers["Accept"] = "application/json, text/plain, */*"
        self.s.headers["X-Requested-With"] = "XMLHttpRequest"
        with RateLimitedSession._lock:
            if interval > RateLimitedSession._global_interval:
                RateLimitedSession._global_interval = interval

    def _wait(self) -> None:
        with RateLimitedSession._lock:
            delta = time.monotonic() - RateLimitedSession._last_global
            if delta < RateLimitedSession._global_interval:
                time.sleep(RateLimitedSession._global_interval - delta)
            RateLimitedSession._last_global = time.monotonic()

    def request(
        self,
        method: str,
        url: str,
        *,
        max_retries: int = 4,
        timeout: float = 30.0,
        **kw: Any,
    ) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)

    def post(self, url: str, **kw: Any) -> requests.Response:
        return self.request("POST", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class TrialResult:
    rank: int | None = None
    brand: str = ""
    product: str = ""
    traits: str = ""
    metrics: dict[str, float | str | None] = field(default_factory=dict)


@dataclass
class PlotReport:
    source_key: str
    source_url: str
    crop: str           # "corn" / "soybeans" / "sorghum" / "silage"
    year: int
    plot_id: str
    lat: float | None = None
    lng: float | None = None
    plot_average: float | None = None  # whole-plot mean (BU/Ac or T/Ac)

    state_abbrev: str | None = None
    state_name: str | None = None
    city: str | None = None
    cooperator: str | None = None
    salesperson: str | None = None
    planted_date: str | None = None    # ISO date
    harvested_date: str | None = None  # ISO date

    results: list[TrialResult] = field(default_factory=list)


# --------------------------------------------------------------------- discovery


def discover_plots(
    http: RateLimitedSession,
    *,
    crops: set[str],
    years: set[int],
) -> list[tuple[str, int, str]]:
    """Hit POST GetPlots per crop, filter by target years, return
    ``[(crop, year, plot_id), ...]``."""
    out: list[tuple[str, int, str]] = []
    for crop in sorted(crops):
        if crop not in CROPS:
            log.warning("unknown crop %r, skipping", crop)
            continue
        url_seg, seed_type, _is_silage = CROPS[crop]
        url = f"{BASE}/performance/{url_seg}/GetPlots/"
        log.info("POST %s seedType=%s", url, seed_type)
        r = http.post(
            url,
            data=json.dumps({"seedType": seed_type, "traits": [], "products": []}),
            headers={
                "Content-Type": "application/json",
                "Referer": f"{BASE}/performance/{url_seg}",
            },
        )
        r.raise_for_status()
        plots = r.json()
        for p in plots:
            try:
                year = int(p["year"])
                pid = str(p["id"])
            except (KeyError, TypeError, ValueError):
                continue
            if years and year not in years:
                continue
            out.append((crop, year, pid))
        log.info("  %s: %d plots after year filter", crop, sum(1 for c, _, _ in out if c == crop))
    return out


# --------------------------------------------------------------------- helpers


def source_key_for(crop: str, year: int, plot_id: str) -> str:
    return f"lgpr-{crop}-{year}-{plot_id}"


def _parse_iso_date(s: str | None) -> str | None:
    """``2025-05-19T00:00:00`` → ``2025-05-19``."""
    if not s:
        return None
    s = s.strip()
    m = re.match(r"^(\d{4}-\d{2}-\d{2})", s)
    return m.group(1) if m else None


# Known seed brands that can appear as competitor entries. Used to
# split a single "name" string like "LG60C24VT4PRO" or
# "P1170AM" or "212-02VT2PRIB" into brand/product/traits.
# Ordered longest-first so multi-word brands match before substrings.
_BRAND_HINTS = (
    "Golden Harvest", "WestBred", "AgriPro", "DEKALB", "Pioneer",
    "Channel", "Asgrow", "Becks", "Beck's", "Brevant",
    "Stine", "Renk", "Wyffels", "Croplan", "FS",
    "Local Choice", "Mycogen", "AgriGold", "Hoegemeyer", "NK",
)


def _split_lg_top_entry(entry: dict) -> tuple[str, str, str]:
    """LG Seeds 'top' entries are flat — just a name like
    ``LG60C24VT4PRO`` (LG hybrid + trait stack) or ``P1170AM``
    (Pioneer competitor) or ``212-02VT2PRIB`` (untagged competitor
    code). We use the productId/variety/crop fields to detect LG
    entries; otherwise best-effort brand inference.

    Returns ``(brand, product, traits)`` so the chunk renderer can
    display the LG vs. competitor breakdown clearly.
    """
    name = (entry.get("name") or "").strip()
    if not name:
        return ("", "", "")

    # LG entry: productId + variety are populated
    if entry.get("productId") and entry.get("variety"):
        product = str(entry["variety"]).strip()
        # The 'name' field includes the trait stack as a suffix
        # appended to the variety code (e.g. variety "LG60C24",
        # name "LG60C24VT4PRO" → traits "VT4PRO").
        traits = ""
        if name.upper().startswith(product.upper()):
            traits = name[len(product):].strip(" -")
        return ("LG Seeds", product, traits)

    # Competitor entry — try to identify the brand from the name prefix.
    upper = name.upper()
    if upper.startswith("P") and re.match(r"^P\d", upper):
        # Pioneer code pattern "P1170AM"
        return ("Pioneer", name, "")
    for brand in _BRAND_HINTS:
        if upper.startswith(brand.upper()):
            rest = name[len(brand):].strip()
            return (brand, rest or name, "")
    # No brand match — return whole name as product, brand unknown
    return ("Unknown", name, "")


# --------------------------------------------------------------------- detail


def fetch_plot_detail(
    http: RateLimitedSession,
    *,
    crop: str,
    plot_id: str,
) -> PlotReport | None:
    url_seg, _seed_type, is_silage = CROPS[crop]
    detail_url = f"{BASE}/performance/{url_seg}/GetPlotData/"
    r = http.get(
        detail_url,
        params={"PlotId": plot_id, "IsSilage": "true" if is_silage else "false"},
        headers={"Referer": f"{BASE}/performance/{url_seg}"},
    )
    if r.status_code == 404:
        return None
    r.raise_for_status()
    try:
        d = r.json()
    except json.JSONDecodeError:
        log.error("non-JSON response for plot %s (%s)", plot_id, crop)
        return None
    if not d:
        return None

    year = int(d.get("year")) if d.get("year") else 0
    state_abbrev = (d.get("state") or "").strip().lower() or None
    state_name = STATE_NAMES.get(state_abbrev, state_abbrev.upper()) if state_abbrev else None

    prod = PlotReport(
        source_key=source_key_for(crop, year, plot_id),
        source_url=f"{BASE}/performance/{url_seg}#plot-{plot_id}",
        crop=crop,
        year=year,
        plot_id=str(plot_id),
        lat=d.get("lat"),
        lng=d.get("lng"),
        plot_average=d.get("avg"),
        state_abbrev=state_abbrev,
        state_name=state_name,
        city=d.get("city"),
        cooperator=d.get("cooperator"),
        salesperson=d.get("salesperson"),
        planted_date=_parse_iso_date(d.get("plantingDate")),
        harvested_date=_parse_iso_date(d.get("harvestDate")),
    )

    # Choose the primary metric label per crop.
    metric_label = "Ton/Acre" if crop == "silage" else "Yield"

    top = d.get("top") or []
    for i, entry in enumerate(top, start=1):
        brand, product, traits = _split_lg_top_entry(entry)
        avg = entry.get("avg")
        # The "name" sometimes deduplicates with the same product
        # but a different trait suffix — keep both rows but tag the
        # variety with its raw name in the traits field for trace.
        metrics: dict[str, float | str | None] = {}
        if isinstance(avg, (int, float)):
            metrics[metric_label] = float(avg)
        prod.results.append(TrialResult(
            rank=i,
            brand=brand,
            product=product,
            traits=traits,
            metrics=metrics,
        ))

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: PlotReport) -> str:
    crop_label = {
        "corn": "Corn", "soybeans": "Soybean",
        "sorghum": "Sorghum", "silage": "Silage",
    }.get(p.crop, p.crop.title())
    where = ", ".join(x for x in (p.city, p.state_name) if x) or "?"

    head: list[str] = [
        f"# {crop_label} yield trial — {where}, {p.year}",
        "",
        f"- **Source:** LG Seeds plot report (top-5 cross-vendor)",
        f"- **Vendor:** AgReliant Genetics / LG Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if p.state_name:
        head.append(f"- **State:** {p.state_name}" + (f" ({p.state_abbrev.upper()})" if p.state_abbrev else ""))
    if p.city:
        head.append(f"- **City:** {p.city}")
    head.append(f"- **Year:** {p.year}")
    head.append(f"- **Plot ID:** {p.plot_id}")
    if p.cooperator:
        head.append(f"- **Cooperator:** {p.cooperator}")
    if p.salesperson and p.salesperson != p.cooperator:
        head.append(f"- **Salesperson:** {p.salesperson}")
    if p.planted_date:
        head.append(f"- **Planted:** {p.planted_date}")
    if p.harvested_date:
        head.append(f"- **Harvested:** {p.harvested_date}")
    if p.plot_average is not None:
        unit = "T/Ac" if p.crop == "silage" else "BU/Ac"
        head.append(f"- **Plot average:** {p.plot_average} {unit}")
    if p.lat and p.lng:
        head.append(f"- **Coordinates:** {p.lat:.5f}, {p.lng:.5f}")
    head.append(f"- **URL:** {p.source_url}")
    head.append("")
    head.append("---")
    head.append("")

    sections: list[str] = []
    if p.results:
        metric_keys: list[str] = []
        seen: set[str] = set()
        for r in p.results:
            for k in r.metrics.keys():
                if k not in seen:
                    seen.add(k)
                    metric_keys.append(k)

        sections.append("## Top 5 results (by yield rank)")
        sections.append("")
        headers = ["Rank", "Brand", "Product", "Traits"] + metric_keys
        sections.append("| " + " | ".join(headers) + " |")
        sections.append("|" + "|".join(["---"] * len(headers)) + "|")
        for r in p.results:
            row = [
                str(r.rank) if r.rank is not None else "-",
                r.brand or "-",
                r.product or "-",
                r.traits or "-",
            ]
            for k in metric_keys:
                v = r.metrics.get(k)
                if v is None:
                    row.append("-")
                elif isinstance(v, (int, float)):
                    row.append(str(v))
                else:
                    row.append(str(v))
            sections.append("| " + " | ".join(row) + " |")
        sections.append("")

        # Compact summary line — useful BM25/embedder signal.
        primary_label = metric_keys[0] if metric_keys else "Yield"
        summary_bits = []
        for r in p.results:
            v = r.metrics.get(primary_label)
            if isinstance(v, (int, float)):
                summary_bits.append(f"{r.product} ({r.brand}) {v}")
        if summary_bits:
            sections.append(f"Top 5 by {primary_label}: " + ", ".join(summary_bits) + ".")
            sections.append("")

    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_plot(p: PlotReport, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{p.source_key}.md"
    json_path = CORPUS_DIR / f"{p.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "lg_plot_reports",
        "source_key": p.source_key,
        "data_type": "trial",
        "vendor": "AgReliant Genetics",
        "brand": "LG Seeds",
        "crop": p.crop,
        "state": p.state_name,
        "state_abbrev": p.state_abbrev,
        "city": p.city,
        "year": p.year,
        "plot_id": p.plot_id,
        "lat": p.lat,
        "lng": p.lng,
        "cooperator": p.cooperator,
        "salesperson": p.salesperson,
        "planted_date": p.planted_date,
        "harvested_date": p.harvested_date,
        "plot_average": p.plot_average,
        "results": [
            {
                "rank": r.rank,
                "brand": r.brand,
                "product": r.product,
                "traits": r.traits,
                "metrics": r.metrics,
            }
            for r in p.results
        ],
        "n_results": len(p.results),
        "source_urls": [p.source_url],
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_plot(
    http: RateLimitedSession,
    *,
    crop: str,
    year: int,
    plot_id: str,
    force: bool,
) -> tuple[str, PlotReport | None]:
    sk = source_key_for(crop, year, plot_id)
    md_path = CORPUS_DIR / f"{sk}.md"
    if md_path.exists() and not force:
        return "skipped", None
    try:
        p = fetch_plot_detail(http, crop=crop, plot_id=plot_id)
    except Exception as exc:  # noqa: BLE001
        log.error("detail fetch failed for %s/%s: %s", crop, plot_id, exc)
        return "failed", None
    if p is None:
        return "missing", None
    body = render_markdown(p)
    write_plot(p, body)
    return "written", p


def run(
    *,
    limit: int | None,
    force: bool,
    only_crop: str | None,
    only_year: int | None,
    include_2023: bool,
    workers: int = DEFAULT_WORKERS,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)

    crops = {only_crop} if only_crop else set(CROPS.keys())
    if only_year:
        years = {only_year}
    elif include_2023:
        years = {2023, 2024, 2025}
    else:
        years = {2024, 2025}

    discovery_http = RateLimitedSession()
    targets = discover_plots(discovery_http, crops=crops, years=years)
    log.info("discovered %d total plot targets", len(targets))
    if limit is not None:
        targets = targets[:limit]

    counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0}
    counts_lock = threading.Lock()
    processed_counter = {"n": 0}
    total = len(targets)

    thread_local = threading.local()

    def _session() -> RateLimitedSession:
        s = getattr(thread_local, "session", None)
        if s is None:
            s = RateLimitedSession()
            thread_local.session = s
        return s

    def _worker(target: tuple[str, int, str]) -> tuple[str, Any]:
        crop, year, plot_id = target
        return process_plot(
            _session(), crop=crop, year=year, plot_id=plot_id, force=force,
        )

    log.info(
        "dispatching %d plots across %d workers (shared rate limiter %.2f sec/req)",
        total, workers, REQ_INTERVAL_SEC,
    )
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_worker, t): t for t in targets}
        for fut in as_completed(futures):
            target = futures[fut]
            crop, year, plot_id = target
            try:
                status, p = fut.result()
            except Exception as exc:  # noqa: BLE001
                log.error("worker failed for %s/%s/%s: %s", crop, year, plot_id, exc)
                status, p = "failed", None

            with counts_lock:
                counts[status] = counts.get(status, 0) + 1
                processed_counter["n"] += 1
                n = processed_counter["n"]

            if (p is not None and n <= 5) or n % 100 == 0 or status == "failed":
                log.info(
                    "[%d/%d] %s %s | results=%d state=%s",
                    n, total,
                    source_key_for(crop, year, plot_id), status,
                    len(p.results) if p else 0,
                    (p.state_abbrev.upper() if p and p.state_abbrev else "-"),
                )

    log.info(
        "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)",
        processed_counter["n"], counts["written"], counts["skipped"],
        counts["missing"], counts["failed"], total,
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.lg_plot_reports",
        description="Scrape LG Seeds cross-vendor plot reports (yield trials).",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N plots (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None,
                   choices=tuple(CROPS.keys()),
                   help="Limit to one crop.")
    p.add_argument("--year", type=int, default=None, choices=(2023, 2024, 2025),
                   help="Limit to one year.")
    p.add_argument("--include-2023", action="store_true",
                   help="Include 2023 plot reports (default: 2024-2025 only).")
    p.add_argument("--workers", type=int, default=DEFAULT_WORKERS,
                   help=f"Concurrent worker threads (default {DEFAULT_WORKERS}, "
                        f"all share a global {REQ_INTERVAL_SEC}-sec rate limiter).")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit,
        force=args.force,
        only_crop=args.crop,
        only_year=args.year,
        include_2023=args.include_2023,
        workers=args.workers,
    )


if __name__ == "__main__":
    sys.exit(main())