"""LG Seeds plot-report scraper — cross-vendor yield trials. LG Seeds publishes its plot data via a JSON XHR API behind ``lgseeds.com/performance/{crop}``. Each plot is a head-to-head yield trial at a specific state/year/site, with the **top 5 performing hybrids** shown — sometimes all LG products, sometimes a mix of LG + competitors (e.g. "212-02VT2PRIB" from another brand shows up as a competitor entry with no productId). This is the SECOND ``data_type: "trial"`` source in the corpus after ``gh_plot_reports`` — same shape (per-site cross-vendor yield), different vendor (AgReliant Genetics / LG Seeds). Sources: POST /performance/{crop}/GetPlots/ body={seedType,traits:[],products:[]} → list of {id, lat, lng, year, avg, ...} (sparse; state=null until detail) GET /performance/{crop}/GetPlotData/?PlotId=&IsSilage= → {id, lat, lng, avg, year, city, state, salesperson, cooperator, plantingDate, harvestDate, top: [{name, avg, productId, crop, variety}]} Crops supported by LG's portal: corn, soybeans, sorghum, silage. Plot counts (across all years 2023-2025): Crop 2023 2024 2025 Total Corn 553 487 476 1,516 Soybeans 198 134 153 485 Sorghum 9 6 4 19 Silage 29 26 24 79 Initial scrape: 2024+2025 (matching gh_plot_reports baseline). 2023 is older but still informative; defer to ``--include-2023``. Output: corpus/lg_plot_reports/.md LLM-visible body corpus/lg_plot_reports/.json sidecar metadata source_key convention: ``lgpr---`` e.g. ``lgpr-corn-2025-146257``. State isn't in the source_key because the discovery endpoint returns ``state: null`` — it's only filled in by the detail call. The full state name lives in the sidecar so filters work. CLI: python -m scrape.sources.lg_plot_reports --limit 5 python -m scrape.sources.lg_plot_reports --crop corn --year 2025 python -m scrape.sources.lg_plot_reports --include-2023 --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" # Note: lgseeds.com 301-redirects www→apex for the POST endpoints, # which curl follows but turns the POST into a GET. Hit the apex # host directly. BASE = "https://lgseeds.com" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "lg_plot_reports" REQ_INTERVAL_SEC = 0.25 DEFAULT_WORKERS = 4 log = logging.getLogger("scrape.lg_plot_reports") # Crop → (URL segment, seedType payload, IsSilage flag for detail call) CROPS: dict[str, tuple[str, str, bool]] = { "corn": ("corn", "Corn", False), "soybeans": ("soybeans", "Soybeans", False), "sorghum": ("sorghum", "Sorghum", False), "silage": ("silage", "Silage", True), } # State abbrev → full name. Used for sidecar normalization when # detail returns a 2-letter state code. STATE_NAMES = { "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas", "ca": "California", "co": "Colorado", "ct": "Connecticut", "de": "Delaware", "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", "id": "Idaho", "il": "Illinois", "in": "Indiana", "ia": "Iowa", "ks": "Kansas", "ky": "Kentucky", "la": "Louisiana", "me": "Maine", "md": "Maryland", "ma": "Massachusetts", "mi": "Michigan", "mn": "Minnesota", "ms": "Mississippi", "mo": "Missouri", "mt": "Montana", "ne": "Nebraska", "nv": "Nevada", "nh": "New Hampshire", "nj": "New Jersey", "nm": "New Mexico", "ny": "New York", "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", "ok": "Oklahoma", "or": "Oregon", "pa": "Pennsylvania", "ri": "Rhode Island", "sc": "South Carolina", "sd": "South Dakota", "tn": "Tennessee", "tx": "Texas", "ut": "Utah", "vt": "Vermont", "va": "Virginia", "wa": "Washington", "wv": "West Virginia", "wi": "Wisconsin", "wy": "Wyoming", } # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Thread-safe rate-limited requests.Session wrapper. Same primitive as gh_plot_reports — single global cadence across every worker thread + retries on 429/5xx with backoff. """ _lock = threading.Lock() _last_global: float = 0.0 _global_interval: float = REQ_INTERVAL_SEC def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT # XHR + JSON headers — required because the LG endpoints # return the whole HTML page if not flagged as XHR. self.s.headers["Accept"] = "application/json, text/plain, */*" self.s.headers["X-Requested-With"] = "XMLHttpRequest" with RateLimitedSession._lock: if interval > RateLimitedSession._global_interval: RateLimitedSession._global_interval = interval def _wait(self) -> None: with RateLimitedSession._lock: delta = time.monotonic() - RateLimitedSession._last_global if delta < RateLimitedSession._global_interval: time.sleep(RateLimitedSession._global_interval - delta) RateLimitedSession._last_global = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) def post(self, url: str, **kw: Any) -> requests.Response: return self.request("POST", url, **kw) # --------------------------------------------------------------------- model @dataclass class TrialResult: rank: int | None = None brand: str = "" product: str = "" traits: str = "" metrics: dict[str, float | str | None] = field(default_factory=dict) @dataclass class PlotReport: source_key: str source_url: str crop: str # "corn" / "soybeans" / "sorghum" / "silage" year: int plot_id: str lat: float | None = None lng: float | None = None plot_average: float | None = None # whole-plot mean (BU/Ac or T/Ac) state_abbrev: str | None = None state_name: str | None = None city: str | None = None cooperator: str | None = None salesperson: str | None = None planted_date: str | None = None # ISO date harvested_date: str | None = None # ISO date results: list[TrialResult] = field(default_factory=list) # --------------------------------------------------------------------- discovery def discover_plots( http: RateLimitedSession, *, crops: set[str], years: set[int], ) -> list[tuple[str, int, str]]: """Hit POST GetPlots per crop, filter by target years, return ``[(crop, year, plot_id), ...]``.""" out: list[tuple[str, int, str]] = [] for crop in sorted(crops): if crop not in CROPS: log.warning("unknown crop %r, skipping", crop) continue url_seg, seed_type, _is_silage = CROPS[crop] url = f"{BASE}/performance/{url_seg}/GetPlots/" log.info("POST %s seedType=%s", url, seed_type) r = http.post( url, data=json.dumps({"seedType": seed_type, "traits": [], "products": []}), headers={ "Content-Type": "application/json", "Referer": f"{BASE}/performance/{url_seg}", }, ) r.raise_for_status() plots = r.json() for p in plots: try: year = int(p["year"]) pid = str(p["id"]) except (KeyError, TypeError, ValueError): continue if years and year not in years: continue out.append((crop, year, pid)) log.info(" %s: %d plots after year filter", crop, sum(1 for c, _, _ in out if c == crop)) return out # --------------------------------------------------------------------- helpers def source_key_for(crop: str, year: int, plot_id: str) -> str: return f"lgpr-{crop}-{year}-{plot_id}" def _parse_iso_date(s: str | None) -> str | None: """``2025-05-19T00:00:00`` → ``2025-05-19``.""" if not s: return None s = s.strip() m = re.match(r"^(\d{4}-\d{2}-\d{2})", s) return m.group(1) if m else None # Known seed brands that can appear as competitor entries. Used to # split a single "name" string like "LG60C24VT4PRO" or # "P1170AM" or "212-02VT2PRIB" into brand/product/traits. # Ordered longest-first so multi-word brands match before substrings. _BRAND_HINTS = ( "Golden Harvest", "WestBred", "AgriPro", "DEKALB", "Pioneer", "Channel", "Asgrow", "Becks", "Beck's", "Brevant", "Stine", "Renk", "Wyffels", "Croplan", "FS", "Local Choice", "Mycogen", "AgriGold", "Hoegemeyer", "NK", ) def _split_lg_top_entry(entry: dict) -> tuple[str, str, str]: """LG Seeds 'top' entries are flat — just a name like ``LG60C24VT4PRO`` (LG hybrid + trait stack) or ``P1170AM`` (Pioneer competitor) or ``212-02VT2PRIB`` (untagged competitor code). We use the productId/variety/crop fields to detect LG entries; otherwise best-effort brand inference. Returns ``(brand, product, traits)`` so the chunk renderer can display the LG vs. competitor breakdown clearly. """ name = (entry.get("name") or "").strip() if not name: return ("", "", "") # LG entry: productId + variety are populated if entry.get("productId") and entry.get("variety"): product = str(entry["variety"]).strip() # The 'name' field includes the trait stack as a suffix # appended to the variety code (e.g. variety "LG60C24", # name "LG60C24VT4PRO" → traits "VT4PRO"). traits = "" if name.upper().startswith(product.upper()): traits = name[len(product):].strip(" -") return ("LG Seeds", product, traits) # Competitor entry — try to identify the brand from the name prefix. upper = name.upper() if upper.startswith("P") and re.match(r"^P\d", upper): # Pioneer code pattern "P1170AM" return ("Pioneer", name, "") for brand in _BRAND_HINTS: if upper.startswith(brand.upper()): rest = name[len(brand):].strip() return (brand, rest or name, "") # No brand match — return whole name as product, brand unknown return ("Unknown", name, "") # --------------------------------------------------------------------- detail def fetch_plot_detail( http: RateLimitedSession, *, crop: str, plot_id: str, ) -> PlotReport | None: url_seg, _seed_type, is_silage = CROPS[crop] detail_url = f"{BASE}/performance/{url_seg}/GetPlotData/" r = http.get( detail_url, params={"PlotId": plot_id, "IsSilage": "true" if is_silage else "false"}, headers={"Referer": f"{BASE}/performance/{url_seg}"}, ) if r.status_code == 404: return None r.raise_for_status() try: d = r.json() except json.JSONDecodeError: log.error("non-JSON response for plot %s (%s)", plot_id, crop) return None if not d: return None year = int(d.get("year")) if d.get("year") else 0 state_abbrev = (d.get("state") or "").strip().lower() or None state_name = STATE_NAMES.get(state_abbrev, state_abbrev.upper()) if state_abbrev else None prod = PlotReport( source_key=source_key_for(crop, year, plot_id), source_url=f"{BASE}/performance/{url_seg}#plot-{plot_id}", crop=crop, year=year, plot_id=str(plot_id), lat=d.get("lat"), lng=d.get("lng"), plot_average=d.get("avg"), state_abbrev=state_abbrev, state_name=state_name, city=d.get("city"), cooperator=d.get("cooperator"), salesperson=d.get("salesperson"), planted_date=_parse_iso_date(d.get("plantingDate")), harvested_date=_parse_iso_date(d.get("harvestDate")), ) # Choose the primary metric label per crop. metric_label = "Ton/Acre" if crop == "silage" else "Yield" top = d.get("top") or [] for i, entry in enumerate(top, start=1): brand, product, traits = _split_lg_top_entry(entry) avg = entry.get("avg") # The "name" sometimes deduplicates with the same product # but a different trait suffix — keep both rows but tag the # variety with its raw name in the traits field for trace. metrics: dict[str, float | str | None] = {} if isinstance(avg, (int, float)): metrics[metric_label] = float(avg) prod.results.append(TrialResult( rank=i, brand=brand, product=product, traits=traits, metrics=metrics, )) return prod # --------------------------------------------------------------------- render def render_markdown(p: PlotReport) -> str: crop_label = { "corn": "Corn", "soybeans": "Soybean", "sorghum": "Sorghum", "silage": "Silage", }.get(p.crop, p.crop.title()) where = ", ".join(x for x in (p.city, p.state_name) if x) or "?" head: list[str] = [ f"# {crop_label} yield trial — {where}, {p.year}", "", f"- **Source:** LG Seeds plot report (top-5 cross-vendor)", f"- **Vendor:** AgReliant Genetics / LG Seeds", f"- **Crop:** {crop_label}", ] if p.state_name: head.append(f"- **State:** {p.state_name}" + (f" ({p.state_abbrev.upper()})" if p.state_abbrev else "")) if p.city: head.append(f"- **City:** {p.city}") head.append(f"- **Year:** {p.year}") head.append(f"- **Plot ID:** {p.plot_id}") if p.cooperator: head.append(f"- **Cooperator:** {p.cooperator}") if p.salesperson and p.salesperson != p.cooperator: head.append(f"- **Salesperson:** {p.salesperson}") if p.planted_date: head.append(f"- **Planted:** {p.planted_date}") if p.harvested_date: head.append(f"- **Harvested:** {p.harvested_date}") if p.plot_average is not None: unit = "T/Ac" if p.crop == "silage" else "BU/Ac" head.append(f"- **Plot average:** {p.plot_average} {unit}") if p.lat and p.lng: head.append(f"- **Coordinates:** {p.lat:.5f}, {p.lng:.5f}") head.append(f"- **URL:** {p.source_url}") head.append("") head.append("---") head.append("") sections: list[str] = [] if p.results: metric_keys: list[str] = [] seen: set[str] = set() for r in p.results: for k in r.metrics.keys(): if k not in seen: seen.add(k) metric_keys.append(k) sections.append("## Top 5 results (by yield rank)") sections.append("") headers = ["Rank", "Brand", "Product", "Traits"] + metric_keys sections.append("| " + " | ".join(headers) + " |") sections.append("|" + "|".join(["---"] * len(headers)) + "|") for r in p.results: row = [ str(r.rank) if r.rank is not None else "-", r.brand or "-", r.product or "-", r.traits or "-", ] for k in metric_keys: v = r.metrics.get(k) if v is None: row.append("-") elif isinstance(v, (int, float)): row.append(str(v)) else: row.append(str(v)) sections.append("| " + " | ".join(row) + " |") sections.append("") # Compact summary line — useful BM25/embedder signal. primary_label = metric_keys[0] if metric_keys else "Yield" summary_bits = [] for r in p.results: v = r.metrics.get(primary_label) if isinstance(v, (int, float)): summary_bits.append(f"{r.product} ({r.brand}) {v}") if summary_bits: sections.append(f"Top 5 by {primary_label}: " + ", ".join(summary_bits) + ".") sections.append("") return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_plot(p: PlotReport, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{p.source_key}.md" json_path = CORPUS_DIR / f"{p.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "lg_plot_reports", "source_key": p.source_key, "data_type": "trial", "vendor": "AgReliant Genetics", "brand": "LG Seeds", "crop": p.crop, "state": p.state_name, "state_abbrev": p.state_abbrev, "city": p.city, "year": p.year, "plot_id": p.plot_id, "lat": p.lat, "lng": p.lng, "cooperator": p.cooperator, "salesperson": p.salesperson, "planted_date": p.planted_date, "harvested_date": p.harvested_date, "plot_average": p.plot_average, "results": [ { "rank": r.rank, "brand": r.brand, "product": r.product, "traits": r.traits, "metrics": r.metrics, } for r in p.results ], "n_results": len(p.results), "source_urls": [p.source_url], "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_plot( http: RateLimitedSession, *, crop: str, year: int, plot_id: str, force: bool, ) -> tuple[str, PlotReport | None]: sk = source_key_for(crop, year, plot_id) md_path = CORPUS_DIR / f"{sk}.md" if md_path.exists() and not force: return "skipped", None try: p = fetch_plot_detail(http, crop=crop, plot_id=plot_id) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s/%s: %s", crop, plot_id, exc) return "failed", None if p is None: return "missing", None body = render_markdown(p) write_plot(p, body) return "written", p def run( *, limit: int | None, force: bool, only_crop: str | None, only_year: int | None, include_2023: bool, workers: int = DEFAULT_WORKERS, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) crops = {only_crop} if only_crop else set(CROPS.keys()) if only_year: years = {only_year} elif include_2023: years = {2023, 2024, 2025} else: years = {2024, 2025} discovery_http = RateLimitedSession() targets = discover_plots(discovery_http, crops=crops, years=years) log.info("discovered %d total plot targets", len(targets)) if limit is not None: targets = targets[:limit] counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} counts_lock = threading.Lock() processed_counter = {"n": 0} total = len(targets) thread_local = threading.local() def _session() -> RateLimitedSession: s = getattr(thread_local, "session", None) if s is None: s = RateLimitedSession() thread_local.session = s return s def _worker(target: tuple[str, int, str]) -> tuple[str, Any]: crop, year, plot_id = target return process_plot( _session(), crop=crop, year=year, plot_id=plot_id, force=force, ) log.info( "dispatching %d plots across %d workers (shared rate limiter %.2f sec/req)", total, workers, REQ_INTERVAL_SEC, ) with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_worker, t): t for t in targets} for fut in as_completed(futures): target = futures[fut] crop, year, plot_id = target try: status, p = fut.result() except Exception as exc: # noqa: BLE001 log.error("worker failed for %s/%s/%s: %s", crop, year, plot_id, exc) status, p = "failed", None with counts_lock: counts[status] = counts.get(status, 0) + 1 processed_counter["n"] += 1 n = processed_counter["n"] if (p is not None and n <= 5) or n % 100 == 0 or status == "failed": log.info( "[%d/%d] %s %s | results=%d state=%s", n, total, source_key_for(crop, year, plot_id), status, len(p.results) if p else 0, (p.state_abbrev.upper() if p and p.state_abbrev else "-"), ) log.info( "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)", processed_counter["n"], counts["written"], counts["skipped"], counts["missing"], counts["failed"], total, ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.lg_plot_reports", description="Scrape LG Seeds cross-vendor plot reports (yield trials).", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N plots (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=tuple(CROPS.keys()), help="Limit to one crop.") p.add_argument("--year", type=int, default=None, choices=(2023, 2024, 2025), help="Limit to one year.") p.add_argument("--include-2023", action="store_true", help="Include 2023 plot reports (default: 2024-2025 only).") p.add_argument("--workers", type=int, default=DEFAULT_WORKERS, help=f"Concurrent worker threads (default {DEFAULT_WORKERS}, " f"all share a global {REQ_INTERVAL_SEC}-sec rate limiter).") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_year=args.year, include_2023=args.include_2023, workers=args.workers, ) if __name__ == "__main__": sys.exit(main())