"""AgriGold plot-report scraper — cross-vendor yield trials. AgriGold publishes detailed cross-vendor yield trials at ``agrigold.com/{crop}/performance/{crop}-yield-results``. Each ``plot-id`` is a single head-to-head trial site comparing AgriGold hybrids against competitors (DEKALB, Pioneer, Dairyland, etc.) on yield, moisture, test weight, and an "Adj Yield" check-adjusted yield. This is the THIRD ``data_type: "trial"`` source in the corpus (after ``gh_plot_reports`` and ``lg_plot_reports``) — same shape (per-site cross-vendor), different vendor (AgReliant Genetics / AgriGold), and the **most metadata-rich** of the three. AgriGold's detail page includes tillage, previous crop, fungicide, herbicide, insecticide, irrigation, soil type — fields the others don't publish. Listing URLs (one per crop, year-filterable): /corn/performance/corn-yield-results?harvestYear={year} /soybeans/performance/soybean-yield-results?harvestYear={year} Detail URL: //performance/-yield-results/{plot_id} (For soybeans the URL is ``/soybeans/performance/soybean-yield-results/{id}`` - note the singular "soybean" in the path.) Plot counts by harvest year (corn): 2025: 408, 2024: 441, 2023: 583, plus 2022 + 2026 (likely sparse) Initial scrape: 2024+2025 (matching gh_plot_reports baseline). Output: corpus/agrigold_plot_reports/.md LLM-visible body corpus/agrigold_plot_reports/.json sidecar metadata source_key convention: ``agpr---`` e.g. ``agpr-corn-2025-145926``. CLI: python -m scrape.sources.agrigold_plot_reports --limit 5 python -m scrape.sources.agrigold_plot_reports --crop corn --year 2025 python -m scrape.sources.agrigold_plot_reports --include-2023 --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.agrigold.com" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "agrigold_plot_reports" REQ_INTERVAL_SEC = 0.5 # AgriGold pages are HEAVY (~50KB detail, 1MB+ listing) DEFAULT_WORKERS = 4 log = logging.getLogger("scrape.agrigold_plot_reports") # Crop → (URL segment, listing URL slug, detail URL slug) # Corn: /corn/performance/corn-yield-results[/{id}] # Soybeans: /soybeans/performance/soybean-yield-results[/{id}] (singular "soybean") CROPS: dict[str, tuple[str, str]] = { "corn": ("corn", "corn-yield-results"), "soybeans": ("soybeans", "soybean-yield-results"), } # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Thread-safe rate-limited requests.Session wrapper. Mirrors the primitive in gh_plot_reports.py. """ _lock = threading.Lock() _last_global: float = 0.0 _global_interval: float = REQ_INTERVAL_SEC def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT with RateLimitedSession._lock: if interval > RateLimitedSession._global_interval: RateLimitedSession._global_interval = interval def _wait(self) -> None: with RateLimitedSession._lock: delta = time.monotonic() - RateLimitedSession._last_global if delta < RateLimitedSession._global_interval: time.sleep(RateLimitedSession._global_interval - delta) RateLimitedSession._last_global = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class TrialResult: rank: int | None = None brand: str = "" product: str = "" traits: str = "" # Columns: Ck, H20 (moisture %), Test Wt., Yield, Adj Yield metrics: dict[str, float | str | None] = field(default_factory=dict) @dataclass class PlotReport: source_key: str source_url: str crop: str # "corn" / "soybeans" year: int plot_id: str city: str | None = None state_abbrev: str | None = None county: str | None = None cooperator: str | None = None plot_average: float | None = None # whole-plot mean yield # Plot management details (AgriGold publishes more of these # than GH or LG — useful for agronomic comparison queries). planted_date: str | None = None # ISO date harvested_date: str | None = None # ISO date population: int | None = None fungicide: str | None = None soil_type: str | None = None tillage: str | None = None herbicide: str | None = None insecticide: str | None = None row_width_in: str | None = None # kept as string ("30.0\"") num_rows: int | None = None previous_crop: str | None = None irrigation: str | None = None results: list[TrialResult] = field(default_factory=list) # --------------------------------------------------------------------- discovery def discover_plots( http: RateLimitedSession, *, crops: set[str], years: set[int], ) -> list[tuple[str, int, str]]: """Walk the listing pages per (crop, year). Returns ``[(crop, year, plot_id), ...]``.""" out: list[tuple[str, int, str]] = [] for crop in sorted(crops): if crop not in CROPS: log.warning("unknown crop %r, skipping", crop) continue url_seg, listing_slug = CROPS[crop] for year in sorted(years): url = f"{BASE}/{url_seg}/performance/{listing_slug}?harvestYear={year}" log.info("GET %s", url) r = http.get(url) r.raise_for_status() # data-plotid="123456" appears 5x per plot. Dedupe. ids = set(re.findall(r'data-plotid="(\d+)"', r.text)) log.info(" %s %d: %d unique plot ids", crop, year, len(ids)) for pid in sorted(ids): out.append((crop, year, pid)) return out # --------------------------------------------------------------------- helpers def source_key_for(crop: str, year: int, plot_id: str) -> str: return f"agpr-{crop}-{year}-{plot_id}" # State abbrev (city, ST format) regex — e.g. "Erie, IL", "Cottage Hill , KS" _CITY_STATE_RE = re.compile(r"^(.*?)\s*,\s*([A-Z]{2})\s*$") def _parse_int(s: str | None) -> int | None: if not s: return None s = re.sub(r"[,$]", "", str(s).strip()) try: return int(s) except ValueError: return None def _parse_float(s: str | None) -> float | None: if not s: return None s = re.sub(r"[,$]", "", str(s).strip()) try: return float(s) except ValueError: return None def _parse_date_slash(s: str | None) -> str | None: """``05/10/25`` → ``2025-05-10``. 2-digit year → 20xx.""" if not s: return None s = s.strip() m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s) if not m: return None mo, dy, yr = m.group(1), m.group(2), m.group(3) if len(yr) == 2: yr = "20" + yr try: return f"{int(yr):04d}-{int(mo):02d}-{int(dy):02d}" except ValueError: return None def _detail_pairs(soup: BeautifulSoup, container_class: str) -> dict[str, str]: """Pull ``.detail-item`` label/value pairs from a container.""" out: dict[str, str] = {} container = soup.find("div", class_=container_class) if container is None: return out for item in container.find_all("div", class_="detail-item"): label_el = item.find("div", class_="label") value_el = item.find("div", class_="value") if label_el is None or value_el is None: continue label = label_el.get_text(" ", strip=True) value = value_el.get_text(" ", strip=True) if label and value: out[label] = value return out # --------------------------------------------------------------------- detail def fetch_plot_detail( http: RateLimitedSession, *, crop: str, year: int, plot_id: str, ) -> PlotReport | None: url_seg, listing_slug = CROPS[crop] detail_url = f"{BASE}/{url_seg}/performance/{listing_slug}/{plot_id}" r = http.get(detail_url) if r.status_code == 404: return None r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") prod = PlotReport( source_key=source_key_for(crop, year, plot_id), source_url=detail_url, crop=crop, year=year, plot_id=str(plot_id), ) # Header block: plot-average + city/state + county/cooperator header = soup.find("div", class_="details-header") if header is not None: avg_el = header.find("div", class_="plot-average") if avg_el is not None: # Strip the inner "Plot Average" label and parse the leading number label_inside = avg_el.find("div", class_="label") if label_inside is not None: label_inside.extract() avg_text = avg_el.get_text(" ", strip=True) prod.plot_average = _parse_float(avg_text) cs_el = header.find("div", class_="city-state") if cs_el is not None: cs = cs_el.get_text(" ", strip=True) m = _CITY_STATE_RE.match(cs) if m: prod.city = m.group(1).strip() prod.state_abbrev = m.group(2).strip().lower() else: prod.city = cs.strip() county_el = header.find("div", class_="county") if county_el is not None: prod.county = county_el.get_text(" ", strip=True) coop_el = header.find("div", class_="coorperator") # site's typo, not ours if coop_el is not None: prod.cooperator = coop_el.get_text(" ", strip=True) # Plot Details: two columns of (label, value) pairs. details = _detail_pairs(soup, "plot-details") prod.planted_date = _parse_date_slash(details.get("Planting Date")) prod.harvested_date = _parse_date_slash(details.get("Harvest Date")) prod.population = _parse_int(details.get("Planting Population")) prod.fungicide = details.get("Fungicide") or None prod.soil_type = details.get("Soil Type") or None prod.tillage = details.get("Tillage") or None prod.herbicide = details.get("Herbicide") or None prod.insecticide = details.get("Insecticide") or None prod.row_width_in = details.get("Row Width") or None prod.num_rows = _parse_int(details.get("Number Of Rows")) prod.previous_crop = details.get("Previous Crop") or None prod.irrigation = details.get("Irrigation") or None # Comparison table table = soup.find("table", class_="plot-rows") if table is None: return prod # Header cells — already known to be Rank, Brand, Product, Trait, # Ck, H20, Test Wt., Yield, Adj Yield. Read defensively from DOM # in case it shifts. header_cells = [] thead = table.find("thead") if thead is not None: for th in thead.find_all("th"): wrap = th.find("div", class_="th-wrapper") txt = (wrap.get_text(" ", strip=True) if wrap else th.get_text(" ", strip=True)).strip() header_cells.append(txt) # Map header position → key def find_col(*names: str) -> int | None: for n in names: for i, h in enumerate(header_cells): if h.lower() == n.lower(): return i return None i_rank = find_col("Rank") i_brand = find_col("Brand") i_product = find_col("Product") i_trait = find_col("Trait", "Traits") skip = {i_rank, i_brand, i_product, i_trait} # Anything else is a metric column metric_cols: list[tuple[str, int]] = [] for i, h in enumerate(header_cells): if i in skip: continue if h: metric_cols.append((h, i)) tbody = table.find("tbody") if tbody is None: return prod for row in tbody.find_all("tr"): cls = row.get("class") or [] # Skip CK AVERAGE and PLOT AVERAGE summary rows if "check-averages" in cls or "plot-averages" in cls: continue cells = [c.get_text(" ", strip=True) for c in row.find_all("td")] if len(cells) < 4: continue def cell(i: int | None) -> str: return cells[i] if i is not None and 0 <= i < len(cells) else "" metrics: dict[str, float | str | None] = {} for name, idx in metric_cols: raw = cell(idx).strip() if not raw or raw == "-": metrics[name] = None else: f = _parse_float(raw) metrics[name] = f if f is not None else raw result = TrialResult( rank=_parse_int(cell(i_rank)), brand=cell(i_brand).strip(), product=cell(i_product).strip(), traits=cell(i_trait).strip(), metrics=metrics, ) if result.brand or result.product or any(v is not None for v in metrics.values()): prod.results.append(result) return prod # --------------------------------------------------------------------- render def render_markdown(p: PlotReport) -> str: crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(p.crop, p.crop.title()) where = ", ".join(x for x in (p.city, p.state_abbrev.upper() if p.state_abbrev else None) if x) or "?" head: list[str] = [ f"# {crop_label} yield trial — {where}, {p.year}", "", f"- **Source:** AgriGold plot report (cross-vendor head-to-head)", f"- **Vendor:** AgReliant Genetics / AgriGold", f"- **Crop:** {crop_label}", ] if p.state_abbrev: head.append(f"- **State:** {p.state_abbrev.upper()}") if p.county: head.append(f"- **County:** {p.county}") if p.city: head.append(f"- **City:** {p.city}") head.append(f"- **Year:** {p.year}") head.append(f"- **Plot ID:** {p.plot_id}") if p.cooperator: head.append(f"- **Cooperator:** {p.cooperator}") if p.plot_average is not None: unit = "BU/Ac" # AgriGold publishes BU/Ac for both corn and soy head.append(f"- **Plot average:** {p.plot_average} {unit}") if p.planted_date: head.append(f"- **Planted:** {p.planted_date}") if p.harvested_date: head.append(f"- **Harvested:** {p.harvested_date}") if p.population: head.append(f"- **Population:** {p.population:,} seeds/acre") if p.row_width_in: head.append(f"- **Row width:** {p.row_width_in}") if p.num_rows: head.append(f"- **# Rows:** {p.num_rows}") if p.soil_type: head.append(f"- **Soil type:** {p.soil_type}") if p.tillage: head.append(f"- **Tillage:** {p.tillage}") if p.previous_crop: head.append(f"- **Previous crop:** {p.previous_crop}") if p.irrigation: head.append(f"- **Irrigation:** {p.irrigation}") if p.fungicide and p.fungicide.upper() != "N/A": head.append(f"- **Fungicide:** {p.fungicide}") if p.herbicide and p.herbicide.upper() != "N/A": head.append(f"- **Herbicide:** {p.herbicide}") if p.insecticide and p.insecticide.upper() != "N/A": head.append(f"- **Insecticide:** {p.insecticide}") head.append(f"- **URL:** {p.source_url}") head.append("") head.append("---") head.append("") sections: list[str] = [] if p.results: metric_keys: list[str] = [] seen: set[str] = set() for r in p.results: for k in r.metrics.keys(): if k not in seen: seen.add(k) metric_keys.append(k) sections.append("## Results (by rank)") sections.append("") headers = ["Rank", "Brand", "Product", "Trait"] + metric_keys sections.append("| " + " | ".join(headers) + " |") sections.append("|" + "|".join(["---"] * len(headers)) + "|") for r in p.results: row = [ str(r.rank) if r.rank is not None else "-", r.brand or "-", r.product or "-", r.traits or "-", ] for k in metric_keys: v = r.metrics.get(k) if v is None: row.append("-") elif isinstance(v, (int, float)): row.append(str(v)) else: row.append(str(v)) sections.append("| " + " | ".join(row) + " |") sections.append("") # Compact summary line for embedder signal — top 5 by Yield. primary = "Yield" if "Yield" in metric_keys else (metric_keys[0] if metric_keys else None) if primary: top = sorted( (r for r in p.results if isinstance(r.metrics.get(primary), (int, float))), key=lambda r: -r.metrics[primary], # type: ignore[operator] )[:5] if top: bits = [f"{r.product} ({r.brand}) {r.metrics[primary]}" for r in top] sections.append(f"Top 5 by {primary}: " + ", ".join(bits) + ".") sections.append("") return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_plot(p: PlotReport, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{p.source_key}.md" json_path = CORPUS_DIR / f"{p.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "agrigold_plot_reports", "source_key": p.source_key, "data_type": "trial", "vendor": "AgReliant Genetics", "brand": "AgriGold", "crop": p.crop, "state": p.state_abbrev.upper() if p.state_abbrev else None, "state_abbrev": p.state_abbrev, "city": p.city, "county": p.county, "year": p.year, "plot_id": p.plot_id, "cooperator": p.cooperator, "plot_average": p.plot_average, "planted_date": p.planted_date, "harvested_date": p.harvested_date, "population_seeds_per_acre": p.population, "row_width": p.row_width_in, "num_rows": p.num_rows, "soil_type": p.soil_type, "tillage": p.tillage, "previous_crop": p.previous_crop, "irrigation": p.irrigation, "fungicide": p.fungicide, "herbicide": p.herbicide, "insecticide": p.insecticide, "results": [ { "rank": r.rank, "brand": r.brand, "product": r.product, "traits": r.traits, "metrics": r.metrics, } for r in p.results ], "n_results": len(p.results), "source_urls": [p.source_url], "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_plot( http: RateLimitedSession, *, crop: str, year: int, plot_id: str, force: bool, ) -> tuple[str, PlotReport | None]: sk = source_key_for(crop, year, plot_id) md_path = CORPUS_DIR / f"{sk}.md" if md_path.exists() and not force: return "skipped", None try: p = fetch_plot_detail(http, crop=crop, year=year, plot_id=plot_id) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s/%s: %s", crop, plot_id, exc) return "failed", None if p is None: return "missing", None body = render_markdown(p) write_plot(p, body) return "written", p def run( *, limit: int | None, force: bool, only_crop: str | None, only_year: int | None, include_2023: bool, workers: int = DEFAULT_WORKERS, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) crops = {only_crop} if only_crop else set(CROPS.keys()) if only_year: years = {only_year} elif include_2023: years = {2023, 2024, 2025} else: years = {2024, 2025} discovery_http = RateLimitedSession() targets = discover_plots(discovery_http, crops=crops, years=years) log.info("discovered %d total plot targets", len(targets)) if limit is not None: targets = targets[:limit] counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} counts_lock = threading.Lock() processed_counter = {"n": 0} total = len(targets) thread_local = threading.local() def _session() -> RateLimitedSession: s = getattr(thread_local, "session", None) if s is None: s = RateLimitedSession() thread_local.session = s return s def _worker(target: tuple[str, int, str]) -> tuple[str, Any]: crop, year, plot_id = target return process_plot( _session(), crop=crop, year=year, plot_id=plot_id, force=force, ) log.info( "dispatching %d plots across %d workers (shared rate limiter %.2f sec/req)", total, workers, REQ_INTERVAL_SEC, ) with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_worker, t): t for t in targets} for fut in as_completed(futures): target = futures[fut] crop, year, plot_id = target try: status, p = fut.result() except Exception as exc: # noqa: BLE001 log.error("worker failed for %s/%s/%s: %s", crop, year, plot_id, exc) status, p = "failed", None with counts_lock: counts[status] = counts.get(status, 0) + 1 processed_counter["n"] += 1 n = processed_counter["n"] if (p is not None and n <= 5) or n % 100 == 0 or status == "failed": log.info( "[%d/%d] %s %s | results=%d state=%s", n, total, source_key_for(crop, year, plot_id), status, len(p.results) if p else 0, (p.state_abbrev.upper() if p and p.state_abbrev else "-"), ) log.info( "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)", processed_counter["n"], counts["written"], counts["skipped"], counts["missing"], counts["failed"], total, ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.agrigold_plot_reports", description="Scrape AgriGold cross-vendor plot reports (yield trials).", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N plots (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=tuple(CROPS.keys()), help="Limit to one crop.") p.add_argument("--year", type=int, default=None, choices=(2022, 2023, 2024, 2025, 2026), help="Limit to one year.") p.add_argument("--include-2023", action="store_true", help="Include 2023 plot reports (default: 2024-2025 only).") p.add_argument("--workers", type=int, default=DEFAULT_WORKERS, help=f"Concurrent worker threads (default {DEFAULT_WORKERS}, " f"all share a global {REQ_INTERVAL_SEC}-sec rate limiter).") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_year=args.year, include_2023=args.include_2023, workers=args.workers, ) if __name__ == "__main__": sys.exit(main())