a54fac240f
Image rebuild (skip scrape) / build (push) Successful in 5m54s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
652 lines
23 KiB
Python
652 lines
23 KiB
Python
"""Iowa Crop Performance Tests (ICPT) — cross-vendor yield trials.
|
||
|
||
Iowa State University / the Iowa Crop Improvement Association run the
|
||
**Iowa Crop Performance Tests**, an independent, third-party variety
|
||
trial program. Because the trial is publisher-neutral, a single
|
||
district table ranks EVERY brand head-to-head — Pioneer, DEKALB,
|
||
Brevant, NuTech, Renk, Legacy, Epley Brothers, etc. — on identical
|
||
plots. That makes it the highest-trust ``data_type: "trial"`` source
|
||
in the corpus: unlike the vendor plot reports (Golden Harvest, LG,
|
||
AgriGold, ProHarvest), no seed company controls the entry list or the
|
||
agronomy, so there's no home-brand bias.
|
||
|
||
Site shape (ASP.NET, server-rendered GridView tables — requests +
|
||
BeautifulSoup, no JS / headless browser needed):
|
||
|
||
Corn: https://www.croptesting.iastate.edu/corn/CornDistrict2.aspx
|
||
Soybean: https://www.croptesting.iastate.edu/Soybean/SoybeanDistrict2.aspx
|
||
|
||
``...District2.aspx`` is the ONLY live district URL — the district
|
||
(North / Central / South) is chosen *on that same page* via a
|
||
``radLstDistrict`` radio (1/2/3) ASP.NET **postback**, NOT a separate
|
||
URL (CornDistrict1/3.aspx 302-redirect away). Likewise the year
|
||
(``cmbYear`` dropdown, 2025→2014) and the maturity season
|
||
(``radListSeason``: 1=Early, 2=Full) are postbacks — there are no
|
||
stable GET URLs for them. So we GET the page once to harvest the
|
||
ASP.NET hidden fields (``__VIEWSTATE`` / ``__VIEWSTATEGENERATOR`` /
|
||
``__VIEWSTATEENCRYPTED``), then POST the form with the desired
|
||
year/district/season + ``btnFilter=Filter`` to drive the view.
|
||
``CornDistrict.aspx`` (no number) is the 2013-and-older legacy page —
|
||
out of scope.
|
||
|
||
A district table is a multi-site aggregate: the GridView carries the
|
||
district-wide Yield plus a West/East sub-region split (Wyld/Eyld) and a
|
||
per-site yield column for each cooperator location in the district.
|
||
That makes **one district × season × year table the natural document
|
||
granularity** — one sidecar per ``(crop, year, district, season)``.
|
||
|
||
GridView column → field map:
|
||
corn: Company | Entry | RM | Herb Tech | Trait Package |
|
||
Yield | Yldp | Moist | Wyld | Eyld | <site> ...
|
||
soybean: Company | Entry | MG | Herb Tech |
|
||
Yield | WestYield | EastYield | <site> ...
|
||
Company -> result.brand (the seed COMPANY — critical)
|
||
Entry -> result.product (variety / hybrid code)
|
||
Herb Tech +
|
||
Trait Package -> result.traits
|
||
everything else (RM/MG, Yield, Yldp, Moist, Wyld/Eyld, per-site)
|
||
-> result.metrics ("Yield" kept verbatim as the
|
||
primary key the chunker's top-N picker reads)
|
||
Rows are pre-sorted by Yield DESC on the page; we re-sort defensively
|
||
and assign rank ourselves (the table has no rank column).
|
||
|
||
We emit the SAME sidecar shape as ``agrigold_plot_reports`` /
|
||
``lg_plot_reports`` / ``gh_plot_reports`` / ``proharvest_plots``
|
||
(``results: [{rank, brand, product, traits, metrics}]``). The trial
|
||
chunker's source dispatch doesn't list ``iowa_icpt_trials`` explicitly,
|
||
so it FALLS THROUGH to the shared ``_render_gh_plot_chunk`` renderer —
|
||
no ``rag/chunk.py`` edit required.
|
||
|
||
Output:
|
||
corpus/iowa_icpt_trials/<source_key>.md LLM-visible body
|
||
corpus/iowa_icpt_trials/<source_key>.json sidecar metadata
|
||
|
||
source_key: ``icpt-<crop>-<year>-<district>[-<season>]``
|
||
e.g. ``icpt-corn-2025-north-early``, ``icpt-soybeans-2024-south-full``.
|
||
|
||
Scope: 2024 + 2025 baseline. ``--include-old`` walks 2014–2023.
|
||
|
||
robots/ToS: no robots.txt (the ASP.NET app 404s it); footer
|
||
"Copyright (c) 1995-2016 Iowa State University ... All rights reserved"
|
||
carries no automation clause. Public land-grant ICPT data, polite UA,
|
||
low request rate. (See ``tos_note`` in the sidecar.)
|
||
|
||
CLI:
|
||
python -m scrape.sources.iowa_icpt_trials --limit 4
|
||
python -m scrape.sources.iowa_icpt_trials --crop corn --year 2025
|
||
python -m scrape.sources.iowa_icpt_trials --include-old --force
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import random
|
||
import re
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
SCRAPER_VERSION = "0.1.0"
|
||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||
BASE = "https://www.croptesting.iastate.edu"
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||
CORPUS_DIR = CORPUS_ROOT / "iowa_icpt_trials"
|
||
|
||
REQ_INTERVAL_SEC = 2.0 # land-grant box; be polite, single-threaded
|
||
|
||
BASELINE_YEARS = [2024, 2025]
|
||
OLD_YEARS = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
|
||
|
||
TOS_NOTE = (
|
||
"Footer 'Copyright (c) ...ISU...All rights reserved' (no automation "
|
||
"clause, no robots.txt); public ICPT data; low request rate; attribute "
|
||
"Iowa Crop Performance Tests / ISU."
|
||
)
|
||
|
||
# crop -> (district-results page URL, RM/MG header label)
|
||
CROPS: dict[str, tuple[str, str]] = {
|
||
"corn": (f"{BASE}/corn/CornDistrict2.aspx", "RM"),
|
||
"soybeans": (f"{BASE}/Soybean/SoybeanDistrict2.aspx", "MG"),
|
||
}
|
||
|
||
# radLstDistrict radio value -> (slug, label)
|
||
DISTRICTS: dict[str, tuple[str, str]] = {
|
||
"1": ("north", "North"),
|
||
"2": ("central", "Central"),
|
||
"3": ("south", "South"),
|
||
}
|
||
# radListSeason radio value -> (slug, label)
|
||
SEASONS: dict[str, tuple[str, str]] = {
|
||
"1": ("early", "Early Season"),
|
||
"2": ("full", "Full Season"),
|
||
}
|
||
|
||
# ASP.NET control names
|
||
C_YEAR = "ctl00$MainContent$cmbYear"
|
||
C_DISTRICT = "ctl00$MainContent$radLstDistrict"
|
||
C_SEASON = "ctl00$MainContent$radListSeason"
|
||
C_SHOW = "ctl00$MainContent$radLstShowOptions"
|
||
C_FILTER = "ctl00$MainContent$btnFilter"
|
||
|
||
# GridView header labels that are NOT metric columns.
|
||
BRAND_COL = "company"
|
||
PRODUCT_COL = "entry"
|
||
TRAIT_COLS = {"herb tech", "trait package"}
|
||
|
||
log = logging.getLogger("scrape.iowa_icpt_trials")
|
||
|
||
|
||
# --------------------------------------------------------------------- HTTP
|
||
|
||
|
||
class RateLimitedSession:
|
||
"""Single-threaded rate-limited requests.Session (ASP.NET viewstate
|
||
flow is inherently sequential per page, so no global lock needed)."""
|
||
|
||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||
self.s = requests.Session()
|
||
self.s.headers["User-Agent"] = USER_AGENT
|
||
self.interval = interval
|
||
self._last = 0.0
|
||
|
||
def _wait(self) -> None:
|
||
delta = time.monotonic() - self._last
|
||
if delta < self.interval:
|
||
time.sleep(self.interval - delta)
|
||
self._last = time.monotonic()
|
||
|
||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||
timeout: float = 45.0, **kw: Any) -> requests.Response:
|
||
last_exc: Exception | None = None
|
||
resp: requests.Response | None = None
|
||
for attempt in range(max_retries):
|
||
self._wait()
|
||
try:
|
||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||
except requests.RequestException as exc:
|
||
last_exc = exc
|
||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||
method, url, exc, backoff)
|
||
time.sleep(backoff)
|
||
continue
|
||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||
ra = resp.headers.get("Retry-After")
|
||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||
30.0, (2 ** attempt) + random.random())
|
||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||
resp.status_code, method, url, backoff)
|
||
time.sleep(backoff)
|
||
continue
|
||
return resp
|
||
if last_exc:
|
||
raise last_exc
|
||
assert resp is not None
|
||
return resp
|
||
|
||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||
return self.request("GET", url, **kw)
|
||
|
||
def post(self, url: str, **kw: Any) -> requests.Response:
|
||
return self.request("POST", url, **kw)
|
||
|
||
|
||
# --------------------------------------------------------------------- model
|
||
|
||
|
||
@dataclass
|
||
class TrialResult:
|
||
rank: int | None = None
|
||
brand: str = ""
|
||
product: str = ""
|
||
traits: str = ""
|
||
metrics: dict[str, float | str | None] = field(default_factory=dict)
|
||
|
||
|
||
@dataclass
|
||
class DistrictTrial:
|
||
source_key: str
|
||
source_url: str
|
||
crop: str # "corn" / "soybeans"
|
||
year: int
|
||
district_slug: str # north / central / south
|
||
district_label: str # North / Central / South
|
||
season_slug: str # early / full
|
||
season_label: str # Early Season / Full Season
|
||
sites: list[str] = field(default_factory=list) # cooperator locations
|
||
experiment_mean: float | None = None
|
||
results: list[TrialResult] = field(default_factory=list)
|
||
|
||
|
||
# --------------------------------------------------------------------- parse
|
||
|
||
|
||
def _hidden_fields(soup: BeautifulSoup) -> dict[str, str]:
|
||
out: dict[str, str] = {}
|
||
for inp in soup.find_all("input", {"type": "hidden"}):
|
||
name = inp.get("name")
|
||
if name:
|
||
out[name] = inp.get("value") or ""
|
||
return out
|
||
|
||
|
||
_NUM_RE = re.compile(r"^-?\d+(?:\.\d+)?$")
|
||
|
||
|
||
def _to_num(s: str | None) -> float | int | None:
|
||
s = (s or "").strip()
|
||
if not s or s == "-" or not _NUM_RE.match(s):
|
||
return None
|
||
f = float(s)
|
||
return int(f) if f.is_integer() else f
|
||
|
||
|
||
def _norm(s: str) -> str:
|
||
return re.sub(r"\s+", " ", (s or "").strip()).lower()
|
||
|
||
|
||
def _grid_rows(soup: BeautifulSoup, table_id: str) -> list[list[str]]:
|
||
table = soup.find("table", {"id": table_id})
|
||
if table is None:
|
||
return []
|
||
rows: list[list[str]] = []
|
||
for tr in table.find_all("tr"):
|
||
cells = [c.get_text(" ", strip=True) for c in tr.find_all(["th", "td"])]
|
||
if cells:
|
||
rows.append(cells)
|
||
return rows
|
||
|
||
|
||
def _experiment_mean(soup: BeautifulSoup) -> float | None:
|
||
"""Pull the district-wide 'Experiment Mean' Yield from the summary
|
||
GridView (first data row, second column)."""
|
||
rows = _grid_rows(soup, "MainContent_gvDataSummary")
|
||
for r in rows:
|
||
if r and _norm(r[0]).startswith("experiment mean") and len(r) > 1:
|
||
return _to_num(r[1]) # type: ignore[return-value]
|
||
return None
|
||
|
||
|
||
def parse_district_table(
|
||
soup: BeautifulSoup,
|
||
*,
|
||
rm_mg_label: str,
|
||
) -> tuple[list[TrialResult], list[str], float | None]:
|
||
"""Parse the ``MainContent_gvData`` GridView into ranked results.
|
||
|
||
Returns ``(results, site_columns, experiment_mean)``. Rows arrive
|
||
pre-sorted by Yield DESC; we re-sort by Yield DESC defensively and
|
||
assign rank ourselves (no rank column on the page)."""
|
||
rows = _grid_rows(soup, "MainContent_gvData")
|
||
if len(rows) < 2:
|
||
return [], [], None
|
||
|
||
header = rows[0]
|
||
hkeys = [_norm(h) for h in header]
|
||
|
||
# Locate the structural columns.
|
||
def find_col(*want: str) -> int | None:
|
||
for w in want:
|
||
for i, h in enumerate(hkeys):
|
||
if h == w:
|
||
return i
|
||
return None
|
||
|
||
i_brand = find_col(BRAND_COL)
|
||
i_product = find_col(PRODUCT_COL)
|
||
i_traits = [i for i, h in enumerate(hkeys) if h in TRAIT_COLS]
|
||
|
||
# Identify the per-site (cooperator-location) yield columns: they
|
||
# come AFTER the West/East sub-region split (Wyld/Eyld /
|
||
# WestYield/EastYield), and their header is a location name, not a
|
||
# known metric. Anything that isn't brand/product/trait is a metric;
|
||
# per-site columns are metrics whose header isn't a reserved label.
|
||
reserved_metric = {
|
||
_norm(rm_mg_label), "yield", "yldp", "yield pct", "yield %",
|
||
"moist", "wyld", "eyld", "westyield", "eastyield",
|
||
}
|
||
sites: list[str] = []
|
||
for i, h in enumerate(hkeys):
|
||
if i == i_brand or i == i_product or i in i_traits:
|
||
continue
|
||
if h and h not in reserved_metric:
|
||
sites.append(header[i])
|
||
|
||
skip = {i_brand, i_product, *i_traits}
|
||
metric_cols = [(header[i], i) for i in range(len(header)) if i not in skip and header[i]]
|
||
|
||
results: list[TrialResult] = []
|
||
for raw in rows[1:]:
|
||
# Pad/truncate row to header width defensively.
|
||
cells = raw + [""] * (len(header) - len(raw))
|
||
|
||
def cell(i: int | None) -> str:
|
||
return cells[i].strip() if i is not None and 0 <= i < len(cells) else ""
|
||
|
||
brand = cell(i_brand)
|
||
product = cell(i_product)
|
||
traits = " ".join(
|
||
t for t in (cell(i) for i in i_traits)
|
||
if t and _norm(t) != "none"
|
||
).strip()
|
||
|
||
metrics: dict[str, float | str | None] = {}
|
||
for name, idx in metric_cols:
|
||
raw_val = cell(idx)
|
||
num = _to_num(raw_val)
|
||
if num is not None:
|
||
metrics[name] = num
|
||
elif raw_val and raw_val != "-":
|
||
metrics[name] = raw_val
|
||
# else: leave the column out (empty)
|
||
|
||
res = TrialResult(brand=brand, product=product, traits=traits, metrics=metrics)
|
||
if _row_ok(res):
|
||
results.append(res)
|
||
|
||
# Re-sort by Yield DESC (page is already sorted, but don't trust it),
|
||
# then assign rank. Rows with no numeric Yield sink to the bottom.
|
||
def _ysort(r: TrialResult) -> tuple[int, float]:
|
||
y = r.metrics.get("Yield")
|
||
if isinstance(y, (int, float)):
|
||
return (0, -float(y))
|
||
return (1, 0.0)
|
||
|
||
results.sort(key=_ysort)
|
||
for n, r in enumerate(results, start=1):
|
||
r.rank = n
|
||
|
||
return results, sites, _experiment_mean(soup)
|
||
|
||
|
||
def _row_ok(r: TrialResult) -> bool:
|
||
"""Per-row sanity gate. A sound entry has a real (non-numeric)
|
||
company/brand, a product code, and a plausible bu/a Yield. Drops
|
||
summary/blank rows and any leaked aggregate line."""
|
||
brand = (r.brand or "").strip()
|
||
product = (r.product or "").strip()
|
||
if not brand or brand.isdigit():
|
||
return False
|
||
if _norm(brand) in ("summary", "experiment mean", "minimum mean",
|
||
"maximum mean", "lsd", "coefficient of variability"):
|
||
return False
|
||
if not product:
|
||
return False
|
||
y = r.metrics.get("Yield")
|
||
# Corn runs ~120-280 bu/a, soy ~30-90; gate generously but reject
|
||
# garbage / a moisture/RM value that leaked into the Yield slot.
|
||
if not isinstance(y, (int, float)) or not (10 < float(y) < 400):
|
||
return False
|
||
return True
|
||
|
||
|
||
# --------------------------------------------------------------------- fetch
|
||
|
||
|
||
def source_key_for(crop: str, year: int, district_slug: str, season_slug: str) -> str:
|
||
return f"icpt-{crop}-{year}-{district_slug}-{season_slug}"
|
||
|
||
|
||
def fetch_view(
|
||
http: RateLimitedSession,
|
||
*,
|
||
crop: str,
|
||
year: int,
|
||
district: str, # radio value "1"/"2"/"3"
|
||
season: str, # radio value "1"/"2"
|
||
) -> DistrictTrial | None:
|
||
"""GET the district page (for viewstate), then POST the filter form
|
||
to switch to the requested year/district/season. Returns a parsed
|
||
DistrictTrial, or None if the table is empty for that combination."""
|
||
url, rm_mg_label = CROPS[crop]
|
||
district_slug, district_label = DISTRICTS[district]
|
||
season_slug, season_label = SEASONS[season]
|
||
|
||
seed = http.get(url)
|
||
seed.raise_for_status()
|
||
seed_soup = BeautifulSoup(seed.text, "html.parser")
|
||
|
||
payload = _hidden_fields(seed_soup)
|
||
payload[C_YEAR] = str(year)
|
||
payload[C_DISTRICT] = district
|
||
payload[C_SEASON] = season
|
||
payload[C_SHOW] = "yield" # yield view carries Yield/Yldp/Moist + per-SITE yields
|
||
payload[C_FILTER] = "Filter"
|
||
|
||
resp = http.post(url, data=payload)
|
||
resp.raise_for_status()
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
results, sites, mean = parse_district_table(soup, rm_mg_label=rm_mg_label)
|
||
if not results:
|
||
return None
|
||
|
||
return DistrictTrial(
|
||
source_key=source_key_for(crop, year, district_slug, season_slug),
|
||
source_url=url,
|
||
crop=crop,
|
||
year=year,
|
||
district_slug=district_slug,
|
||
district_label=district_label,
|
||
season_slug=season_slug,
|
||
season_label=season_label,
|
||
sites=sites,
|
||
experiment_mean=mean,
|
||
results=results,
|
||
)
|
||
|
||
|
||
# --------------------------------------------------------------------- render
|
||
|
||
|
||
def render_markdown(t: DistrictTrial) -> str:
|
||
crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(t.crop, t.crop.title())
|
||
head: list[str] = [
|
||
f"# {crop_label} yield trial — Iowa {t.district_label} District "
|
||
f"({t.season_label}), {t.year}",
|
||
"",
|
||
"- **Source:** Iowa Crop Performance Tests (independent third-party trial)",
|
||
"- **Publisher:** Iowa State University / Iowa Crop Improvement Association",
|
||
f"- **Crop:** {crop_label}",
|
||
"- **State:** IA",
|
||
f"- **District:** {t.district_label}",
|
||
f"- **Maturity season:** {t.season_label}",
|
||
f"- **Year:** {t.year}",
|
||
]
|
||
if t.experiment_mean is not None:
|
||
head.append(f"- **Experiment mean yield:** {t.experiment_mean} bu/a")
|
||
if t.sites:
|
||
head.append(f"- **Cooperator sites:** {', '.join(t.sites)}")
|
||
head += [f"- **URL:** {t.source_url}", "", "---", ""]
|
||
|
||
# Discover metric column order from the first result with metrics.
|
||
metric_keys: list[str] = []
|
||
for r in t.results:
|
||
if r.metrics:
|
||
metric_keys = list(r.metrics.keys())
|
||
break
|
||
|
||
sections: list[str] = ["## Results (by yield, all brands)", ""]
|
||
headers = ["Rank", "Company", "Entry", "Traits"] + metric_keys
|
||
sections.append("| " + " | ".join(headers) + " |")
|
||
sections.append("|" + "|".join(["---"] * len(headers)) + "|")
|
||
for r in t.results:
|
||
row = [
|
||
str(r.rank) if r.rank is not None else "-",
|
||
r.brand or "-",
|
||
r.product or "-",
|
||
r.traits or "-",
|
||
]
|
||
for k in metric_keys:
|
||
v = r.metrics.get(k)
|
||
row.append("-" if v is None else str(v))
|
||
sections.append("| " + " | ".join(row) + " |")
|
||
sections.append("")
|
||
|
||
# Compact top-5 line for embedder signal.
|
||
top = [r for r in t.results if isinstance(r.metrics.get("Yield"), (int, float))][:5]
|
||
if top:
|
||
bits = [f"{r.product} ({r.brand}) {r.metrics['Yield']}" for r in top]
|
||
sections.append(f"Top 5 by Yield: " + ", ".join(bits) + ".")
|
||
sections.append("")
|
||
|
||
return "\n".join(head) + "\n".join(sections)
|
||
|
||
|
||
# --------------------------------------------------------------------- write
|
||
|
||
|
||
def write_trial(t: DistrictTrial, body_md: str) -> None:
|
||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||
(CORPUS_DIR / f"{t.source_key}.md").write_text(body_md, encoding="utf-8")
|
||
sidecar = {
|
||
"source": "iowa_icpt_trials",
|
||
"source_key": t.source_key,
|
||
"data_type": "trial",
|
||
"vendor": "Iowa State University",
|
||
"brand_aggregator": "Iowa Crop Performance Tests publishes",
|
||
"brand": "Iowa Crop Performance Tests",
|
||
"crop": t.crop,
|
||
"state": "IA",
|
||
"state_abbrev": "ia",
|
||
"year": t.year,
|
||
"region": f"District {t.district_label}",
|
||
"district": t.district_label,
|
||
"season": t.season_label,
|
||
"cooperator_sites": t.sites,
|
||
"experiment_mean_yield": t.experiment_mean,
|
||
"results": [
|
||
{
|
||
"rank": r.rank,
|
||
"brand": r.brand,
|
||
"product": r.product,
|
||
"traits": r.traits,
|
||
"metrics": r.metrics,
|
||
}
|
||
for r in t.results
|
||
],
|
||
"n_results": len(t.results),
|
||
"source_urls": [t.source_url],
|
||
"tos_note": TOS_NOTE,
|
||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||
"scraper_version": SCRAPER_VERSION,
|
||
}
|
||
(CORPUS_DIR / f"{t.source_key}.json").write_text(
|
||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||
encoding="utf-8",
|
||
)
|
||
|
||
|
||
# --------------------------------------------------------------------- pipeline
|
||
|
||
|
||
def run(
|
||
*,
|
||
crops: set[str],
|
||
years: list[int],
|
||
limit: int | None,
|
||
force: bool,
|
||
) -> int:
|
||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||
http = RateLimitedSession()
|
||
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
|
||
processed = 0
|
||
|
||
targets: list[tuple[str, int, str, str]] = []
|
||
for crop in sorted(crops):
|
||
for year in years:
|
||
for district in DISTRICTS: # 1/2/3
|
||
for season in SEASONS: # 1/2
|
||
targets.append((crop, year, district, season))
|
||
|
||
log.info("planned %d (crop x year x district x season) targets", len(targets))
|
||
|
||
for crop, year, district, season in targets:
|
||
if limit is not None and processed >= limit:
|
||
break
|
||
district_slug = DISTRICTS[district][0]
|
||
season_slug = SEASONS[season][0]
|
||
sk = source_key_for(crop, year, district_slug, season_slug)
|
||
md_path = CORPUS_DIR / f"{sk}.md"
|
||
if md_path.exists() and not force:
|
||
counts["skipped"] += 1
|
||
continue
|
||
processed += 1
|
||
try:
|
||
trial = fetch_view(http, crop=crop, year=year,
|
||
district=district, season=season)
|
||
except Exception as exc: # noqa: BLE001
|
||
counts["failed"] += 1
|
||
log.error("[%s] fetch failed: %s", sk, exc)
|
||
continue
|
||
if trial is None:
|
||
counts["empty"] += 1
|
||
log.info("[%s] empty table (no entries) — skipping", sk)
|
||
continue
|
||
write_trial(trial, render_markdown(trial))
|
||
counts["written"] += 1
|
||
log.info("[%s] written | %d entries | %d sites | brands=%d",
|
||
sk, len(trial.results), len(trial.sites),
|
||
len({r.brand for r in trial.results}))
|
||
|
||
log.info("done: written=%d skipped=%d empty=%d failed=%d (processed=%d)",
|
||
counts["written"], counts["skipped"], counts["empty"],
|
||
counts["failed"], processed)
|
||
return 0 if counts["failed"] == 0 else 1
|
||
|
||
|
||
# --------------------------------------------------------------------- CLI
|
||
|
||
|
||
def _build_argparser() -> argparse.ArgumentParser:
|
||
p = argparse.ArgumentParser(
|
||
prog="scrape.sources.iowa_icpt_trials",
|
||
description="Scrape Iowa Crop Performance Tests (ICPT) cross-vendor "
|
||
"yield trials (corn + soybean district tables).",
|
||
)
|
||
p.add_argument("--year", type=int, default=None,
|
||
choices=tuple(BASELINE_YEARS + OLD_YEARS),
|
||
help="Limit to a single year (default: 2024+2025 baseline).")
|
||
p.add_argument("--include-old", action="store_true",
|
||
help="Also scrape 2014-2023 (deferred by default).")
|
||
p.add_argument("--crop", default=None, choices=tuple(CROPS.keys()),
|
||
help="Limit to one crop (default: both).")
|
||
p.add_argument("--limit", type=int, default=None,
|
||
help="Stop after writing/processing N tables (default: all).")
|
||
p.add_argument("--force", action="store_true",
|
||
help="Re-fetch even if the markdown file already exists.")
|
||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||
return p
|
||
|
||
|
||
def main(argv: list[str] | None = None) -> int:
|
||
args = _build_argparser().parse_args(argv)
|
||
logging.basicConfig(
|
||
level=args.log_level.upper(),
|
||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||
stream=sys.stderr,
|
||
)
|
||
crops = {args.crop} if args.crop else set(CROPS.keys())
|
||
if args.year is not None:
|
||
years = [args.year]
|
||
elif args.include_old:
|
||
years = sorted(set(OLD_YEARS + BASELINE_YEARS))
|
||
else:
|
||
years = list(BASELINE_YEARS)
|
||
return run(crops=crops, years=years, limit=args.limit, force=args.force)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|