From 1409c2617d60fb2e771fdeea8ce18af0c0179e19 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Mon, 25 May 2026 13:30:30 -0400 Subject: [PATCH] golden_harvest: implement scraper (~175 Syngenta corn + soy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sitemap-driven scraper for goldenharvestseeds.com. Walks sitemap-ghs-hybrids.xml to discover product URLs under /products/corn/ and /products/soybean/ (~89 + 86 = 175 candidates). Per-variety detail parsed from server-rendered HTML: - product code (from

/ ) - positioning (from <meta name="Description">) - maturity (from <div class="product-label"><div class="right">): integer days for corn, decimal MG for soybeans - traits derived from product-code suffix (XF, E3, VIP3, GT, Z, etc.) - 9-row disease tolerance bar chart (#dvDiseaseTolerance) where data-percentage / 10 = rating on 1-9 (9 = best) scale - 9-row agronomic characteristics bar chart (#dvAgronomicChar) - recommended environment list (.AgronomicMange — upstream typo) - all 2-column tables (plant description, seed quality, herbicide responses, Phytophthora gene, SCN race coverage) - tech-sheet PDF URL from live HTML (not sitemap — that's stale) 302 redirects to /product-finder treated as "discontinued" and skipped (Golden Harvest still sitemap-lists some retired SKUs). Rating scale: 1-9 (9 = best) — same as Bayer despite recon's "9-to-1" descriptor (that referred to chart-axis direction, not numeric meaning). _scale_direction is set explicitly so the chunker stays forward-compatible. PDFs are NOT downloaded (recon flagged ~14MB each); tech-sheet URLs are captured in the sidecar for future enrichment. Smoke-tested all branches: 4 corn varieties (E085Z5, E092W5, E094Z4, E095D3, E097K6, E100A3) with full 6 characteristics groups + tech-sheet URL; 3 soy varieties (GH00864XF MG 0.08, GH00973E3 MG 0.09, GH0225XF MG 0.2) with disease + agronomic bars; 302 redirects skipped cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --- scrape/sources/golden_harvest.py | 687 +++++++++++++++++++++++++++++-- 1 file changed, 662 insertions(+), 25 deletions(-) diff --git a/scrape/sources/golden_harvest.py b/scrape/sources/golden_harvest.py index de6f12a6..99e0dcdd 100644 --- a/scrape/sources/golden_harvest.py +++ b/scrape/sources/golden_harvest.py @@ -1,42 +1,679 @@ -"""Golden Harvest scraper (Syngenta brand). +"""Golden Harvest (Syngenta) seed scraper — corn + soybeans. -Discovery: ``https://www.goldenharvestseeds.com/sitemap.xml`` lists -every variety page. Server-rendered HTML — no headless browser -required. Tech-sheet PDFs live on the Syngenta CDN at -``assets.syngentaebiz.com/pdf/techsheets/<CODE>_YYMMDD.pdf`` — same -fetcher pattern as NK. +Source: ``www.goldenharvestseeds.com`` — ASP.NET WebForms site, +server-rendered HTML (no Next.js / SPA). robots.txt is permissive +(no Disallow for /products/). -Two gotchas: +Discovery: ``/sitemap-ghs-hybrids.xml`` lists ~175 product URLs +under ``/products/corn/`` and ``/products/soybean/``. The sitemap +also references thousands of regional plot-report pages we are NOT +indexing (those are head-to-head trial results, useful but a separate +corpus from variety identity — defer to a future ``gh_plot_reports`` +source). -1. **Sitemap PDF dates are stale** (the sitemap was generated - 2025-03-31 and never updated). Resolve the LIVE PDF URL from the - product HTML page, not from the sitemap entry. +A subset of the sitemap-listed product URLs 302-redirect to the +generic ``/<crop>/product-finder/`` page — those are discontinued +varieties Golden Harvest still lists in the sitemap. We do NOT +follow redirects; 302 → skip. -2. **Disease scale is reversed.** Golden Harvest publishes ratings - on a 9-to-1 scale (9 = best, 1 = worst). Bayer/NK/AgriPro use - 1-9 (9 = best). Normalize at chunk time so the corpus has a - single direction. Record the original direction in the chunk_0 - preamble: "Note: ratings normalized to 1-9 (9 = best). Golden - Harvest publishes on a 9-to-1 scale natively." +Per-variety data lives in the page HTML in two shapes: -Expected count: ~175 varieties (89 corn + 86 soy). No wheat. +1. **Tables** — ``<table>`` elements with two columns + (label, value). For corn pages: plant description, maturity + (RM days / GDU), planting rate. For soy pages: plant description, + seed quality + herbicide responses, Phytophthora / SCN genes. -Bonus dataset: ``/plot-report/<state>/<year>/<id>`` — ~7,800 regional -yield trial records. Out of scope for v1 but a high-value future -ingest for regional placement recommendations. +2. **Bar charts** — ``<div class="bar-row">`` elements inside + ``#dvDiseaseTolerance`` and ``#dvAgronomicChar``. Each bar's + ``data-percentage="N"`` value encodes the rating: percent / 10 + = rating on the 1-9 scale (9 = best, same as Bayer). Empty + ``<div class="bar-wrapper">`` content means "no data". -TODO: implement. Reuse the PDF-fetch helper that NK uses. +Per CLAUDE.md the recon described GH ratings as a "9-to-1 reversed" +scale, but inspection of the rendered bars + the published "rating +9 = best" convention shows GH uses the canonical 1-9 (9 = best) +direction — same as Bayer. No flip needed. The sidecar's +``_scale_direction`` field declares this so the chunker can be +forward-compatible if a future vendor genuinely reverses. + +Tech-sheet PDFs: a link to ``assets.syngentaebiz.com/pdf/techsheets/ +<CODE>_YYMMDD.pdf`` appears in the product HTML. The sitemap's +``sitemap-ghs-techsheets.xml`` has STALE date stamps (250331) so we +always read the live URL from the product page, never the sitemap. +PDFs aren't ingested yet (recon flagged they're 14MB each, large) +but the URL is captured in the sidecar for the chunker / future +enrichment. + +Output: + corpus/golden_harvest/<source_key>.md LLM-visible body + corpus/golden_harvest/<source_key>.json sidecar metadata + +source_key convention: ``golden_harvest-<sku>`` lowercased, e.g. +``golden_harvest-e085z5`` or ``golden_harvest-gh00864xf``. + +CLI: + python -m scrape.sources.golden_harvest --limit 5 + python -m scrape.sources.golden_harvest --crop corn --limit 20 + python -m scrape.sources.golden_harvest --force """ + from __future__ import annotations +import argparse +import json +import logging +import os +import random +import re import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://www.goldenharvestseeds.com" +SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml" + +CROP_PATHS = { + "corn": "/products/corn/", + "soybeans": "/products/soybean/", # URL uses "soybean", schema uses "soybeans" +} + +# Bayer + Golden Harvest publish on identical 1-9 (9 = best) ratings +# despite recon mentioning "9-to-1" — the direction descriptor referred +# to the visual chart order, not the numeric meaning. Verified empirically. +RATING_SCALE_DIRECTION = "1-9 (9 = best)" + +# Trait suffix → full name. Best-effort mapping from product-code +# suffix, since GH's HTML doesn't expose trait stack as a structured +# field. Maps verified against tech-sheet PDFs + public marketing. +TRAIT_SUFFIX_MAP = { + # Corn + "VIP3": "Agrisure Viptera® 3220 E-Z Refuge®", + "VIP4": "Agrisure Viptera® 4 Trecepta®", + "GT": "Agrisure GT (glyphosate tolerance)", + "Z": "Agrisure Duracade® 5222 E-Z Refuge® (above + below-ground)", + # Soy + "XF": "XtendFlex® (Roundup Ready 2 Xtend + dicamba + glufosinate)", + "E3": "Enlist E3® (2,4-D + glyphosate + glufosinate)", +} + +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "golden_harvest" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.golden_harvest") + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + """Same shape as bayer_seeds' session. Sleep-based rate limiting + + polite retries on 429/5xx. We do NOT follow redirects by default: + 302 from a product page → discontinued variety, skip.""" + + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 30.0, + allow_redirects: bool = False, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request( + method, url, timeout=timeout, + allow_redirects=allow_redirects, **kw, + ) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class GHProduct: + source_key: str + source_url: str + crop: str # "corn" | "soybeans" + product_name: str = "" # e.g. "E085Z5" + positioning_statement: str | None = None + relative_maturity: str | None = None # corn (string of int) + maturity_group: str | None = None # soy (string of decimal) + trait_codes: list[str] = field(default_factory=list) + trait_descriptions: list[str] = field(default_factory=list) + characteristics_groups: list[dict] = field(default_factory=list) + techsheet_url: str | None = None + sitemap_last_modified: str | None = None + + +# --------------------------------------------------------------------- discovery + + +def discover_products( + http: RateLimitedSession, + *, + only_crop: str | None = None, +) -> list[tuple[str, str, str]]: + """Return ``[(url, crop, lastmod), ...]`` for every GH product page in + the hybrids sitemap.""" + log.info("fetching sitemap %s", SITEMAP_HYBRIDS) + r = http.get(SITEMAP_HYBRIDS, allow_redirects=True) + r.raise_for_status() + entries = re.findall( + r"<url>\s*<loc>([^<]+)</loc>\s*(?:<lastmod>([^<]+)</lastmod>)?", + r.text, + ) + out: list[tuple[str, str, str]] = [] + for url, lastmod in entries: + for crop, path in CROP_PATHS.items(): + if only_crop and crop != only_crop: + continue + if path in url and url.rstrip("/").count("/") >= 5: + tail = url.rstrip("/").rsplit("/", 1)[-1] + if not tail or tail in ("corn", "soybean"): + continue + out.append((url, crop, lastmod or "")) + break + by_crop: dict[str, int] = {} + for _, c, _ in out: + by_crop[c] = by_crop.get(c, 0) + 1 + log.info("variety URLs found: %s (total=%d)", + ", ".join(f"{k}={v}" for k, v in sorted(by_crop.items())), + len(out)) + return out + + +# --------------------------------------------------------------------- helpers + + +def source_key_for(url: str) -> str: + """``.../products/corn/e085z5`` → ``golden_harvest-e085z5``.""" + tail = url.rstrip("/").rsplit("/", 1)[-1].lower() + return f"golden_harvest-{tail}" + + +_TRAIT_SUFFIX_RE = re.compile(r"(VIP3|VIP4|VIP|E3|XF|GT)$", re.I) + + +def derive_traits(product_code: str) -> tuple[list[str], list[str]]: + """Pull the trait suffix off the product code. Returns + ``(codes, descriptions)``. Empty if no recognized suffix.""" + if not product_code: + return [], [] + code = product_code.upper() + m = _TRAIT_SUFFIX_RE.search(code) + if not m: + # The "Z" suffix encodes Duracade-class above + below ground + # protection on Golden Harvest's corn naming convention. + # E085Z5 → Z is the Duracade tag. + if re.search(r"[A-Z]\d+Z\d+$", code): + return ["Z"], [TRAIT_SUFFIX_MAP.get("Z", "")] + return [], [] + tok = m.group(0).upper() + return [tok], [TRAIT_SUFFIX_MAP.get(tok, "")] + + +def _table_to_items(tbl) -> list[dict]: + items: list[dict] = [] + for r in tbl.find_all("tr"): + cells = r.find_all(["th", "td"]) + if len(cells) < 2: + continue + label = cells[0].get_text(" ", strip=True) + value = cells[1].get_text(" ", strip=True) + if label and value: + items.append({"characteristic": label, "value": value}) + return items + + +def _bars_to_items(container) -> list[dict]: + items: list[dict] = [] + for row in container.find_all("div", class_="bar-row"): + label_el = row.find("div", class_="bar-label") + if not label_el: + continue + label = label_el.get_text(" ", strip=True) + bar = row.find("div", class_="bar") + pct = bar.get("data-percentage") if bar else None + if pct is None or str(pct).strip() == "": + items.append({"characteristic": label, "value": "-"}) + continue + try: + rating = int(int(pct) / 10) + except (TypeError, ValueError): + rating = None + if rating is None: + items.append({"characteristic": label, "value": str(pct)}) + else: + items.append({"characteristic": label, "value": str(rating)}) + return items + + +CHART_SECTIONS = [ + # (label_for_sidecar, div_id) + ("DISEASE RATINGS", "dvDiseaseTolerance"), + ("AGRONOMIC CHARACTERISTICS", "dvAgronomicChar"), +] + + +# --------------------------------------------------------------------- detail + + +def fetch_product_detail( + http: RateLimitedSession, url: str, crop: str, lastmod: str +) -> GHProduct | None: + """Fetch + parse one product page. Returns None for discontinued + varieties (302 → product-finder).""" + r = http.get(url, allow_redirects=False) + if r.status_code in (301, 302, 303, 307, 308): + log.info("skip discontinued (redirect): %s → %s", + url, r.headers.get("Location")) + return None + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + + prod = GHProduct( + source_key=source_key_for(url), + source_url=url, + crop=crop, + sitemap_last_modified=lastmod or None, + ) + + # Product name (the code) — prefer <h1>, fall back to <title>. + h1 = soup.find("h1") + if h1: + prod.product_name = h1.get_text(strip=True) + if not prod.product_name: + t = soup.find("title") + if t: + txt = t.get_text(strip=True) + if "|" in txt: + prod.product_name = txt.rsplit("|", 1)[-1].strip() + + # Positioning — meta name="Description" + meta = soup.find("meta", attrs={"name": "Description"}) + if meta and meta.get("content"): + desc = meta["content"].strip() + if prod.product_name: + prefix = prod.product_name + "." + if desc.startswith(prefix): + desc = desc[len(prefix):].strip() + prod.positioning_statement = desc or None + + # Traits inferred from product code. + prod.trait_codes, prod.trait_descriptions = derive_traits(prod.product_name) + + # Tables: capture every two-column table we find, labeled by the + # nearest preceding heading text. + table_groups: list[dict] = [] + for tbl in soup.find_all("table"): + items = _table_to_items(tbl) + if not items: + continue + label = None + cur = tbl + for _ in range(8): + cur = cur.find_previous(["h2", "h3", "h4", "strong"]) + if cur is None: + break + t = cur.get_text(strip=True) + if t: + label = t + break + label = label or "PRODUCT DATA" + table_groups.append({ + "label": label.upper(), + "type": "table", + "items": items, + }) + + # Bar-chart sections. + chart_groups: list[dict] = [] + for label, div_id in CHART_SECTIONS: + container = soup.find(id=div_id) + if not container: + continue + items = _bars_to_items(container) + if items: + chart_groups.append({ + "label": label, + "type": "chart", + "items": items, + }) + + # Recommended environments / management ("AgronomicMange" — typo + # in upstream class name). Rendered as a flat list of strings. + am = soup.find(class_="AgronomicMange") + if am: + recs = [t.strip() for t in am.stripped_strings if t.strip()] + if recs: + chart_groups.append({ + "label": "RECOMMENDED MANAGEMENT", + "type": "list", + "items": [{"characteristic": x, "value": "✓"} for x in recs], + }) + + prod.characteristics_groups = chart_groups + table_groups + + # Maturity routing per crop. The canonical place GH publishes the + # maturity number is the product-label hero block: + # <div class="product-label"><div class="right"><span>RM</span>NN</div></div> + # — same DOM shape on corn and soybean pages, just different units + # (integer days for corn, MG decimal for soy). The maturity table + # (corn only) is a useful fallback. + label_rm = None + pl = soup.find(class_="product-label") + if pl: + right = pl.find(class_="right") + if right: + # The <span>RM</span> sits before the value; get_text drops + # the span boundary, so strip the literal "RM" prefix. + t = right.get_text(" ", strip=True) + t = re.sub(r"^RM\s*", "", t).strip() + if t: + label_rm = t + if label_rm: + if prod.crop == "corn": + m = re.match(r"^(\d{2,3})", label_rm) + if m: + prod.relative_maturity = m.group(1) + elif prod.crop == "soybeans": + m = re.match(r"^(\d+(?:\.\d+)?)", label_rm) + if m: + prod.maturity_group = m.group(1) + + # Corn-table fallback if the hero header was missing. + if prod.crop == "corn" and prod.relative_maturity is None: + for grp in prod.characteristics_groups: + for it in grp.get("items") or []: + if "relative maturity" in (it.get("characteristic") or "").lower(): + m = re.match(r"^(\d{2,3})", (it.get("value") or "").strip()) + if m: + prod.relative_maturity = m.group(1) + break + if prod.relative_maturity: + break + + # Tech-sheet PDF link. + ts = soup.find("a", href=re.compile(r"assets\.syngentaebiz\.com/pdf/techsheets/")) + if ts: + prod.techsheet_url = ts["href"] + else: + m = re.search( + r'(https?://assets\.syngentaebiz\.com/pdf/techsheets/[^"\s<>]+\.pdf)', + r.text, + ) + if m: + prod.techsheet_url = m.group(1) + + return prod + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: GHProduct) -> str: + title = p.product_name or p.source_key + crop_label = "Corn" if p.crop == "corn" else "Soybeans" + maturity_lines: list[str] = [] + if p.relative_maturity and p.crop == "corn": + maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}") + if p.maturity_group and p.crop == "soybeans": + maturity_lines.append(f"- **Maturity group:** {p.maturity_group}") + + trait_line = "" + if p.trait_codes: + codes = ", ".join(p.trait_codes) + if p.trait_descriptions and any(p.trait_descriptions): + trait_line = f"- **Traits:** {codes} ({'; '.join(p.trait_descriptions)})" + else: + trait_line = f"- **Traits:** {codes}" + + head = [ + f"# {title}", + "", + "- **Vendor:** Syngenta", + "- **Brand:** Golden Harvest", + f"- **Crop:** {crop_label}", + *maturity_lines, + ] + if trait_line: + head.append(trait_line) + head.append(f"- **Source:** {p.source_url}") + if p.techsheet_url: + head.append(f"- **Tech sheet (PDF):** {p.techsheet_url}") + head.append(f"- **Rating scale (Golden Harvest):** {RATING_SCALE_DIRECTION}") + head.append("") + head.append("---") + head.append("") + + sections: list[str] = [] + if p.positioning_statement: + sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n") + + for g in p.characteristics_groups: + label = (g.get("label") or "Characteristics").title() + items = g.get("items") or [] + if not items: + continue + rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items) + sections.append( + f"## {label}\n\n" + "| Characteristic | Value |\n" + "|---|---|\n" + f"{rows}\n" + ) + return "\n".join(head) + "\n".join(sections) + + +# --------------------------------------------------------------------- write + + +def write_product(prod: GHProduct, body_md: str) -> None: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + sidecar = { + "source": "golden_harvest", + "source_key": prod.source_key, + "vendor": "Syngenta", + "brand": "Golden Harvest", + "product_name": prod.product_name, + "product_id": None, + "hybrid_prefix": prod.product_name, + "hybrid_suffix": None, + "crop": prod.crop, + "release_year": None, + "relative_maturity": prod.relative_maturity, + "maturity_group": prod.maturity_group, + "wheat_class": None, + "trait_stack": prod.trait_codes, + "trait_descriptions": prod.trait_descriptions, + "positioning_statement": prod.positioning_statement, + "strengths": [], + "characteristics_groups": prod.characteristics_groups, + "_scale_direction": RATING_SCALE_DIRECTION, + "regional_recommendations": [], + "image_url": None, + "techsheet_url": prod.techsheet_url, + "source_urls": [prod.source_url], + "sitemap_last_modified": prod.sitemap_last_modified, + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_product( + http: RateLimitedSession, + *, + url: str, + crop: str, + lastmod: str, + force: bool, +) -> tuple[str, GHProduct | None]: + source_key = source_key_for(url) + md_path = CORPUS_DIR / f"{source_key}.md" + if md_path.exists() and not force: + return "skipped", None + + try: + prod = fetch_product_detail(http, url, crop, lastmod) + except Exception as exc: # noqa: BLE001 + log.error("detail fetch failed for %s: %s", url, exc) + return "failed", None + if prod is None: + return "discontinued", None + + body = render_markdown(prod) + write_product(prod, body) + return "written", prod + + +def run( + *, + limit: int | None, + force: bool, + only_crop: str | None, + only_product: str | None, +) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + + targets = discover_products(http, only_crop=only_crop) + if only_product: + targets = [ + (u, c, lm) for (u, c, lm) in targets + if source_key_for(u) == only_product + or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product.lower() + ] + if not targets: + log.error("no variety matched --product=%s", only_product) + return 2 + + counts = {"written": 0, "skipped": 0, "discontinued": 0, "failed": 0} + processed = 0 + for url, crop, lastmod in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_product( + http, url=url, crop=crop, lastmod=lastmod, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + if prod is not None: + log.info( + "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d techsheet=%s", + processed, str(limit) if limit else "all", + prod.source_key, status, prod.crop, + prod.relative_maturity or prod.maturity_group or "-", + ",".join(prod.trait_codes) or "-", + len(prod.characteristics_groups), + "y" if prod.techsheet_url else "n", + ) + else: + log.info("[%d/%s] %s %s", + processed, str(limit) if limit else "all", + source_key_for(url), status) + + log.info( + "done: processed=%d written=%d skipped=%d discontinued=%d failed=%d " + "(of %d candidates)", + processed, counts["written"], counts["skipped"], + counts["discontinued"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.golden_harvest", + description="Scrape Golden Harvest (Syngenta) corn + soybean varieties.", + ) + p.add_argument("--limit", type=int, default=None, + help="Stop after processing N varieties (default: all).") + p.add_argument("--force", action="store_true", + help="Re-fetch even if the markdown file already exists.") + p.add_argument("--crop", default=None, choices=("corn", "soybeans"), + help="Limit to one crop.") + p.add_argument("--product", default=None, + help="Process a single variety by source_key or URL tail.") + p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) + return p def main(argv: list[str] | None = None) -> int: - print("golden_harvest: not implemented yet — see CLAUDE.md for the disease-scale-reversal gotcha and the live-PDF-URL-resolution requirement", - file=sys.stderr) - return 2 + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run( + limit=args.limit, + force=args.force, + only_crop=args.crop, + only_product=args.product, + ) if __name__ == "__main__": - sys.exit(main(sys.argv[1:])) + sys.exit(main())