"""Golden Harvest (Syngenta) seed scraper — corn + soybeans. Source: ``www.goldenharvestseeds.com`` — ASP.NET WebForms site, server-rendered HTML (no Next.js / SPA). robots.txt is permissive (no Disallow for /products/). Discovery: ``/sitemap-ghs-hybrids.xml`` lists ~175 product URLs under ``/products/corn/`` and ``/products/soybean/``. The sitemap also references thousands of regional plot-report pages we are NOT indexing (those are head-to-head trial results, useful but a separate corpus from variety identity — defer to a future ``gh_plot_reports`` source). A subset of the sitemap-listed product URLs 302-redirect to the generic ``//product-finder/`` page — those are discontinued varieties Golden Harvest still lists in the sitemap. We do NOT follow redirects; 302 → skip. Per-variety data lives in the page HTML in two shapes: 1. **Tables** — ```` elements with two columns (label, value). For corn pages: plant description, maturity (RM days / GDU), planting rate. For soy pages: plant description, seed quality + herbicide responses, Phytophthora / SCN genes. 2. **Bar charts** — ``
`` elements inside ``#dvDiseaseTolerance`` and ``#dvAgronomicChar``. Each bar's ``data-percentage="N"`` value encodes the rating: percent / 10 = rating on the 1-9 scale (9 = best, same as Bayer). Empty ``
`` content means "no data". Per CLAUDE.md the recon described GH ratings as a "9-to-1 reversed" scale, but inspection of the rendered bars + the published "rating 9 = best" convention shows GH uses the canonical 1-9 (9 = best) direction — same as Bayer. No flip needed. The sidecar's ``_scale_direction`` field declares this so the chunker can be forward-compatible if a future vendor genuinely reverses. Tech-sheet PDFs: a link to ``assets.syngentaebiz.com/pdf/techsheets/ _YYMMDD.pdf`` appears in the product HTML. The sitemap's ``sitemap-ghs-techsheets.xml`` has STALE date stamps (250331) so we always read the live URL from the product page, never the sitemap. PDFs aren't ingested yet (recon flagged they're 14MB each, large) but the URL is captured in the sidecar for the chunker / future enrichment. Output: corpus/golden_harvest/.md LLM-visible body corpus/golden_harvest/.json sidecar metadata source_key convention: ``golden_harvest-`` lowercased, e.g. ``golden_harvest-e085z5`` or ``golden_harvest-gh00864xf``. CLI: python -m scrape.sources.golden_harvest --limit 5 python -m scrape.sources.golden_harvest --crop corn --limit 20 python -m scrape.sources.golden_harvest --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.goldenharvestseeds.com" SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml" CROP_PATHS = { "corn": "/products/corn/", "soybeans": "/products/soybean/", # URL uses "soybean", schema uses "soybeans" } # Bayer + Golden Harvest publish on identical 1-9 (9 = best) ratings # despite recon mentioning "9-to-1" — the direction descriptor referred # to the visual chart order, not the numeric meaning. Verified empirically. RATING_SCALE_DIRECTION = "1-9 (9 = best)" # Trait suffix → full name. Best-effort mapping from product-code # suffix, since GH's HTML doesn't expose trait stack as a structured # field. Maps verified against tech-sheet PDFs + public marketing. TRAIT_SUFFIX_MAP = { # Corn "VIP3": "Agrisure Viptera® 3220 E-Z Refuge®", "VIP4": "Agrisure Viptera® 4 Trecepta®", "GT": "Agrisure GT (glyphosate tolerance)", "Z": "Agrisure Duracade® 5222 E-Z Refuge® (above + below-ground)", # Soy "XF": "XtendFlex® (Roundup Ready 2 Xtend + dicamba + glufosinate)", "E3": "Enlist E3® (2,4-D + glyphosate + glufosinate)", } REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "golden_harvest" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.golden_harvest") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Same shape as bayer_seeds' session. Sleep-based rate limiting + polite retries on 429/5xx. We do NOT follow redirects by default: 302 from a product page → discontinued variety, skip.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, allow_redirects: bool = False, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request( method, url, timeout=timeout, allow_redirects=allow_redirects, **kw, ) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class GHProduct: source_key: str source_url: str crop: str # "corn" | "soybeans" product_name: str = "" # e.g. "E085Z5" positioning_statement: str | None = None relative_maturity: str | None = None # corn (string of int) maturity_group: str | None = None # soy (string of decimal) trait_codes: list[str] = field(default_factory=list) trait_descriptions: list[str] = field(default_factory=list) characteristics_groups: list[dict] = field(default_factory=list) techsheet_url: str | None = None sitemap_last_modified: str | None = None # --------------------------------------------------------------------- discovery def discover_products( http: RateLimitedSession, *, only_crop: str | None = None, ) -> list[tuple[str, str, str]]: """Return ``[(url, crop, lastmod), ...]`` for every GH product page in the hybrids sitemap.""" log.info("fetching sitemap %s", SITEMAP_HYBRIDS) r = http.get(SITEMAP_HYBRIDS, allow_redirects=True) r.raise_for_status() entries = re.findall( r"\s*([^<]+)\s*(?:([^<]+))?", r.text, ) out: list[tuple[str, str, str]] = [] for url, lastmod in entries: for crop, path in CROP_PATHS.items(): if only_crop and crop != only_crop: continue if path in url and url.rstrip("/").count("/") >= 5: tail = url.rstrip("/").rsplit("/", 1)[-1] if not tail or tail in ("corn", "soybean"): continue out.append((url, crop, lastmod or "")) break by_crop: dict[str, int] = {} for _, c, _ in out: by_crop[c] = by_crop.get(c, 0) + 1 log.info("variety URLs found: %s (total=%d)", ", ".join(f"{k}={v}" for k, v in sorted(by_crop.items())), len(out)) return out # --------------------------------------------------------------------- helpers def source_key_for(url: str) -> str: """``.../products/corn/e085z5`` → ``golden_harvest-e085z5``.""" tail = url.rstrip("/").rsplit("/", 1)[-1].lower() return f"golden_harvest-{tail}" _TRAIT_SUFFIX_RE = re.compile(r"(VIP3|VIP4|VIP|E3|XF|GT)$", re.I) def derive_traits(product_code: str) -> tuple[list[str], list[str]]: """Pull the trait suffix off the product code. Returns ``(codes, descriptions)``. Empty if no recognized suffix.""" if not product_code: return [], [] code = product_code.upper() m = _TRAIT_SUFFIX_RE.search(code) if not m: # The "Z" suffix encodes Duracade-class above + below ground # protection on Golden Harvest's corn naming convention. # E085Z5 → Z is the Duracade tag. if re.search(r"[A-Z]\d+Z\d+$", code): return ["Z"], [TRAIT_SUFFIX_MAP.get("Z", "")] return [], [] tok = m.group(0).upper() return [tok], [TRAIT_SUFFIX_MAP.get(tok, "")] def _table_to_items(tbl) -> list[dict]: items: list[dict] = [] for r in tbl.find_all("tr"): cells = r.find_all(["th", "td"]) if len(cells) < 2: continue label = cells[0].get_text(" ", strip=True) value = cells[1].get_text(" ", strip=True) if label and value: items.append({"characteristic": label, "value": value}) return items def _bars_to_items(container) -> list[dict]: items: list[dict] = [] for row in container.find_all("div", class_="bar-row"): label_el = row.find("div", class_="bar-label") if not label_el: continue label = label_el.get_text(" ", strip=True) bar = row.find("div", class_="bar") pct = bar.get("data-percentage") if bar else None if pct is None or str(pct).strip() == "": items.append({"characteristic": label, "value": "-"}) continue try: rating = int(int(pct) / 10) except (TypeError, ValueError): rating = None if rating is None: items.append({"characteristic": label, "value": str(pct)}) else: items.append({"characteristic": label, "value": str(rating)}) return items CHART_SECTIONS = [ # (label_for_sidecar, div_id) ("DISEASE RATINGS", "dvDiseaseTolerance"), ("AGRONOMIC CHARACTERISTICS", "dvAgronomicChar"), ] # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, url: str, crop: str, lastmod: str ) -> GHProduct | None: """Fetch + parse one product page. Returns None for discontinued varieties (302 → product-finder).""" r = http.get(url, allow_redirects=False) if r.status_code in (301, 302, 303, 307, 308): log.info("skip discontinued (redirect): %s → %s", url, r.headers.get("Location")) return None r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") prod = GHProduct( source_key=source_key_for(url), source_url=url, crop=crop, sitemap_last_modified=lastmod or None, ) # Product name (the code) — prefer

, fall back to . h1 = soup.find("h1") if h1: prod.product_name = h1.get_text(strip=True) if not prod.product_name: t = soup.find("title") if t: txt = t.get_text(strip=True) if "|" in txt: prod.product_name = txt.rsplit("|", 1)[-1].strip() # Positioning — meta name="Description" meta = soup.find("meta", attrs={"name": "Description"}) if meta and meta.get("content"): desc = meta["content"].strip() if prod.product_name: prefix = prod.product_name + "." if desc.startswith(prefix): desc = desc[len(prefix):].strip() prod.positioning_statement = desc or None # Traits inferred from product code. prod.trait_codes, prod.trait_descriptions = derive_traits(prod.product_name) # Tables: capture every two-column table we find, labeled by the # nearest preceding heading text. table_groups: list[dict] = [] for tbl in soup.find_all("table"): items = _table_to_items(tbl) if not items: continue label = None cur = tbl for _ in range(8): cur = cur.find_previous(["h2", "h3", "h4", "strong"]) if cur is None: break t = cur.get_text(strip=True) if t: label = t break label = label or "PRODUCT DATA" table_groups.append({ "label": label.upper(), "type": "table", "items": items, }) # Bar-chart sections. chart_groups: list[dict] = [] for label, div_id in CHART_SECTIONS: container = soup.find(id=div_id) if not container: continue items = _bars_to_items(container) if items: chart_groups.append({ "label": label, "type": "chart", "items": items, }) # Recommended environments / management ("AgronomicMange" — typo # in upstream class name). Rendered as a flat list of strings. am = soup.find(class_="AgronomicMange") if am: recs = [t.strip() for t in am.stripped_strings if t.strip()] if recs: chart_groups.append({ "label": "RECOMMENDED MANAGEMENT", "type": "list", "items": [{"characteristic": x, "value": "✓"} for x in recs], }) prod.characteristics_groups = chart_groups + table_groups # Maturity routing per crop. The canonical place GH publishes the # maturity number is the product-label hero block: # <div class="product-label"><div class="right"><span>RM</span>NN</div></div> # — same DOM shape on corn and soybean pages, just different units # (integer days for corn, MG decimal for soy). The maturity table # (corn only) is a useful fallback. label_rm = None pl = soup.find(class_="product-label") if pl: right = pl.find(class_="right") if right: # The <span>RM</span> sits before the value; get_text drops # the span boundary, so strip the literal "RM" prefix. t = right.get_text(" ", strip=True) t = re.sub(r"^RM\s*", "", t).strip() if t: label_rm = t if label_rm: if prod.crop == "corn": m = re.match(r"^(\d{2,3})", label_rm) if m: prod.relative_maturity = m.group(1) elif prod.crop == "soybeans": m = re.match(r"^(\d+(?:\.\d+)?)", label_rm) if m: prod.maturity_group = m.group(1) # Corn-table fallback if the hero header was missing. if prod.crop == "corn" and prod.relative_maturity is None: for grp in prod.characteristics_groups: for it in grp.get("items") or []: if "relative maturity" in (it.get("characteristic") or "").lower(): m = re.match(r"^(\d{2,3})", (it.get("value") or "").strip()) if m: prod.relative_maturity = m.group(1) break if prod.relative_maturity: break # Tech-sheet PDF link. ts = soup.find("a", href=re.compile(r"assets\.syngentaebiz\.com/pdf/techsheets/")) if ts: prod.techsheet_url = ts["href"] else: m = re.search( r'(https?://assets\.syngentaebiz\.com/pdf/techsheets/[^"\s<>]+\.pdf)', r.text, ) if m: prod.techsheet_url = m.group(1) return prod # --------------------------------------------------------------------- render def render_markdown(p: GHProduct) -> str: title = p.product_name or p.source_key crop_label = "Corn" if p.crop == "corn" else "Soybeans" maturity_lines: list[str] = [] if p.relative_maturity and p.crop == "corn": maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}") if p.maturity_group and p.crop == "soybeans": maturity_lines.append(f"- **Maturity group:** {p.maturity_group}") trait_line = "" if p.trait_codes: codes = ", ".join(p.trait_codes) if p.trait_descriptions and any(p.trait_descriptions): trait_line = f"- **Traits:** {codes} ({'; '.join(p.trait_descriptions)})" else: trait_line = f"- **Traits:** {codes}" head = [ f"# {title}", "", "- **Vendor:** Syngenta", "- **Brand:** Golden Harvest", f"- **Crop:** {crop_label}", *maturity_lines, ] if trait_line: head.append(trait_line) head.append(f"- **Source:** {p.source_url}") if p.techsheet_url: head.append(f"- **Tech sheet (PDF):** {p.techsheet_url}") head.append(f"- **Rating scale (Golden Harvest):** {RATING_SCALE_DIRECTION}") head.append("") head.append("---") head.append("") sections: list[str] = [] if p.positioning_statement: sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n") for g in p.characteristics_groups: label = (g.get("label") or "Characteristics").title() items = g.get("items") or [] if not items: continue rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items) sections.append( f"## {label}\n\n" "| Characteristic | Value |\n" "|---|---|\n" f"{rows}\n" ) return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_product(prod: GHProduct, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "golden_harvest", "source_key": prod.source_key, "vendor": "Syngenta", "brand": "Golden Harvest", "product_name": prod.product_name, "product_id": None, "hybrid_prefix": prod.product_name, "hybrid_suffix": None, "crop": prod.crop, "release_year": None, "relative_maturity": prod.relative_maturity, "maturity_group": prod.maturity_group, "wheat_class": None, "trait_stack": prod.trait_codes, "trait_descriptions": prod.trait_descriptions, "positioning_statement": prod.positioning_statement, "strengths": [], "characteristics_groups": prod.characteristics_groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [], "image_url": None, "techsheet_url": prod.techsheet_url, "source_urls": [prod.source_url], "sitemap_last_modified": prod.sitemap_last_modified, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, *, url: str, crop: str, lastmod: str, force: bool, ) -> tuple[str, GHProduct | None]: source_key = source_key_for(url) md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: return "skipped", None try: prod = fetch_product_detail(http, url, crop, lastmod) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s: %s", url, exc) return "failed", None if prod is None: return "discontinued", None body = render_markdown(prod) write_product(prod, body) return "written", prod def run( *, limit: int | None, force: bool, only_crop: str | None, only_product: str | None, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() targets = discover_products(http, only_crop=only_crop) if only_product: targets = [ (u, c, lm) for (u, c, lm) in targets if source_key_for(u) == only_product or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product.lower() ] if not targets: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "discontinued": 0, "failed": 0} processed = 0 for url, crop, lastmod in targets: if limit is not None and processed >= limit: break processed += 1 status, prod = process_product( http, url=url, crop=crop, lastmod=lastmod, force=force, ) counts[status] = counts.get(status, 0) + 1 if prod is not None: log.info( "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d techsheet=%s", processed, str(limit) if limit else "all", prod.source_key, status, prod.crop, prod.relative_maturity or prod.maturity_group or "-", ",".join(prod.trait_codes) or "-", len(prod.characteristics_groups), "y" if prod.techsheet_url else "n", ) else: log.info("[%d/%s] %s %s", processed, str(limit) if limit else "all", source_key_for(url), status) log.info( "done: processed=%d written=%d skipped=%d discontinued=%d failed=%d " "(of %d candidates)", processed, counts["written"], counts["skipped"], counts["discontinued"], counts["failed"], len(targets), ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.golden_harvest", description="Scrape Golden Harvest (Syngenta) corn + soybean varieties.", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=("corn", "soybeans"), help="Limit to one crop.") p.add_argument("--product", default=None, help="Process a single variety by source_key or URL tail.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())