"""AgriGold scraper — AgReliant Genetics brand. Source: ``www.agrigold.com`` — WordPress site, empty robots.txt (no Disallow). Catalog covers corn + soybeans. Sibling of LG Seeds under the same parent (AgReliant) but distinct branding / positioning, so kept in its own scraper. Discovery: the listing page ``/corn/explore-corn-hybrids`` (and the soybean equivalent) is server-rendered HTML that contains ```` for every variety. Codes look like ``A616-30``, ``A623-88``, etc. Parse the listing HTML, collect distinct variety URLs. Per-variety detail (``/corn/explore-corn-hybrids/``) renders several ``
`` blocks. Each section has a ``
`` heading + multiple ``.detail-item`` rows shaped as ``
N
V
``. The ``
`` content is one of: - **5-circle rating scale** (Agronomic Rating, Disease Tolerance, Silage Characteristics): ``
`` containing 5 children, where N have class ``circle selected`` and 5-N have class ``circle``. Count = rating on a **1-5 scale** (5 = best). Distinct from Bayer / LG Seeds' 1-9 convention — documented in the sidecar's ``_scale_direction``. - **Numeric value** (GDUs, year, plant population): bare number. - **Categorical / qualitative** (Ear Flex Type "KERNEL", Leaf Orientation "SEMI UPRIGHT", Cob Color "Red"): the literal text. - **NA**: rated but not yet measured. Rating scale: ``1-5 (5 = best)`` — distinct from the other brands; the chunker reads ``_scale_direction`` to render the correct preamble. Output: corpus/agrigold/.md corpus/agrigold/.json source_key: ``agrigold-`` lowercased, e.g. ``agrigold-a616-30``. CLI: python -m scrape.sources.agrigold --crop corn --limit 5 python -m scrape.sources.agrigold --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.agrigold.com" LISTING_PATHS = { "corn": "/corn/explore-corn-hybrids", "soybeans": "/soybeans/explore-soybean-varieties", } # AgriGold publishes ratings on a 1-5 scale (5 = best), counted from # the selected circles in the per-rating scale block. The chunker # preserves this verbatim — every chunk preamble declares the scale # so the LLM doesn't conflate with Bayer's 1-9. RATING_SCALE_DIRECTION = "1-5 (5 = best)" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "agrigold" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.agrigold") # --------------------------------------------------------------------- HTTP class RateLimitedSession: def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class AGProduct: source_key: str source_url: str crop: str product_name: str = "" relative_maturity: str | None = None # corn RM days from .maturity maturity_group: str | None = None # soy MG trait_descriptions: list[str] = field(default_factory=list) characteristics_groups: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery def discover_varieties( http: RateLimitedSession, *, only_crop: str | None = None, ) -> list[tuple[str, str, str]]: """Return ``[(url, crop, variety_code), ...]`` for every variety in the listing pages.""" out: list[tuple[str, str, str]] = [] for crop, path in LISTING_PATHS.items(): if only_crop and crop != only_crop: continue log.info("fetching listing %s%s", BASE, path) r = http.get(f"{BASE}{path}") r.raise_for_status() # Collect distinct hrefs that look like //explore-X-{hybrids, # varieties}/. Codes are alphanumeric with dashes. href_re = re.compile(rf"^{re.escape(path)}/([\w\-]+)$") seen: set[str] = set() soup = BeautifulSoup(r.text, "html.parser") for a in soup.find_all("a", href=True): m = href_re.match(a["href"]) if not m: continue code = m.group(1) # Filter out catalog-tool tails ("filter", "browse", etc.) if not re.match(r"^[A-Z0-9][\w\-]{2,30}$", code, re.I): continue if code in seen: continue seen.add(code) out.append((f"{BASE}{path}/{code}", crop, code)) log.info(" %s: %d varieties", crop, len(seen)) log.info("total varieties discovered: %d", len(out)) return out # --------------------------------------------------------------------- helpers def source_key_for(code: str) -> str: slug = re.sub(r"[^a-zA-Z0-9-]+", "-", code).strip("-").lower() return f"agrigold-{slug}" # Section class hint -> normalized label for the sidecar. SECTION_LABEL_MAP = { "agronomic-rating": "AGRONOMIC RATING", "disease-tolerance": "DISEASE TOLERANCE", "plant-characteristics": "PLANT CHARACTERISTICS", "plant-features": "PRODUCT FEATURES", "silage-characteristics": "SILAGE CHARACTERISTICS", "planting-applications": "PLANTING APPLICATIONS", "planting-population": "PLANTING POPULATION", } def _parse_scale(value_el) -> int | None: """Count selected circles in a ``
`` block. Returns 1-5 or None if no scale present.""" if value_el is None: return None scale = value_el.find("div", class_="scale") if scale is None: return None selected = scale.find_all("div", class_=lambda c: c and "selected" in c) return len(selected) if selected else 0 def _parse_value(value_el) -> str: """Extract a non-scale value: raw text contents, trimmed.""" if value_el is None: return "" # If it has a .scale child we should have caught it above. Otherwise # return the leaf text. text = value_el.get_text(" ", strip=True) return text # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, url: str, crop: str, code: str, ) -> AGProduct: r = http.get(url) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") prod = AGProduct( source_key=source_key_for(code), source_url=url, crop=crop, product_name=code, ) # Maturity — often rendered as ``
86 days
``. mat_el = soup.find(class_="maturity") if mat_el: text = mat_el.get_text(strip=True) m = re.search(r"(\d+(?:\.\d+)?)", text) if m: if crop == "corn": prod.relative_maturity = m.group(1) elif crop == "soybeans": prod.maturity_group = m.group(1) # Trait package — from .product-details / "Trait Package" pd = soup.find(class_="product-details") if pd: # The details block renders pairs of label / value text: # "Genetic Family | Icon-J | Trait Package | VT2RIB | ..." # Parse the labels we recognize. text = pd.get_text(" | ", strip=True) m = re.search(r"Trait Package\s*\|\s*([^|]+?)(?:\s*\||$)", text) if m: tp = m.group(1).strip() if tp and tp.lower() not in ("none", "-"): prod.trait_descriptions = [tp] # Iterate all product-section blocks; bucket items per section. for section in soup.find_all("div", class_=re.compile(r"product-section")): section_classes = section.get("class", []) label = "" for cls in section_classes: if cls in SECTION_LABEL_MAP: label = SECTION_LABEL_MAP[cls] break if not label: title_el = section.find(class_="title") label = (title_el.get_text(strip=True).upper() if title_el else "OTHER") items: list[dict] = [] for detail in section.find_all("div", class_="detail-item"): label_el = detail.find("div", class_="label") value_el = detail.find("div", class_="value") ch = (label_el.get_text(" ", strip=True) if label_el else "").strip() if not ch: continue scale = _parse_scale(value_el) if scale is not None: items.append({"characteristic": ch, "value": str(scale)}) else: v = _parse_value(value_el) # Special-case the "Row Type" header row from planting-population # which holds nested headers, not a real rating. if ch.lower() == "row type" and v.lower() in ( "low medium high", "low / medium / high", ): continue if v: items.append({"characteristic": ch, "value": v}) if items: prod.characteristics_groups.append({ "label": label, "type": "scale-or-value", "items": items, }) return prod # --------------------------------------------------------------------- render def render_markdown(p: AGProduct) -> str: title = p.product_name or p.source_key crop_label = "Corn" if p.crop == "corn" else "Soybeans" head: list[str] = [ f"# {title}", "", "- **Vendor:** AgReliant Genetics", "- **Brand:** AgriGold", f"- **Crop:** {crop_label}", ] if p.relative_maturity and p.crop == "corn": head.append(f"- **Relative maturity:** {p.relative_maturity}") if p.maturity_group and p.crop == "soybeans": head.append(f"- **Maturity group:** {p.maturity_group}") if p.trait_descriptions: head.append(f"- **Traits:** {', '.join(p.trait_descriptions)}") head.append(f"- **Source:** {p.source_url}") head.append(f"- **Rating scale (AgriGold):** {RATING_SCALE_DIRECTION}") head.append("") head.append("---") head.append("") sections: list[str] = [] for g in p.characteristics_groups: label = (g.get("label") or "Characteristics").title() items = g.get("items") or [] if not items: continue rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items) sections.append( f"## {label}\n\n" "| Characteristic | Value |\n" "|---|---|\n" f"{rows}\n" ) return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_product(prod: AGProduct, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "agrigold", "source_key": prod.source_key, "vendor": "AgReliant Genetics", "brand": "AgriGold", "product_name": prod.product_name, "product_id": None, "hybrid_prefix": prod.product_name, "hybrid_suffix": None, "crop": prod.crop, "release_year": None, "relative_maturity": prod.relative_maturity, "maturity_group": prod.maturity_group, "wheat_class": None, "trait_stack": prod.trait_descriptions, "trait_descriptions": prod.trait_descriptions, "positioning_statement": None, "strengths": [], "characteristics_groups": prod.characteristics_groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [], "image_url": None, "source_urls": [prod.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, *, url: str, crop: str, code: str, force: bool, ) -> tuple[str, AGProduct | None]: source_key = source_key_for(code) md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: return "skipped", None try: prod = fetch_product_detail(http, url, crop, code) except Exception as exc: # noqa: BLE001 log.error("variety %s failed: %s", code, exc) return "failed", None body = render_markdown(prod) write_product(prod, body) return "written", prod def run(*, limit: int | None, force: bool, only_crop: str | None, only_product: str | None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() targets = discover_varieties(http, only_crop=only_crop) if only_product: targets = [ (u, c, k) for (u, c, k) in targets if source_key_for(k) == only_product or k.lower() == only_product.lower() ] if not targets: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "failed": 0} processed = 0 for url, crop, code in targets: if limit is not None and processed >= limit: break processed += 1 status, prod = process_product( http, url=url, crop=crop, code=code, force=force, ) counts[status] = counts.get(status, 0) + 1 if prod is not None: log.info( "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d", processed, str(limit) if limit else "all", prod.source_key, status, prod.crop, prod.relative_maturity or prod.maturity_group or "-", ",".join(prod.trait_descriptions) or "-", len(prod.characteristics_groups), ) else: log.info("[%d/%s] %s %s", processed, str(limit) if limit else "all", source_key_for(code), status) log.info( "done: processed=%d written=%d skipped=%d failed=%d (of %d candidates)", processed, counts["written"], counts["skipped"], counts["failed"], len(targets), ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.agrigold", description="Scrape AgriGold (AgReliant Genetics) corn + soybean varieties.", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=list(LISTING_PATHS), help="Limit to one crop.") p.add_argument("--product", default=None, help="Process a single variety by source_key or variety code.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())