"""AgriPro (Syngenta) wheat scraper. Source: ``agriprowheat.com`` — Drupal site, server-rendered HTML. robots.txt is empty (no Disallow). Expected count: 24 varieties spanning Hard Red Winter (HRW), Hard Red Spring (HRS), Hard White Spring (HWS), Soft White Winter (SWW), Soft White Spring (SWS), and durum. NO SRW — Syngenta's Soft Red Winter sits at GrowProGenetics.com under a separate brand, out of scope for AgriPro. Discovery: the variety listing at ``/search-agripro-brand-varieties`` server-renders only the filter form; the actual variety rows are populated by a Drupal Views AJAX call. We sidestep the AJAX by passing the filter values as GET params on the same path: /search-agripro-brand-varieties?title=&variety_type_value=All That returns the fully-rendered list (24 rows in ``.block-views-blockvarieties-search-varieties-search-block``) with links to ``/variety/`` pages. Per-variety detail comes from the variety page HTML. Useful fields: - ``

`` — product name (e.g. "AP Exceed") - ``.field--node--variety-type--variety`` — wheat class ("Soft White Winter", "Hard Red Spring", etc.) - ``.field--node--tag-line--variety`` — short positioning slogan - ``.field--node--body`` — full positioning narrative - Three sections delimited by ``

``: Agronomics / Grain / Disease, each containing ``.row`` divs with ``
`` pairs. **Rating-scale direction**: AgriPro publishes disease tolerance on a 1-9 scale where **1 = best (most resistant)** — REVERSED from Bayer's and Golden Harvest's "9 = best" convention. The chunker preserves values verbatim and the sidecar's ``_scale_direction`` field declares the direction, so the LLM's chunk-preamble framing will correctly say "(1 = best)" — anti-hallucination guarantee holds even across vendors with opposite scales. (Agronomic ratings on AgriPro are qualitative — "Excellent / Very Good / Good / Fair / Poor" — and don't have a numeric direction issue. They're preserved verbatim.) Output: corpus/agripro/.md corpus/agripro/.json source_key convention: ``agripro-`` lowercased, e.g. ``agripro-ap-exceed`` or ``agripro-sy-assure``. CLI: python -m scrape.sources.agripro --limit 5 python -m scrape.sources.agripro --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://agriprowheat.com" LIST_URL = f"{BASE}/search-agripro-brand-varieties?title=&variety_type_value=All" # AgriPro disease ratings: 1-9, LOWER number = MORE resistant. This # is the inverse of Bayer/Golden-Harvest's 1-9 (9 = best) convention. # Document this in the sidecar so the chunker / LLM never mis-renders. RATING_SCALE_DIRECTION = "1-9 (1 = best, lower = more resistant)" # Class abbreviations for the wheat_class field. AgriPro renders the # full English name; we map it to the canonical short form the rest # of the corpus uses (matches schema notes in seed-mcp/CLAUDE.md). WHEAT_CLASS_MAP = { "hard red winter": "HRW", "hard red spring": "HRS", "hard white spring": "HWS", "hard white winter": "HWW", "soft white winter": "SWW", "soft white spring": "SWS", "soft red winter": "SRW", "durum": "Durum", } REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "agripro" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.agripro") # --------------------------------------------------------------------- HTTP class RateLimitedSession: def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class APProduct: source_key: str source_url: str product_name: str = "" wheat_class: str | None = None positioning_statement: str | None = None tagline: str | None = None characteristics_groups: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery def discover_varieties(http: RateLimitedSession) -> list[str]: """Fetch the variety-search page and return the list of ``/variety/`` URLs found in it. Dedupes per-row twice-listed links (the row's hero image link and its "view full details" link both point to the same place). """ log.info("fetching variety list %s", LIST_URL) r = http.get(LIST_URL) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") urls: list[str] = [] seen: set[str] = set() for a in soup.find_all("a", href=re.compile(r"^/variety/")): h = a["href"] if h in seen: continue seen.add(h) urls.append(BASE + h) log.info("variety URLs found: %d", len(urls)) return urls # --------------------------------------------------------------------- helpers def source_key_for(url: str) -> str: """``/variety/ap-exceed`` → ``agripro-ap-exceed``.""" tail = url.rstrip("/").rsplit("/", 1)[-1].lower() return f"agripro-{tail}" def normalize_wheat_class(raw: str | None) -> str | None: if not raw: return None key = raw.strip().lower() return WHEAT_CLASS_MAP.get(key, raw.strip()) def _rows_in_section(soup: BeautifulSoup, h3_text: str) -> list[dict]: """Walk the variety page for the section heading matching ``h3_text``, then collect every ``.row`` inside the same container. Returns ``[{characteristic, value}, ...]``.""" items: list[dict] = [] for h3 in soup.find_all("h3"): if h3.get_text(strip=True).lower() != h3_text.lower(): continue # Walk up to the enclosing section (the parent that scopes # the .row siblings of the h3). The simplest reliable scope: # the row siblings within the immediate parent. parent = h3.parent if parent is None: continue for row in parent.find_all(class_="row"): label_el = row.find(class_="label") if not label_el: continue label = label_el.get_text(" ", strip=True) # The value is whatever
sibling follows the label # (NOT the .label div itself). value: str | None = None for child in row.find_all("div"): if "label" in (child.get("class") or []): continue # First non-label
with non-empty text wins. t = child.get_text(" ", strip=True) if t: value = t break if label and value: items.append({"characteristic": label, "value": value}) break return items # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, url: str ) -> APProduct | None: r = http.get(url) if r.status_code == 404: return None r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") prod = APProduct( source_key=source_key_for(url), source_url=url, ) h1 = soup.find("h1") if h1: prod.product_name = h1.get_text(strip=True) vt = soup.find(class_="field--node--variety-type--variety") if vt: prod.wheat_class = normalize_wheat_class(vt.get_text(strip=True)) tl = soup.find(class_="field--node--tag-line--variety") if tl: prod.tagline = tl.get_text(strip=True) or None # Body text — the long-form positioning narrative. body = soup.find(class_=re.compile(r"field--node--body")) if body: prod.positioning_statement = body.get_text(" ", strip=True) or None # Tagline alone if no body — better than nothing. if not prod.positioning_statement and prod.tagline: prod.positioning_statement = prod.tagline # The three rated sections on every variety page. groups: list[dict] = [] for label, h3 in ( ("AGRONOMICS", "Agronomics"), ("GRAIN", "Grain"), ("DISEASE RATINGS", "Disease"), ): items = _rows_in_section(soup, h3) if items: groups.append({"label": label, "type": "fields", "items": items}) prod.characteristics_groups = groups return prod # --------------------------------------------------------------------- render def render_markdown(p: APProduct) -> str: title = p.product_name or p.source_key head: list[str] = [ f"# {title}", "", "- **Vendor:** Syngenta", "- **Brand:** AgriPro", "- **Crop:** Wheat", ] if p.wheat_class: head.append(f"- **Wheat class:** {p.wheat_class}") if p.tagline: head.append(f"- **Tagline:** {p.tagline}") head.append(f"- **Source:** {p.source_url}") head.append(f"- **Rating scale (AgriPro):** {RATING_SCALE_DIRECTION}") head.append("") head.append("---") head.append("") sections: list[str] = [] if p.positioning_statement and p.positioning_statement != p.tagline: sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n") for g in p.characteristics_groups: label = (g.get("label") or "Characteristics").title() items = g.get("items") or [] if not items: continue rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items) sections.append( f"## {label}\n\n" "| Characteristic | Value |\n" "|---|---|\n" f"{rows}\n" ) return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_product(prod: APProduct, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "agripro", "source_key": prod.source_key, "vendor": "Syngenta", "brand": "AgriPro", "product_name": prod.product_name, "product_id": None, "hybrid_prefix": prod.product_name, "hybrid_suffix": None, "crop": "wheat", "release_year": None, "relative_maturity": None, "maturity_group": None, "wheat_class": prod.wheat_class, "trait_stack": [], "trait_descriptions": [], "positioning_statement": prod.positioning_statement, "tagline": prod.tagline, "strengths": [], "characteristics_groups": prod.characteristics_groups, # AgriPro's reversed direction is the load-bearing field here: # any cross-vendor disease-resistance comparison MUST consult # this before interpreting values. The chunker reads it; the # api_lessons file's rating-scales section documents the # convention. "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [], "image_url": None, "source_urls": [prod.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, *, url: str, force: bool, ) -> tuple[str, APProduct | None]: source_key = source_key_for(url) md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: return "skipped", None try: prod = fetch_product_detail(http, url) except Exception as exc: # noqa: BLE001 log.error("detail fetch failed for %s: %s", url, exc) return "failed", None if prod is None: return "missing", None body = render_markdown(prod) write_product(prod, body) return "written", prod def run( *, limit: int | None, force: bool, only_product: str | None, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() targets = discover_varieties(http) if only_product: targets = [ u for u in targets if source_key_for(u) == only_product or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product.lower() ] if not targets: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} processed = 0 for url in targets: if limit is not None and processed >= limit: break processed += 1 status, prod = process_product(http, url=url, force=force) counts[status] = counts.get(status, 0) + 1 if prod is not None: log.info( "[%d/%s] %s %s | class=%s groups=%d", processed, str(limit) if limit else "all", prod.source_key, status, prod.wheat_class or "-", len(prod.characteristics_groups), ) else: log.info("[%d/%s] %s %s", processed, str(limit) if limit else "all", source_key_for(url), status) log.info( "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)", processed, counts["written"], counts["skipped"], counts["missing"], counts["failed"], len(targets), ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.agripro", description="Scrape AgriPro (Syngenta) wheat varieties.", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--product", default=None, help="Process a single variety by source_key or URL tail.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())