"""Burrus Seed scraper — independent family-owned company (Arenzville, IL). Source: Burrus Hybrids ("Burrus Seed"), an independent family company founded **1935** in Arenzville, Illinois — NOT owned by any of the multinationals (Bayer / Corteva / Syngenta / BASF). It markets corn under the **Burrus** and **Power Plus** brands and soybeans under the **Burrus** and **DONMARIO** brands, sold through a dealer network across IL / IN / IA / MO / WI. Unlike the ProHarvest scraper (which parses HTML detail pages), Burrus publishes its full agronomic dataset through the **Seedware** catalog widget's JSON-over-JSONP API (the backend for the product finder on ``burrusseed.com/products/{corn,soybeans}``). So this scraper does TWO list calls and maps JSON fields straight into ``characteristics_groups``; there is no per-variety page fetch. Seedware API ------------ ``GET https://burrus25.seedware.net/app/_queries/crop_varieties.php ?crop_pkey=101&callback=cb`` -> CORN (JSONP) ``crop_pkey=102`` -> SOYBEANS Both require: * a ``callback`` query param (WITHOUT it the endpoint returns ``[]``), * a ``Referer: https://burrusseed.com/`` header. The response is ``cb([...]);`` — strip the JSONP wrapper to get a JSON array of ~38 corn + ~26 soy records. Each record has ~44 fields: ``id`` (variety code, e.g. ``8J697AM``), ``description`` (brand + code, e.g. ``Power Plus 8J697AM``), ``pkey`` (Seedware row id), ``maturity`` (RM for corn / MG for soy, as a string like ``"97.00"`` / ``"2.00"``), ``released`` (year int), ``trait`` / ``trait_platform``, a per-record brand in ``stat_corn_brand`` / ``stat_soybean_brand``, and many ``stat_*`` agronomic / disease / herbicide-tolerance ratings. Rating scales (confirmed from the live data, Jun 2026) ------------------------------------------------------ * **Numeric agronomic + disease ratings: 1-10, 10 = best / most tolerant** (observed values 4-10; standard Seedware/seed-industry high-is-better scale). Soy agronomic stats arrive as ``"8.000"`` — the trailing zeros are stripped to ``"8"``. ``NR`` / ``None`` / blank / ``-`` = not rated and are SKIPPED (never coerced to a value). * **Herbicide tolerance + insect-protection packages: Yes / No** (verbatim). ``glyphosate`` / ``glufosinate`` / ``2,4-D choline`` / ``FOPs`` / ``dicamba`` tolerances and the Bt insect packages (corn borer / rootworm / etc.) are categorical Yes/No, not numeric. * **Categorical agronomic notes** (corn-on-corn suitability, refuge structure) pass through verbatim. Output: corpus/burrus/.md corpus/burrus/.json source_key: ``burrus-`` lowercased + slugified, e.g. ``burrus-8j697am``. The variety ``id`` (the catalog code) is stable. CLI: python -m scrape.sources.burrus --crop corn --limit 2 --force python -m scrape.sources.burrus --crop soybeans python -m scrape.sources.burrus --force python -m scrape.sources.burrus --product burrus-8j697am ROBOTS / UA: burrusseed.com robots.txt blocks ~33 NAMED AI/scraper bots (Scrapy, CCBot, Bytespider, Diffbot, ...) and declares ``Crawl-delay: 10`` + ``Content-signal: ai-train=no``; ``User-agent: *`` is allowed. The operator has chosen to include this source. We use a non-blacklisted UA and honour the 10-second crawl delay (the API call count is tiny — two list calls — so this is cheap). """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests SCRAPER_VERSION = "0.1.0" # NOT any blacklisted bot name — robots.txt allows User-agent: *. USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" SEEDWARE = "https://burrus25.seedware.net" API = f"{SEEDWARE}/app/_queries/crop_varieties.php" SITE = "https://burrusseed.com" REFERER = "https://burrusseed.com/" # crop_pkey -> (chunker crop value, public product page slug). CROP_PKEYS = { "corn": (101, "corn"), "soybeans": (102, "soybeans"), } # robots.txt declares Crawl-delay: 10 for burrusseed.com / seedware.net. # Honour it — the catalog is only two list calls so this is cheap. REQ_INTERVAL_SEC = 10.0 RATING_SCALE_DIRECTION = ( "numeric agronomic + disease ratings 1-10, 10=best/most-tolerant " "(observed 4-10; higher is better); NR/blank/0/'-' = not rated (omitted). " "Herbicide tolerances and Bt insect-protection packages are Yes/No " "(verbatim, not numeric). Corn-on-corn suitability and refuge structure " "are categorical." ) # ----- stat_* field -> (group label, human characteristic name) ----------- # # Group labels match the chunker's buckets in rag/chunk.py: # "DISEASE RATINGS" -> disease framing # "AGRONOMIC CHARACTERISTICS" -> agronomic framing # "HERBICIDE TOLERANCE" -> falls into the chunker's MANAGEMENT # bucket ("HERBICIDE" is a recognised label), # so it renders as "Management notes". # Fields intentionally NOT mapped: stat_corn_brand / stat_soybean_brand # (used for the per-record brand), stat_herbicide_tolerance (always blank # in the live data — the per-chemistry stats carry the real signal). DISEASE_FIELDS = { # corn "stat_gray_leaf_spot_tolerance": "Gray leaf spot tolerance", "stat_tar_spot_tolerance": "Tar spot tolerance", # soy "stat_brown_stem_rot": "Brown stem rot (BSR) tolerance", "stat_sds": "Sudden death syndrome (SDS) tolerance", "stat_phytophthora_root_rot": "Phytophthora root rot tolerance", "stat_prr_phytophthora_root_rot": "Phytophthora root rot (PRR) tolerance", } # Agronomic ratings — numeric 1-10 (corn) and "8.000"-style (soy). AGRONOMIC_NUMERIC_FIELDS = { # corn "stat_drought_tolerance": "Drought tolerance", "stat_greensnap_tolerance": "Greensnap tolerance", "stat_root_strength": "Root strength", "stat_stalk_strength": "Stalk strength", "stat_standability": "Standability", "stat_black_cutworm": "Black cutworm tolerance", # soy "stat_emergence": "Emergence", "stat_canopy_width": "Canopy width", "stat_plant_height": "Plant height", } # Agronomic categorical / Yes-No notes (insect protection + placement). AGRONOMIC_CATEGORICAL_FIELDS = { "stat_corn_corn": "Corn-on-corn suitability", "stat_refuge": "Refuge structure", "stat_corn_borer": "Corn borer protection (Bt)", "stat_corn_rootworm": "Corn rootworm protection (Bt)", "stat_corn_earworm": "Corn earworm protection (Bt)", "stat_nematode": "Nematode protection", "stat_wireworm": "Wireworm protection", } # Herbicide tolerance — Yes/No per chemistry. HERBICIDE_FIELDS = { "stat_glyphosate_tolerance": "Glyphosate tolerance", "stat_glufosinate_tolerance": "Glufosinate tolerance", "stat_24d_choline_tolerance": "2,4-D choline tolerance", "stat_dicamba_tolerance": "Dicamba tolerance", "stat_fops_tolerance": "FOPs (fop herbicide) tolerance", } GROUP_ORDER = [ ("DISEASE RATINGS", DISEASE_FIELDS), ("AGRONOMIC CHARACTERISTICS", {**AGRONOMIC_NUMERIC_FIELDS, **AGRONOMIC_CATEGORICAL_FIELDS}), ("HERBICIDE TOLERANCE", HERBICIDE_FIELDS), ] # Values that mean "not rated" — never coerced into a chunk. _NOT_RATED = {"", "-", "--", "n/a", "na", "nr", "none", "0", "0.000", "0.00"} REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "burrus" log = logging.getLogger("scrape.burrus") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Polite session with backoff. Honours burrusseed.com's Crawl-delay: 10 (>=10 s between requests to seedware.net / burrusseed.com). The Burrus catalog is two list calls total.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.s.headers["Referer"] = REFERER self.s.headers["Accept"] = "*/*" self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if self._last and delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None resp: requests.Response | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min( 30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc assert resp is not None return resp def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) def _strip_jsonp(text: str) -> Any: """Strip a ``cb( ... );`` JSONP wrapper and parse the JSON inside.""" s = text.strip() m = re.match(r"^[^(]*\((.*)\)\s*;?\s*$", s, re.S) body = m.group(1) if m else s return json.loads(body) # --------------------------------------------------------------------- model @dataclass class BurrusVariety: source_key: str crop: str # chunker value: corn / soybeans product_name: str # "Power Plus 8J697AM" product_id: str # "8J697AM" brand: str # "Burrus" | "Power Plus" | "DONMARIO" relative_maturity: int | None = None maturity_group: float | None = None release_year: int | None = None trait_stack: list[str] = field(default_factory=list) positioning: str | None = None groups: list[dict] = field(default_factory=list) source_url: str = "" # --------------------------------------------------------------------- fetch def fetch_crop(http: RateLimitedSession, crop_pkey: int) -> list[dict]: """Fetch + decode the JSONP variety array for one crop_pkey.""" url = f"{API}?crop_pkey={crop_pkey}&callback=cb" r = http.get(url) r.raise_for_status() data = _strip_jsonp(r.text) if not isinstance(data, list): raise ValueError(f"unexpected payload for crop_pkey={crop_pkey}: " f"{type(data).__name__}") return data # --------------------------------------------------------------------- mapping def _slug(s: str) -> str: s = (s or "").strip().lower() s = re.sub(r"[^a-z0-9]+", "-", s) return re.sub(r"-+", "-", s).strip("-") def _is_rated(v: Any) -> bool: if v is None: return False return str(v).strip().lower() not in _NOT_RATED def _clean_value(v: Any) -> str: """Normalise a stat value for display. Numeric soy stats arrive as '8.000' — strip the trailing zeros to '8'. Everything else passes through verbatim (Yes / No / Suitable / Integrated refuge / ...).""" s = str(v).strip() # numeric like "8.000" / "8.00" / "97.00" -> "8" / "97" if re.fullmatch(r"-?\d+(?:\.\d+)?", s): f = float(s) return str(int(f)) if f == int(f) else (f"{f:g}") return s def _maturity(rec: dict, crop: str) -> tuple[int | None, float | None]: raw = rec.get("maturity") if raw is None or str(raw).strip() == "": return None, None try: f = float(str(raw).strip()) except ValueError: return None, None if crop == "corn": return int(round(f)), None return None, round(f, 1) def _brand(rec: dict) -> str: """Per-record brand. corn -> stat_corn_brand (Burrus / Power Plus); soy -> stat_soybean_brand (Burrus / DONMARIO). Falls back to the leading token of the description, else 'Burrus'.""" b = rec.get("stat_corn_brand") or rec.get("stat_soybean_brand") if b and str(b).strip(): return str(b).strip() desc = (rec.get("description") or "").strip() code = (rec.get("id") or "").strip() if desc and code and desc.lower().endswith(code.lower()): lead = desc[: len(desc) - len(code)].strip() if lead: return lead return "Burrus" def _traits(rec: dict) -> list[str]: out: list[str] = [] for key in ("trait", "trait_platform"): v = rec.get(key) if v and str(v).strip(): # strip stray trailing punctuation seen in the data # ("Conventional." / "AM`") t = str(v).strip().rstrip(".`") if t and t not in out: out.append(t) return out def _build_groups(rec: dict) -> list[dict]: groups: list[dict] = [] for label, fields in GROUP_ORDER: items: list[dict] = [] for stat_key, human in fields.items(): v = rec.get(stat_key) if _is_rated(v): items.append({"characteristic": human, "value": _clean_value(v)}) if items: groups.append({"label": label, "items": items}) return groups def map_record(rec: dict, crop: str) -> BurrusVariety: code = (rec.get("id") or "").strip() pkey = rec.get("pkey") key_seed = code or (f"pkey-{pkey}" if pkey else (rec.get("description") or "")) source_key = f"burrus-{_slug(key_seed)}" name = (rec.get("description") or code or key_seed).strip() rm, mg = _maturity(rec, crop) page_slug = CROP_PKEYS[crop][1] return BurrusVariety( source_key=source_key, crop=crop, product_name=name, product_id=code or name, brand=_brand(rec), relative_maturity=rm, maturity_group=mg, release_year=(rec.get("released") if isinstance(rec.get("released"), int) else None), trait_stack=_traits(rec), # The Seedware records carry no marketing blurb; leave positioning # null rather than fabricate one. positioning=None, groups=_build_groups(rec), source_url=f"{SITE}/products/{page_slug}", ) # --------------------------------------------------------------------- render def render_markdown(v: BurrusVariety) -> str: crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get( v.crop, v.crop.title()) head: list[str] = [ f"# {v.product_name}", "", "- **Vendor:** Burrus Seed (Burrus Hybrids — independent family " "company, Arenzville, IL, since 1935)", f"- **Brand:** {v.brand}", f"- **Crop:** {crop_label}", ] if v.crop == "corn" and v.relative_maturity is not None: head.append(f"- **Relative maturity:** {v.relative_maturity} days") if v.crop == "soybeans" and v.maturity_group is not None: head.append(f"- **Maturity group:** {v.maturity_group}") if v.trait_stack: head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}") if v.release_year: head.append(f"- **Released:** {v.release_year}") head.append(f"- **Source:** {v.source_url}") head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}") head.append("- **Service area:** Burrus dealer network " "(IL / IN / IA / MO / WI)") head.append("") head += ["---", ""] for g in v.groups: head.append(f"## {g['label'].title()}") head.append("") for it in g["items"]: head.append(f"- **{it['characteristic']}:** {it['value'] or '—'}") head.append("") return "\n".join(head) def write_variety(v: BurrusVariety, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8") sidecar = { "source": "burrus", "source_key": v.source_key, "vendor": "Burrus Seed", "brand": v.brand, "product_name": v.product_name, "product_id": v.product_id, "crop": v.crop, "release_year": v.release_year, "relative_maturity": v.relative_maturity, "maturity_group": v.maturity_group, "wheat_class": None, "trait_stack": v.trait_stack, "trait_descriptions": [], "positioning_statement": v.positioning, "strengths": [], "characteristics_groups": v.groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [ {"product_list_name": "Burrus dealer network (IL/IN/IA/MO/WI)", "agronomist": None, "agronomist_email": None, "variant_id": None}, ], "image_url": None, "source_urls": [v.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } (CORPUS_DIR / f"{v.source_key}.json").write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # --------------------------------------------------------------------- pipeline def run(*, limit: int | None, force: bool, only_crop: str | None, only_product: str | None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() crops = [only_crop] if only_crop else list(CROP_PKEYS.keys()) records: list[tuple[str, dict]] = [] for crop in crops: crop_pkey = CROP_PKEYS[crop][0] try: raw = fetch_crop(http, crop_pkey) except (requests.HTTPError, ValueError) as exc: log.error("fetch failed for crop=%s (pkey=%d): %s", crop, crop_pkey, exc) continue log.info("crop=%-9s pkey=%d: %d records", crop, crop_pkey, len(raw)) for rec in raw: records.append((crop, rec)) varieties = [map_record(rec, crop) for crop, rec in records] if only_product: key = only_product.lower() varieties = [v for v in varieties if v.source_key == key or v.product_id.lower() == key or _slug(v.product_id) == _slug(key)] if not varieties: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "empty": 0} processed = 0 total = len(varieties) for v in varieties: if limit is not None and processed >= limit: break processed += 1 md_path = CORPUS_DIR / f"{v.source_key}.md" if md_path.exists() and not force: counts["skipped"] += 1 log.info("[%d/%d] %s skipped", processed, total, v.source_key) continue if not v.groups: counts["empty"] += 1 log.warning("[%d/%d] %s — no rating groups (still writing identity)", processed, total, v.source_key) write_variety(v, render_markdown(v)) counts["written"] += 1 log.info("[%d/%d] %s written | brand=%s crop=%s rm/mg=%s groups=%d " "traits=%s", processed, total, v.source_key, v.brand, v.crop, v.relative_maturity or v.maturity_group or "-", len(v.groups), ",".join(v.trait_stack) or "-") log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)", processed, counts["written"], counts["skipped"], counts["empty"], total) return 0 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.burrus", description="Scrape Burrus Seed (independent family company, " "Arenzville IL) — corn / soybeans via the Seedware " "JSON-over-JSONP catalog API.") p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-write even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=sorted(CROP_PKEYS.keys()), help="Limit to one crop (corn / soybeans).") p.add_argument("--product", default=None, help="Process a single variety by source_key or id.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr) return run(limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product) if __name__ == "__main__": sys.exit(main())