"""Ebbert's Seeds scraper — small regional Ohio/Indiana breeder. Source: ``www.ebbertsseeds.com`` — WordPress site. robots.txt is permissive (``Crawl-delay: 5`` only, no Disallow). Covington, OH + Decatur, IN — Eastern Corn Belt focus. Catalog is structured as one scrollable page PER CROP, with each variety rendered as a CSS-grid block of `

NAME TRAIT RM RM

` + several sub-sections (MANAGEMENT & POSITIONING / CHARACTERISTICS / DISEASE RATINGS) where the labels and numeric values live in separate adjacent grid cells. Reconstructing a perfectly-aligned {characteristic: value} dict from the multi-column layout is fiddly; the small variety count (~17 corn + similar soy/wheat) doesn't justify the engineering. We instead **preserve the full text body of each variety's container** in the chunk markdown so the LLM can read the tabular text as-is. Pages scraped: `/corn/`, `/soybeans-2/`, `/wheat/`. Grass-seed / forage / cover-crop pages are out of scope for the row-crop advisor. Rating scale: ``1-5 (1 = best, lower = more resistant)`` — same direction as AgriPro / NK. Confirmed by cross-referencing positioning text against published values (a variety described as "Robust tall plants" has STANDABILITY 1.0 → 1 = best). Output: corpus/ebberts_seeds/.md corpus/ebberts_seeds/.json source_key: ``ebberts-`` lowercased, e.g. ``ebberts-7000tr-rib`` or ``ebberts-1335-conventional``. CLI: python -m scrape.sources.ebberts_seeds --crop corn --limit 5 python -m scrape.sources.ebberts_seeds --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.ebbertsseeds.com" # Ebbert's per-crop catalog pages. URL paths confirmed via homepage # nav links 2026-05-26. CROP_PAGES = { "corn": "/corn/", "soybeans": "/soybeans-2/", "wheat": "/wheat/", } # Per robots.txt: Crawl-delay: 5 (seconds). We respect that. REQ_INTERVAL_SEC = 5.0 RATING_SCALE_DIRECTION = "1-5 (1 = best, lower = more resistant)" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "ebberts_seeds" log = logging.getLogger("scrape.ebberts_seeds") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """robots.txt asks for 5-sec Crawl-delay; we honor it. Ebbert's catalog is only ~30-50 pages total so even at 5 sec/req the full scrape finishes in <5 min.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class EbProduct: source_key: str source_url: str # the per-crop page URL (Ebbert's doesn't have per-variety pages) crop: str product_name: str = "" # "7000TR RIB", "1335 CONVENTIONAL" trait_label: str | None = None # "RIB", "CONVENTIONAL", "PC", "SSX RIB", etc. relative_maturity: str | None = None # corn maturity_group: str | None = None # soy body_text: str = "" # verbatim text of the variety's container # --------------------------------------------------------------------- discovery + parse _VARIETY_HEADING_RE = re.compile( r"^(?P\S+(?:\s+\S+)*?)\s+(?P\d+(?:\.\d+)?)\s*RM$", re.IGNORECASE, ) def _variety_text(h1, next_h1) -> str: """Collect the visible text from this variety's

up to (but not including) the next variety's

, walking the DOM in document order. Ebbert's grid layout spreads each variety's content across many sibling ``.x-cell`` blocks in the outer container; the h1's immediate parent only holds the title cell. The correct boundary is the next variety h1 in document order. """ chunks: list[str] = [h1.get_text(strip=True)] for node in h1.find_all_next(string=True): # Stop once we cross into the next variety's h1. if next_h1 is not None: if node is next_h1 or next_h1 in getattr(node, "parents", []): break # Or text is a descendant of next_h1 anc = node.parent while anc is not None: if anc is next_h1: break anc = anc.parent if anc is next_h1: break text = str(node).strip() if text: chunks.append(text) body = " | ".join(chunks) body = re.sub(r"\s\|\s\|\s", " | ", body) body = re.sub(r"\s+", " ", body).strip() return body def _slug(text: str) -> str: s = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-").lower() return s def discover_and_parse( http: RateLimitedSession, , only_crop: str | None = None, ) -> list[EbProduct]: """Fetch one page per crop and extract every variety container.""" out: list[EbProduct] = [] for crop, path in CROP_PAGES.items(): if only_crop and crop != only_crop: continue url = f"{BASE}{path}" log.info("fetching %s", url) r = http.get(url) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") # Every variety is anchored by an

NAME ... RM RM

. v_h1s = [ h for h in soup.find_all("h1") if _VARIETY_HEADING_RE.match(h.get_text(strip=True)) ] log.info(" %s: %d varieties", crop, len(v_h1s)) for i, h1 in enumerate(v_h1s): title = h1.get_text(strip=True) m = _VARIETY_HEADING_RE.match(title) if not m: continue name = m.group("name").strip() maturity = m.group("rm") next_h1 = v_h1s[i + 1] if i + 1 < len(v_h1s) else None body = _variety_text(h1, next_h1) prod = EbProduct( source_key=f"ebberts-{_slug(name)}", source_url=url, crop=crop, product_name=name, relative_maturity=maturity if crop == "corn" else None, maturity_group=maturity if crop == "soybeans" else None, body_text=body, ) # Derive trait_label from the second token of the name if # it looks like a trait (CONVENTIONAL, RIB, PC, SSX RIB, # TR RIB, etc.). Best-effort, doesn't have to be perfect. parts = name.split(maxsplit=1) if len(parts) == 2: prod.trait_label = parts[1] out.append(prod) log.info("total varieties discovered: %d", len(out)) return out # --------------------------------------------------------------------- render def render_markdown(p: EbProduct) -> str: title = p.product_name or p.source_key crop_label = {"corn": "Corn", "soybeans": "Soybeans", "wheat": "Wheat"}.get(p.crop, p.crop.title()) head: list[str] = [ f"# {title}", "", "- **Vendor:** Ebbert's Seeds (independent regional breeder)", "- **Brand:** Ebbert's Seeds", f"- **Crop:** {crop_label}", ] if p.relative_maturity and p.crop == "corn": head.append(f"- **Relative maturity:** {p.relative_maturity}") if p.maturity_group and p.crop == "soybeans": head.append(f"- **Maturity group:** {p.maturity_group}") if p.trait_label: head.append(f"- **Trait stack (label):** {p.trait_label}") head.append(f"- **Source:** {p.source_url}") head.append(f"- **Rating scale (Ebbert's):** {RATING_SCALE_DIRECTION}") head.append("- **Service area:** Covington, OH + Decatur, IN — Eastern Corn Belt regional") head.append("") head.append("---") head.append("") head.append("## Variety detail (verbatim from page)") head.append("") head.append(p.body_text) head.append("") return "\n".join(head) # --------------------------------------------------------------------- write def write_product(prod: EbProduct, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "ebberts_seeds", "source_key": prod.source_key, "vendor": "Ebbert's Seeds", "brand": "Ebbert's Seeds", "product_name": prod.product_name, "product_id": None, "hybrid_prefix": prod.product_name, "hybrid_suffix": prod.trait_label, "crop": prod.crop, "release_year": None, "relative_maturity": prod.relative_maturity, "maturity_group": prod.maturity_group, "wheat_class": None, "trait_stack": [prod.trait_label] if prod.trait_label else [], "trait_descriptions": [], "positioning_statement": None, "strengths": [], # No structured groups — the body markdown carries the table # text verbatim. characteristics_groups stays empty so the # chunker doesn't try to bucket non-existent items. "characteristics_groups": [], "page_text_chars": len(prod.body_text), "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [ {"product_list_name": "Ebbert's service area (Eastern Corn Belt — OH/IN/IL)", "agronomist": None, "agronomist_email": None, "variant_id": None}, ], "image_url": None, "source_urls": [prod.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def run(*, limit: int | None, force: bool, only_crop: str | None, only_product: str | None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() products = discover_and_parse(http, only_crop=only_crop) if only_product: products = [ p for p in products if p.source_key == only_product or p.product_name.lower() == only_product.lower() ] if not products: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0} processed = 0 for prod in products: if limit is not None and processed >= limit: break processed += 1 md_path = CORPUS_DIR / f"{prod.source_key}.md" if md_path.exists() and not force: counts["skipped"] += 1 log.info("[%d/%s] %s skipped", processed, str(limit) if limit else len(products), prod.source_key) continue body = render_markdown(prod) write_product(prod, body) counts["written"] += 1 log.info( "[%d/%s] %s written | crop=%s rm/mg=%s trait=%s chars=%d", processed, str(limit) if limit else len(products), prod.source_key, prod.crop, prod.relative_maturity or prod.maturity_group or "-", prod.trait_label or "-", len(prod.body_text), ) log.info( "done: processed=%d written=%d skipped=%d (of %d varieties)", processed, counts["written"], counts["skipped"], len(products), ) return 0 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.ebberts_seeds", description="Scrape Ebbert's Seeds (regional Eastern Corn Belt breeder) — " "corn / soybeans / wheat.", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=list(CROP_PAGES), help="Limit to one crop (corn / soybeans / wheat).") p.add_argument("--product", default=None, help="Process a single variety by source_key or product name.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())