"""Ebbert's Seeds scraper — small regional Ohio/Indiana breeder.
Source: ``www.ebbertsseeds.com`` — WordPress site. robots.txt is
permissive (``Crawl-delay: 5`` only, no Disallow). Covington, OH +
Decatur, IN — Eastern Corn Belt focus.
Catalog is structured as one scrollable page PER CROP, with each
variety rendered as a CSS-grid block of `
NAME TRAIT RM RM
`
+ several sub-sections (MANAGEMENT & POSITIONING / CHARACTERISTICS
/ DISEASE RATINGS) where the labels and numeric values live in
separate adjacent grid cells. Reconstructing a perfectly-aligned
{characteristic: value} dict from the multi-column layout is
fiddly; the small variety count (~17 corn + similar soy/wheat)
doesn't justify the engineering. We instead **preserve the full
text body of each variety's container** in the chunk markdown so
the LLM can read the tabular text as-is.
Pages scraped: `/corn/`, `/soybeans-2/`, `/wheat/`. Grass-seed /
forage / cover-crop pages are out of scope for the row-crop
advisor.
Rating scale: ``1-5 (1 = best, lower = more resistant)`` — same
direction as AgriPro / NK. Confirmed by cross-referencing
positioning text against published values (a variety described as
"Robust tall plants" has STANDABILITY 1.0 → 1 = best).
Output:
corpus/ebberts_seeds/.md
corpus/ebberts_seeds/.json
source_key: ``ebberts-`` lowercased, e.g.
``ebberts-7000tr-rib`` or ``ebberts-1335-conventional``.
CLI:
python -m scrape.sources.ebberts_seeds --crop corn --limit 5
python -m scrape.sources.ebberts_seeds --force
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.ebbertsseeds.com"
# Ebbert's per-crop catalog pages. URL paths confirmed via homepage
# nav links 2026-05-26.
CROP_PAGES = {
"corn": "/corn/",
"soybeans": "/soybeans-2/",
"wheat": "/wheat/",
}
# Per robots.txt: Crawl-delay: 5 (seconds). We respect that.
REQ_INTERVAL_SEC = 5.0
RATING_SCALE_DIRECTION = "1-5 (1 = best, lower = more resistant)"
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "ebberts_seeds"
log = logging.getLogger("scrape.ebberts_seeds")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""robots.txt asks for 5-sec Crawl-delay; we honor it. Ebbert's
catalog is only ~30-50 pages total so even at 5 sec/req the
full scrape finishes in <5 min."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
# --------------------------------------------------------------------- model
@dataclass
class EbProduct:
source_key: str
source_url: str # the per-crop page URL (Ebbert's doesn't have per-variety pages)
crop: str
product_name: str = "" # "7000TR RIB", "1335 CONVENTIONAL"
trait_label: str | None = None # "RIB", "CONVENTIONAL", "PC", "SSX RIB", etc.
relative_maturity: str | None = None # corn
maturity_group: str | None = None # soy
body_text: str = "" # verbatim text of the variety's container
# --------------------------------------------------------------------- discovery + parse
_VARIETY_HEADING_RE = re.compile(
r"^(?P\S+(?:\s+\S+)*?)\s+(?P\d+(?:\.\d+)?)\s*RM$",
re.IGNORECASE,
)
def _variety_text(h1, next_h1) -> str:
"""Collect the visible text from this variety's up to (but
not including) the next variety's , walking the DOM in
document order.
Ebbert's grid layout spreads each variety's content across many
sibling ``.x-cell`` blocks in the outer container; the h1's
immediate parent only holds the title cell. The correct boundary
is the next variety h1 in document order.
"""
chunks: list[str] = [h1.get_text(strip=True)]
for node in h1.find_all_next(string=True):
# Stop once we cross into the next variety's h1.
if next_h1 is not None:
if node is next_h1 or next_h1 in getattr(node, "parents", []):
break
# Or text is a descendant of next_h1
anc = node.parent
while anc is not None:
if anc is next_h1:
break
anc = anc.parent
if anc is next_h1:
break
text = str(node).strip()
if text:
chunks.append(text)
body = " | ".join(chunks)
body = re.sub(r"\s*\|\s*\|\s*", " | ", body)
body = re.sub(r"\s+", " ", body).strip()
return body
def _slug(text: str) -> str:
s = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-").lower()
return s
def discover_and_parse(
http: RateLimitedSession, *, only_crop: str | None = None,
) -> list[EbProduct]:
"""Fetch one page per crop and extract every variety container."""
out: list[EbProduct] = []
for crop, path in CROP_PAGES.items():
if only_crop and crop != only_crop:
continue
url = f"{BASE}{path}"
log.info("fetching %s", url)
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Every variety is anchored by an NAME ... RM RM
.
v_h1s = [
h for h in soup.find_all("h1")
if _VARIETY_HEADING_RE.match(h.get_text(strip=True))
]
log.info(" %s: %d varieties", crop, len(v_h1s))
for i, h1 in enumerate(v_h1s):
title = h1.get_text(strip=True)
m = _VARIETY_HEADING_RE.match(title)
if not m:
continue
name = m.group("name").strip()
maturity = m.group("rm")
next_h1 = v_h1s[i + 1] if i + 1 < len(v_h1s) else None
body = _variety_text(h1, next_h1)
prod = EbProduct(
source_key=f"ebberts-{_slug(name)}",
source_url=url,
crop=crop,
product_name=name,
relative_maturity=maturity if crop == "corn" else None,
maturity_group=maturity if crop == "soybeans" else None,
body_text=body,
)
# Derive trait_label from the second token of the name if
# it looks like a trait (CONVENTIONAL, RIB, PC, SSX RIB,
# TR RIB, etc.). Best-effort, doesn't have to be perfect.
parts = name.split(maxsplit=1)
if len(parts) == 2:
prod.trait_label = parts[1]
out.append(prod)
log.info("total varieties discovered: %d", len(out))
return out
# --------------------------------------------------------------------- render
def render_markdown(p: EbProduct) -> str:
title = p.product_name or p.source_key
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
"wheat": "Wheat"}.get(p.crop, p.crop.title())
head: list[str] = [
f"# {title}",
"",
"- **Vendor:** Ebbert's Seeds (independent regional breeder)",
"- **Brand:** Ebbert's Seeds",
f"- **Crop:** {crop_label}",
]
if p.relative_maturity and p.crop == "corn":
head.append(f"- **Relative maturity:** {p.relative_maturity}")
if p.maturity_group and p.crop == "soybeans":
head.append(f"- **Maturity group:** {p.maturity_group}")
if p.trait_label:
head.append(f"- **Trait stack (label):** {p.trait_label}")
head.append(f"- **Source:** {p.source_url}")
head.append(f"- **Rating scale (Ebbert's):** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Covington, OH + Decatur, IN — Eastern Corn Belt regional")
head.append("")
head.append("---")
head.append("")
head.append("## Variety detail (verbatim from page)")
head.append("")
head.append(p.body_text)
head.append("")
return "\n".join(head)
# --------------------------------------------------------------------- write
def write_product(prod: EbProduct, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
md_path = CORPUS_DIR / f"{prod.source_key}.md"
json_path = CORPUS_DIR / f"{prod.source_key}.json"
md_path.write_text(body_md, encoding="utf-8")
sidecar = {
"source": "ebberts_seeds",
"source_key": prod.source_key,
"vendor": "Ebbert's Seeds",
"brand": "Ebbert's Seeds",
"product_name": prod.product_name,
"product_id": None,
"hybrid_prefix": prod.product_name,
"hybrid_suffix": prod.trait_label,
"crop": prod.crop,
"release_year": None,
"relative_maturity": prod.relative_maturity,
"maturity_group": prod.maturity_group,
"wheat_class": None,
"trait_stack": [prod.trait_label] if prod.trait_label else [],
"trait_descriptions": [],
"positioning_statement": None,
"strengths": [],
# No structured groups — the body markdown carries the table
# text verbatim. characteristics_groups stays empty so the
# chunker doesn't try to bucket non-existent items.
"characteristics_groups": [],
"page_text_chars": len(prod.body_text),
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "Ebbert's service area (Eastern Corn Belt — OH/IN/IL)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [prod.source_url],
"sitemap_last_modified": None,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
json_path.write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
products = discover_and_parse(http, only_crop=only_crop)
if only_product:
products = [
p for p in products
if p.source_key == only_product
or p.product_name.lower() == only_product.lower()
]
if not products:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0}
processed = 0
for prod in products:
if limit is not None and processed >= limit:
break
processed += 1
md_path = CORPUS_DIR / f"{prod.source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%s] %s skipped",
processed, str(limit) if limit else len(products),
prod.source_key)
continue
body = render_markdown(prod)
write_product(prod, body)
counts["written"] += 1
log.info(
"[%d/%s] %s written | crop=%s rm/mg=%s trait=%s chars=%d",
processed, str(limit) if limit else len(products),
prod.source_key, prod.crop,
prod.relative_maturity or prod.maturity_group or "-",
prod.trait_label or "-", len(prod.body_text),
)
log.info(
"done: processed=%d written=%d skipped=%d (of %d varieties)",
processed, counts["written"], counts["skipped"], len(products),
)
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.ebberts_seeds",
description="Scrape Ebbert's Seeds (regional Eastern Corn Belt breeder) — "
"corn / soybeans / wheat.",
)
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=list(CROP_PAGES),
help="Limit to one crop (corn / soybeans / wheat).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or product name.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr,
)
return run(
limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product,
)
if __name__ == "__main__":
sys.exit(main())