seed-mcp/scrape/sources/ebberts_seeds.py

"""Ebbert's Seeds scraper — small regional Ohio/Indiana breeder.

Source: ``www.ebbertsseeds.com`` — WordPress site. robots.txt is
permissive (``Crawl-delay: 5`` only, no Disallow). Covington, OH +
Decatur, IN — Eastern Corn Belt focus.

Catalog is structured as one scrollable page PER CROP, with each
variety rendered as a CSS-grid block of `<h1>NAME TRAIT RM RM</h1>`
+ several sub-sections (MANAGEMENT & POSITIONING / CHARACTERISTICS
/ DISEASE RATINGS) where the labels and numeric values live in
separate adjacent grid cells. Reconstructing a perfectly-aligned
{characteristic: value} dict from the multi-column layout is
fiddly; the small variety count (~17 corn + similar soy/wheat)
doesn't justify the engineering. We instead **preserve the full
text body of each variety's container** in the chunk markdown so
the LLM can read the tabular text as-is.

Pages scraped: `/corn/`, `/soybeans-2/`, `/wheat/`. Grass-seed /
forage / cover-crop pages are out of scope for the row-crop
advisor.

Rating scale: ``1-5 (1 = best, lower = more resistant)`` — same
direction as AgriPro / NK. Confirmed by cross-referencing
positioning text against published values (a variety described as
"Robust tall plants" has STANDABILITY 1.0 → 1 = best).

Output:
  corpus/ebberts_seeds/<source_key>.md
  corpus/ebberts_seeds/<source_key>.json

source_key: ``ebberts-<slug>`` lowercased, e.g.
``ebberts-7000tr-rib`` or ``ebberts-1335-conventional``.

CLI:
  python -m scrape.sources.ebberts_seeds --crop corn --limit 5
  python -m scrape.sources.ebberts_seeds --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.ebbertsseeds.com"

# Ebbert's per-crop catalog pages. URL paths confirmed via homepage
# nav links 2026-05-26.
CROP_PAGES = {
    "corn":     "/corn/",
    "soybeans": "/soybeans-2/",
    "wheat":    "/wheat/",
}

# Per robots.txt: Crawl-delay: 5 (seconds). We respect that.
REQ_INTERVAL_SEC = 5.0

RATING_SCALE_DIRECTION = "1-5 (1 = best, lower = more resistant)"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "ebberts_seeds"

log = logging.getLogger("scrape.ebberts_seeds")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """robots.txt asks for 5-sec Crawl-delay; we honor it. Ebbert's
    catalog is only ~30-50 pages total so even at 5 sec/req the
    full scrape finishes in <5 min."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class EbProduct:
    source_key: str
    source_url: str          # the per-crop page URL (Ebbert's doesn't have per-variety pages)
    crop: str
    product_name: str = ""   # "7000TR RIB", "1335 CONVENTIONAL"
    trait_label: str | None = None   # "RIB", "CONVENTIONAL", "PC", "SSX RIB", etc.
    relative_maturity: str | None = None    # corn
    maturity_group: str | None = None       # soy
    body_text: str = ""      # verbatim text of the variety's container


# --------------------------------------------------------------------- discovery + parse


_VARIETY_HEADING_RE = re.compile(
    r"^(?P<name>\S+(?:\s+\S+)*?)\s+(?P<rm>\d+(?:\.\d+)?)\s*RM$",
    re.IGNORECASE,
)


def _variety_text(h1, next_h1) -> str:
    """Collect the visible text from this variety's <h1> up to (but
    not including) the next variety's <h1>, walking the DOM in
    document order.

    Ebbert's grid layout spreads each variety's content across many
    sibling ``.x-cell`` blocks in the outer container; the h1's
    immediate parent only holds the title cell. The correct boundary
    is the next variety h1 in document order.
    """
    chunks: list[str] = [h1.get_text(strip=True)]
    for node in h1.find_all_next(string=True):
        # Stop once we cross into the next variety's h1.
        if next_h1 is not None:
            if node is next_h1 or next_h1 in getattr(node, "parents", []):
                break
            # Or text is a descendant of next_h1
            anc = node.parent
            while anc is not None:
                if anc is next_h1:
                    break
                anc = anc.parent
            if anc is next_h1:
                break
        text = str(node).strip()
        if text:
            chunks.append(text)
    body = " | ".join(chunks)
    body = re.sub(r"\s*\|\s*\|\s*", " | ", body)
    body = re.sub(r"\s+", " ", body).strip()
    return body


def _slug(text: str) -> str:
    s = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-").lower()
    return s


def discover_and_parse(
    http: RateLimitedSession, *, only_crop: str | None = None,
) -> list[EbProduct]:
    """Fetch one page per crop and extract every variety container."""
    out: list[EbProduct] = []
    for crop, path in CROP_PAGES.items():
        if only_crop and crop != only_crop:
            continue
        url = f"{BASE}{path}"
        log.info("fetching %s", url)
        r = http.get(url)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        # Every variety is anchored by an <h1>NAME ... RM RM</h1>.
        v_h1s = [
            h for h in soup.find_all("h1")
            if _VARIETY_HEADING_RE.match(h.get_text(strip=True))
        ]
        log.info("  %s: %d varieties", crop, len(v_h1s))

        for i, h1 in enumerate(v_h1s):
            title = h1.get_text(strip=True)
            m = _VARIETY_HEADING_RE.match(title)
            if not m:
                continue
            name = m.group("name").strip()
            maturity = m.group("rm")

            next_h1 = v_h1s[i + 1] if i + 1 < len(v_h1s) else None
            body = _variety_text(h1, next_h1)

            prod = EbProduct(
                source_key=f"ebberts-{_slug(name)}",
                source_url=url,
                crop=crop,
                product_name=name,
                relative_maturity=maturity if crop == "corn" else None,
                maturity_group=maturity if crop == "soybeans" else None,
                body_text=body,
            )
            # Derive trait_label from the second token of the name if
            # it looks like a trait (CONVENTIONAL, RIB, PC, SSX RIB,
            # TR RIB, etc.). Best-effort, doesn't have to be perfect.
            parts = name.split(maxsplit=1)
            if len(parts) == 2:
                prod.trait_label = parts[1]
            out.append(prod)
    log.info("total varieties discovered: %d", len(out))
    return out


# --------------------------------------------------------------------- render


def render_markdown(p: EbProduct) -> str:
    title = p.product_name or p.source_key
    crop_label = {"corn": "Corn", "soybeans": "Soybeans",
                  "wheat": "Wheat"}.get(p.crop, p.crop.title())
    head: list[str] = [
        f"# {title}",
        "",
        "- **Vendor:** Ebbert's Seeds (independent regional breeder)",
        "- **Brand:** Ebbert's Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if p.relative_maturity and p.crop == "corn":
        head.append(f"- **Relative maturity:** {p.relative_maturity}")
    if p.maturity_group and p.crop == "soybeans":
        head.append(f"- **Maturity group:** {p.maturity_group}")
    if p.trait_label:
        head.append(f"- **Trait stack (label):** {p.trait_label}")
    head.append(f"- **Source:** {p.source_url}")
    head.append(f"- **Rating scale (Ebbert's):** {RATING_SCALE_DIRECTION}")
    head.append("- **Service area:** Covington, OH + Decatur, IN — Eastern Corn Belt regional")
    head.append("")
    head.append("---")
    head.append("")
    head.append("## Variety detail (verbatim from page)")
    head.append("")
    head.append(p.body_text)
    head.append("")
    return "\n".join(head)


# --------------------------------------------------------------------- write


def write_product(prod: EbProduct, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{prod.source_key}.md"
    json_path = CORPUS_DIR / f"{prod.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "ebberts_seeds",
        "source_key": prod.source_key,
        "vendor": "Ebbert's Seeds",
        "brand": "Ebbert's Seeds",
        "product_name": prod.product_name,
        "product_id": None,
        "hybrid_prefix": prod.product_name,
        "hybrid_suffix": prod.trait_label,
        "crop": prod.crop,
        "release_year": None,
        "relative_maturity": prod.relative_maturity,
        "maturity_group": prod.maturity_group,
        "wheat_class": None,
        "trait_stack": [prod.trait_label] if prod.trait_label else [],
        "trait_descriptions": [],
        "positioning_statement": None,
        "strengths": [],
        # No structured groups — the body markdown carries the table
        # text verbatim. characteristics_groups stays empty so the
        # chunker doesn't try to bucket non-existent items.
        "characteristics_groups": [],
        "page_text_chars": len(prod.body_text),
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [
            {"product_list_name": "Ebbert's service area (Eastern Corn Belt — OH/IN/IL)",
             "agronomist": None, "agronomist_email": None, "variant_id": None},
        ],
        "image_url": None,
        "source_urls": [prod.source_url],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    products = discover_and_parse(http, only_crop=only_crop)

    if only_product:
        products = [
            p for p in products
            if p.source_key == only_product
            or p.product_name.lower() == only_product.lower()
        ]
        if not products:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0}
    processed = 0
    for prod in products:
        if limit is not None and processed >= limit:
            break
        processed += 1
        md_path = CORPUS_DIR / f"{prod.source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%s] %s skipped",
                     processed, str(limit) if limit else len(products),
                     prod.source_key)
            continue
        body = render_markdown(prod)
        write_product(prod, body)
        counts["written"] += 1
        log.info(
            "[%d/%s] %s written | crop=%s rm/mg=%s trait=%s chars=%d",
            processed, str(limit) if limit else len(products),
            prod.source_key, prod.crop,
            prod.relative_maturity or prod.maturity_group or "-",
            prod.trait_label or "-", len(prod.body_text),
        )

    log.info(
        "done: processed=%d written=%d skipped=%d (of %d varieties)",
        processed, counts["written"], counts["skipped"], len(products),
    )
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.ebberts_seeds",
        description="Scrape Ebbert's Seeds (regional Eastern Corn Belt breeder) — "
                    "corn / soybeans / wheat.",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=list(CROP_PAGES),
                   help="Limit to one crop (corn / soybeans / wheat).")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or product name.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit, force=args.force,
        only_crop=args.crop, only_product=args.product,
    )


if __name__ == "__main__":
    sys.exit(main())