seed-mcp/scrape/sources/agrigold.py

"""AgriGold scraper — AgReliant Genetics brand.

Source: ``www.agrigold.com`` — WordPress site, empty robots.txt
(no Disallow). Catalog covers corn + soybeans. Sibling of LG Seeds
under the same parent (AgReliant) but distinct branding /
positioning, so kept in its own scraper.

Discovery: the listing page ``/corn/explore-corn-hybrids`` (and
the soybean equivalent) is server-rendered HTML that contains
``<a href="/corn/explore-corn-hybrids/<CODE>">`` for every variety.
Codes look like ``A616-30``, ``A623-88``, etc. Parse the listing
HTML, collect distinct variety URLs.

Per-variety detail (``/corn/explore-corn-hybrids/<CODE>``) renders
several ``<div class="product-section ...">`` blocks. Each section
has a ``<div class="title">`` heading + multiple ``.detail-item``
rows shaped as ``<div class="label">N</div><div class="value">V</div>``.

The ``<div class="value">`` content is one of:

  - **5-circle rating scale** (Agronomic Rating, Disease Tolerance,
    Silage Characteristics): ``<div class="scale">`` containing 5
    children, where N have class ``circle selected`` and 5-N have
    class ``circle``. Count = rating on a **1-5 scale** (5 = best).
    Distinct from Bayer / LG Seeds' 1-9 convention — documented in
    the sidecar's ``_scale_direction``.

  - **Numeric value** (GDUs, year, plant population): bare number.

  - **Categorical / qualitative** (Ear Flex Type "KERNEL",
    Leaf Orientation "SEMI UPRIGHT", Cob Color "Red"): the literal
    text.

  - **NA**: rated but not yet measured.

Rating scale: ``1-5 (5 = best)`` — distinct from the other brands;
the chunker reads ``_scale_direction`` to render the correct
preamble.

Output:
  corpus/agrigold/<source_key>.md
  corpus/agrigold/<source_key>.json

source_key: ``agrigold-<code>`` lowercased, e.g.
``agrigold-a616-30``.

CLI:
  python -m scrape.sources.agrigold --crop corn --limit 5
  python -m scrape.sources.agrigold --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.agrigold.com"

LISTING_PATHS = {
    "corn":     "/corn/explore-corn-hybrids",
    "soybeans": "/soybeans/explore-soybean-varieties",
}

# AgriGold publishes ratings on a 1-5 scale (5 = best), counted from
# the selected circles in the per-rating scale block. The chunker
# preserves this verbatim — every chunk preamble declares the scale
# so the LLM doesn't conflate with Bayer's 1-9.
RATING_SCALE_DIRECTION = "1-5 (5 = best)"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "agrigold"

REQ_INTERVAL_SEC = 1.0

log = logging.getLogger("scrape.agrigold")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class AGProduct:
    source_key: str
    source_url: str
    crop: str
    product_name: str = ""
    relative_maturity: str | None = None   # corn RM days from .maturity
    maturity_group: str | None = None      # soy MG
    trait_descriptions: list[str] = field(default_factory=list)
    characteristics_groups: list[dict] = field(default_factory=list)


# --------------------------------------------------------------------- discovery


def discover_varieties(
    http: RateLimitedSession, *, only_crop: str | None = None,
) -> list[tuple[str, str, str]]:
    """Return ``[(url, crop, variety_code), ...]`` for every variety in
    the listing pages."""
    out: list[tuple[str, str, str]] = []
    for crop, path in LISTING_PATHS.items():
        if only_crop and crop != only_crop:
            continue
        log.info("fetching listing %s%s", BASE, path)
        r = http.get(f"{BASE}{path}")
        r.raise_for_status()
        # Collect distinct hrefs that look like /<crop>/explore-X-{hybrids,
        # varieties}/<CODE>. Codes are alphanumeric with dashes.
        href_re = re.compile(rf"^{re.escape(path)}/([\w\-]+)$")
        seen: set[str] = set()
        soup = BeautifulSoup(r.text, "html.parser")
        for a in soup.find_all("a", href=True):
            m = href_re.match(a["href"])
            if not m:
                continue
            code = m.group(1)
            # Filter out catalog-tool tails ("filter", "browse", etc.)
            if not re.match(r"^[A-Z0-9][\w\-]{2,30}$", code, re.I):
                continue
            if code in seen:
                continue
            seen.add(code)
            out.append((f"{BASE}{path}/{code}", crop, code))
        log.info("  %s: %d varieties", crop, len(seen))
    log.info("total varieties discovered: %d", len(out))
    return out


# --------------------------------------------------------------------- helpers


def source_key_for(code: str) -> str:
    slug = re.sub(r"[^a-zA-Z0-9-]+", "-", code).strip("-").lower()
    return f"agrigold-{slug}"


# Section class hint -> normalized label for the sidecar.
SECTION_LABEL_MAP = {
    "agronomic-rating":        "AGRONOMIC RATING",
    "disease-tolerance":       "DISEASE TOLERANCE",
    "plant-characteristics":   "PLANT CHARACTERISTICS",
    "plant-features":          "PRODUCT FEATURES",
    "silage-characteristics":  "SILAGE CHARACTERISTICS",
    "planting-applications":   "PLANTING APPLICATIONS",
    "planting-population":     "PLANTING POPULATION",
}


def _parse_scale(value_el) -> int | None:
    """Count selected circles in a ``<div class="scale">`` block.
    Returns 1-5 or None if no scale present."""
    if value_el is None:
        return None
    scale = value_el.find("div", class_="scale")
    if scale is None:
        return None
    selected = scale.find_all("div", class_=lambda c: c and "selected" in c)
    return len(selected) if selected else 0


def _parse_value(value_el) -> str:
    """Extract a non-scale value: raw text contents, trimmed."""
    if value_el is None:
        return ""
    # If it has a .scale child we should have caught it above. Otherwise
    # return the leaf text.
    text = value_el.get_text(" ", strip=True)
    return text


# --------------------------------------------------------------------- detail


def fetch_product_detail(
    http: RateLimitedSession, url: str, crop: str, code: str,
) -> AGProduct:
    r = http.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    prod = AGProduct(
        source_key=source_key_for(code),
        source_url=url,
        crop=crop,
        product_name=code,
    )

    # Maturity — often rendered as ``<div class="maturity">86 days</div>``.
    mat_el = soup.find(class_="maturity")
    if mat_el:
        text = mat_el.get_text(strip=True)
        m = re.search(r"(\d+(?:\.\d+)?)", text)
        if m:
            if crop == "corn":
                prod.relative_maturity = m.group(1)
            elif crop == "soybeans":
                prod.maturity_group = m.group(1)

    # Trait package — from .product-details / "Trait Package"
    pd = soup.find(class_="product-details")
    if pd:
        # The details block renders pairs of label / value text:
        # "Genetic Family | Icon-J | Trait Package | VT2RIB | ..."
        # Parse the labels we recognize.
        text = pd.get_text(" | ", strip=True)
        m = re.search(r"Trait Package\s*\|\s*([^|]+?)(?:\s*\||$)", text)
        if m:
            tp = m.group(1).strip()
            if tp and tp.lower() not in ("none", "-"):
                prod.trait_descriptions = [tp]

    # Iterate all product-section blocks; bucket items per section.
    for section in soup.find_all("div", class_=re.compile(r"product-section")):
        section_classes = section.get("class", [])
        label = ""
        for cls in section_classes:
            if cls in SECTION_LABEL_MAP:
                label = SECTION_LABEL_MAP[cls]
                break
        if not label:
            title_el = section.find(class_="title")
            label = (title_el.get_text(strip=True).upper()
                     if title_el else "OTHER")

        items: list[dict] = []
        for detail in section.find_all("div", class_="detail-item"):
            label_el = detail.find("div", class_="label")
            value_el = detail.find("div", class_="value")
            ch = (label_el.get_text(" ", strip=True) if label_el else "").strip()
            if not ch:
                continue

            scale = _parse_scale(value_el)
            if scale is not None:
                items.append({"characteristic": ch, "value": str(scale)})
            else:
                v = _parse_value(value_el)
                # Special-case the "Row Type" header row from planting-population
                # which holds nested headers, not a real rating.
                if ch.lower() == "row type" and v.lower() in (
                    "low medium high", "low / medium / high",
                ):
                    continue
                if v:
                    items.append({"characteristic": ch, "value": v})

        if items:
            prod.characteristics_groups.append({
                "label": label, "type": "scale-or-value", "items": items,
            })

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: AGProduct) -> str:
    title = p.product_name or p.source_key
    crop_label = "Corn" if p.crop == "corn" else "Soybeans"
    head: list[str] = [
        f"# {title}",
        "",
        "- **Vendor:** AgReliant Genetics",
        "- **Brand:** AgriGold",
        f"- **Crop:** {crop_label}",
    ]
    if p.relative_maturity and p.crop == "corn":
        head.append(f"- **Relative maturity:** {p.relative_maturity}")
    if p.maturity_group and p.crop == "soybeans":
        head.append(f"- **Maturity group:** {p.maturity_group}")
    if p.trait_descriptions:
        head.append(f"- **Traits:** {', '.join(p.trait_descriptions)}")
    head.append(f"- **Source:** {p.source_url}")
    head.append(f"- **Rating scale (AgriGold):** {RATING_SCALE_DIRECTION}")
    head.append("")
    head.append("---")
    head.append("")

    sections: list[str] = []
    for g in p.characteristics_groups:
        label = (g.get("label") or "Characteristics").title()
        items = g.get("items") or []
        if not items:
            continue
        rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items)
        sections.append(
            f"## {label}\n\n"
            "| Characteristic | Value |\n"
            "|---|---|\n"
            f"{rows}\n"
        )
    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_product(prod: AGProduct, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{prod.source_key}.md"
    json_path = CORPUS_DIR / f"{prod.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "agrigold",
        "source_key": prod.source_key,
        "vendor": "AgReliant Genetics",
        "brand": "AgriGold",
        "product_name": prod.product_name,
        "product_id": None,
        "hybrid_prefix": prod.product_name,
        "hybrid_suffix": None,
        "crop": prod.crop,
        "release_year": None,
        "relative_maturity": prod.relative_maturity,
        "maturity_group": prod.maturity_group,
        "wheat_class": None,
        "trait_stack": prod.trait_descriptions,
        "trait_descriptions": prod.trait_descriptions,
        "positioning_statement": None,
        "strengths": [],
        "characteristics_groups": prod.characteristics_groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [],
        "image_url": None,
        "source_urls": [prod.source_url],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_product(
    http: RateLimitedSession, *, url: str, crop: str, code: str, force: bool,
) -> tuple[str, AGProduct | None]:
    source_key = source_key_for(code)
    md_path = CORPUS_DIR / f"{source_key}.md"
    if md_path.exists() and not force:
        return "skipped", None
    try:
        prod = fetch_product_detail(http, url, crop, code)
    except Exception as exc:  # noqa: BLE001
        log.error("variety %s failed: %s", code, exc)
        return "failed", None
    body = render_markdown(prod)
    write_product(prod, body)
    return "written", prod


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    targets = discover_varieties(http, only_crop=only_crop)
    if only_product:
        targets = [
            (u, c, k) for (u, c, k) in targets
            if source_key_for(k) == only_product
            or k.lower() == only_product.lower()
        ]
        if not targets:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "failed": 0}
    processed = 0
    for url, crop, code in targets:
        if limit is not None and processed >= limit:
            break
        processed += 1
        status, prod = process_product(
            http, url=url, crop=crop, code=code, force=force,
        )
        counts[status] = counts.get(status, 0) + 1
        if prod is not None:
            log.info(
                "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d",
                processed, str(limit) if limit else "all",
                prod.source_key, status, prod.crop,
                prod.relative_maturity or prod.maturity_group or "-",
                ",".join(prod.trait_descriptions) or "-",
                len(prod.characteristics_groups),
            )
        else:
            log.info("[%d/%s] %s %s",
                     processed, str(limit) if limit else "all",
                     source_key_for(code), status)

    log.info(
        "done: processed=%d written=%d skipped=%d failed=%d (of %d candidates)",
        processed, counts["written"], counts["skipped"],
        counts["failed"], len(targets),
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.agrigold",
        description="Scrape AgriGold (AgReliant Genetics) corn + soybean varieties.",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=list(LISTING_PATHS),
                   help="Limit to one crop.")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or variety code.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit, force=args.force,
        only_crop=args.crop, only_product=args.product,
    )


if __name__ == "__main__":
    sys.exit(main())