seed-mcp/scrape/sources/lg_seeds.py

"""LG Seeds scraper — AgReliant Genetics brand.

Source: ``www.lgseeds.com`` — WordPress site. Empty robots.txt
(no Disallow). Catalog covers 4 crops: corn, soybeans, alfalfa,
sorghum.

Two-layer fetch:

1. **Listing page** (one per crop): inline JavaScript variable
   ``products = [{...}, ...]`` carries the full variety summary —
   Variety code, Maturity, Traits[], Bullets[], CropType. No
   per-variety HTTP needed for identity.

2. **Detail page** (``/products/<crop>/<Variety>``): rich plant
   characteristics + disease tolerance + management ratings,
   rendered as ``<div class="characteristics-bar">`` blocks with
   ``<span class="bar-N">`` where N ∈ 1-9 is the rating. Same
   convention as Bayer/Golden Harvest (9 = best).

LG Seeds is a regional brand (Eastern Corn Belt focus) under
AgReliant Genetics, the same parent as AgriGold. Brand voice is
distinct so we keep them in separate scrapers.

Rating scale: ``1-9 (9 = best)`` — verified empirically on the
bar-N markup; matches Bayer / Golden Harvest convention.

Output:
  corpus/lg_seeds/<source_key>.md
  corpus/lg_seeds/<source_key>.json

source_key: ``lg-<variety>`` lowercased, e.g. ``lg-lg5701``,
``lg-c3400`` (soybean — codes don't use LG prefix), ``lg-7c300``
(alfalfa), ``lg-silo-max-100`` (sorghum).

CLI:
  python -m scrape.sources.lg_seeds --crop corn --limit 5
  python -m scrape.sources.lg_seeds --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.lgseeds.com"

# Crops listed in nav. Each has a listing page at /products/<crop>
# with an inline `var products = [...]` JSON blob.
LISTING_PATHS = {
    "corn":     "/products/corn",
    "soybeans": "/products/soybeans",
    "alfalfa":  "/products/alfalfa",
    "sorghum":  "/products/sorghum",
}

RATING_SCALE_DIRECTION = "1-9 (9 = best)"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "lg_seeds"

REQ_INTERVAL_SEC = 1.0

log = logging.getLogger("scrape.lg_seeds")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class LGProduct:
    source_key: str
    source_url: str
    crop: str
    product_name: str = ""
    product_id: int | None = None
    maturity_raw: str | None = None             # corn RM days / soy MG / alfalfa FD / sorghum days
    fall_dormancy: str | None = None            # alfalfa only
    trait_descriptions: list[str] = field(default_factory=list)
    bullets: list[str] = field(default_factory=list)
    characteristics_groups: list[dict] = field(default_factory=list)


# --------------------------------------------------------------------- discovery


_VAR_RE = re.compile(
    r'var\s+\w+\s*=\s*(\[\{"Variety":.+?\}\]);', re.S,
)


def discover_varieties(
    http: RateLimitedSession, *, only_crop: str | None = None,
) -> list[tuple[str, dict]]:
    """Return ``[(crop, summary_dict), ...]`` from each listing page's
    inline JSON. Summary dict has Variety / Id / Maturity / Traits /
    Bullets / CropType / FallDormancy."""
    out: list[tuple[str, dict]] = []
    for crop, path in LISTING_PATHS.items():
        if only_crop and crop != only_crop:
            continue
        log.info("fetching listing %s%s", BASE, path)
        r = http.get(f"{BASE}{path}")
        r.raise_for_status()
        m = _VAR_RE.search(r.text)
        if not m:
            log.warning("no products array in %s", path)
            continue
        try:
            items = json.loads(m.group(1))
        except json.JSONDecodeError as exc:
            log.error("JSON parse failed for %s: %s", path, exc)
            continue
        log.info("  %s: %d varieties", crop, len(items))
        for it in items:
            out.append((crop, it))
    log.info("total varieties discovered: %d", len(out))
    return out


# --------------------------------------------------------------------- helpers


def source_key_for(variety: str) -> str:
    """Slugify the variety code into a stable source_key."""
    slug = re.sub(r"[^a-zA-Z0-9-]+", "-", variety).strip("-").lower()
    return f"lg-{slug}"


_BAR_CLASS_RE = re.compile(r"^bar-(\d)$")


def _parse_bar_value(span_classes: list[str]) -> int | None:
    """Extract the integer rating from a ``bar-N`` CSS class."""
    for c in span_classes or []:
        m = _BAR_CLASS_RE.match(c)
        if m:
            return int(m.group(1))
    return None


# --------------------------------------------------------------------- detail


def fetch_product_detail(
    http: RateLimitedSession, summary: dict, crop: str,
) -> LGProduct:
    """Fetch the detail page and merge characteristics into an
    LGProduct seeded by the listing-page summary."""
    variety = summary.get("Variety") or ""
    # LG's detail URL is /products/<crop>/<Variety>. The Variety in the
    # listing JSON appears in correct case; LG seems to accept any case
    # but we use what's published.
    url = f"{BASE}/products/{crop}/{variety}"
    prod = LGProduct(
        source_key=source_key_for(variety),
        source_url=url,
        crop=crop,
        product_name=variety,
        product_id=summary.get("Id"),
        maturity_raw=str(summary.get("Maturity")) if summary.get("Maturity") is not None else None,
        fall_dormancy=str(summary.get("FallDormancy")) if summary.get("FallDormancy") else None,
        trait_descriptions=list(summary.get("Traits") or []),
        bullets=list(summary.get("Bullets") or []),
    )

    try:
        r = http.get(url)
        r.raise_for_status()
    except Exception as exc:  # noqa: BLE001
        log.warning("detail fetch failed for %s: %s", variety, exc)
        return prod  # identity-only fallback

    soup = BeautifulSoup(r.text, "html.parser")

    # The detail page has multiple .product-section blocks; each has
    # a heading + a collection of .characteristics-bar rows. We bucket
    # by the section's text content. Common LG section labels:
    # "Characteristics" / "Management" / "Disease Tolerance".
    sections: list[tuple[str, list[dict]]] = []
    for section in soup.find_all("div", class_=re.compile(r"product-section")):
        # Heading is the first text node inside the section, before bars.
        # The section class often includes a hint like "disease-toler",
        # "plantCharacteristics", "management-pr".
        section_classes = " ".join(section.get("class", []))
        bars = section.find_all("div", class_="characteristics-bar")
        if not bars:
            continue

        # Section label — use the first heading-like element or the
        # text right after the section class anchor.
        label = ""
        for h in section.find_all(["h2", "h3", "h4"]):
            t = h.get_text(strip=True)
            if t:
                label = t
                break
        if not label:
            # fallback: section_classes hint
            if "disease" in section_classes.lower():
                label = "Disease Tolerance"
            elif "management" in section_classes.lower():
                label = "Management"
            elif "plantcharacteristics" in section_classes.lower():
                label = "Characteristics"

        items: list[dict] = []
        for bar in bars:
            name_el = bar.find(class_="product-name")
            value_span = bar.find("span", class_=_BAR_CLASS_RE)
            name = (name_el.get_text(" ", strip=True) if name_el else "").strip()
            rating = _parse_bar_value(value_span.get("class") if value_span else [])
            if not name:
                continue
            # Some "bars" are actually qualitative (e.g. "Tar Spot Susceptible",
            # "Fungicide Response High"). For those we keep the label as the
            # value text rather than a missing rating.
            if rating is None:
                # Look inside the bar element for a non-name text snippet
                inner_text = bar.get_text(" ", strip=True)
                # Strip the label off the front
                if inner_text.startswith(name):
                    inner_text = inner_text[len(name):].strip()
                items.append({"characteristic": name, "value": inner_text or "-"})
            else:
                items.append({"characteristic": name, "value": str(rating)})

        if items:
            sections.append((label or "Characteristics", items))

    prod.characteristics_groups = [
        {"label": label.upper(), "type": "bars", "items": items}
        for label, items in sections
    ]

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: LGProduct) -> str:
    title = p.product_name or p.source_key
    crop_label = {
        "corn": "Corn", "soybeans": "Soybeans",
        "alfalfa": "Alfalfa", "sorghum": "Sorghum",
    }.get(p.crop, p.crop.title())

    head: list[str] = [
        f"# {title}",
        "",
        "- **Vendor:** AgReliant Genetics",
        "- **Brand:** LG Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if p.maturity_raw:
        if p.crop == "corn":
            head.append(f"- **Relative maturity:** {p.maturity_raw}")
        elif p.crop == "soybeans":
            head.append(f"- **Maturity group:** {p.maturity_raw}")
        elif p.crop == "alfalfa":
            head.append(f"- **Fall dormancy / maturity:** {p.maturity_raw}")
        elif p.crop == "sorghum":
            head.append(f"- **Days to maturity:** {p.maturity_raw}")
    if p.trait_descriptions:
        head.append(f"- **Traits:** {', '.join(p.trait_descriptions)}")
    head.append(f"- **Source:** {p.source_url}")
    head.append(f"- **Rating scale (LG Seeds):** {RATING_SCALE_DIRECTION}")
    head.append("")
    head.append("---")
    head.append("")

    sections: list[str] = []
    if p.bullets:
        bullets = "\n".join(f"- {b}" for b in p.bullets)
        sections.append("## Strengths\n\n" + bullets + "\n")

    for g in p.characteristics_groups:
        label = (g.get("label") or "Characteristics").title()
        items = g.get("items") or []
        if not items:
            continue
        rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items)
        sections.append(
            f"## {label}\n\n"
            "| Characteristic | Value |\n"
            "|---|---|\n"
            f"{rows}\n"
        )
    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_product(prod: LGProduct, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{prod.source_key}.md"
    json_path = CORPUS_DIR / f"{prod.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "lg_seeds",
        "source_key": prod.source_key,
        "vendor": "AgReliant Genetics",
        "brand": "LG Seeds",
        "product_name": prod.product_name,
        "product_id": prod.product_id,
        "hybrid_prefix": prod.product_name,
        "hybrid_suffix": None,
        "crop": prod.crop,
        "release_year": None,
        # Maturity routing: corn = RM days, soy = MG, alfalfa = FD,
        # sorghum = days-to-maturity. Stored in the canonical fields
        # so the chunker's crop-aware preamble works.
        "relative_maturity": prod.maturity_raw if prod.crop in ("corn", "sorghum") else None,
        "maturity_group": prod.maturity_raw if prod.crop == "soybeans" else None,
        "fall_dormancy": prod.maturity_raw if prod.crop == "alfalfa" else prod.fall_dormancy,
        "wheat_class": None,
        "trait_stack": prod.trait_descriptions,  # LG publishes full names, not codes
        "trait_descriptions": prod.trait_descriptions,
        "positioning_statement": None,
        "strengths": prod.bullets,
        "characteristics_groups": prod.characteristics_groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [],
        "image_url": None,
        "source_urls": [prod.source_url],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_product(
    http: RateLimitedSession, summary: dict, crop: str, *, force: bool,
) -> tuple[str, LGProduct | None]:
    variety = summary.get("Variety") or ""
    source_key = source_key_for(variety)
    md_path = CORPUS_DIR / f"{source_key}.md"
    if md_path.exists() and not force:
        return "skipped", None
    try:
        prod = fetch_product_detail(http, summary, crop)
    except Exception as exc:  # noqa: BLE001
        log.error("variety %s failed: %s", variety, exc)
        return "failed", None
    body = render_markdown(prod)
    write_product(prod, body)
    return "written", prod


def run(
    *, limit: int | None, force: bool,
    only_crop: str | None, only_product: str | None,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    targets = discover_varieties(http, only_crop=only_crop)
    if only_product:
        targets = [
            (c, s) for (c, s) in targets
            if source_key_for(s.get("Variety", "")) == only_product
            or s.get("Variety", "").lower() == only_product.lower()
        ]
        if not targets:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "failed": 0}
    processed = 0
    for crop, summary in targets:
        if limit is not None and processed >= limit:
            break
        processed += 1
        status, prod = process_product(http, summary, crop, force=force)
        counts[status] = counts.get(status, 0) + 1
        if prod is not None:
            log.info(
                "[%d/%s] %s %s | crop=%s maturity=%s traits=%d groups=%d",
                processed, str(limit) if limit else "all",
                prod.source_key, status, prod.crop,
                prod.maturity_raw or "-",
                len(prod.trait_descriptions),
                len(prod.characteristics_groups),
            )
        else:
            log.info("[%d/%s] %s %s",
                     processed, str(limit) if limit else "all",
                     source_key_for(summary.get("Variety", "")), status)

    log.info(
        "done: processed=%d written=%d skipped=%d failed=%d (of %d candidates)",
        processed, counts["written"], counts["skipped"],
        counts["failed"], len(targets),
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.lg_seeds",
        description="Scrape LG Seeds (AgReliant Genetics) — corn / "
                    "soybeans / alfalfa / sorghum.",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=list(LISTING_PATHS),
                   help="Limit to one crop.")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or Variety code.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit, force=args.force,
        only_crop=args.crop, only_product=args.product,
    )


if __name__ == "__main__":
    sys.exit(main())