seed-mcp/scrape/sources/latham.py

"""Latham Hi-Tech Seeds scraper — independent family-owned brand (Alexander, IA).

Source: ``www.lathamseeds.com`` — WordPress site exposing a public,
no-auth REST API. robots.txt is permissive (only ``/wp-admin/``
disallowed; the catalog + ``/wp-json/`` are open, no Crawl-delay).
Independent Upper-Midwest seed company (the self-styled "Latham
Country" — IA / MN / WI / IL / ND / SD / NE); corn + soybeans only
(an Alfalfa crop term exists in the taxonomy but has zero published
varieties — no wheat).

Two-step ingestion (mirrors the ProHarvest scraper):

1. **Enumerate** via the WP REST API. ``/wp/v2/varieties`` is the
   variety custom-post-type (~265 records, ``X-WP-Total: 265``).
   ``/wp/v2/variety_crop`` is the crop taxonomy (Corn=2013,
   Soybean=2029, Alfalfa=2159/empty); ``/wp/v2/variety_trait`` is the
   trait taxonomy (Enlist E3, VT2 PRO RIB, Smart Stax, XtendFlex, …).
   The REST payload gives the canonical id / slug / title / permalink
   and taxonomy term IDs, plus a human-readable ``class_list`` (e.g.
   ``variety_crop-soybean``, ``variety_trait-enlist-e3``). ``acf`` is
   ``[]`` and ``content.rendered`` is EMPTY in REST, so the ratings
   have to come from the detail page.

2. **Parse the detail page.** Each ``/products/<slug>/`` page
   server-renders the agronomic data as ``<h3>`` spec sections, each a
   run of ``<li><span>label</span><span>value</span></li>`` rows up to
   the next section header:
     - Corn: "Agronomic Characteristics" (Early Vigor / Stalk Strength
       / Root Strength / Stay Green / Drydown / Test Weight / Drought
       Tolerance / Foliar Fungicide / Corn-on-Corn), "Plant
       Characteristics" (Ear Height / Ear Type), "Disease Ratings"
       (Goss's Wilt / Northern Leaf Blight / Anthracnose Stalk Rot /
       Gray Leaf Spot / tar spot etc).
     - Soybean: "Plant Characteristics" (Relative Maturity / Emergence
       / Plant Height / Plant Type / Flower Color / Pubescence / Pod
       Color / Hilum Color), "Defensive Characteristics & Disease
       Ratings" (SCN Resistance source / Iron Chlorosis / Stress
       Tolerance / Phytophthora Rps gene / Brown Stem Rot / White Mold
       / Sudden Death). "Herbicide Tolerance" + "Placement" sections
       are present but carry no ``<li>`` rows.
   The relative maturity also sits in a "Key Features" ``Maturity``
   row ("113.00 RM" / "3.60 RM"); we read RM/MG from the per-crop
   spec section first and fall back to that.

Rating scale: **numeric, LOWER = BETTER** (1 = best / most
tolerant / most resistant). No explicit on-page legend, so the
direction was confirmed by cross-referencing the Product Overview
prose against the published values across ~12 corn varieties:
hybrids described "very good / superior / excellent stalks and roots"
carry Stalk/Root Strength 1.0–1.5, weaker traits run 3.0–3.5, and no
value approaches 9 (observed range ~1.0–3.5). The soybean disease
panel (Iron Chlorosis / Brown Stem Rot / White Mold / Sudden Death /
Stress Tolerance) reads the same direction (lower = more resistant).
A handful of values are categorical rather than numeric and pass
through verbatim: SCN Resistance source ("PI 88788"), Phytophthora
"Rps 1k", Anthracnose "ASR", plant descriptors ("Medium Tall",
"Flex"). ``NA`` / blank = not rated.

Unlike the Ebbert's scraper (which left ``characteristics_groups``
empty and relied on a verbatim body), we parse the spec sections into
structured ``characteristics_groups`` so the numeric + categorical
ratings land in the embedded chunk and are actually retrievable. The
soybean "Defensive Characteristics & Disease Ratings" section maps to
the DISEASE RATINGS bucket; corn "Agronomic Characteristics" +
"Plant Characteristics" map to AGRONOMIC CHARACTERISTICS.

Output:
  corpus/latham/<source_key>.md
  corpus/latham/<source_key>.json

source_key: ``latham-<slug>`` lowercased, e.g. ``latham-l-3632-e3``.

CLI:
  python -m scrape.sources.latham --crop corn --limit 5
  python -m scrape.sources.latham --force
  python -m scrape.sources.latham --product latham-l-3632-e3
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, Tag

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.lathamseeds.com"
WP = f"{BASE}/wp-json/wp/v2"

# variety_crop taxonomy slug -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. "alfalfa"
# is in the taxonomy but has zero published varieties; everything not
# listed here is out of scope for the row-crop advisor. (No wheat.)
CROP_TYPES = {
    "corn": "corn",
    "soybean": "soybeans",
}

# robots.txt declares no Crawl-delay and only blocks /wp-admin/; we
# stay polite. ~265 detail pages at 1.5 s/req finishes in ~7 min.
REQ_INTERVAL_SEC = 1.5

RATING_SCALE_DIRECTION = (
    "numeric ratings ~1-9 where LOWER = BETTER (1 = best / most "
    "tolerant / most resistant); confirmed by cross-referencing "
    "Product Overview prose vs values (top-rated stalks/roots cluster "
    "1.0-1.5, weak traits 3.0-3.5, none approach 9). Categorical "
    "values pass through verbatim (SCN source 'PI 88788', "
    "Phytophthora 'Rps 1k', Anthracnose 'ASR', 'Medium Tall', 'Flex'). "
    "NA/blank = not rated."
)

# Detail-page spec section headers (<h3>) -> characteristics_groups
# label. DISEASE RATINGS -> disease framing, AGRONOMIC CHARACTERISTICS
# -> agronomic framing in the chunker; anything else passes through as
# its own titled section. Both corn and soy headers are covered. The
# soybean "Defensive Characteristics & Disease Ratings" panel mixes
# disease 1-9 ratings with categorical resistance source/genes — we
# bucket the whole panel as DISEASE so it embeds under disease framing.
SPEC_SECTIONS = {
    "agronomic characteristics": "AGRONOMIC CHARACTERISTICS",
    "plant characteristics": "AGRONOMIC CHARACTERISTICS",
    "disease ratings": "DISEASE RATINGS",
    "defensive characteristics & disease ratings": "DISEASE RATINGS",
    "defensive characteristics and disease ratings": "DISEASE RATINGS",
}

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "latham"

log = logging.getLogger("scrape.latham")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Polite session with backoff. Latham's catalog is ~265 detail
    pages so 1.5 s/req finishes the full scrape in ~7 min."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)

    def get_json(self, url: str, **kw: Any) -> Any:
        r = self.get(url, **kw)
        r.raise_for_status()
        return r.json()


# --------------------------------------------------------------------- model


@dataclass
class LathamVariety:
    source_key: str
    source_url: str
    crop: str                         # chunker value: corn / soybeans
    product_name: str = ""            # "L 3632 E3"
    relative_maturity: int | None = None     # corn (days)
    maturity_group: float | None = None      # soy
    release_year: str | None = None
    trait_stack: list[str] = field(default_factory=list)
    positioning: str | None = None
    # [{label, items:[{characteristic, value}]}] — chunker source of truth
    groups: list[dict] = field(default_factory=list)


# --------------------------------------------------------------------- discovery (REST)


def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
    """term_id -> name for a WP taxonomy (paged)."""
    out: dict[int, str] = {}
    page = 1
    while True:
        url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
        r = http.get(url)
        if r.status_code == 400:   # past last page
            break
        r.raise_for_status()
        terms = r.json()
        if not terms:
            break
        for t in terms:
            out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
        if len(terms) < 100:
            break
        page += 1
    return out


def _crop_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
    out: dict[str, int] = {}
    for t in http.get_json(f"{WP}/variety_crop?per_page=100&_fields=id,slug"):
        out[t["slug"]] = t["id"]
    return out


def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
    """Return REST variety records for the in-scope row crops."""
    crop_ids = _crop_slug_to_id(http)
    records: list[dict] = []
    seen: set[int] = set()
    for crop_slug, crop in CROP_TYPES.items():
        if only_crop and crop != only_crop:
            continue
        cid = crop_ids.get(crop_slug)
        if cid is None:
            log.warning("variety_crop %r not found in taxonomy — skipping", crop_slug)
            continue
        page = 1
        while True:
            url = (f"{WP}/varieties?variety_crop={cid}&per_page=100&page={page}"
                   "&_fields=id,slug,title,link,variety_trait,variety_year")
            r = http.get(url)
            if r.status_code == 400:
                break
            r.raise_for_status()
            batch = r.json()
            if not batch:
                break
            for v in batch:
                if v["id"] in seen:
                    continue
                seen.add(v["id"])
                v["_crop"] = crop
                records.append(v)
            if len(batch) < 100:
                break
            page += 1
        log.info("variety_crop %-8s (%s): cumulative %d", crop_slug, crop, len(records))
    return records


# --------------------------------------------------------------------- detail parse


_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")


def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()


def _two_span(li: Tag) -> tuple[str, str] | None:
    """A spec row is an <li> with exactly two non-empty <span>
    descendants: (label, value)."""
    spans = [_clean(s.get_text(" ", strip=True)) for s in li.find_all("span")]
    if len(spans) == 2 and all(spans):
        return spans[0], spans[1]
    return None


def _section_rows(header: Tag) -> list[tuple[str, str]]:
    """Collect every two-span <li> from a section header up to (but not
    including) the next section header (h2/h3) in document order."""
    rows: list[tuple[str, str]] = []
    for el in header.find_all_next():
        if el.name in ("h2", "h3") and el is not header:
            break
        if isinstance(el, Tag) and el.name == "li":
            pair = _two_span(el)
            if pair:
                rows.append(pair)
    return rows


def _parse_groups(soup: BeautifulSoup) -> list[dict]:
    """Parse each known spec <h3> into a {label, items:[{characteristic,
    value}]} group. Sections with no rows are dropped."""
    groups: list[dict] = []
    for header in soup.find_all(["h2", "h3"]):
        head = _clean(header.get_text(" ", strip=True)).lower()
        label = SPEC_SECTIONS.get(head)
        if not label:
            continue
        rows = _section_rows(header)
        if not rows:
            continue
        items = [{"characteristic": k, "value": v} for k, v in rows]
        # If a previous section already mapped to this label (corn maps
        # both Agronomic + Plant Characteristics -> AGRONOMIC), merge so
        # the chunker sees one coherent bucket instead of two.
        existing = next((g for g in groups if g["label"] == label), None)
        if existing:
            existing["items"].extend(items)
        else:
            groups.append({"label": label, "items": items})
    return groups


def _parse_maturity_from_groups(groups: list[dict], crop: str,
                                ) -> tuple[int | None, float | None]:
    """Pull RM (corn) / MG (soy) from the parsed groups. Corn carries
    'Maturity' under the page's Key Features and 'Relative Maturity' is
    soy-side under Plant Characteristics."""
    keys = ("relative maturity", "maturity")
    for g in groups:
        for it in g["items"]:
            if it["characteristic"].strip().lower() in keys:
                m = _MATURITY_RE.search(it["value"])
                if not m:
                    continue
                if crop == "corn":
                    return int(float(m.group(1))), None
                return None, float(m.group(1))
    return None, None


def _parse_maturity_keyfeatures(soup: BeautifulSoup, crop: str,
                                ) -> tuple[int | None, float | None]:
    """Fallback: the 'Key Features' block carries a 'Maturity' row
    ('113.00 RM' / '3.60 RM')."""
    for li in soup.find_all("li"):
        pair = _two_span(li)
        if pair and pair[0].strip().lower() == "maturity":
            m = _MATURITY_RE.search(pair[1])
            if m:
                if crop == "corn":
                    return int(float(m.group(1))), None
                return None, float(m.group(1))
    return None, None


def _parse_positioning(soup: BeautifulSoup) -> str | None:
    """First substantive paragraph under the 'Product Overview' /
    'Hybrid Advantages' heading. Best-effort marketing blurb."""
    for header in soup.find_all(["h2", "h3"]):
        if _clean(header.get_text(" ", strip=True)).lower() not in (
                "product overview", "hybrid advantages"):
            continue
        for el in header.find_all_next():
            if el.name in ("h2", "h3") and el is not header:
                break
            if isinstance(el, Tag) and el.name == "p":
                t = _clean(el.get_text(" ", strip=True))
                if len(t) >= 40:
                    return t
    return None


def parse_detail(http: RateLimitedSession, rec: dict,
                 trait_names: dict[int, str],
                 year_names: dict[int, str]) -> LathamVariety:
    crop = rec["_crop"]
    slug = rec["slug"]
    url = rec.get("link") or f"{BASE}/products/{slug}/"
    name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
    r = http.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    # Drop noise so footer/nav text never bleeds into positioning.
    for t in soup(["script", "style", "noscript"]):
        t.decompose()

    groups = _parse_groups(soup)
    rm, mg = _parse_maturity_from_groups(groups, crop)
    if rm is None and mg is None:
        rm, mg = _parse_maturity_keyfeatures(soup, crop)
    positioning = _parse_positioning(soup)
    traits = [trait_names[t] for t in (rec.get("variety_trait") or [])
              if t in trait_names]
    years = [year_names[t] for t in (rec.get("variety_year") or [])
             if t in year_names]
    release_year = years[0] if years else None

    return LathamVariety(
        source_key=f"latham-{slug.lower()}",
        source_url=url,
        crop=crop,
        product_name=name,
        relative_maturity=rm,
        maturity_group=mg,
        release_year=release_year,
        trait_stack=traits,
        positioning=positioning,
        groups=groups,
    )


# --------------------------------------------------------------------- render


def render_markdown(v: LathamVariety) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
        v.crop, v.crop.title())
    head: list[str] = [
        f"# {v.product_name}",
        "",
        "- **Vendor:** Latham Hi-Tech Seeds (independent family-owned, Alexander, IA)",
        "- **Brand:** Latham Hi-Tech Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if v.crop == "corn" and v.relative_maturity is not None:
        head.append(f"- **Relative maturity:** {v.relative_maturity} days")
    if v.crop == "soybeans" and v.maturity_group is not None:
        head.append(f"- **Maturity group:** {v.maturity_group}")
    if v.trait_stack:
        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
    head.append(f"- **Source:** {v.source_url}")
    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
    head.append("- **Service area:** Latham dealer network — Upper Midwest "
                "(IA/MN/WI/IL/ND/SD/NE)")
    head.append("")
    if v.positioning:
        head += ["---", "", f"_{v.positioning}_", ""]
    head += ["---", ""]
    for g in v.groups:
        head.append(f"## {g['label'].title()}")
        head.append("")
        for it in g["items"]:
            ch = it["characteristic"]
            val = it["value"] or "—"
            head.append(f"- **{ch}:** {val}")
        head.append("")
    return "\n".join(head)


def write_variety(v: LathamVariety, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "latham",
        "source_key": v.source_key,
        "vendor": "Latham Hi-Tech Seeds",
        "brand": "Latham Hi-Tech Seeds",
        "product_name": v.product_name,
        "product_id": v.product_name,
        "crop": v.crop,
        "release_year": v.release_year,
        "relative_maturity": v.relative_maturity,
        "maturity_group": v.maturity_group,
        "wheat_class": None,
        "trait_stack": v.trait_stack,
        "trait_descriptions": [],
        "positioning_statement": v.positioning,
        "strengths": [],
        "characteristics_groups": v.groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [
            {"product_list_name": "Latham dealer network (Upper Midwest — "
                                  "IA/MN/WI/IL/ND/SD/NE)",
             "agronomist": None, "agronomist_email": None, "variant_id": None},
        ],
        "image_url": None,
        "source_urls": [v.source_url],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    (CORPUS_DIR / f"{v.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    trait_names = _taxonomy_map(http, "variety_trait")
    year_names = _taxonomy_map(http, "variety_year")
    records = discover(http, only_crop=only_crop)

    if only_product:
        key = only_product.lower()
        records = [r for r in records
                   if f"latham-{r['slug'].lower()}" == key
                   or r["slug"].lower() == key]
        if not records:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "empty": 0}
    processed = 0
    for rec in records:
        if limit is not None and processed >= limit:
            break
        processed += 1
        source_key = f"latham-{rec['slug'].lower()}"
        md_path = CORPUS_DIR / f"{source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%d] %s skipped", processed, len(records), source_key)
            continue
        try:
            v = parse_detail(http, rec, trait_names, year_names)
        except requests.HTTPError as exc:
            log.error("[%d/%d] %s detail fetch failed: %s",
                      processed, len(records), source_key, exc)
            continue
        if not v.groups:
            counts["empty"] += 1
            log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
                        processed, len(records), source_key)
        write_variety(v, render_markdown(v))
        counts["written"] += 1
        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
                 processed, len(records), source_key, v.crop,
                 v.relative_maturity or v.maturity_group or "-",
                 len(v.groups), ",".join(v.trait_stack) or "-")

    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
             processed, counts["written"], counts["skipped"], counts["empty"],
             len(records))
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.latham",
        description="Scrape Latham Hi-Tech Seeds (independent Upper-Midwest "
                    "brand) — corn / soybeans via the WP REST API + detail pages.")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
                   help="Limit to one crop (corn / soybeans).")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or slug.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    return run(limit=args.limit, force=args.force,
               only_crop=args.crop, only_product=args.product)


if __name__ == "__main__":
    sys.exit(main())