seed-mcp/scrape/sources/proharvest.py

"""ProHarvest Seeds scraper — independent regional brand (Hindsboro, IL).

Source: ``proharvestseeds.com`` — WordPress site exposing a public,
no-auth REST API. robots.txt is permissive (only ``/?s=``, ``/search/``,
``/dealer-files/*``, ``/dealer-section/*`` disallowed; the catalog +
``/wp-json/`` are open). Independent family-owned seed company; corn /
soybeans / wheat (plus forage / cover-crop lines that are out of scope
for the row-crop advisor).

Two-step ingestion:

1. **Enumerate** via the WP REST API. ``/wp/v2/seed`` is the variety
   custom-post-type; ``/wp/v2/seed-type`` is the crop taxonomy. We pull
   every variety whose seed-type is one of the row-crop terms
   (corn-hybrid / soybean / wheat) — ignoring alfalfa / forage / grass /
   cover-crop / sweet-corn terms. The REST payload gives the canonical
   id / slug / title / permalink, but ``acf`` and ``content`` are NOT
   registered to REST (both come back empty), so the ratings have to
   come from the detail page.

2. **Parse the detail page.** Each ``/seed/<slug>/`` page server-renders
   the agronomic data as ``<h2>`` spec sections, each a flat run of
   ``<strong>label</strong><div>value</div>`` pairs (General
   Characteristics / Agronomic Features / Disease Tolerance / Soil
   Adaptability / Nitrogen Application/Timing / Recommended Seeding
   Rates). The relative maturity sits in an ``<h1>Maturity: 111
   Days</h1>`` heading.

Rating scales are **mixed** and preserved verbatim (the chunker never
fabricates a value):
  - Disease Tolerance: **1-9 numeric** (9 = best / most tolerant, per
    industry norm; ``NA`` = not rated). Direction is the same as
    Bayer/NK so no flip is needed.
  - General Characteristics / Agronomic Features: **qualitative**
    (Excellent / Very Good / Good / Average / …) with a few raw numerics
    (GDD, Kernel Rows).
  - Soil Adaptability: ``HR`` (highly recommended) / ``R`` (recommended)
    / etc.

Unlike the Ebbert's scraper (which left ``characteristics_groups`` empty
and relied on a verbatim body), we parse the spec sections into
structured ``characteristics_groups`` so the qualitative + numeric
ratings land in the embedded chunk and are actually retrievable.

Output:
  corpus/proharvest/<source_key>.md
  corpus/proharvest/<source_key>.json

source_key: ``proharvest-<slug>`` lowercased, e.g. ``proharvest-81p11``.

CLI:
  python -m scrape.sources.proharvest --crop corn --limit 5
  python -m scrape.sources.proharvest --force
  python -m scrape.sources.proharvest --product proharvest-81p11
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup, NavigableString, Tag

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://proharvestseeds.com"
WP = f"{BASE}/wp-json/wp/v2"

# seed-type taxonomy slug -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. Everything
# not listed here (alfalfa / forage / grass / cover-crop / sweet-corn /
# blends) is out of scope for the row-crop advisor.
CROP_TYPES = {
    "corn-hybrid": "corn",
    "soybean": "soybeans",
    "wheat": "wheat",
}

# robots.txt declares no Crawl-delay for "*", but we stay polite — the
# row-crop catalog is only ~120 detail pages.
REQ_INTERVAL_SEC = 1.5

RATING_SCALE_DIRECTION = (
    "disease 1-9, 9=best/most-tolerant, NA=not rated; "
    "agronomic/general qualitative (Excellent/Very Good/Good/Average); "
    "soil HR=highly recommended/R=recommended"
)

# Detail-page <h2> spec sections we extract, in display order. The
# value maps the page header to a characteristics_groups label the
# chunker buckets: DISEASE RATINGS -> disease framing, AGRONOMIC
# CHARACTERISTICS -> agronomic framing; the rest pass through verbatim
# as their own titled section (still embedded + retrievable).
SPEC_SECTIONS = {
    "General Characteristics": "GENERAL CHARACTERISTICS",
    "Agronomic Features": "AGRONOMIC CHARACTERISTICS",
    "Disease Tolerance": "DISEASE RATINGS",
    "Soil Adaptability": "SOIL ADAPTABILITY",
    "Nitrogen Application/Timing": "NITROGEN APPLICATION/TIMING",
    "Recommended Seeding Rates": "RECOMMENDED SEEDING RATES",
}

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "proharvest"

log = logging.getLogger("scrape.proharvest")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Polite session with backoff. ProHarvest's row-crop catalog is
    small (~120 detail pages) so 1.5 s/req still finishes in a few min."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(self, method: str, url: str, *, max_retries: int = 4,
                timeout: float = 30.0, **kw: Any) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(
                    30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)

    def get_json(self, url: str, **kw: Any) -> Any:
        r = self.get(url, **kw)
        r.raise_for_status()
        return r.json()


# --------------------------------------------------------------------- model


@dataclass
class PHVariety:
    source_key: str
    source_url: str
    crop: str                         # chunker value: corn / soybeans / wheat
    product_name: str = ""            # "81P11"
    relative_maturity: int | None = None     # corn (days)
    maturity_group: float | None = None      # soy
    wheat_maturity: str | None = None         # wheat qualitative
    trait_stack: list[str] = field(default_factory=list)
    positioning: str | None = None
    # [{label, items:[{characteristic, value}]}] — chunker source of truth
    groups: list[dict] = field(default_factory=list)


# --------------------------------------------------------------------- discovery (REST)


def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
    """term_id -> name for a WP taxonomy (paged)."""
    out: dict[int, str] = {}
    page = 1
    while True:
        url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
        r = http.get(url)
        if r.status_code == 400:   # past last page
            break
        r.raise_for_status()
        terms = r.json()
        if not terms:
            break
        for t in terms:
            out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
        if len(terms) < 100:
            break
        page += 1
    return out


def _type_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
    out: dict[str, int] = {}
    for t in http.get_json(f"{WP}/seed-type?per_page=100&_fields=id,slug"):
        out[t["slug"]] = t["id"]
    return out


def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
    """Return REST seed records for the in-scope row crops."""
    type_ids = _type_slug_to_id(http)
    records: list[dict] = []
    for type_slug, crop in CROP_TYPES.items():
        if only_crop and crop != only_crop:
            continue
        tid = type_ids.get(type_slug)
        if tid is None:
            log.warning("seed-type %r not found in taxonomy — skipping", type_slug)
            continue
        page = 1
        while True:
            url = (f"{WP}/seed?seed-type={tid}&per_page=100&page={page}"
                   "&_fields=id,slug,title,link,seed-trait")
            r = http.get(url)
            if r.status_code == 400:
                break
            r.raise_for_status()
            batch = r.json()
            if not batch:
                break
            for s in batch:
                s["_crop"] = crop
                records.append(s)
            if len(batch) < 100:
                break
            page += 1
        log.info("seed-type %-12s (%s): cumulative %d", type_slug, crop, len(records))
    return records


# --------------------------------------------------------------------- detail parse


_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")


def _clean(s: str) -> str:
    return re.sub(r"\s+", " ", s or "").strip()


def _direct_text(el: Tag) -> str:
    return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))


def _parse_maturity(soup: BeautifulSoup, crop: str) -> tuple[int | None, float | None, str | None]:
    """Pull RM (corn) / MG (soy) / qualitative (wheat) from the
    'Maturity: …' heading. Returns (rm, mg, wheat_maturity)."""
    head = None
    for h in soup.find_all(["h1", "h2", "h3"]):
        txt = h.get_text(" ", strip=True)
        if re.match(r"^Maturity\b", txt, re.I):
            head = txt
            break
    if not head:
        return None, None, None
    m = _MATURITY_RE.search(head)
    if crop == "corn":
        return (int(float(m.group(1))) if m else None), None, None
    if crop == "soybeans":
        return None, (float(m.group(1)) if m else None), None
    # wheat — keep the qualitative phrase after "Maturity:"
    val = head.split(":", 1)[1].strip() if ":" in head else head
    return None, None, (val or None)


def _parse_groups(soup: BeautifulSoup) -> list[dict]:
    """Parse each known spec <h2> into a {label, items:[{characteristic,
    value}]} group. Each section is a flat run of
    <strong>label</strong><div>value</div> pairs up to the next <h2>."""
    groups: list[dict] = []
    h2s = soup.find_all("h2")
    for h2 in h2s:
        header = _clean(h2.get_text(" ", strip=True))
        label = SPEC_SECTIONS.get(header)
        if not label:
            continue
        # Collect (tag, text) for strong/div leaves until the next <h2>.
        seq: list[tuple[str, str]] = []
        for el in h2.find_all_next():
            if el.name == "h2":
                break
            if not isinstance(el, Tag):
                continue
            if el.name == "strong":
                t = _clean(el.get_text(" ", strip=True))
                if t:
                    seq.append(("k", t))
            elif el.name == "div":
                t = _direct_text(el)
                if t:
                    seq.append(("v", t))
        # Pair adjacent key->value. A key with no following value (or two
        # keys in a row) keeps an em-dash placeholder so nothing silently
        # drops.
        items: list[dict] = []
        i = 0
        while i < len(seq):
            kind, text = seq[i]
            if kind == "k":
                value = ""
                if i + 1 < len(seq) and seq[i + 1][0] == "v":
                    value = seq[i + 1][1]
                    i += 1
                items.append({"characteristic": text, "value": value})
            i += 1
        if items:
            groups.append({"label": label, "items": items})
    return groups


def _parse_positioning(soup: BeautifulSoup) -> str | None:
    """First substantive paragraph after the variety <h1>, before the
    first spec <h2>. Best-effort marketing/positioning blurb."""
    h1 = soup.find("h1")
    if not h1:
        return None
    for el in h1.find_all_next():
        if el.name == "h2":
            break
        if isinstance(el, Tag) and el.name == "p":
            t = _clean(el.get_text(" ", strip=True))
            if len(t) >= 40:
                return t
    return None


def parse_detail(http: RateLimitedSession, rec: dict,
                 trait_names: dict[int, str]) -> PHVariety:
    crop = rec["_crop"]
    slug = rec["slug"]
    url = rec.get("link") or f"{BASE}/seed/{slug}/"
    name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
    r = http.get(url)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    rm, mg, wheat_mat = _parse_maturity(soup, crop)
    groups = _parse_groups(soup)
    positioning = _parse_positioning(soup)
    traits = [trait_names[t] for t in (rec.get("seed-trait") or []) if t in trait_names]

    return PHVariety(
        source_key=f"proharvest-{slug.lower()}",
        source_url=url,
        crop=crop,
        product_name=name,
        relative_maturity=rm,
        maturity_group=mg,
        wheat_maturity=wheat_mat,
        trait_stack=traits,
        positioning=positioning,
        groups=groups,
    )


# --------------------------------------------------------------------- render


def render_markdown(v: PHVariety) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybeans",
                  "wheat": "Wheat"}.get(v.crop, v.crop.title())
    head: list[str] = [
        f"# {v.product_name}",
        "",
        "- **Vendor:** ProHarvest Seeds (independent regional brand)",
        "- **Brand:** ProHarvest Seeds",
        f"- **Crop:** {crop_label}",
    ]
    if v.crop == "corn" and v.relative_maturity is not None:
        head.append(f"- **Relative maturity:** {v.relative_maturity} days")
    if v.crop == "soybeans" and v.maturity_group is not None:
        head.append(f"- **Maturity group:** {v.maturity_group}")
    if v.crop == "wheat" and v.wheat_maturity:
        head.append(f"- **Maturity:** {v.wheat_maturity}")
    if v.trait_stack:
        head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
    head.append(f"- **Source:** {v.source_url}")
    head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
    head.append("- **Service area:** Independent dealer network — Eastern/Central Corn Belt (IL/IN/OH/MO/IA/KS/NE)")
    head.append("")
    if v.positioning:
        head += ["---", "", f"_{v.positioning}_", ""]
    head += ["---", ""]
    for g in v.groups:
        head.append(f"## {g['label'].title()}")
        head.append("")
        for it in g["items"]:
            ch = it["characteristic"]
            val = it["value"] or "—"
            head.append(f"- **{ch}:** {val}")
        head.append("")
    return "\n".join(head)


def write_variety(v: PHVariety, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "proharvest",
        "source_key": v.source_key,
        "vendor": "ProHarvest Seeds",
        "brand": "ProHarvest Seeds",
        "product_name": v.product_name,
        "product_id": v.product_name,
        "crop": v.crop,
        "release_year": None,
        "relative_maturity": v.relative_maturity,
        "maturity_group": v.maturity_group,
        # Wheat maturity is qualitative; stash it where the chunker reads
        # the wheat "Maturity" fact from (relative_maturity), as a string.
        "wheat_class": None,
        "trait_stack": v.trait_stack,
        "trait_descriptions": [],
        "positioning_statement": v.positioning,
        "strengths": [],
        "characteristics_groups": v.groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [
            {"product_list_name": "ProHarvest dealer network (Eastern/Central Corn Belt — IL/IN/OH/MO/IA/KS/NE)",
             "agronomist": None, "agronomist_email": None, "variant_id": None},
        ],
        "image_url": None,
        "source_urls": [v.source_url],
        "sitemap_last_modified": None,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    # For wheat, surface the qualitative maturity through relative_maturity
    # so the chunker's wheat "Maturity {rm}" branch renders it.
    if v.crop == "wheat" and v.wheat_maturity:
        sidecar["relative_maturity"] = v.wheat_maturity
    (CORPUS_DIR / f"{v.source_key}.json").write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")


# --------------------------------------------------------------------- pipeline


def run(*, limit: int | None, force: bool,
        only_crop: str | None, only_product: str | None) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()
    trait_names = _taxonomy_map(http, "seed-trait")
    records = discover(http, only_crop=only_crop)

    if only_product:
        key = only_product.lower()
        records = [r for r in records
                   if f"proharvest-{r['slug'].lower()}" == key
                   or r["slug"].lower() == key]
        if not records:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "empty": 0}
    processed = 0
    for rec in records:
        if limit is not None and processed >= limit:
            break
        processed += 1
        source_key = f"proharvest-{rec['slug'].lower()}"
        md_path = CORPUS_DIR / f"{source_key}.md"
        if md_path.exists() and not force:
            counts["skipped"] += 1
            log.info("[%d/%d] %s skipped", processed, len(records), source_key)
            continue
        try:
            v = parse_detail(http, rec, trait_names)
        except requests.HTTPError as exc:
            log.error("[%d/%d] %s detail fetch failed: %s",
                      processed, len(records), source_key, exc)
            continue
        if not v.groups:
            counts["empty"] += 1
            log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
                        processed, len(records), source_key)
        write_variety(v, render_markdown(v))
        counts["written"] += 1
        log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
                 processed, len(records), source_key, v.crop,
                 v.relative_maturity or v.maturity_group or v.wheat_maturity or "-",
                 len(v.groups), ",".join(v.trait_stack) or "-")

    log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
             processed, counts["written"], counts["skipped"], counts["empty"], len(records))
    return 0


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.proharvest",
        description="Scrape ProHarvest Seeds (independent Corn Belt brand) — "
                    "corn / soybeans / wheat via the WP REST API + detail pages.")
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
                   help="Limit to one crop (corn / soybeans / wheat).")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or slug.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr)
    return run(limit=args.limit, force=args.force,
               only_crop=args.crop, only_product=args.product)


if __name__ == "__main__":
    sys.exit(main())