"""Golden Harvest (Syngenta) seed scraper — corn + soybeans.

Source: ``www.goldenharvestseeds.com`` — ASP.NET WebForms site,
server-rendered HTML (no Next.js / SPA). robots.txt is permissive
(no Disallow for /products/).

Discovery: ``/sitemap-ghs-hybrids.xml`` lists ~175 product URLs
under ``/products/corn/`` and ``/products/soybean/``. The sitemap
also references thousands of regional plot-report pages we are NOT
indexing (those are head-to-head trial results, useful but a separate
corpus from variety identity — defer to a future ``gh_plot_reports``
source).

A subset of the sitemap-listed product URLs 302-redirect to the
generic ``/<crop>/product-finder/`` page — those are discontinued
varieties Golden Harvest still lists in the sitemap. We do NOT
follow redirects; 302 → skip.

Per-variety data lives in the page HTML in two shapes:

1. **Tables** — ``<table>`` elements with two columns
   (label, value). For corn pages: plant description, maturity
   (RM days / GDU), planting rate. For soy pages: plant description,
   seed quality + herbicide responses, Phytophthora / SCN genes.

2. **Bar charts** — ``<div class="bar-row">`` elements inside
   ``#dvDiseaseTolerance`` and ``#dvAgronomicChar``. Each bar's
   ``data-percentage="N"`` value encodes the rating: percent / 10
   = rating on the 1-9 scale (9 = best, same as Bayer). Empty
   ``<div class="bar-wrapper">`` content means "no data".

Per CLAUDE.md the recon described GH ratings as a "9-to-1 reversed"
scale, but inspection of the rendered bars + the published "rating
9 = best" convention shows GH uses the canonical 1-9 (9 = best)
direction — same as Bayer. No flip needed. The sidecar's
``_scale_direction`` field declares this so the chunker can be
forward-compatible if a future vendor genuinely reverses.

Tech-sheet PDFs: a link to ``assets.syngentaebiz.com/pdf/techsheets/
<CODE>_YYMMDD.pdf`` appears in the product HTML. The sitemap's
``sitemap-ghs-techsheets.xml`` has STALE date stamps (250331) so we
always read the live URL from the product page, never the sitemap.
PDFs aren't ingested yet (recon flagged they're 14MB each, large)
but the URL is captured in the sidecar for the chunker / future
enrichment.

Output:
  corpus/golden_harvest/<source_key>.md     LLM-visible body
  corpus/golden_harvest/<source_key>.json   sidecar metadata

source_key convention: ``golden_harvest-<sku>`` lowercased, e.g.
``golden_harvest-e085z5`` or ``golden_harvest-gh00864xf``.

CLI:
  python -m scrape.sources.golden_harvest --limit 5
  python -m scrape.sources.golden_harvest --crop corn --limit 20
  python -m scrape.sources.golden_harvest --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.goldenharvestseeds.com"
SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml"

CROP_PATHS = {
    "corn": "/products/corn/",
    "soybeans": "/products/soybean/",   # URL uses "soybean", schema uses "soybeans"
}

# Bayer + Golden Harvest publish on identical 1-9 (9 = best) ratings
# despite recon mentioning "9-to-1" — the direction descriptor referred
# to the visual chart order, not the numeric meaning. Verified empirically.
RATING_SCALE_DIRECTION = "1-9 (9 = best)"

# Trait suffix → full name. Best-effort mapping from product-code
# suffix, since GH's HTML doesn't expose trait stack as a structured
# field. Maps verified against tech-sheet PDFs + public marketing.
TRAIT_SUFFIX_MAP = {
    # Corn
    "VIP3": "Agrisure Viptera® 3220 E-Z Refuge®",
    "VIP4": "Agrisure Viptera® 4 Trecepta®",
    "GT": "Agrisure GT (glyphosate tolerance)",
    "Z": "Agrisure Duracade® 5222 E-Z Refuge® (above + below-ground)",
    # Soy
    "XF": "XtendFlex® (Roundup Ready 2 Xtend + dicamba + glufosinate)",
    "E3": "Enlist E3® (2,4-D + glyphosate + glufosinate)",
}

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "golden_harvest"

REQ_INTERVAL_SEC = 1.0

log = logging.getLogger("scrape.golden_harvest")


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Same shape as bayer_seeds' session. Sleep-based rate limiting
    + polite retries on 429/5xx. We do NOT follow redirects by default:
    302 from a product page → discontinued variety, skip."""

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        self.interval = interval
        self._last = 0.0

    def _wait(self) -> None:
        delta = time.monotonic() - self._last
        if delta < self.interval:
            time.sleep(self.interval - delta)
        self._last = time.monotonic()

    def request(
        self,
        method: str,
        url: str,
        *,
        max_retries: int = 4,
        timeout: float = 30.0,
        allow_redirects: bool = False,
        **kw: Any,
    ) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(
                    method, url, timeout=timeout,
                    allow_redirects=allow_redirects, **kw,
                )
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class GHProduct:
    source_key: str
    source_url: str
    crop: str                                          # "corn" | "soybeans"
    product_name: str = ""                             # e.g. "E085Z5"
    positioning_statement: str | None = None
    relative_maturity: str | None = None               # corn (string of int)
    maturity_group: str | None = None                  # soy (string of decimal)
    trait_codes: list[str] = field(default_factory=list)
    trait_descriptions: list[str] = field(default_factory=list)
    characteristics_groups: list[dict] = field(default_factory=list)
    techsheet_url: str | None = None
    sitemap_last_modified: str | None = None


# --------------------------------------------------------------------- discovery


def discover_products(
    http: RateLimitedSession,
    *,
    only_crop: str | None = None,
) -> list[tuple[str, str, str]]:
    """Return ``[(url, crop, lastmod), ...]`` for every GH product page in
    the hybrids sitemap."""
    log.info("fetching sitemap %s", SITEMAP_HYBRIDS)
    r = http.get(SITEMAP_HYBRIDS, allow_redirects=True)
    r.raise_for_status()
    entries = re.findall(
        r"<url>\s*<loc>([^<]+)</loc>\s*(?:<lastmod>([^<]+)</lastmod>)?",
        r.text,
    )
    out: list[tuple[str, str, str]] = []
    for url, lastmod in entries:
        for crop, path in CROP_PATHS.items():
            if only_crop and crop != only_crop:
                continue
            if path in url and url.rstrip("/").count("/") >= 5:
                tail = url.rstrip("/").rsplit("/", 1)[-1]
                if not tail or tail in ("corn", "soybean"):
                    continue
                out.append((url, crop, lastmod or ""))
                break
    by_crop: dict[str, int] = {}
    for _, c, _ in out:
        by_crop[c] = by_crop.get(c, 0) + 1
    log.info("variety URLs found: %s (total=%d)",
             ", ".join(f"{k}={v}" for k, v in sorted(by_crop.items())),
             len(out))
    return out


# --------------------------------------------------------------------- helpers


def source_key_for(url: str) -> str:
    """``.../products/corn/e085z5`` → ``golden_harvest-e085z5``."""
    tail = url.rstrip("/").rsplit("/", 1)[-1].lower()
    return f"golden_harvest-{tail}"


_TRAIT_SUFFIX_RE = re.compile(r"(VIP3|VIP4|VIP|E3|XF|GT)$", re.I)


def derive_traits(product_code: str) -> tuple[list[str], list[str]]:
    """Pull the trait suffix off the product code. Returns
    ``(codes, descriptions)``. Empty if no recognized suffix."""
    if not product_code:
        return [], []
    code = product_code.upper()
    m = _TRAIT_SUFFIX_RE.search(code)
    if not m:
        # The "Z" suffix encodes Duracade-class above + below ground
        # protection on Golden Harvest's corn naming convention.
        # E085Z5 → Z is the Duracade tag.
        if re.search(r"[A-Z]\d+Z\d+$", code):
            return ["Z"], [TRAIT_SUFFIX_MAP.get("Z", "")]
        return [], []
    tok = m.group(0).upper()
    return [tok], [TRAIT_SUFFIX_MAP.get(tok, "")]


def _table_to_items(tbl) -> list[dict]:
    items: list[dict] = []
    for r in tbl.find_all("tr"):
        cells = r.find_all(["th", "td"])
        if len(cells) < 2:
            continue
        label = cells[0].get_text(" ", strip=True)
        value = cells[1].get_text(" ", strip=True)
        if label and value:
            items.append({"characteristic": label, "value": value})
    return items


def _bars_to_items(container) -> list[dict]:
    items: list[dict] = []
    for row in container.find_all("div", class_="bar-row"):
        label_el = row.find("div", class_="bar-label")
        if not label_el:
            continue
        label = label_el.get_text(" ", strip=True)
        bar = row.find("div", class_="bar")
        pct = bar.get("data-percentage") if bar else None
        if pct is None or str(pct).strip() == "":
            items.append({"characteristic": label, "value": "-"})
            continue
        try:
            rating = int(int(pct) / 10)
        except (TypeError, ValueError):
            rating = None
        if rating is None:
            items.append({"characteristic": label, "value": str(pct)})
        else:
            items.append({"characteristic": label, "value": str(rating)})
    return items


CHART_SECTIONS = [
    # (label_for_sidecar,            div_id)
    ("DISEASE RATINGS",               "dvDiseaseTolerance"),
    ("AGRONOMIC CHARACTERISTICS",     "dvAgronomicChar"),
]


# --------------------------------------------------------------------- detail


def fetch_product_detail(
    http: RateLimitedSession, url: str, crop: str, lastmod: str
) -> GHProduct | None:
    """Fetch + parse one product page. Returns None for discontinued
    varieties (302 → product-finder)."""
    r = http.get(url, allow_redirects=False)
    if r.status_code in (301, 302, 303, 307, 308):
        log.info("skip discontinued (redirect): %s → %s",
                 url, r.headers.get("Location"))
        return None
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    prod = GHProduct(
        source_key=source_key_for(url),
        source_url=url,
        crop=crop,
        sitemap_last_modified=lastmod or None,
    )

    # Product name (the code) — prefer <h1>, fall back to <title>.
    h1 = soup.find("h1")
    if h1:
        prod.product_name = h1.get_text(strip=True)
    if not prod.product_name:
        t = soup.find("title")
        if t:
            txt = t.get_text(strip=True)
            if "|" in txt:
                prod.product_name = txt.rsplit("|", 1)[-1].strip()

    # Positioning — meta name="Description"
    meta = soup.find("meta", attrs={"name": "Description"})
    if meta and meta.get("content"):
        desc = meta["content"].strip()
        if prod.product_name:
            prefix = prod.product_name + "."
            if desc.startswith(prefix):
                desc = desc[len(prefix):].strip()
        prod.positioning_statement = desc or None

    # Traits inferred from product code.
    prod.trait_codes, prod.trait_descriptions = derive_traits(prod.product_name)

    # Tables: capture every two-column table we find, labeled by the
    # nearest preceding heading text.
    table_groups: list[dict] = []
    for tbl in soup.find_all("table"):
        items = _table_to_items(tbl)
        if not items:
            continue
        label = None
        cur = tbl
        for _ in range(8):
            cur = cur.find_previous(["h2", "h3", "h4", "strong"])
            if cur is None:
                break
            t = cur.get_text(strip=True)
            if t:
                label = t
                break
        label = label or "PRODUCT DATA"
        table_groups.append({
            "label": label.upper(),
            "type": "table",
            "items": items,
        })

    # Bar-chart sections.
    chart_groups: list[dict] = []
    for label, div_id in CHART_SECTIONS:
        container = soup.find(id=div_id)
        if not container:
            continue
        items = _bars_to_items(container)
        if items:
            chart_groups.append({
                "label": label,
                "type": "chart",
                "items": items,
            })

    # Recommended environments / management ("AgronomicMange" — typo
    # in upstream class name). Rendered as a flat list of strings.
    am = soup.find(class_="AgronomicMange")
    if am:
        recs = [t.strip() for t in am.stripped_strings if t.strip()]
        if recs:
            chart_groups.append({
                "label": "RECOMMENDED MANAGEMENT",
                "type": "list",
                "items": [{"characteristic": x, "value": "✓"} for x in recs],
            })

    prod.characteristics_groups = chart_groups + table_groups

    # Maturity routing per crop. The canonical place GH publishes the
    # maturity number is the product-label hero block:
    #   <div class="product-label"><div class="right"><span>RM</span>NN</div></div>
    # — same DOM shape on corn and soybean pages, just different units
    # (integer days for corn, MG decimal for soy). The maturity table
    # (corn only) is a useful fallback.
    label_rm = None
    pl = soup.find(class_="product-label")
    if pl:
        right = pl.find(class_="right")
        if right:
            # The <span>RM</span> sits before the value; get_text drops
            # the span boundary, so strip the literal "RM" prefix.
            t = right.get_text(" ", strip=True)
            t = re.sub(r"^RM\s*", "", t).strip()
            if t:
                label_rm = t
    if label_rm:
        if prod.crop == "corn":
            m = re.match(r"^(\d{2,3})", label_rm)
            if m:
                prod.relative_maturity = m.group(1)
        elif prod.crop == "soybeans":
            m = re.match(r"^(\d+(?:\.\d+)?)", label_rm)
            if m:
                prod.maturity_group = m.group(1)

    # Corn-table fallback if the hero header was missing.
    if prod.crop == "corn" and prod.relative_maturity is None:
        for grp in prod.characteristics_groups:
            for it in grp.get("items") or []:
                if "relative maturity" in (it.get("characteristic") or "").lower():
                    m = re.match(r"^(\d{2,3})", (it.get("value") or "").strip())
                    if m:
                        prod.relative_maturity = m.group(1)
                        break
            if prod.relative_maturity:
                break

    # Tech-sheet PDF link.
    ts = soup.find("a", href=re.compile(r"assets\.syngentaebiz\.com/pdf/techsheets/"))
    if ts:
        prod.techsheet_url = ts["href"]
    else:
        m = re.search(
            r'(https?://assets\.syngentaebiz\.com/pdf/techsheets/[^"\s<>]+\.pdf)',
            r.text,
        )
        if m:
            prod.techsheet_url = m.group(1)

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: GHProduct) -> str:
    title = p.product_name or p.source_key
    crop_label = "Corn" if p.crop == "corn" else "Soybeans"
    maturity_lines: list[str] = []
    if p.relative_maturity and p.crop == "corn":
        maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}")
    if p.maturity_group and p.crop == "soybeans":
        maturity_lines.append(f"- **Maturity group:** {p.maturity_group}")

    trait_line = ""
    if p.trait_codes:
        codes = ", ".join(p.trait_codes)
        if p.trait_descriptions and any(p.trait_descriptions):
            trait_line = f"- **Traits:** {codes} ({'; '.join(p.trait_descriptions)})"
        else:
            trait_line = f"- **Traits:** {codes}"

    head = [
        f"# {title}",
        "",
        "- **Vendor:** Syngenta",
        "- **Brand:** Golden Harvest",
        f"- **Crop:** {crop_label}",
        *maturity_lines,
    ]
    if trait_line:
        head.append(trait_line)
    head.append(f"- **Source:** {p.source_url}")
    if p.techsheet_url:
        head.append(f"- **Tech sheet (PDF):** {p.techsheet_url}")
    head.append(f"- **Rating scale (Golden Harvest):** {RATING_SCALE_DIRECTION}")
    head.append("")
    head.append("---")
    head.append("")

    sections: list[str] = []
    if p.positioning_statement:
        sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n")

    for g in p.characteristics_groups:
        label = (g.get("label") or "Characteristics").title()
        items = g.get("items") or []
        if not items:
            continue
        rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items)
        sections.append(
            f"## {label}\n\n"
            "| Characteristic | Value |\n"
            "|---|---|\n"
            f"{rows}\n"
        )
    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_product(prod: GHProduct, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{prod.source_key}.md"
    json_path = CORPUS_DIR / f"{prod.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "golden_harvest",
        "source_key": prod.source_key,
        "vendor": "Syngenta",
        "brand": "Golden Harvest",
        "product_name": prod.product_name,
        "product_id": None,
        "hybrid_prefix": prod.product_name,
        "hybrid_suffix": None,
        "crop": prod.crop,
        "release_year": None,
        "relative_maturity": prod.relative_maturity,
        "maturity_group": prod.maturity_group,
        "wheat_class": None,
        "trait_stack": prod.trait_codes,
        "trait_descriptions": prod.trait_descriptions,
        "positioning_statement": prod.positioning_statement,
        "strengths": [],
        "characteristics_groups": prod.characteristics_groups,
        "_scale_direction": RATING_SCALE_DIRECTION,
        "regional_recommendations": [],
        "image_url": None,
        "techsheet_url": prod.techsheet_url,
        "source_urls": [prod.source_url],
        "sitemap_last_modified": prod.sitemap_last_modified,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_product(
    http: RateLimitedSession,
    *,
    url: str,
    crop: str,
    lastmod: str,
    force: bool,
) -> tuple[str, GHProduct | None]:
    source_key = source_key_for(url)
    md_path = CORPUS_DIR / f"{source_key}.md"
    if md_path.exists() and not force:
        return "skipped", None

    try:
        prod = fetch_product_detail(http, url, crop, lastmod)
    except Exception as exc:  # noqa: BLE001
        log.error("detail fetch failed for %s: %s", url, exc)
        return "failed", None
    if prod is None:
        return "discontinued", None

    body = render_markdown(prod)
    write_product(prod, body)
    return "written", prod


def run(
    *,
    limit: int | None,
    force: bool,
    only_crop: str | None,
    only_product: str | None,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    http = RateLimitedSession()

    targets = discover_products(http, only_crop=only_crop)
    if only_product:
        targets = [
            (u, c, lm) for (u, c, lm) in targets
            if source_key_for(u) == only_product
            or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product.lower()
        ]
        if not targets:
            log.error("no variety matched --product=%s", only_product)
            return 2

    counts = {"written": 0, "skipped": 0, "discontinued": 0, "failed": 0}
    processed = 0
    for url, crop, lastmod in targets:
        if limit is not None and processed >= limit:
            break
        processed += 1
        status, prod = process_product(
            http, url=url, crop=crop, lastmod=lastmod, force=force,
        )
        counts[status] = counts.get(status, 0) + 1
        if prod is not None:
            log.info(
                "[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d techsheet=%s",
                processed, str(limit) if limit else "all",
                prod.source_key, status, prod.crop,
                prod.relative_maturity or prod.maturity_group or "-",
                ",".join(prod.trait_codes) or "-",
                len(prod.characteristics_groups),
                "y" if prod.techsheet_url else "n",
            )
        else:
            log.info("[%d/%s] %s %s",
                     processed, str(limit) if limit else "all",
                     source_key_for(url), status)

    log.info(
        "done: processed=%d written=%d skipped=%d discontinued=%d failed=%d "
        "(of %d candidates)",
        processed, counts["written"], counts["skipped"],
        counts["discontinued"], counts["failed"], len(targets),
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.golden_harvest",
        description="Scrape Golden Harvest (Syngenta) corn + soybean varieties.",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N varieties (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None, choices=("corn", "soybeans"),
                   help="Limit to one crop.")
    p.add_argument("--product", default=None,
                   help="Process a single variety by source_key or URL tail.")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit,
        force=args.force,
        only_crop=args.crop,
        only_product=args.product,
    )


if __name__ == "__main__":
    sys.exit(main())