"""ProHarvest Seeds scraper — independent regional brand (Hindsboro, IL). Source: ``proharvestseeds.com`` — WordPress site exposing a public, no-auth REST API. robots.txt is permissive (only ``/?s=``, ``/search/``, ``/dealer-files/*``, ``/dealer-section/*`` disallowed; the catalog + ``/wp-json/`` are open). Independent family-owned seed company; corn / soybeans / wheat (plus forage / cover-crop lines that are out of scope for the row-crop advisor). Two-step ingestion: 1. **Enumerate** via the WP REST API. ``/wp/v2/seed`` is the variety custom-post-type; ``/wp/v2/seed-type`` is the crop taxonomy. We pull every variety whose seed-type is one of the row-crop terms (corn-hybrid / soybean / wheat) — ignoring alfalfa / forage / grass / cover-crop / sweet-corn terms. The REST payload gives the canonical id / slug / title / permalink, but ``acf`` and ``content`` are NOT registered to REST (both come back empty), so the ratings have to come from the detail page. 2. **Parse the detail page.** Each ``/seed//`` page server-renders the agronomic data as ``

`` spec sections, each a flat run of ``label
value
`` pairs (General Characteristics / Agronomic Features / Disease Tolerance / Soil Adaptability / Nitrogen Application/Timing / Recommended Seeding Rates). The relative maturity sits in an ``

Maturity: 111 Days

`` heading. Rating scales are **mixed** and preserved verbatim (the chunker never fabricates a value): - Disease Tolerance: **1-9 numeric** (9 = best / most tolerant, per industry norm; ``NA`` = not rated). Direction is the same as Bayer/NK so no flip is needed. - General Characteristics / Agronomic Features: **qualitative** (Excellent / Very Good / Good / Average / …) with a few raw numerics (GDD, Kernel Rows). - Soil Adaptability: ``HR`` (highly recommended) / ``R`` (recommended) / etc. Unlike the Ebbert's scraper (which left ``characteristics_groups`` empty and relied on a verbatim body), we parse the spec sections into structured ``characteristics_groups`` so the qualitative + numeric ratings land in the embedded chunk and are actually retrievable. Output: corpus/proharvest/.md corpus/proharvest/.json source_key: ``proharvest-`` lowercased, e.g. ``proharvest-81p11``. CLI: python -m scrape.sources.proharvest --crop corn --limit 5 python -m scrape.sources.proharvest --force python -m scrape.sources.proharvest --product proharvest-81p11 """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup, NavigableString, Tag SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://proharvestseeds.com" WP = f"{BASE}/wp-json/wp/v2" # seed-type taxonomy slug -> chunker crop value. The chunker keys on # "soybeans" (plural) for the MG branch, so map accordingly. Everything # not listed here (alfalfa / forage / grass / cover-crop / sweet-corn / # blends) is out of scope for the row-crop advisor. CROP_TYPES = { "corn-hybrid": "corn", "soybean": "soybeans", "wheat": "wheat", } # robots.txt declares no Crawl-delay for "*", but we stay polite — the # row-crop catalog is only ~120 detail pages. REQ_INTERVAL_SEC = 1.5 RATING_SCALE_DIRECTION = ( "disease 1-9, 9=best/most-tolerant, NA=not rated; " "agronomic/general qualitative (Excellent/Very Good/Good/Average); " "soil HR=highly recommended/R=recommended" ) # Detail-page

spec sections we extract, in display order. The # value maps the page header to a characteristics_groups label the # chunker buckets: DISEASE RATINGS -> disease framing, AGRONOMIC # CHARACTERISTICS -> agronomic framing; the rest pass through verbatim # as their own titled section (still embedded + retrievable). SPEC_SECTIONS = { "General Characteristics": "GENERAL CHARACTERISTICS", "Agronomic Features": "AGRONOMIC CHARACTERISTICS", "Disease Tolerance": "DISEASE RATINGS", "Soil Adaptability": "SOIL ADAPTABILITY", "Nitrogen Application/Timing": "NITROGEN APPLICATION/TIMING", "Recommended Seeding Rates": "RECOMMENDED SEEDING RATES", } REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "proharvest" log = logging.getLogger("scrape.proharvest") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Polite session with backoff. ProHarvest's row-crop catalog is small (~120 detail pages) so 1.5 s/req still finishes in a few min.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min( 30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) def get_json(self, url: str, **kw: Any) -> Any: r = self.get(url, **kw) r.raise_for_status() return r.json() # --------------------------------------------------------------------- model @dataclass class PHVariety: source_key: str source_url: str crop: str # chunker value: corn / soybeans / wheat product_name: str = "" # "81P11" relative_maturity: int | None = None # corn (days) maturity_group: float | None = None # soy wheat_maturity: str | None = None # wheat qualitative trait_stack: list[str] = field(default_factory=list) positioning: str | None = None # [{label, items:[{characteristic, value}]}] — chunker source of truth groups: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery (REST) def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]: """term_id -> name for a WP taxonomy (paged).""" out: dict[int, str] = {} page = 1 while True: url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug" r = http.get(url) if r.status_code == 400: # past last page break r.raise_for_status() terms = r.json() if not terms: break for t in terms: out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"]) if len(terms) < 100: break page += 1 return out def _type_slug_to_id(http: RateLimitedSession) -> dict[str, int]: out: dict[str, int] = {} for t in http.get_json(f"{WP}/seed-type?per_page=100&_fields=id,slug"): out[t["slug"]] = t["id"] return out def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]: """Return REST seed records for the in-scope row crops.""" type_ids = _type_slug_to_id(http) records: list[dict] = [] for type_slug, crop in CROP_TYPES.items(): if only_crop and crop != only_crop: continue tid = type_ids.get(type_slug) if tid is None: log.warning("seed-type %r not found in taxonomy — skipping", type_slug) continue page = 1 while True: url = (f"{WP}/seed?seed-type={tid}&per_page=100&page={page}" "&_fields=id,slug,title,link,seed-trait") r = http.get(url) if r.status_code == 400: break r.raise_for_status() batch = r.json() if not batch: break for s in batch: s["_crop"] = crop records.append(s) if len(batch) < 100: break page += 1 log.info("seed-type %-12s (%s): cumulative %d", type_slug, crop, len(records)) return records # --------------------------------------------------------------------- detail parse _MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)") def _clean(s: str) -> str: return re.sub(r"\s+", " ", s or "").strip() def _direct_text(el: Tag) -> str: return _clean("".join(c for c in el.children if isinstance(c, NavigableString))) def _parse_maturity(soup: BeautifulSoup, crop: str) -> tuple[int | None, float | None, str | None]: """Pull RM (corn) / MG (soy) / qualitative (wheat) from the 'Maturity: …' heading. Returns (rm, mg, wheat_maturity).""" head = None for h in soup.find_all(["h1", "h2", "h3"]): txt = h.get_text(" ", strip=True) if re.match(r"^Maturity\b", txt, re.I): head = txt break if not head: return None, None, None m = _MATURITY_RE.search(head) if crop == "corn": return (int(float(m.group(1))) if m else None), None, None if crop == "soybeans": return None, (float(m.group(1)) if m else None), None # wheat — keep the qualitative phrase after "Maturity:" val = head.split(":", 1)[1].strip() if ":" in head else head return None, None, (val or None) def _parse_groups(soup: BeautifulSoup) -> list[dict]: """Parse each known spec

into a {label, items:[{characteristic, value}]} group. Each section is a flat run of label
value
pairs up to the next

.""" groups: list[dict] = [] h2s = soup.find_all("h2") for h2 in h2s: header = _clean(h2.get_text(" ", strip=True)) label = SPEC_SECTIONS.get(header) if not label: continue # Collect (tag, text) for strong/div leaves until the next

. seq: list[tuple[str, str]] = [] for el in h2.find_all_next(): if el.name == "h2": break if not isinstance(el, Tag): continue if el.name == "strong": t = _clean(el.get_text(" ", strip=True)) if t: seq.append(("k", t)) elif el.name == "div": t = _direct_text(el) if t: seq.append(("v", t)) # Pair adjacent key->value. A key with no following value (or two # keys in a row) keeps an em-dash placeholder so nothing silently # drops. items: list[dict] = [] i = 0 while i < len(seq): kind, text = seq[i] if kind == "k": value = "" if i + 1 < len(seq) and seq[i + 1][0] == "v": value = seq[i + 1][1] i += 1 items.append({"characteristic": text, "value": value}) i += 1 if items: groups.append({"label": label, "items": items}) return groups def _parse_positioning(soup: BeautifulSoup) -> str | None: """First substantive paragraph after the variety

, before the first spec

. Best-effort marketing/positioning blurb.""" h1 = soup.find("h1") if not h1: return None for el in h1.find_all_next(): if el.name == "h2": break if isinstance(el, Tag) and el.name == "p": t = _clean(el.get_text(" ", strip=True)) if len(t) >= 40: return t return None def parse_detail(http: RateLimitedSession, rec: dict, trait_names: dict[int, str]) -> PHVariety: crop = rec["_crop"] slug = rec["slug"] url = rec.get("link") or f"{BASE}/seed/{slug}/" name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper() r = http.get(url) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") rm, mg, wheat_mat = _parse_maturity(soup, crop) groups = _parse_groups(soup) positioning = _parse_positioning(soup) traits = [trait_names[t] for t in (rec.get("seed-trait") or []) if t in trait_names] return PHVariety( source_key=f"proharvest-{slug.lower()}", source_url=url, crop=crop, product_name=name, relative_maturity=rm, maturity_group=mg, wheat_maturity=wheat_mat, trait_stack=traits, positioning=positioning, groups=groups, ) # --------------------------------------------------------------------- render def render_markdown(v: PHVariety) -> str: crop_label = {"corn": "Corn", "soybeans": "Soybeans", "wheat": "Wheat"}.get(v.crop, v.crop.title()) head: list[str] = [ f"# {v.product_name}", "", "- **Vendor:** ProHarvest Seeds (independent regional brand)", "- **Brand:** ProHarvest Seeds", f"- **Crop:** {crop_label}", ] if v.crop == "corn" and v.relative_maturity is not None: head.append(f"- **Relative maturity:** {v.relative_maturity} days") if v.crop == "soybeans" and v.maturity_group is not None: head.append(f"- **Maturity group:** {v.maturity_group}") if v.crop == "wheat" and v.wheat_maturity: head.append(f"- **Maturity:** {v.wheat_maturity}") if v.trait_stack: head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}") head.append(f"- **Source:** {v.source_url}") head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}") head.append("- **Service area:** Independent dealer network — Eastern/Central Corn Belt (IL/IN/OH/MO/IA/KS/NE)") head.append("") if v.positioning: head += ["---", "", f"_{v.positioning}_", ""] head += ["---", ""] for g in v.groups: head.append(f"## {g['label'].title()}") head.append("") for it in g["items"]: ch = it["characteristic"] val = it["value"] or "—" head.append(f"- **{ch}:** {val}") head.append("") return "\n".join(head) def write_variety(v: PHVariety, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8") sidecar = { "source": "proharvest", "source_key": v.source_key, "vendor": "ProHarvest Seeds", "brand": "ProHarvest Seeds", "product_name": v.product_name, "product_id": v.product_name, "crop": v.crop, "release_year": None, "relative_maturity": v.relative_maturity, "maturity_group": v.maturity_group, # Wheat maturity is qualitative; stash it where the chunker reads # the wheat "Maturity" fact from (relative_maturity), as a string. "wheat_class": None, "trait_stack": v.trait_stack, "trait_descriptions": [], "positioning_statement": v.positioning, "strengths": [], "characteristics_groups": v.groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [ {"product_list_name": "ProHarvest dealer network (Eastern/Central Corn Belt — IL/IN/OH/MO/IA/KS/NE)", "agronomist": None, "agronomist_email": None, "variant_id": None}, ], "image_url": None, "source_urls": [v.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } # For wheat, surface the qualitative maturity through relative_maturity # so the chunker's wheat "Maturity {rm}" branch renders it. if v.crop == "wheat" and v.wheat_maturity: sidecar["relative_maturity"] = v.wheat_maturity (CORPUS_DIR / f"{v.source_key}.json").write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # --------------------------------------------------------------------- pipeline def run(*, limit: int | None, force: bool, only_crop: str | None, only_product: str | None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() trait_names = _taxonomy_map(http, "seed-trait") records = discover(http, only_crop=only_crop) if only_product: key = only_product.lower() records = [r for r in records if f"proharvest-{r['slug'].lower()}" == key or r["slug"].lower() == key] if not records: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "empty": 0} processed = 0 for rec in records: if limit is not None and processed >= limit: break processed += 1 source_key = f"proharvest-{rec['slug'].lower()}" md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: counts["skipped"] += 1 log.info("[%d/%d] %s skipped", processed, len(records), source_key) continue try: v = parse_detail(http, rec, trait_names) except requests.HTTPError as exc: log.error("[%d/%d] %s detail fetch failed: %s", processed, len(records), source_key, exc) continue if not v.groups: counts["empty"] += 1 log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)", processed, len(records), source_key) write_variety(v, render_markdown(v)) counts["written"] += 1 log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s", processed, len(records), source_key, v.crop, v.relative_maturity or v.maturity_group or v.wheat_maturity or "-", len(v.groups), ",".join(v.trait_stack) or "-") log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)", processed, counts["written"], counts["skipped"], counts["empty"], len(records)) return 0 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.proharvest", description="Scrape ProHarvest Seeds (independent Corn Belt brand) — " "corn / soybeans / wheat via the WP REST API + detail pages.") p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())), help="Limit to one crop (corn / soybeans / wheat).") p.add_argument("--product", default=None, help="Process a single variety by source_key or slug.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr) return run(limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product) if __name__ == "__main__": sys.exit(main())