Files
claude 22e8092faf
Image rebuild (skip scrape) / build (push) Successful in 5m46s
Add ProHarvest Seeds: 119 varieties + 161 cross-vendor plot reports (#16)
Co-authored-by: claude <claude@jpaul.io>
Co-committed-by: claude <claude@jpaul.io>
2026-06-04 21:05:30 -04:00

547 lines
20 KiB
Python

"""ProHarvest Seeds scraper — independent regional brand (Hindsboro, IL).
Source: ``proharvestseeds.com`` — WordPress site exposing a public,
no-auth REST API. robots.txt is permissive (only ``/?s=``, ``/search/``,
``/dealer-files/*``, ``/dealer-section/*`` disallowed; the catalog +
``/wp-json/`` are open). Independent family-owned seed company; corn /
soybeans / wheat (plus forage / cover-crop lines that are out of scope
for the row-crop advisor).
Two-step ingestion:
1. **Enumerate** via the WP REST API. ``/wp/v2/seed`` is the variety
custom-post-type; ``/wp/v2/seed-type`` is the crop taxonomy. We pull
every variety whose seed-type is one of the row-crop terms
(corn-hybrid / soybean / wheat) — ignoring alfalfa / forage / grass /
cover-crop / sweet-corn terms. The REST payload gives the canonical
id / slug / title / permalink, but ``acf`` and ``content`` are NOT
registered to REST (both come back empty), so the ratings have to
come from the detail page.
2. **Parse the detail page.** Each ``/seed/<slug>/`` page server-renders
the agronomic data as ``<h2>`` spec sections, each a flat run of
``<strong>label</strong><div>value</div>`` pairs (General
Characteristics / Agronomic Features / Disease Tolerance / Soil
Adaptability / Nitrogen Application/Timing / Recommended Seeding
Rates). The relative maturity sits in an ``<h1>Maturity: 111
Days</h1>`` heading.
Rating scales are **mixed** and preserved verbatim (the chunker never
fabricates a value):
- Disease Tolerance: **1-9 numeric** (9 = best / most tolerant, per
industry norm; ``NA`` = not rated). Direction is the same as
Bayer/NK so no flip is needed.
- General Characteristics / Agronomic Features: **qualitative**
(Excellent / Very Good / Good / Average / …) with a few raw numerics
(GDD, Kernel Rows).
- Soil Adaptability: ``HR`` (highly recommended) / ``R`` (recommended)
/ etc.
Unlike the Ebbert's scraper (which left ``characteristics_groups`` empty
and relied on a verbatim body), we parse the spec sections into
structured ``characteristics_groups`` so the qualitative + numeric
ratings land in the embedded chunk and are actually retrievable.
Output:
corpus/proharvest/<source_key>.md
corpus/proharvest/<source_key>.json
source_key: ``proharvest-<slug>`` lowercased, e.g. ``proharvest-81p11``.
CLI:
python -m scrape.sources.proharvest --crop corn --limit 5
python -m scrape.sources.proharvest --force
python -m scrape.sources.proharvest --product proharvest-81p11
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://proharvestseeds.com"
WP = f"{BASE}/wp-json/wp/v2"
# seed-type taxonomy slug -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. Everything
# not listed here (alfalfa / forage / grass / cover-crop / sweet-corn /
# blends) is out of scope for the row-crop advisor.
CROP_TYPES = {
"corn-hybrid": "corn",
"soybean": "soybeans",
"wheat": "wheat",
}
# robots.txt declares no Crawl-delay for "*", but we stay polite — the
# row-crop catalog is only ~120 detail pages.
REQ_INTERVAL_SEC = 1.5
RATING_SCALE_DIRECTION = (
"disease 1-9, 9=best/most-tolerant, NA=not rated; "
"agronomic/general qualitative (Excellent/Very Good/Good/Average); "
"soil HR=highly recommended/R=recommended"
)
# Detail-page <h2> spec sections we extract, in display order. The
# value maps the page header to a characteristics_groups label the
# chunker buckets: DISEASE RATINGS -> disease framing, AGRONOMIC
# CHARACTERISTICS -> agronomic framing; the rest pass through verbatim
# as their own titled section (still embedded + retrievable).
SPEC_SECTIONS = {
"General Characteristics": "GENERAL CHARACTERISTICS",
"Agronomic Features": "AGRONOMIC CHARACTERISTICS",
"Disease Tolerance": "DISEASE RATINGS",
"Soil Adaptability": "SOIL ADAPTABILITY",
"Nitrogen Application/Timing": "NITROGEN APPLICATION/TIMING",
"Recommended Seeding Rates": "RECOMMENDED SEEDING RATES",
}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "proharvest"
log = logging.getLogger("scrape.proharvest")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. ProHarvest's row-crop catalog is
small (~120 detail pages) so 1.5 s/req still finishes in a few min."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
def get_json(self, url: str, **kw: Any) -> Any:
r = self.get(url, **kw)
r.raise_for_status()
return r.json()
# --------------------------------------------------------------------- model
@dataclass
class PHVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans / wheat
product_name: str = "" # "81P11"
relative_maturity: int | None = None # corn (days)
maturity_group: float | None = None # soy
wheat_maturity: str | None = None # wheat qualitative
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
# --------------------------------------------------------------------- discovery (REST)
def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
"""term_id -> name for a WP taxonomy (paged)."""
out: dict[int, str] = {}
page = 1
while True:
url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
r = http.get(url)
if r.status_code == 400: # past last page
break
r.raise_for_status()
terms = r.json()
if not terms:
break
for t in terms:
out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
if len(terms) < 100:
break
page += 1
return out
def _type_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
out: dict[str, int] = {}
for t in http.get_json(f"{WP}/seed-type?per_page=100&_fields=id,slug"):
out[t["slug"]] = t["id"]
return out
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
"""Return REST seed records for the in-scope row crops."""
type_ids = _type_slug_to_id(http)
records: list[dict] = []
for type_slug, crop in CROP_TYPES.items():
if only_crop and crop != only_crop:
continue
tid = type_ids.get(type_slug)
if tid is None:
log.warning("seed-type %r not found in taxonomy — skipping", type_slug)
continue
page = 1
while True:
url = (f"{WP}/seed?seed-type={tid}&per_page=100&page={page}"
"&_fields=id,slug,title,link,seed-trait")
r = http.get(url)
if r.status_code == 400:
break
r.raise_for_status()
batch = r.json()
if not batch:
break
for s in batch:
s["_crop"] = crop
records.append(s)
if len(batch) < 100:
break
page += 1
log.info("seed-type %-12s (%s): cumulative %d", type_slug, crop, len(records))
return records
# --------------------------------------------------------------------- detail parse
_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _direct_text(el: Tag) -> str:
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
def _parse_maturity(soup: BeautifulSoup, crop: str) -> tuple[int | None, float | None, str | None]:
"""Pull RM (corn) / MG (soy) / qualitative (wheat) from the
'Maturity: …' heading. Returns (rm, mg, wheat_maturity)."""
head = None
for h in soup.find_all(["h1", "h2", "h3"]):
txt = h.get_text(" ", strip=True)
if re.match(r"^Maturity\b", txt, re.I):
head = txt
break
if not head:
return None, None, None
m = _MATURITY_RE.search(head)
if crop == "corn":
return (int(float(m.group(1))) if m else None), None, None
if crop == "soybeans":
return None, (float(m.group(1)) if m else None), None
# wheat — keep the qualitative phrase after "Maturity:"
val = head.split(":", 1)[1].strip() if ":" in head else head
return None, None, (val or None)
def _parse_groups(soup: BeautifulSoup) -> list[dict]:
"""Parse each known spec <h2> into a {label, items:[{characteristic,
value}]} group. Each section is a flat run of
<strong>label</strong><div>value</div> pairs up to the next <h2>."""
groups: list[dict] = []
h2s = soup.find_all("h2")
for h2 in h2s:
header = _clean(h2.get_text(" ", strip=True))
label = SPEC_SECTIONS.get(header)
if not label:
continue
# Collect (tag, text) for strong/div leaves until the next <h2>.
seq: list[tuple[str, str]] = []
for el in h2.find_all_next():
if el.name == "h2":
break
if not isinstance(el, Tag):
continue
if el.name == "strong":
t = _clean(el.get_text(" ", strip=True))
if t:
seq.append(("k", t))
elif el.name == "div":
t = _direct_text(el)
if t:
seq.append(("v", t))
# Pair adjacent key->value. A key with no following value (or two
# keys in a row) keeps an em-dash placeholder so nothing silently
# drops.
items: list[dict] = []
i = 0
while i < len(seq):
kind, text = seq[i]
if kind == "k":
value = ""
if i + 1 < len(seq) and seq[i + 1][0] == "v":
value = seq[i + 1][1]
i += 1
items.append({"characteristic": text, "value": value})
i += 1
if items:
groups.append({"label": label, "items": items})
return groups
def _parse_positioning(soup: BeautifulSoup) -> str | None:
"""First substantive paragraph after the variety <h1>, before the
first spec <h2>. Best-effort marketing/positioning blurb."""
h1 = soup.find("h1")
if not h1:
return None
for el in h1.find_all_next():
if el.name == "h2":
break
if isinstance(el, Tag) and el.name == "p":
t = _clean(el.get_text(" ", strip=True))
if len(t) >= 40:
return t
return None
def parse_detail(http: RateLimitedSession, rec: dict,
trait_names: dict[int, str]) -> PHVariety:
crop = rec["_crop"]
slug = rec["slug"]
url = rec.get("link") or f"{BASE}/seed/{slug}/"
name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
rm, mg, wheat_mat = _parse_maturity(soup, crop)
groups = _parse_groups(soup)
positioning = _parse_positioning(soup)
traits = [trait_names[t] for t in (rec.get("seed-trait") or []) if t in trait_names]
return PHVariety(
source_key=f"proharvest-{slug.lower()}",
source_url=url,
crop=crop,
product_name=name,
relative_maturity=rm,
maturity_group=mg,
wheat_maturity=wheat_mat,
trait_stack=traits,
positioning=positioning,
groups=groups,
)
# --------------------------------------------------------------------- render
def render_markdown(v: PHVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
"wheat": "Wheat"}.get(v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** ProHarvest Seeds (independent regional brand)",
"- **Brand:** ProHarvest Seeds",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.crop == "wheat" and v.wheat_maturity:
head.append(f"- **Maturity:** {v.wheat_maturity}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Independent dealer network — Eastern/Central Corn Belt (IL/IN/OH/MO/IA/KS/NE)")
head.append("")
if v.positioning:
head += ["---", "", f"_{v.positioning}_", ""]
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or "—"
head.append(f"- **{ch}:** {val}")
head.append("")
return "\n".join(head)
def write_variety(v: PHVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "proharvest",
"source_key": v.source_key,
"vendor": "ProHarvest Seeds",
"brand": "ProHarvest Seeds",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": None,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
# Wheat maturity is qualitative; stash it where the chunker reads
# the wheat "Maturity" fact from (relative_maturity), as a string.
"wheat_class": None,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": [],
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "ProHarvest dealer network (Eastern/Central Corn Belt — IL/IN/OH/MO/IA/KS/NE)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": None,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
# For wheat, surface the qualitative maturity through relative_maturity
# so the chunker's wheat "Maturity {rm}" branch renders it.
if v.crop == "wheat" and v.wheat_maturity:
sidecar["relative_maturity"] = v.wheat_maturity
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
trait_names = _taxonomy_map(http, "seed-trait")
records = discover(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
records = [r for r in records
if f"proharvest-{r['slug'].lower()}" == key
or r["slug"].lower() == key]
if not records:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0}
processed = 0
for rec in records:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"proharvest-{rec['slug'].lower()}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
continue
try:
v = parse_detail(http, rec, trait_names)
except requests.HTTPError as exc:
log.error("[%d/%d] %s detail fetch failed: %s",
processed, len(records), source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
processed, len(records), source_key)
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, len(records), source_key, v.crop,
v.relative_maturity or v.maturity_group or v.wheat_maturity or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"], len(records))
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.proharvest",
description="Scrape ProHarvest Seeds (independent Corn Belt brand) — "
"corn / soybeans / wheat via the WP REST API + detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
help="Limit to one crop (corn / soybeans / wheat).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or slug.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())