22e8092faf
Image rebuild (skip scrape) / build (push) Successful in 5m46s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
547 lines
20 KiB
Python
547 lines
20 KiB
Python
"""ProHarvest Seeds scraper — independent regional brand (Hindsboro, IL).
|
|
|
|
Source: ``proharvestseeds.com`` — WordPress site exposing a public,
|
|
no-auth REST API. robots.txt is permissive (only ``/?s=``, ``/search/``,
|
|
``/dealer-files/*``, ``/dealer-section/*`` disallowed; the catalog +
|
|
``/wp-json/`` are open). Independent family-owned seed company; corn /
|
|
soybeans / wheat (plus forage / cover-crop lines that are out of scope
|
|
for the row-crop advisor).
|
|
|
|
Two-step ingestion:
|
|
|
|
1. **Enumerate** via the WP REST API. ``/wp/v2/seed`` is the variety
|
|
custom-post-type; ``/wp/v2/seed-type`` is the crop taxonomy. We pull
|
|
every variety whose seed-type is one of the row-crop terms
|
|
(corn-hybrid / soybean / wheat) — ignoring alfalfa / forage / grass /
|
|
cover-crop / sweet-corn terms. The REST payload gives the canonical
|
|
id / slug / title / permalink, but ``acf`` and ``content`` are NOT
|
|
registered to REST (both come back empty), so the ratings have to
|
|
come from the detail page.
|
|
|
|
2. **Parse the detail page.** Each ``/seed/<slug>/`` page server-renders
|
|
the agronomic data as ``<h2>`` spec sections, each a flat run of
|
|
``<strong>label</strong><div>value</div>`` pairs (General
|
|
Characteristics / Agronomic Features / Disease Tolerance / Soil
|
|
Adaptability / Nitrogen Application/Timing / Recommended Seeding
|
|
Rates). The relative maturity sits in an ``<h1>Maturity: 111
|
|
Days</h1>`` heading.
|
|
|
|
Rating scales are **mixed** and preserved verbatim (the chunker never
|
|
fabricates a value):
|
|
- Disease Tolerance: **1-9 numeric** (9 = best / most tolerant, per
|
|
industry norm; ``NA`` = not rated). Direction is the same as
|
|
Bayer/NK so no flip is needed.
|
|
- General Characteristics / Agronomic Features: **qualitative**
|
|
(Excellent / Very Good / Good / Average / …) with a few raw numerics
|
|
(GDD, Kernel Rows).
|
|
- Soil Adaptability: ``HR`` (highly recommended) / ``R`` (recommended)
|
|
/ etc.
|
|
|
|
Unlike the Ebbert's scraper (which left ``characteristics_groups`` empty
|
|
and relied on a verbatim body), we parse the spec sections into
|
|
structured ``characteristics_groups`` so the qualitative + numeric
|
|
ratings land in the embedded chunk and are actually retrievable.
|
|
|
|
Output:
|
|
corpus/proharvest/<source_key>.md
|
|
corpus/proharvest/<source_key>.json
|
|
|
|
source_key: ``proharvest-<slug>`` lowercased, e.g. ``proharvest-81p11``.
|
|
|
|
CLI:
|
|
python -m scrape.sources.proharvest --crop corn --limit 5
|
|
python -m scrape.sources.proharvest --force
|
|
python -m scrape.sources.proharvest --product proharvest-81p11
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
SCRAPER_VERSION = "0.1.0"
|
|
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
|
BASE = "https://proharvestseeds.com"
|
|
WP = f"{BASE}/wp-json/wp/v2"
|
|
|
|
# seed-type taxonomy slug -> chunker crop value. The chunker keys on
|
|
# "soybeans" (plural) for the MG branch, so map accordingly. Everything
|
|
# not listed here (alfalfa / forage / grass / cover-crop / sweet-corn /
|
|
# blends) is out of scope for the row-crop advisor.
|
|
CROP_TYPES = {
|
|
"corn-hybrid": "corn",
|
|
"soybean": "soybeans",
|
|
"wheat": "wheat",
|
|
}
|
|
|
|
# robots.txt declares no Crawl-delay for "*", but we stay polite — the
|
|
# row-crop catalog is only ~120 detail pages.
|
|
REQ_INTERVAL_SEC = 1.5
|
|
|
|
RATING_SCALE_DIRECTION = (
|
|
"disease 1-9, 9=best/most-tolerant, NA=not rated; "
|
|
"agronomic/general qualitative (Excellent/Very Good/Good/Average); "
|
|
"soil HR=highly recommended/R=recommended"
|
|
)
|
|
|
|
# Detail-page <h2> spec sections we extract, in display order. The
|
|
# value maps the page header to a characteristics_groups label the
|
|
# chunker buckets: DISEASE RATINGS -> disease framing, AGRONOMIC
|
|
# CHARACTERISTICS -> agronomic framing; the rest pass through verbatim
|
|
# as their own titled section (still embedded + retrievable).
|
|
SPEC_SECTIONS = {
|
|
"General Characteristics": "GENERAL CHARACTERISTICS",
|
|
"Agronomic Features": "AGRONOMIC CHARACTERISTICS",
|
|
"Disease Tolerance": "DISEASE RATINGS",
|
|
"Soil Adaptability": "SOIL ADAPTABILITY",
|
|
"Nitrogen Application/Timing": "NITROGEN APPLICATION/TIMING",
|
|
"Recommended Seeding Rates": "RECOMMENDED SEEDING RATES",
|
|
}
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
|
CORPUS_DIR = CORPUS_ROOT / "proharvest"
|
|
|
|
log = logging.getLogger("scrape.proharvest")
|
|
|
|
|
|
# --------------------------------------------------------------------- HTTP
|
|
|
|
|
|
class RateLimitedSession:
|
|
"""Polite session with backoff. ProHarvest's row-crop catalog is
|
|
small (~120 detail pages) so 1.5 s/req still finishes in a few min."""
|
|
|
|
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
|
self.s = requests.Session()
|
|
self.s.headers["User-Agent"] = USER_AGENT
|
|
self.interval = interval
|
|
self._last = 0.0
|
|
|
|
def _wait(self) -> None:
|
|
delta = time.monotonic() - self._last
|
|
if delta < self.interval:
|
|
time.sleep(self.interval - delta)
|
|
self._last = time.monotonic()
|
|
|
|
def request(self, method: str, url: str, *, max_retries: int = 4,
|
|
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
|
last_exc: Exception | None = None
|
|
for attempt in range(max_retries):
|
|
self._wait()
|
|
try:
|
|
resp = self.s.request(method, url, timeout=timeout, **kw)
|
|
except requests.RequestException as exc:
|
|
last_exc = exc
|
|
backoff = min(30.0, (2 ** attempt) + random.random())
|
|
log.warning("network error on %s %s: %s — retry in %.1fs",
|
|
method, url, exc, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
|
ra = resp.headers.get("Retry-After")
|
|
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
|
30.0, (2 ** attempt) + random.random())
|
|
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
|
resp.status_code, method, url, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
return resp
|
|
if last_exc:
|
|
raise last_exc
|
|
return resp # type: ignore[return-value]
|
|
|
|
def get(self, url: str, **kw: Any) -> requests.Response:
|
|
return self.request("GET", url, **kw)
|
|
|
|
def get_json(self, url: str, **kw: Any) -> Any:
|
|
r = self.get(url, **kw)
|
|
r.raise_for_status()
|
|
return r.json()
|
|
|
|
|
|
# --------------------------------------------------------------------- model
|
|
|
|
|
|
@dataclass
|
|
class PHVariety:
|
|
source_key: str
|
|
source_url: str
|
|
crop: str # chunker value: corn / soybeans / wheat
|
|
product_name: str = "" # "81P11"
|
|
relative_maturity: int | None = None # corn (days)
|
|
maturity_group: float | None = None # soy
|
|
wheat_maturity: str | None = None # wheat qualitative
|
|
trait_stack: list[str] = field(default_factory=list)
|
|
positioning: str | None = None
|
|
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
|
groups: list[dict] = field(default_factory=list)
|
|
|
|
|
|
# --------------------------------------------------------------------- discovery (REST)
|
|
|
|
|
|
def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
|
|
"""term_id -> name for a WP taxonomy (paged)."""
|
|
out: dict[int, str] = {}
|
|
page = 1
|
|
while True:
|
|
url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
|
|
r = http.get(url)
|
|
if r.status_code == 400: # past last page
|
|
break
|
|
r.raise_for_status()
|
|
terms = r.json()
|
|
if not terms:
|
|
break
|
|
for t in terms:
|
|
out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
|
|
if len(terms) < 100:
|
|
break
|
|
page += 1
|
|
return out
|
|
|
|
|
|
def _type_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
|
|
out: dict[str, int] = {}
|
|
for t in http.get_json(f"{WP}/seed-type?per_page=100&_fields=id,slug"):
|
|
out[t["slug"]] = t["id"]
|
|
return out
|
|
|
|
|
|
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
|
|
"""Return REST seed records for the in-scope row crops."""
|
|
type_ids = _type_slug_to_id(http)
|
|
records: list[dict] = []
|
|
for type_slug, crop in CROP_TYPES.items():
|
|
if only_crop and crop != only_crop:
|
|
continue
|
|
tid = type_ids.get(type_slug)
|
|
if tid is None:
|
|
log.warning("seed-type %r not found in taxonomy — skipping", type_slug)
|
|
continue
|
|
page = 1
|
|
while True:
|
|
url = (f"{WP}/seed?seed-type={tid}&per_page=100&page={page}"
|
|
"&_fields=id,slug,title,link,seed-trait")
|
|
r = http.get(url)
|
|
if r.status_code == 400:
|
|
break
|
|
r.raise_for_status()
|
|
batch = r.json()
|
|
if not batch:
|
|
break
|
|
for s in batch:
|
|
s["_crop"] = crop
|
|
records.append(s)
|
|
if len(batch) < 100:
|
|
break
|
|
page += 1
|
|
log.info("seed-type %-12s (%s): cumulative %d", type_slug, crop, len(records))
|
|
return records
|
|
|
|
|
|
# --------------------------------------------------------------------- detail parse
|
|
|
|
|
|
_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
|
|
|
|
|
|
def _clean(s: str) -> str:
|
|
return re.sub(r"\s+", " ", s or "").strip()
|
|
|
|
|
|
def _direct_text(el: Tag) -> str:
|
|
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
|
|
|
|
|
|
def _parse_maturity(soup: BeautifulSoup, crop: str) -> tuple[int | None, float | None, str | None]:
|
|
"""Pull RM (corn) / MG (soy) / qualitative (wheat) from the
|
|
'Maturity: …' heading. Returns (rm, mg, wheat_maturity)."""
|
|
head = None
|
|
for h in soup.find_all(["h1", "h2", "h3"]):
|
|
txt = h.get_text(" ", strip=True)
|
|
if re.match(r"^Maturity\b", txt, re.I):
|
|
head = txt
|
|
break
|
|
if not head:
|
|
return None, None, None
|
|
m = _MATURITY_RE.search(head)
|
|
if crop == "corn":
|
|
return (int(float(m.group(1))) if m else None), None, None
|
|
if crop == "soybeans":
|
|
return None, (float(m.group(1)) if m else None), None
|
|
# wheat — keep the qualitative phrase after "Maturity:"
|
|
val = head.split(":", 1)[1].strip() if ":" in head else head
|
|
return None, None, (val or None)
|
|
|
|
|
|
def _parse_groups(soup: BeautifulSoup) -> list[dict]:
|
|
"""Parse each known spec <h2> into a {label, items:[{characteristic,
|
|
value}]} group. Each section is a flat run of
|
|
<strong>label</strong><div>value</div> pairs up to the next <h2>."""
|
|
groups: list[dict] = []
|
|
h2s = soup.find_all("h2")
|
|
for h2 in h2s:
|
|
header = _clean(h2.get_text(" ", strip=True))
|
|
label = SPEC_SECTIONS.get(header)
|
|
if not label:
|
|
continue
|
|
# Collect (tag, text) for strong/div leaves until the next <h2>.
|
|
seq: list[tuple[str, str]] = []
|
|
for el in h2.find_all_next():
|
|
if el.name == "h2":
|
|
break
|
|
if not isinstance(el, Tag):
|
|
continue
|
|
if el.name == "strong":
|
|
t = _clean(el.get_text(" ", strip=True))
|
|
if t:
|
|
seq.append(("k", t))
|
|
elif el.name == "div":
|
|
t = _direct_text(el)
|
|
if t:
|
|
seq.append(("v", t))
|
|
# Pair adjacent key->value. A key with no following value (or two
|
|
# keys in a row) keeps an em-dash placeholder so nothing silently
|
|
# drops.
|
|
items: list[dict] = []
|
|
i = 0
|
|
while i < len(seq):
|
|
kind, text = seq[i]
|
|
if kind == "k":
|
|
value = ""
|
|
if i + 1 < len(seq) and seq[i + 1][0] == "v":
|
|
value = seq[i + 1][1]
|
|
i += 1
|
|
items.append({"characteristic": text, "value": value})
|
|
i += 1
|
|
if items:
|
|
groups.append({"label": label, "items": items})
|
|
return groups
|
|
|
|
|
|
def _parse_positioning(soup: BeautifulSoup) -> str | None:
|
|
"""First substantive paragraph after the variety <h1>, before the
|
|
first spec <h2>. Best-effort marketing/positioning blurb."""
|
|
h1 = soup.find("h1")
|
|
if not h1:
|
|
return None
|
|
for el in h1.find_all_next():
|
|
if el.name == "h2":
|
|
break
|
|
if isinstance(el, Tag) and el.name == "p":
|
|
t = _clean(el.get_text(" ", strip=True))
|
|
if len(t) >= 40:
|
|
return t
|
|
return None
|
|
|
|
|
|
def parse_detail(http: RateLimitedSession, rec: dict,
|
|
trait_names: dict[int, str]) -> PHVariety:
|
|
crop = rec["_crop"]
|
|
slug = rec["slug"]
|
|
url = rec.get("link") or f"{BASE}/seed/{slug}/"
|
|
name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
|
|
r = http.get(url)
|
|
r.raise_for_status()
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
|
|
rm, mg, wheat_mat = _parse_maturity(soup, crop)
|
|
groups = _parse_groups(soup)
|
|
positioning = _parse_positioning(soup)
|
|
traits = [trait_names[t] for t in (rec.get("seed-trait") or []) if t in trait_names]
|
|
|
|
return PHVariety(
|
|
source_key=f"proharvest-{slug.lower()}",
|
|
source_url=url,
|
|
crop=crop,
|
|
product_name=name,
|
|
relative_maturity=rm,
|
|
maturity_group=mg,
|
|
wheat_maturity=wheat_mat,
|
|
trait_stack=traits,
|
|
positioning=positioning,
|
|
groups=groups,
|
|
)
|
|
|
|
|
|
# --------------------------------------------------------------------- render
|
|
|
|
|
|
def render_markdown(v: PHVariety) -> str:
|
|
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
|
|
"wheat": "Wheat"}.get(v.crop, v.crop.title())
|
|
head: list[str] = [
|
|
f"# {v.product_name}",
|
|
"",
|
|
"- **Vendor:** ProHarvest Seeds (independent regional brand)",
|
|
"- **Brand:** ProHarvest Seeds",
|
|
f"- **Crop:** {crop_label}",
|
|
]
|
|
if v.crop == "corn" and v.relative_maturity is not None:
|
|
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
|
|
if v.crop == "soybeans" and v.maturity_group is not None:
|
|
head.append(f"- **Maturity group:** {v.maturity_group}")
|
|
if v.crop == "wheat" and v.wheat_maturity:
|
|
head.append(f"- **Maturity:** {v.wheat_maturity}")
|
|
if v.trait_stack:
|
|
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
|
head.append(f"- **Source:** {v.source_url}")
|
|
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
|
head.append("- **Service area:** Independent dealer network — Eastern/Central Corn Belt (IL/IN/OH/MO/IA/KS/NE)")
|
|
head.append("")
|
|
if v.positioning:
|
|
head += ["---", "", f"_{v.positioning}_", ""]
|
|
head += ["---", ""]
|
|
for g in v.groups:
|
|
head.append(f"## {g['label'].title()}")
|
|
head.append("")
|
|
for it in g["items"]:
|
|
ch = it["characteristic"]
|
|
val = it["value"] or "—"
|
|
head.append(f"- **{ch}:** {val}")
|
|
head.append("")
|
|
return "\n".join(head)
|
|
|
|
|
|
def write_variety(v: PHVariety, body_md: str) -> None:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
|
sidecar = {
|
|
"source": "proharvest",
|
|
"source_key": v.source_key,
|
|
"vendor": "ProHarvest Seeds",
|
|
"brand": "ProHarvest Seeds",
|
|
"product_name": v.product_name,
|
|
"product_id": v.product_name,
|
|
"crop": v.crop,
|
|
"release_year": None,
|
|
"relative_maturity": v.relative_maturity,
|
|
"maturity_group": v.maturity_group,
|
|
# Wheat maturity is qualitative; stash it where the chunker reads
|
|
# the wheat "Maturity" fact from (relative_maturity), as a string.
|
|
"wheat_class": None,
|
|
"trait_stack": v.trait_stack,
|
|
"trait_descriptions": [],
|
|
"positioning_statement": v.positioning,
|
|
"strengths": [],
|
|
"characteristics_groups": v.groups,
|
|
"_scale_direction": RATING_SCALE_DIRECTION,
|
|
"regional_recommendations": [
|
|
{"product_list_name": "ProHarvest dealer network (Eastern/Central Corn Belt — IL/IN/OH/MO/IA/KS/NE)",
|
|
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
|
],
|
|
"image_url": None,
|
|
"source_urls": [v.source_url],
|
|
"sitemap_last_modified": None,
|
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
|
"scraper_version": SCRAPER_VERSION,
|
|
}
|
|
# For wheat, surface the qualitative maturity through relative_maturity
|
|
# so the chunker's wheat "Maturity {rm}" branch renders it.
|
|
if v.crop == "wheat" and v.wheat_maturity:
|
|
sidecar["relative_maturity"] = v.wheat_maturity
|
|
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
|
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
|
|
|
|
# --------------------------------------------------------------------- pipeline
|
|
|
|
|
|
def run(*, limit: int | None, force: bool,
|
|
only_crop: str | None, only_product: str | None) -> int:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
http = RateLimitedSession()
|
|
trait_names = _taxonomy_map(http, "seed-trait")
|
|
records = discover(http, only_crop=only_crop)
|
|
|
|
if only_product:
|
|
key = only_product.lower()
|
|
records = [r for r in records
|
|
if f"proharvest-{r['slug'].lower()}" == key
|
|
or r["slug"].lower() == key]
|
|
if not records:
|
|
log.error("no variety matched --product=%s", only_product)
|
|
return 2
|
|
|
|
counts = {"written": 0, "skipped": 0, "empty": 0}
|
|
processed = 0
|
|
for rec in records:
|
|
if limit is not None and processed >= limit:
|
|
break
|
|
processed += 1
|
|
source_key = f"proharvest-{rec['slug'].lower()}"
|
|
md_path = CORPUS_DIR / f"{source_key}.md"
|
|
if md_path.exists() and not force:
|
|
counts["skipped"] += 1
|
|
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
|
|
continue
|
|
try:
|
|
v = parse_detail(http, rec, trait_names)
|
|
except requests.HTTPError as exc:
|
|
log.error("[%d/%d] %s detail fetch failed: %s",
|
|
processed, len(records), source_key, exc)
|
|
continue
|
|
if not v.groups:
|
|
counts["empty"] += 1
|
|
log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
|
|
processed, len(records), source_key)
|
|
write_variety(v, render_markdown(v))
|
|
counts["written"] += 1
|
|
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
|
processed, len(records), source_key, v.crop,
|
|
v.relative_maturity or v.maturity_group or v.wheat_maturity or "-",
|
|
len(v.groups), ",".join(v.trait_stack) or "-")
|
|
|
|
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
|
|
processed, counts["written"], counts["skipped"], counts["empty"], len(records))
|
|
return 0
|
|
|
|
|
|
# --------------------------------------------------------------------- CLI
|
|
|
|
|
|
def _build_argparser() -> argparse.ArgumentParser:
|
|
p = argparse.ArgumentParser(
|
|
prog="scrape.sources.proharvest",
|
|
description="Scrape ProHarvest Seeds (independent Corn Belt brand) — "
|
|
"corn / soybeans / wheat via the WP REST API + detail pages.")
|
|
p.add_argument("--limit", type=int, default=None,
|
|
help="Stop after processing N varieties (default: all).")
|
|
p.add_argument("--force", action="store_true",
|
|
help="Re-fetch even if the markdown file already exists.")
|
|
p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
|
|
help="Limit to one crop (corn / soybeans / wheat).")
|
|
p.add_argument("--product", default=None,
|
|
help="Process a single variety by source_key or slug.")
|
|
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
|
return p
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
args = _build_argparser().parse_args(argv)
|
|
logging.basicConfig(
|
|
level=args.log_level.upper(),
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
stream=sys.stderr)
|
|
return run(limit=args.limit, force=args.force,
|
|
only_crop=args.crop, only_product=args.product)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|