Add 4 independent seed brands: Latham + Stine + 1st Choice + Burrus (+623 varieties) (#17)
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
This commit was merged in pull request #17.
This commit is contained in:
@@ -0,0 +1,561 @@
|
||||
"""Burrus Seed scraper — independent family-owned company (Arenzville, IL).
|
||||
|
||||
Source: Burrus Hybrids ("Burrus Seed"), an independent family company
|
||||
founded **1935** in Arenzville, Illinois — NOT owned by any of the
|
||||
multinationals (Bayer / Corteva / Syngenta / BASF). It markets corn under
|
||||
the **Burrus** and **Power Plus** brands and soybeans under the **Burrus**
|
||||
and **DONMARIO** brands, sold through a dealer network across IL / IN / IA
|
||||
/ MO / WI.
|
||||
|
||||
Unlike the ProHarvest scraper (which parses HTML detail pages), Burrus
|
||||
publishes its full agronomic dataset through the **Seedware** catalog
|
||||
widget's JSON-over-JSONP API (the backend for the product finder on
|
||||
``burrusseed.com/products/{corn,soybeans}``). So this scraper does TWO
|
||||
list calls and maps JSON fields straight into ``characteristics_groups``;
|
||||
there is no per-variety page fetch.
|
||||
|
||||
Seedware API
|
||||
------------
|
||||
``GET https://burrus25.seedware.net/app/_queries/crop_varieties.php
|
||||
?crop_pkey=101&callback=cb`` -> CORN (JSONP)
|
||||
``crop_pkey=102`` -> SOYBEANS
|
||||
|
||||
Both require:
|
||||
* a ``callback`` query param (WITHOUT it the endpoint returns ``[]``),
|
||||
* a ``Referer: https://burrusseed.com/`` header.
|
||||
The response is ``cb([...]);`` — strip the JSONP wrapper to get a JSON
|
||||
array of ~38 corn + ~26 soy records. Each record has ~44 fields:
|
||||
``id`` (variety code, e.g. ``8J697AM``), ``description`` (brand + code,
|
||||
e.g. ``Power Plus 8J697AM``), ``pkey`` (Seedware row id), ``maturity``
|
||||
(RM for corn / MG for soy, as a string like ``"97.00"`` / ``"2.00"``),
|
||||
``released`` (year int), ``trait`` / ``trait_platform``, a per-record
|
||||
brand in ``stat_corn_brand`` / ``stat_soybean_brand``, and many
|
||||
``stat_*`` agronomic / disease / herbicide-tolerance ratings.
|
||||
|
||||
Rating scales (confirmed from the live data, Jun 2026)
|
||||
------------------------------------------------------
|
||||
* **Numeric agronomic + disease ratings: 1-10, 10 = best / most
|
||||
tolerant** (observed values 4-10; standard Seedware/seed-industry
|
||||
high-is-better scale). Soy agronomic stats arrive as ``"8.000"`` —
|
||||
the trailing zeros are stripped to ``"8"``. ``NR`` / ``None`` /
|
||||
blank / ``-`` = not rated and are SKIPPED (never coerced to a value).
|
||||
* **Herbicide tolerance + insect-protection packages: Yes / No**
|
||||
(verbatim). ``glyphosate`` / ``glufosinate`` / ``2,4-D choline`` /
|
||||
``FOPs`` / ``dicamba`` tolerances and the Bt insect packages
|
||||
(corn borer / rootworm / etc.) are categorical Yes/No, not numeric.
|
||||
* **Categorical agronomic notes** (corn-on-corn suitability, refuge
|
||||
structure) pass through verbatim.
|
||||
|
||||
Output:
|
||||
corpus/burrus/<source_key>.md
|
||||
corpus/burrus/<source_key>.json
|
||||
|
||||
source_key: ``burrus-<id>`` lowercased + slugified, e.g.
|
||||
``burrus-8j697am``. The variety ``id`` (the catalog code) is stable.
|
||||
|
||||
CLI:
|
||||
python -m scrape.sources.burrus --crop corn --limit 2 --force
|
||||
python -m scrape.sources.burrus --crop soybeans
|
||||
python -m scrape.sources.burrus --force
|
||||
python -m scrape.sources.burrus --product burrus-8j697am
|
||||
|
||||
ROBOTS / UA: burrusseed.com robots.txt blocks ~33 NAMED AI/scraper bots
|
||||
(Scrapy, CCBot, Bytespider, Diffbot, ...) and declares ``Crawl-delay: 10``
|
||||
+ ``Content-signal: ai-train=no``; ``User-agent: *`` is allowed. The
|
||||
operator has chosen to include this source. We use a non-blacklisted UA
|
||||
and honour the 10-second crawl delay (the API call count is tiny — two
|
||||
list calls — so this is cheap).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
|
||||
SCRAPER_VERSION = "0.1.0"
|
||||
# NOT any blacklisted bot name — robots.txt allows User-agent: *.
|
||||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||||
SEEDWARE = "https://burrus25.seedware.net"
|
||||
API = f"{SEEDWARE}/app/_queries/crop_varieties.php"
|
||||
SITE = "https://burrusseed.com"
|
||||
REFERER = "https://burrusseed.com/"
|
||||
|
||||
# crop_pkey -> (chunker crop value, public product page slug).
|
||||
CROP_PKEYS = {
|
||||
"corn": (101, "corn"),
|
||||
"soybeans": (102, "soybeans"),
|
||||
}
|
||||
|
||||
# robots.txt declares Crawl-delay: 10 for burrusseed.com / seedware.net.
|
||||
# Honour it — the catalog is only two list calls so this is cheap.
|
||||
REQ_INTERVAL_SEC = 10.0
|
||||
|
||||
RATING_SCALE_DIRECTION = (
|
||||
"numeric agronomic + disease ratings 1-10, 10=best/most-tolerant "
|
||||
"(observed 4-10; higher is better); NR/blank/0/'-' = not rated (omitted). "
|
||||
"Herbicide tolerances and Bt insect-protection packages are Yes/No "
|
||||
"(verbatim, not numeric). Corn-on-corn suitability and refuge structure "
|
||||
"are categorical."
|
||||
)
|
||||
|
||||
# ----- stat_* field -> (group label, human characteristic name) -----------
|
||||
#
|
||||
# Group labels match the chunker's buckets in rag/chunk.py:
|
||||
# "DISEASE RATINGS" -> disease framing
|
||||
# "AGRONOMIC CHARACTERISTICS" -> agronomic framing
|
||||
# "HERBICIDE TOLERANCE" -> falls into the chunker's MANAGEMENT
|
||||
# bucket ("HERBICIDE" is a recognised label),
|
||||
# so it renders as "Management notes".
|
||||
# Fields intentionally NOT mapped: stat_corn_brand / stat_soybean_brand
|
||||
# (used for the per-record brand), stat_herbicide_tolerance (always blank
|
||||
# in the live data — the per-chemistry stats carry the real signal).
|
||||
|
||||
DISEASE_FIELDS = {
|
||||
# corn
|
||||
"stat_gray_leaf_spot_tolerance": "Gray leaf spot tolerance",
|
||||
"stat_tar_spot_tolerance": "Tar spot tolerance",
|
||||
# soy
|
||||
"stat_brown_stem_rot": "Brown stem rot (BSR) tolerance",
|
||||
"stat_sds": "Sudden death syndrome (SDS) tolerance",
|
||||
"stat_phytophthora_root_rot": "Phytophthora root rot tolerance",
|
||||
"stat_prr_phytophthora_root_rot": "Phytophthora root rot (PRR) tolerance",
|
||||
}
|
||||
|
||||
# Agronomic ratings — numeric 1-10 (corn) and "8.000"-style (soy).
|
||||
AGRONOMIC_NUMERIC_FIELDS = {
|
||||
# corn
|
||||
"stat_drought_tolerance": "Drought tolerance",
|
||||
"stat_greensnap_tolerance": "Greensnap tolerance",
|
||||
"stat_root_strength": "Root strength",
|
||||
"stat_stalk_strength": "Stalk strength",
|
||||
"stat_standability": "Standability",
|
||||
"stat_black_cutworm": "Black cutworm tolerance",
|
||||
# soy
|
||||
"stat_emergence": "Emergence",
|
||||
"stat_canopy_width": "Canopy width",
|
||||
"stat_plant_height": "Plant height",
|
||||
}
|
||||
|
||||
# Agronomic categorical / Yes-No notes (insect protection + placement).
|
||||
AGRONOMIC_CATEGORICAL_FIELDS = {
|
||||
"stat_corn_corn": "Corn-on-corn suitability",
|
||||
"stat_refuge": "Refuge structure",
|
||||
"stat_corn_borer": "Corn borer protection (Bt)",
|
||||
"stat_corn_rootworm": "Corn rootworm protection (Bt)",
|
||||
"stat_corn_earworm": "Corn earworm protection (Bt)",
|
||||
"stat_nematode": "Nematode protection",
|
||||
"stat_wireworm": "Wireworm protection",
|
||||
}
|
||||
|
||||
# Herbicide tolerance — Yes/No per chemistry.
|
||||
HERBICIDE_FIELDS = {
|
||||
"stat_glyphosate_tolerance": "Glyphosate tolerance",
|
||||
"stat_glufosinate_tolerance": "Glufosinate tolerance",
|
||||
"stat_24d_choline_tolerance": "2,4-D choline tolerance",
|
||||
"stat_dicamba_tolerance": "Dicamba tolerance",
|
||||
"stat_fops_tolerance": "FOPs (fop herbicide) tolerance",
|
||||
}
|
||||
|
||||
GROUP_ORDER = [
|
||||
("DISEASE RATINGS", DISEASE_FIELDS),
|
||||
("AGRONOMIC CHARACTERISTICS", {**AGRONOMIC_NUMERIC_FIELDS,
|
||||
**AGRONOMIC_CATEGORICAL_FIELDS}),
|
||||
("HERBICIDE TOLERANCE", HERBICIDE_FIELDS),
|
||||
]
|
||||
|
||||
# Values that mean "not rated" — never coerced into a chunk.
|
||||
_NOT_RATED = {"", "-", "--", "n/a", "na", "nr", "none", "0", "0.000", "0.00"}
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||||
CORPUS_DIR = CORPUS_ROOT / "burrus"
|
||||
|
||||
log = logging.getLogger("scrape.burrus")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- HTTP
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""Polite session with backoff. Honours burrusseed.com's
|
||||
Crawl-delay: 10 (>=10 s between requests to seedware.net /
|
||||
burrusseed.com). The Burrus catalog is two list calls total."""
|
||||
|
||||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||||
self.s = requests.Session()
|
||||
self.s.headers["User-Agent"] = USER_AGENT
|
||||
self.s.headers["Referer"] = REFERER
|
||||
self.s.headers["Accept"] = "*/*"
|
||||
self.interval = interval
|
||||
self._last = 0.0
|
||||
|
||||
def _wait(self) -> None:
|
||||
delta = time.monotonic() - self._last
|
||||
if self._last and delta < self.interval:
|
||||
time.sleep(self.interval - delta)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||||
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
||||
last_exc: Exception | None = None
|
||||
resp: requests.Response | None = None
|
||||
for attempt in range(max_retries):
|
||||
self._wait()
|
||||
try:
|
||||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||||
method, url, exc, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||||
ra = resp.headers.get("Retry-After")
|
||||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||||
30.0, (2 ** attempt) + random.random())
|
||||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||||
resp.status_code, method, url, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
return resp
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
assert resp is not None
|
||||
return resp
|
||||
|
||||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("GET", url, **kw)
|
||||
|
||||
|
||||
def _strip_jsonp(text: str) -> Any:
|
||||
"""Strip a ``cb( ... );`` JSONP wrapper and parse the JSON inside."""
|
||||
s = text.strip()
|
||||
m = re.match(r"^[^(]*\((.*)\)\s*;?\s*$", s, re.S)
|
||||
body = m.group(1) if m else s
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- model
|
||||
|
||||
|
||||
@dataclass
|
||||
class BurrusVariety:
|
||||
source_key: str
|
||||
crop: str # chunker value: corn / soybeans
|
||||
product_name: str # "Power Plus 8J697AM"
|
||||
product_id: str # "8J697AM"
|
||||
brand: str # "Burrus" | "Power Plus" | "DONMARIO"
|
||||
relative_maturity: int | None = None
|
||||
maturity_group: float | None = None
|
||||
release_year: int | None = None
|
||||
trait_stack: list[str] = field(default_factory=list)
|
||||
positioning: str | None = None
|
||||
groups: list[dict] = field(default_factory=list)
|
||||
source_url: str = ""
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- fetch
|
||||
|
||||
|
||||
def fetch_crop(http: RateLimitedSession, crop_pkey: int) -> list[dict]:
|
||||
"""Fetch + decode the JSONP variety array for one crop_pkey."""
|
||||
url = f"{API}?crop_pkey={crop_pkey}&callback=cb"
|
||||
r = http.get(url)
|
||||
r.raise_for_status()
|
||||
data = _strip_jsonp(r.text)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"unexpected payload for crop_pkey={crop_pkey}: "
|
||||
f"{type(data).__name__}")
|
||||
return data
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- mapping
|
||||
|
||||
|
||||
def _slug(s: str) -> str:
|
||||
s = (s or "").strip().lower()
|
||||
s = re.sub(r"[^a-z0-9]+", "-", s)
|
||||
return re.sub(r"-+", "-", s).strip("-")
|
||||
|
||||
|
||||
def _is_rated(v: Any) -> bool:
|
||||
if v is None:
|
||||
return False
|
||||
return str(v).strip().lower() not in _NOT_RATED
|
||||
|
||||
|
||||
def _clean_value(v: Any) -> str:
|
||||
"""Normalise a stat value for display. Numeric soy stats arrive as
|
||||
'8.000' — strip the trailing zeros to '8'. Everything else passes
|
||||
through verbatim (Yes / No / Suitable / Integrated refuge / ...)."""
|
||||
s = str(v).strip()
|
||||
# numeric like "8.000" / "8.00" / "97.00" -> "8" / "97"
|
||||
if re.fullmatch(r"-?\d+(?:\.\d+)?", s):
|
||||
f = float(s)
|
||||
return str(int(f)) if f == int(f) else (f"{f:g}")
|
||||
return s
|
||||
|
||||
|
||||
def _maturity(rec: dict, crop: str) -> tuple[int | None, float | None]:
|
||||
raw = rec.get("maturity")
|
||||
if raw is None or str(raw).strip() == "":
|
||||
return None, None
|
||||
try:
|
||||
f = float(str(raw).strip())
|
||||
except ValueError:
|
||||
return None, None
|
||||
if crop == "corn":
|
||||
return int(round(f)), None
|
||||
return None, round(f, 1)
|
||||
|
||||
|
||||
def _brand(rec: dict) -> str:
|
||||
"""Per-record brand. corn -> stat_corn_brand (Burrus / Power Plus);
|
||||
soy -> stat_soybean_brand (Burrus / DONMARIO). Falls back to the
|
||||
leading token of the description, else 'Burrus'."""
|
||||
b = rec.get("stat_corn_brand") or rec.get("stat_soybean_brand")
|
||||
if b and str(b).strip():
|
||||
return str(b).strip()
|
||||
desc = (rec.get("description") or "").strip()
|
||||
code = (rec.get("id") or "").strip()
|
||||
if desc and code and desc.lower().endswith(code.lower()):
|
||||
lead = desc[: len(desc) - len(code)].strip()
|
||||
if lead:
|
||||
return lead
|
||||
return "Burrus"
|
||||
|
||||
|
||||
def _traits(rec: dict) -> list[str]:
|
||||
out: list[str] = []
|
||||
for key in ("trait", "trait_platform"):
|
||||
v = rec.get(key)
|
||||
if v and str(v).strip():
|
||||
# strip stray trailing punctuation seen in the data
|
||||
# ("Conventional." / "AM`")
|
||||
t = str(v).strip().rstrip(".`")
|
||||
if t and t not in out:
|
||||
out.append(t)
|
||||
return out
|
||||
|
||||
|
||||
def _build_groups(rec: dict) -> list[dict]:
|
||||
groups: list[dict] = []
|
||||
for label, fields in GROUP_ORDER:
|
||||
items: list[dict] = []
|
||||
for stat_key, human in fields.items():
|
||||
v = rec.get(stat_key)
|
||||
if _is_rated(v):
|
||||
items.append({"characteristic": human, "value": _clean_value(v)})
|
||||
if items:
|
||||
groups.append({"label": label, "items": items})
|
||||
return groups
|
||||
|
||||
|
||||
def map_record(rec: dict, crop: str) -> BurrusVariety:
|
||||
code = (rec.get("id") or "").strip()
|
||||
pkey = rec.get("pkey")
|
||||
key_seed = code or (f"pkey-{pkey}" if pkey else (rec.get("description") or ""))
|
||||
source_key = f"burrus-{_slug(key_seed)}"
|
||||
name = (rec.get("description") or code or key_seed).strip()
|
||||
rm, mg = _maturity(rec, crop)
|
||||
page_slug = CROP_PKEYS[crop][1]
|
||||
return BurrusVariety(
|
||||
source_key=source_key,
|
||||
crop=crop,
|
||||
product_name=name,
|
||||
product_id=code or name,
|
||||
brand=_brand(rec),
|
||||
relative_maturity=rm,
|
||||
maturity_group=mg,
|
||||
release_year=(rec.get("released")
|
||||
if isinstance(rec.get("released"), int) else None),
|
||||
trait_stack=_traits(rec),
|
||||
# The Seedware records carry no marketing blurb; leave positioning
|
||||
# null rather than fabricate one.
|
||||
positioning=None,
|
||||
groups=_build_groups(rec),
|
||||
source_url=f"{SITE}/products/{page_slug}",
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- render
|
||||
|
||||
|
||||
def render_markdown(v: BurrusVariety) -> str:
|
||||
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
|
||||
v.crop, v.crop.title())
|
||||
head: list[str] = [
|
||||
f"# {v.product_name}",
|
||||
"",
|
||||
"- **Vendor:** Burrus Seed (Burrus Hybrids — independent family "
|
||||
"company, Arenzville, IL, since 1935)",
|
||||
f"- **Brand:** {v.brand}",
|
||||
f"- **Crop:** {crop_label}",
|
||||
]
|
||||
if v.crop == "corn" and v.relative_maturity is not None:
|
||||
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
|
||||
if v.crop == "soybeans" and v.maturity_group is not None:
|
||||
head.append(f"- **Maturity group:** {v.maturity_group}")
|
||||
if v.trait_stack:
|
||||
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
||||
if v.release_year:
|
||||
head.append(f"- **Released:** {v.release_year}")
|
||||
head.append(f"- **Source:** {v.source_url}")
|
||||
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
||||
head.append("- **Service area:** Burrus dealer network "
|
||||
"(IL / IN / IA / MO / WI)")
|
||||
head.append("")
|
||||
head += ["---", ""]
|
||||
for g in v.groups:
|
||||
head.append(f"## {g['label'].title()}")
|
||||
head.append("")
|
||||
for it in g["items"]:
|
||||
head.append(f"- **{it['characteristic']}:** {it['value'] or '—'}")
|
||||
head.append("")
|
||||
return "\n".join(head)
|
||||
|
||||
|
||||
def write_variety(v: BurrusVariety, body_md: str) -> None:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
||||
sidecar = {
|
||||
"source": "burrus",
|
||||
"source_key": v.source_key,
|
||||
"vendor": "Burrus Seed",
|
||||
"brand": v.brand,
|
||||
"product_name": v.product_name,
|
||||
"product_id": v.product_id,
|
||||
"crop": v.crop,
|
||||
"release_year": v.release_year,
|
||||
"relative_maturity": v.relative_maturity,
|
||||
"maturity_group": v.maturity_group,
|
||||
"wheat_class": None,
|
||||
"trait_stack": v.trait_stack,
|
||||
"trait_descriptions": [],
|
||||
"positioning_statement": v.positioning,
|
||||
"strengths": [],
|
||||
"characteristics_groups": v.groups,
|
||||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||||
"regional_recommendations": [
|
||||
{"product_list_name": "Burrus dealer network (IL/IN/IA/MO/WI)",
|
||||
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
||||
],
|
||||
"image_url": None,
|
||||
"source_urls": [v.source_url],
|
||||
"sitemap_last_modified": None,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"scraper_version": SCRAPER_VERSION,
|
||||
}
|
||||
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
||||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- pipeline
|
||||
|
||||
|
||||
def run(*, limit: int | None, force: bool,
|
||||
only_crop: str | None, only_product: str | None) -> int:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
http = RateLimitedSession()
|
||||
|
||||
crops = [only_crop] if only_crop else list(CROP_PKEYS.keys())
|
||||
records: list[tuple[str, dict]] = []
|
||||
for crop in crops:
|
||||
crop_pkey = CROP_PKEYS[crop][0]
|
||||
try:
|
||||
raw = fetch_crop(http, crop_pkey)
|
||||
except (requests.HTTPError, ValueError) as exc:
|
||||
log.error("fetch failed for crop=%s (pkey=%d): %s",
|
||||
crop, crop_pkey, exc)
|
||||
continue
|
||||
log.info("crop=%-9s pkey=%d: %d records", crop, crop_pkey, len(raw))
|
||||
for rec in raw:
|
||||
records.append((crop, rec))
|
||||
|
||||
varieties = [map_record(rec, crop) for crop, rec in records]
|
||||
|
||||
if only_product:
|
||||
key = only_product.lower()
|
||||
varieties = [v for v in varieties
|
||||
if v.source_key == key or v.product_id.lower() == key
|
||||
or _slug(v.product_id) == _slug(key)]
|
||||
if not varieties:
|
||||
log.error("no variety matched --product=%s", only_product)
|
||||
return 2
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "empty": 0}
|
||||
processed = 0
|
||||
total = len(varieties)
|
||||
for v in varieties:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
md_path = CORPUS_DIR / f"{v.source_key}.md"
|
||||
if md_path.exists() and not force:
|
||||
counts["skipped"] += 1
|
||||
log.info("[%d/%d] %s skipped", processed, total, v.source_key)
|
||||
continue
|
||||
if not v.groups:
|
||||
counts["empty"] += 1
|
||||
log.warning("[%d/%d] %s — no rating groups (still writing identity)",
|
||||
processed, total, v.source_key)
|
||||
write_variety(v, render_markdown(v))
|
||||
counts["written"] += 1
|
||||
log.info("[%d/%d] %s written | brand=%s crop=%s rm/mg=%s groups=%d "
|
||||
"traits=%s", processed, total, v.source_key, v.brand, v.crop,
|
||||
v.relative_maturity or v.maturity_group or "-",
|
||||
len(v.groups), ",".join(v.trait_stack) or "-")
|
||||
|
||||
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
|
||||
processed, counts["written"], counts["skipped"], counts["empty"],
|
||||
total)
|
||||
return 0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- CLI
|
||||
|
||||
|
||||
def _build_argparser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="scrape.sources.burrus",
|
||||
description="Scrape Burrus Seed (independent family company, "
|
||||
"Arenzville IL) — corn / soybeans via the Seedware "
|
||||
"JSON-over-JSONP catalog API.")
|
||||
p.add_argument("--limit", type=int, default=None,
|
||||
help="Stop after processing N varieties (default: all).")
|
||||
p.add_argument("--force", action="store_true",
|
||||
help="Re-write even if the markdown file already exists.")
|
||||
p.add_argument("--crop", default=None, choices=sorted(CROP_PKEYS.keys()),
|
||||
help="Limit to one crop (corn / soybeans).")
|
||||
p.add_argument("--product", default=None,
|
||||
help="Process a single variety by source_key or id.")
|
||||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = _build_argparser().parse_args(argv)
|
||||
logging.basicConfig(
|
||||
level=args.log_level.upper(),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
stream=sys.stderr)
|
||||
return run(limit=args.limit, force=args.force,
|
||||
only_crop=args.crop, only_product=args.product)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,671 @@
|
||||
"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).
|
||||
|
||||
Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
|
||||
(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
|
||||
seed company in Rushville, Indiana, serving the Eastern Corn Belt
|
||||
(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
|
||||
that is out of scope for the row-crop advisor).
|
||||
|
||||
Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
|
||||
post types (corn-hybrids / soybeans / wheat) are NOT exposed to
|
||||
``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
|
||||
fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
|
||||
per-crop child sitemaps:
|
||||
|
||||
- ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/`` (~52 URLs)
|
||||
- ``/soybeans-sitemap.xml`` -> ``/soybeans/<slug>/`` (~22 URLs)
|
||||
- ``/wheat-sitemap.xml`` -> ``/wheat/<slug>/`` (~4 URLs)
|
||||
|
||||
robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
|
||||
``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
|
||||
no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
|
||||
between requests.
|
||||
|
||||
Detail-page DOM (server-rendered, no JS needed for the text):
|
||||
* Product name: the second ``<h1>`` inside ``article.content`` (the
|
||||
first is the site logo "1st Choice Seeds").
|
||||
* Corn — three ``<h2>`` sections + a side table:
|
||||
- "Hybrid Characteristics": a single ``<p>`` of ``label • value``
|
||||
lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
|
||||
Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
|
||||
Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
|
||||
Seedling Vigor (genuinely thin pages — still written).
|
||||
- "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
|
||||
(the numeric 0-10 bars are drawn client-side by d3 and are NOT
|
||||
in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
|
||||
Superior, so higher = better.
|
||||
- "Management Tips": ``label: value`` lines (Corn-On-Corn,
|
||||
Productivity / soil guidance, Silage Rating).
|
||||
- A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
|
||||
the Low/Medium/High recommended planting populations.
|
||||
* Soybeans — three ``<h2>`` sections:
|
||||
- "Field Notes": a ``<ul>`` of strengths (often includes SCN
|
||||
source / PRR gene call-outs).
|
||||
- "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
|
||||
- "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
|
||||
pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
|
||||
Color, Pubescence, Pod, Hilum).
|
||||
* Wheat — thin (title + date only; wheat is private-label). We still
|
||||
write an identity record so the variety is discoverable.
|
||||
|
||||
Rating scale: the published legend is **0-10, higher = better**
|
||||
("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
|
||||
Superior 9-10"). 1st Choice publishes the *qualitative* word
|
||||
(Excellent / Very Good / …) in the HTML — those map directly onto that
|
||||
legend — while the numeric bar is d3-rendered and absent from the
|
||||
markup. NA / blank = not rated.
|
||||
|
||||
Output:
|
||||
corpus/first_choice/<source_key>.md
|
||||
corpus/first_choice/<source_key>.json
|
||||
|
||||
source_key: ``firstchoice-<slug>`` lowercased, e.g.
|
||||
``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.
|
||||
|
||||
CLI:
|
||||
python -m scrape.sources.first_choice --crop corn --limit 5
|
||||
python -m scrape.sources.first_choice --force
|
||||
python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag
|
||||
|
||||
SCRAPER_VERSION = "0.1.0"
|
||||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||||
BASE = "https://www.1stchoiceseeds.com"
|
||||
SITEMAP_INDEX = f"{BASE}/sitemap.xml"
|
||||
|
||||
# Per-crop child sitemap -> chunker crop value. The chunker keys on
|
||||
# "soybeans" (plural) for the MG branch, so map accordingly. The
|
||||
# cover-crops sitemap is intentionally omitted (out of scope for the
|
||||
# row-crop advisor).
|
||||
CROP_SITEMAPS = {
|
||||
"corn": "corn-hybrids-sitemap.xml",
|
||||
"soybeans": "soybeans-sitemap.xml",
|
||||
"wheat": "wheat-sitemap.xml",
|
||||
}
|
||||
|
||||
# URL path prefix that confirms a sitemap entry is a variety detail page
|
||||
# (vs. a category/archive page that can sneak into a child sitemap).
|
||||
CROP_PATH = {
|
||||
"corn": "/corn-hybrids/",
|
||||
"soybeans": "/soybeans/",
|
||||
"wheat": "/wheat/",
|
||||
}
|
||||
|
||||
# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
|
||||
# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
|
||||
REQ_INTERVAL_SEC = 1.2
|
||||
|
||||
RATING_SCALE_DIRECTION = (
|
||||
"0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
|
||||
"6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
|
||||
"publishes the qualitative word in HTML (the numeric bar is "
|
||||
"d3-rendered, not in markup); blank/NA = not rated"
|
||||
)
|
||||
|
||||
# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
|
||||
# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
|
||||
_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}
|
||||
|
||||
# Trait-suffix -> human label, derived from the slug tail. Best-effort;
|
||||
# an unmapped suffix is title-cased verbatim so nothing is dropped.
|
||||
TRAIT_LABELS = {
|
||||
# corn
|
||||
"vt2p": "VT Double PRO (VT2P)",
|
||||
"gt": "Glyphosate Tolerant (GT)",
|
||||
"c": "Conventional",
|
||||
"pc": "PowerCore (PC)",
|
||||
"tre": "Trecepta (TRE)",
|
||||
"ss": "SmartStax (SS)",
|
||||
"v": "VT (V)",
|
||||
"dv": "Double VT (DV)",
|
||||
"aa": "Agrisure Artesian (AA)",
|
||||
# soybeans
|
||||
"en": "Enlist E3 (EN)",
|
||||
"xf": "XtendFlex (XF)",
|
||||
"sts": "STS",
|
||||
# wheat
|
||||
"b": "Bin-run / branded (B)",
|
||||
"s": "Soft (S)",
|
||||
}
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||||
CORPUS_DIR = CORPUS_ROOT / "first_choice"
|
||||
|
||||
log = logging.getLogger("scrape.first_choice")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- HTTP
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""Polite session with backoff. The 1st Choice row-crop catalog is
|
||||
small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
|
||||
a couple minutes."""
|
||||
|
||||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||||
self.s = requests.Session()
|
||||
self.s.headers["User-Agent"] = USER_AGENT
|
||||
self.interval = interval
|
||||
self._last = 0.0
|
||||
|
||||
def _wait(self) -> None:
|
||||
delta = time.monotonic() - self._last
|
||||
if delta < self.interval:
|
||||
time.sleep(self.interval - delta)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||||
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
||||
last_exc: Exception | None = None
|
||||
resp: requests.Response | None = None
|
||||
for attempt in range(max_retries):
|
||||
self._wait()
|
||||
try:
|
||||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||||
method, url, exc, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||||
ra = resp.headers.get("Retry-After")
|
||||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||||
30.0, (2 ** attempt) + random.random())
|
||||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||||
resp.status_code, method, url, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
return resp
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
assert resp is not None
|
||||
return resp
|
||||
|
||||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("GET", url, **kw)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- model
|
||||
|
||||
|
||||
@dataclass
|
||||
class FCVariety:
|
||||
source_key: str
|
||||
source_url: str
|
||||
crop: str # chunker value: corn / soybeans / wheat
|
||||
product_name: str = "" # "FC 8455 VT2P"
|
||||
relative_maturity: int | None = None # corn (days)
|
||||
maturity_group: float | None = None # soy
|
||||
wheat_class: str | None = None # wheat
|
||||
trait_stack: list[str] = field(default_factory=list)
|
||||
positioning: str | None = None
|
||||
strengths: list[str] = field(default_factory=list)
|
||||
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
||||
groups: list[dict] = field(default_factory=list)
|
||||
sitemap_last_modified: str | None = None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- discovery (sitemaps)
|
||||
|
||||
|
||||
_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
|
||||
_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
|
||||
re.IGNORECASE | re.DOTALL)
|
||||
|
||||
|
||||
def _slug_from_url(url: str) -> str:
|
||||
return url.rstrip("/").rsplit("/", 1)[-1].lower()
|
||||
|
||||
|
||||
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
|
||||
"""Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
|
||||
by walking the per-crop child sitemaps under /sitemap.xml.
|
||||
|
||||
We fetch each known child sitemap directly (their names are stable
|
||||
All-in-One-SEO conventions) rather than trusting the index ordering,
|
||||
but we still confirm against the index so a renamed sitemap is caught.
|
||||
"""
|
||||
# Pull the sitemap index once so we can warn if a crop sitemap is
|
||||
# missing/renamed (defensive; we still target the known names).
|
||||
index_locs: set[str] = set()
|
||||
try:
|
||||
idx = http.get(SITEMAP_INDEX)
|
||||
idx.raise_for_status()
|
||||
index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
|
||||
except requests.RequestException as exc:
|
||||
log.warning("could not read sitemap index %s: %s (continuing with "
|
||||
"known child sitemap names)", SITEMAP_INDEX, exc)
|
||||
|
||||
records: list[dict] = []
|
||||
for crop, child in CROP_SITEMAPS.items():
|
||||
if only_crop and crop != only_crop:
|
||||
continue
|
||||
child_url = f"{BASE}/{child}"
|
||||
if index_locs and child_url not in index_locs:
|
||||
log.warning("crop sitemap %s not listed in the index — site may "
|
||||
"have renamed it; trying anyway", child_url)
|
||||
r = http.get(child_url)
|
||||
if r.status_code == 404:
|
||||
log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
|
||||
continue
|
||||
r.raise_for_status()
|
||||
prefix = CROP_PATH[crop]
|
||||
seen: set[str] = set()
|
||||
n = 0
|
||||
for block in _URL_BLOCK_RE.findall(r.text):
|
||||
loc_m = _LOC_RE.search(block)
|
||||
if not loc_m:
|
||||
continue
|
||||
url = loc_m.group(1).strip()
|
||||
if prefix not in url:
|
||||
continue # category/archive page leaked into the sitemap
|
||||
slug = _slug_from_url(url)
|
||||
if not slug or slug in seen:
|
||||
continue
|
||||
seen.add(slug)
|
||||
lm_m = _LASTMOD_RE.search(block)
|
||||
records.append({
|
||||
"crop": crop,
|
||||
"url": url,
|
||||
"slug": slug,
|
||||
"lastmod": lm_m.group(1).strip() if lm_m else None,
|
||||
})
|
||||
n += 1
|
||||
log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
|
||||
log.info("total varieties discovered: %d", len(records))
|
||||
return records
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- detail parse
|
||||
|
||||
|
||||
def _clean(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", s or "").strip()
|
||||
|
||||
|
||||
def _direct_text(el: Tag) -> str:
|
||||
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
|
||||
|
||||
|
||||
def _br_lines(el: Tag) -> list[str]:
|
||||
"""Text of an element with <br> treated as a line break."""
|
||||
# Work on a copy so the original tree (used by other parsers) stays intact.
|
||||
for br in el.find_all("br"):
|
||||
br.replace_with("\n")
|
||||
return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]
|
||||
|
||||
|
||||
def _product_name(article: Tag, slug: str) -> str:
|
||||
"""The variety name is the 2nd <h1> in article.content (the 1st is the
|
||||
site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
|
||||
for h1 in article.find_all("h1"):
|
||||
txt = _clean(h1.get_text(" ", strip=True))
|
||||
if txt and txt.lower() != "1st choice seeds":
|
||||
return txt
|
||||
return slug.upper().replace("-", " ")
|
||||
|
||||
|
||||
def _trait_stack(slug: str, crop: str) -> list[str]:
|
||||
"""Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
|
||||
fb-3545-c-sts -> Conventional + STS). The leading model token
|
||||
(fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
|
||||
parts = slug.split("-")
|
||||
# Drop the leading model identifier: typically the first 1-2 tokens
|
||||
# (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
|
||||
# that is a known trait suffix counts; we scan from the right.
|
||||
traits: list[str] = []
|
||||
for tok in parts:
|
||||
t = tok.lower()
|
||||
if t in TRAIT_LABELS:
|
||||
label = TRAIT_LABELS[t]
|
||||
if label not in traits:
|
||||
traits.append(label)
|
||||
# Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
|
||||
# above naturally skips them. Preserve discovery order (left->right).
|
||||
return traits
|
||||
|
||||
|
||||
def _parse_corn(article: Tag, v: FCVariety) -> None:
|
||||
"""Populate corn ratings from Hybrid Characteristics + Management Tips
|
||||
+ the Relative Maturity / Degree Days side table."""
|
||||
agronomic: list[dict] = []
|
||||
disease: list[dict] = []
|
||||
management: list[dict] = []
|
||||
|
||||
# Hybrid Characteristics: a <p> of "label • value" lines.
|
||||
hc = next((h for h in article.find_all("h2")
|
||||
if _clean(h.get_text()) == "Hybrid Characteristics"), None)
|
||||
if hc is not None:
|
||||
sib = hc.find_next_sibling()
|
||||
if sib is not None and sib.name == "p":
|
||||
for ln in _br_lines(sib):
|
||||
# split on bullet (•) or fall back to first colon
|
||||
if "•" in ln:
|
||||
k, _, val = ln.partition("•")
|
||||
elif ":" in ln:
|
||||
k, _, val = ln.partition(":")
|
||||
else:
|
||||
k, val = ln, ""
|
||||
k, val = _clean(k), _clean(val)
|
||||
if not k:
|
||||
continue
|
||||
item = {"characteristic": k, "value": val}
|
||||
if k.lower() in _CORN_DISEASE_LABELS:
|
||||
disease.append(item)
|
||||
else:
|
||||
agronomic.append(item)
|
||||
|
||||
# Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
|
||||
# Silage Rating). Stop pulling once we wander into the footer address.
|
||||
mt = next((h for h in article.find_all("h2")
|
||||
if _clean(h.get_text()) == "Management Tips"), None)
|
||||
if mt is not None:
|
||||
sib = mt.find_next_sibling()
|
||||
if sib is not None and sib.name == "p":
|
||||
for ln in _br_lines(sib):
|
||||
if ":" not in ln:
|
||||
continue
|
||||
k, _, val = ln.partition(":")
|
||||
k, val = _clean(k), _clean(val)
|
||||
# Footer noise (address / © line) has no useful colon form.
|
||||
if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
|
||||
management.append({"characteristic": k, "value": val})
|
||||
|
||||
# Side table: Relative Maturity / Degree Days + planting populations.
|
||||
pop_rows: list[str] = []
|
||||
for tbl in article.find_all("table"):
|
||||
for tr in tbl.find_all("tr"):
|
||||
cells = [_clean(c.get_text(" ", strip=True))
|
||||
for c in tr.find_all(["td", "th"])]
|
||||
cells = [c for c in cells if c]
|
||||
if not cells:
|
||||
continue
|
||||
joined = " ".join(cells).lower()
|
||||
if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
|
||||
m = re.search(r"(\d+)", cells[1])
|
||||
if m:
|
||||
v.relative_maturity = int(m.group(1))
|
||||
agronomic.insert(0, {"characteristic": "Relative Maturity",
|
||||
"value": cells[1]})
|
||||
elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
|
||||
agronomic.append({"characteristic": "Degree Days (GDU)",
|
||||
"value": cells[1]})
|
||||
elif joined.startswith("low") and ("medium" in joined or "high" in joined):
|
||||
pop_rows.append(" / ".join(cells))
|
||||
if pop_rows:
|
||||
management.append({"characteristic": "Recommended Planting Population",
|
||||
"value": "; ".join(pop_rows)})
|
||||
|
||||
if agronomic:
|
||||
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
|
||||
if disease:
|
||||
v.groups.append({"label": "DISEASE RATINGS", "items": disease})
|
||||
if management:
|
||||
v.groups.append({"label": "MANAGEMENT", "items": management})
|
||||
|
||||
|
||||
def _parse_soy(article: Tag, v: FCVariety) -> None:
|
||||
"""Populate soy MG + agronomic descriptors + field-note strengths."""
|
||||
# Field Notes -> strengths (and positioning from the first one).
|
||||
fn = next((h for h in article.find_all("h2")
|
||||
if _clean(h.get_text()) == "Field Notes"), None)
|
||||
if fn is not None:
|
||||
sib = fn.find_next_sibling()
|
||||
if sib is not None and sib.name == "ul":
|
||||
notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
|
||||
v.strengths = [n for n in notes if n]
|
||||
if v.strengths and not v.positioning:
|
||||
v.positioning = v.strengths[0]
|
||||
|
||||
# Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
|
||||
agronomic: list[dict] = []
|
||||
vd = next((h for h in article.find_all("h2")
|
||||
if _clean(h.get_text()) == "Variety Description"), None)
|
||||
if vd is not None:
|
||||
for el in vd.find_all_next():
|
||||
if el.name == "h2" and el is not vd:
|
||||
break
|
||||
if not isinstance(el, Tag):
|
||||
continue
|
||||
# Stop at the action buttons / right-nav / footer region.
|
||||
cls = el.get("class") or []
|
||||
if el.name == "div" and any(
|
||||
c in cls for c in ("btn", "right-bar", "right-navigation",
|
||||
"address", "wrapper")):
|
||||
break
|
||||
b = el.find("b", recursive=False) if el.name == "div" else None
|
||||
if b is not None:
|
||||
k = _clean(b.get_text(" ", strip=True)).rstrip(":")
|
||||
val = _direct_text(el)
|
||||
if not k:
|
||||
continue
|
||||
if k.lower() == "maturity":
|
||||
try:
|
||||
v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
|
||||
except (AttributeError, ValueError):
|
||||
pass
|
||||
agronomic.append({"characteristic": "Maturity Group", "value": val})
|
||||
else:
|
||||
agronomic.append({"characteristic": k, "value": val})
|
||||
if agronomic:
|
||||
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
|
||||
|
||||
|
||||
def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
|
||||
crop = rec["crop"]
|
||||
slug = rec["slug"]
|
||||
url = rec["url"]
|
||||
v = FCVariety(
|
||||
source_key=f"firstchoice-{slug}",
|
||||
source_url=url,
|
||||
crop=crop,
|
||||
trait_stack=_trait_stack(slug, crop),
|
||||
sitemap_last_modified=rec.get("lastmod"),
|
||||
)
|
||||
r = http.get(url)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
article = soup.find("article", class_="content") or soup
|
||||
v.product_name = _product_name(article, slug)
|
||||
|
||||
if crop == "corn":
|
||||
_parse_corn(article, v)
|
||||
elif crop == "soybeans":
|
||||
_parse_soy(article, v)
|
||||
# wheat: thin pages — identity only (no spec sections to parse).
|
||||
return v
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- render
|
||||
|
||||
|
||||
def render_markdown(v: FCVariety) -> str:
|
||||
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
|
||||
"wheat": "Wheat"}.get(v.crop, v.crop.title())
|
||||
head: list[str] = [
|
||||
f"# {v.product_name}",
|
||||
"",
|
||||
"- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
|
||||
"- **Brand:** 1st Choice Seeds",
|
||||
f"- **Crop:** {crop_label}",
|
||||
]
|
||||
if v.crop == "corn" and v.relative_maturity is not None:
|
||||
head.append(f"- **Relative maturity:** {v.relative_maturity} day")
|
||||
if v.crop == "soybeans" and v.maturity_group is not None:
|
||||
head.append(f"- **Maturity group:** {v.maturity_group}")
|
||||
if v.crop == "wheat" and v.wheat_class:
|
||||
head.append(f"- **Wheat class:** {v.wheat_class}")
|
||||
if v.trait_stack:
|
||||
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
||||
head.append(f"- **Source:** {v.source_url}")
|
||||
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
||||
head.append("- **Service area:** 1st Choice Seeds dealer network — "
|
||||
"Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
|
||||
head.append("")
|
||||
if v.positioning:
|
||||
head += ["---", "", f"_{v.positioning}_", ""]
|
||||
if v.strengths:
|
||||
head += ["---", "", "## Field Notes", ""]
|
||||
head += [f"- {s}" for s in v.strengths]
|
||||
head.append("")
|
||||
head += ["---", ""]
|
||||
for g in v.groups:
|
||||
head.append(f"## {g['label'].title()}")
|
||||
head.append("")
|
||||
for it in g["items"]:
|
||||
ch = it["characteristic"]
|
||||
val = it["value"] or "—"
|
||||
head.append(f"- **{ch}:** {val}")
|
||||
head.append("")
|
||||
if not v.groups and v.crop == "wheat":
|
||||
head += ["_Identity record only — 1st Choice wheat is private-label "
|
||||
"and the catalog page carries no agronomic spec block._", ""]
|
||||
return "\n".join(head)
|
||||
|
||||
|
||||
def write_variety(v: FCVariety, body_md: str) -> None:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
||||
sidecar = {
|
||||
"source": "first_choice",
|
||||
"source_key": v.source_key,
|
||||
"vendor": "1st Choice Seeds",
|
||||
"brand": "1st Choice Seeds",
|
||||
"product_name": v.product_name,
|
||||
"product_id": v.product_name,
|
||||
"crop": v.crop,
|
||||
"release_year": None,
|
||||
"relative_maturity": v.relative_maturity,
|
||||
"maturity_group": v.maturity_group,
|
||||
"wheat_class": v.wheat_class,
|
||||
"trait_stack": v.trait_stack,
|
||||
"trait_descriptions": [],
|
||||
"positioning_statement": v.positioning,
|
||||
"strengths": v.strengths,
|
||||
"characteristics_groups": v.groups,
|
||||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||||
"regional_recommendations": [
|
||||
{"product_list_name": "1st Choice Seeds dealer network "
|
||||
"(Eastern Corn Belt — IN/OH/KY/TN)",
|
||||
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
||||
],
|
||||
"image_url": None,
|
||||
"source_urls": [v.source_url],
|
||||
"sitemap_last_modified": v.sitemap_last_modified,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"scraper_version": SCRAPER_VERSION,
|
||||
}
|
||||
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
||||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- pipeline
|
||||
|
||||
|
||||
def run(*, limit: int | None, force: bool,
|
||||
only_crop: str | None, only_product: str | None) -> int:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
http = RateLimitedSession()
|
||||
records = discover(http, only_crop=only_crop)
|
||||
|
||||
if only_product:
|
||||
key = only_product.lower()
|
||||
records = [r for r in records
|
||||
if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
|
||||
if not records:
|
||||
log.error("no variety matched --product=%s", only_product)
|
||||
return 2
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
|
||||
processed = 0
|
||||
for rec in records:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
source_key = f"firstchoice-{rec['slug']}"
|
||||
md_path = CORPUS_DIR / f"{source_key}.md"
|
||||
if md_path.exists() and not force:
|
||||
counts["skipped"] += 1
|
||||
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
|
||||
continue
|
||||
try:
|
||||
v = parse_detail(http, rec)
|
||||
except requests.HTTPError as exc:
|
||||
counts["failed"] += 1
|
||||
log.error("[%d/%d] %s detail fetch failed: %s",
|
||||
processed, len(records), source_key, exc)
|
||||
continue
|
||||
if not v.groups:
|
||||
counts["empty"] += 1
|
||||
log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
|
||||
processed, len(records), source_key,
|
||||
"; thin wheat page" if v.crop == "wheat" else "")
|
||||
write_variety(v, render_markdown(v))
|
||||
counts["written"] += 1
|
||||
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
||||
processed, len(records), source_key, v.crop,
|
||||
v.relative_maturity or v.maturity_group or "-",
|
||||
len(v.groups), ",".join(v.trait_stack) or "-")
|
||||
|
||||
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
|
||||
processed, counts["written"], counts["skipped"], counts["empty"],
|
||||
counts["failed"], len(records))
|
||||
return 0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- CLI
|
||||
|
||||
|
||||
def _build_argparser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="scrape.sources.first_choice",
|
||||
description="Scrape 1st Choice Seeds (independent, employee-owned — "
|
||||
"Rushville, IN) — corn / soybeans / wheat via sitemaps "
|
||||
"+ detail pages.")
|
||||
p.add_argument("--limit", type=int, default=None,
|
||||
help="Stop after processing N varieties (default: all).")
|
||||
p.add_argument("--force", action="store_true",
|
||||
help="Re-fetch even if the markdown file already exists.")
|
||||
p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
|
||||
help="Limit to one crop (corn / soybeans / wheat).")
|
||||
p.add_argument("--product", default=None,
|
||||
help="Process a single variety by source_key or slug.")
|
||||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = _build_argparser().parse_args(argv)
|
||||
logging.basicConfig(
|
||||
level=args.log_level.upper(),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
stream=sys.stderr)
|
||||
return run(limit=args.limit, force=args.force,
|
||||
only_crop=args.crop, only_product=args.product)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,594 @@
|
||||
"""Latham Hi-Tech Seeds scraper — independent family-owned brand (Alexander, IA).
|
||||
|
||||
Source: ``www.lathamseeds.com`` — WordPress site exposing a public,
|
||||
no-auth REST API. robots.txt is permissive (only ``/wp-admin/``
|
||||
disallowed; the catalog + ``/wp-json/`` are open, no Crawl-delay).
|
||||
Independent Upper-Midwest seed company (the self-styled "Latham
|
||||
Country" — IA / MN / WI / IL / ND / SD / NE); corn + soybeans only
|
||||
(an Alfalfa crop term exists in the taxonomy but has zero published
|
||||
varieties — no wheat).
|
||||
|
||||
Two-step ingestion (mirrors the ProHarvest scraper):
|
||||
|
||||
1. **Enumerate** via the WP REST API. ``/wp/v2/varieties`` is the
|
||||
variety custom-post-type (~265 records, ``X-WP-Total: 265``).
|
||||
``/wp/v2/variety_crop`` is the crop taxonomy (Corn=2013,
|
||||
Soybean=2029, Alfalfa=2159/empty); ``/wp/v2/variety_trait`` is the
|
||||
trait taxonomy (Enlist E3, VT2 PRO RIB, Smart Stax, XtendFlex, …).
|
||||
The REST payload gives the canonical id / slug / title / permalink
|
||||
and taxonomy term IDs, plus a human-readable ``class_list`` (e.g.
|
||||
``variety_crop-soybean``, ``variety_trait-enlist-e3``). ``acf`` is
|
||||
``[]`` and ``content.rendered`` is EMPTY in REST, so the ratings
|
||||
have to come from the detail page.
|
||||
|
||||
2. **Parse the detail page.** Each ``/products/<slug>/`` page
|
||||
server-renders the agronomic data as ``<h3>`` spec sections, each a
|
||||
run of ``<li><span>label</span><span>value</span></li>`` rows up to
|
||||
the next section header:
|
||||
- Corn: "Agronomic Characteristics" (Early Vigor / Stalk Strength
|
||||
/ Root Strength / Stay Green / Drydown / Test Weight / Drought
|
||||
Tolerance / Foliar Fungicide / Corn-on-Corn), "Plant
|
||||
Characteristics" (Ear Height / Ear Type), "Disease Ratings"
|
||||
(Goss's Wilt / Northern Leaf Blight / Anthracnose Stalk Rot /
|
||||
Gray Leaf Spot / tar spot etc).
|
||||
- Soybean: "Plant Characteristics" (Relative Maturity / Emergence
|
||||
/ Plant Height / Plant Type / Flower Color / Pubescence / Pod
|
||||
Color / Hilum Color), "Defensive Characteristics & Disease
|
||||
Ratings" (SCN Resistance source / Iron Chlorosis / Stress
|
||||
Tolerance / Phytophthora Rps gene / Brown Stem Rot / White Mold
|
||||
/ Sudden Death). "Herbicide Tolerance" + "Placement" sections
|
||||
are present but carry no ``<li>`` rows.
|
||||
The relative maturity also sits in a "Key Features" ``Maturity``
|
||||
row ("113.00 RM" / "3.60 RM"); we read RM/MG from the per-crop
|
||||
spec section first and fall back to that.
|
||||
|
||||
Rating scale: **numeric, LOWER = BETTER** (1 = best / most
|
||||
tolerant / most resistant). No explicit on-page legend, so the
|
||||
direction was confirmed by cross-referencing the Product Overview
|
||||
prose against the published values across ~12 corn varieties:
|
||||
hybrids described "very good / superior / excellent stalks and roots"
|
||||
carry Stalk/Root Strength 1.0–1.5, weaker traits run 3.0–3.5, and no
|
||||
value approaches 9 (observed range ~1.0–3.5). The soybean disease
|
||||
panel (Iron Chlorosis / Brown Stem Rot / White Mold / Sudden Death /
|
||||
Stress Tolerance) reads the same direction (lower = more resistant).
|
||||
A handful of values are categorical rather than numeric and pass
|
||||
through verbatim: SCN Resistance source ("PI 88788"), Phytophthora
|
||||
"Rps 1k", Anthracnose "ASR", plant descriptors ("Medium Tall",
|
||||
"Flex"). ``NA`` / blank = not rated.
|
||||
|
||||
Unlike the Ebbert's scraper (which left ``characteristics_groups``
|
||||
empty and relied on a verbatim body), we parse the spec sections into
|
||||
structured ``characteristics_groups`` so the numeric + categorical
|
||||
ratings land in the embedded chunk and are actually retrievable. The
|
||||
soybean "Defensive Characteristics & Disease Ratings" section maps to
|
||||
the DISEASE RATINGS bucket; corn "Agronomic Characteristics" +
|
||||
"Plant Characteristics" map to AGRONOMIC CHARACTERISTICS.
|
||||
|
||||
Output:
|
||||
corpus/latham/<source_key>.md
|
||||
corpus/latham/<source_key>.json
|
||||
|
||||
source_key: ``latham-<slug>`` lowercased, e.g. ``latham-l-3632-e3``.
|
||||
|
||||
CLI:
|
||||
python -m scrape.sources.latham --crop corn --limit 5
|
||||
python -m scrape.sources.latham --force
|
||||
python -m scrape.sources.latham --product latham-l-3632-e3
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
SCRAPER_VERSION = "0.1.0"
|
||||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||||
BASE = "https://www.lathamseeds.com"
|
||||
WP = f"{BASE}/wp-json/wp/v2"
|
||||
|
||||
# variety_crop taxonomy slug -> chunker crop value. The chunker keys on
|
||||
# "soybeans" (plural) for the MG branch, so map accordingly. "alfalfa"
|
||||
# is in the taxonomy but has zero published varieties; everything not
|
||||
# listed here is out of scope for the row-crop advisor. (No wheat.)
|
||||
CROP_TYPES = {
|
||||
"corn": "corn",
|
||||
"soybean": "soybeans",
|
||||
}
|
||||
|
||||
# robots.txt declares no Crawl-delay and only blocks /wp-admin/; we
|
||||
# stay polite. ~265 detail pages at 1.5 s/req finishes in ~7 min.
|
||||
REQ_INTERVAL_SEC = 1.5
|
||||
|
||||
RATING_SCALE_DIRECTION = (
|
||||
"numeric ratings ~1-9 where LOWER = BETTER (1 = best / most "
|
||||
"tolerant / most resistant); confirmed by cross-referencing "
|
||||
"Product Overview prose vs values (top-rated stalks/roots cluster "
|
||||
"1.0-1.5, weak traits 3.0-3.5, none approach 9). Categorical "
|
||||
"values pass through verbatim (SCN source 'PI 88788', "
|
||||
"Phytophthora 'Rps 1k', Anthracnose 'ASR', 'Medium Tall', 'Flex'). "
|
||||
"NA/blank = not rated."
|
||||
)
|
||||
|
||||
# Detail-page spec section headers (<h3>) -> characteristics_groups
|
||||
# label. DISEASE RATINGS -> disease framing, AGRONOMIC CHARACTERISTICS
|
||||
# -> agronomic framing in the chunker; anything else passes through as
|
||||
# its own titled section. Both corn and soy headers are covered. The
|
||||
# soybean "Defensive Characteristics & Disease Ratings" panel mixes
|
||||
# disease 1-9 ratings with categorical resistance source/genes — we
|
||||
# bucket the whole panel as DISEASE so it embeds under disease framing.
|
||||
SPEC_SECTIONS = {
|
||||
"agronomic characteristics": "AGRONOMIC CHARACTERISTICS",
|
||||
"plant characteristics": "AGRONOMIC CHARACTERISTICS",
|
||||
"disease ratings": "DISEASE RATINGS",
|
||||
"defensive characteristics & disease ratings": "DISEASE RATINGS",
|
||||
"defensive characteristics and disease ratings": "DISEASE RATINGS",
|
||||
}
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||||
CORPUS_DIR = CORPUS_ROOT / "latham"
|
||||
|
||||
log = logging.getLogger("scrape.latham")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- HTTP
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""Polite session with backoff. Latham's catalog is ~265 detail
|
||||
pages so 1.5 s/req finishes the full scrape in ~7 min."""
|
||||
|
||||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||||
self.s = requests.Session()
|
||||
self.s.headers["User-Agent"] = USER_AGENT
|
||||
self.interval = interval
|
||||
self._last = 0.0
|
||||
|
||||
def _wait(self) -> None:
|
||||
delta = time.monotonic() - self._last
|
||||
if delta < self.interval:
|
||||
time.sleep(self.interval - delta)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||||
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(max_retries):
|
||||
self._wait()
|
||||
try:
|
||||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||||
method, url, exc, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||||
ra = resp.headers.get("Retry-After")
|
||||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||||
30.0, (2 ** attempt) + random.random())
|
||||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||||
resp.status_code, method, url, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
return resp
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
return resp # type: ignore[return-value]
|
||||
|
||||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("GET", url, **kw)
|
||||
|
||||
def get_json(self, url: str, **kw: Any) -> Any:
|
||||
r = self.get(url, **kw)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- model
|
||||
|
||||
|
||||
@dataclass
|
||||
class LathamVariety:
|
||||
source_key: str
|
||||
source_url: str
|
||||
crop: str # chunker value: corn / soybeans
|
||||
product_name: str = "" # "L 3632 E3"
|
||||
relative_maturity: int | None = None # corn (days)
|
||||
maturity_group: float | None = None # soy
|
||||
release_year: str | None = None
|
||||
trait_stack: list[str] = field(default_factory=list)
|
||||
positioning: str | None = None
|
||||
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
||||
groups: list[dict] = field(default_factory=list)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- discovery (REST)
|
||||
|
||||
|
||||
def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
|
||||
"""term_id -> name for a WP taxonomy (paged)."""
|
||||
out: dict[int, str] = {}
|
||||
page = 1
|
||||
while True:
|
||||
url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
|
||||
r = http.get(url)
|
||||
if r.status_code == 400: # past last page
|
||||
break
|
||||
r.raise_for_status()
|
||||
terms = r.json()
|
||||
if not terms:
|
||||
break
|
||||
for t in terms:
|
||||
out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
|
||||
if len(terms) < 100:
|
||||
break
|
||||
page += 1
|
||||
return out
|
||||
|
||||
|
||||
def _crop_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
|
||||
out: dict[str, int] = {}
|
||||
for t in http.get_json(f"{WP}/variety_crop?per_page=100&_fields=id,slug"):
|
||||
out[t["slug"]] = t["id"]
|
||||
return out
|
||||
|
||||
|
||||
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
|
||||
"""Return REST variety records for the in-scope row crops."""
|
||||
crop_ids = _crop_slug_to_id(http)
|
||||
records: list[dict] = []
|
||||
seen: set[int] = set()
|
||||
for crop_slug, crop in CROP_TYPES.items():
|
||||
if only_crop and crop != only_crop:
|
||||
continue
|
||||
cid = crop_ids.get(crop_slug)
|
||||
if cid is None:
|
||||
log.warning("variety_crop %r not found in taxonomy — skipping", crop_slug)
|
||||
continue
|
||||
page = 1
|
||||
while True:
|
||||
url = (f"{WP}/varieties?variety_crop={cid}&per_page=100&page={page}"
|
||||
"&_fields=id,slug,title,link,variety_trait,variety_year")
|
||||
r = http.get(url)
|
||||
if r.status_code == 400:
|
||||
break
|
||||
r.raise_for_status()
|
||||
batch = r.json()
|
||||
if not batch:
|
||||
break
|
||||
for v in batch:
|
||||
if v["id"] in seen:
|
||||
continue
|
||||
seen.add(v["id"])
|
||||
v["_crop"] = crop
|
||||
records.append(v)
|
||||
if len(batch) < 100:
|
||||
break
|
||||
page += 1
|
||||
log.info("variety_crop %-8s (%s): cumulative %d", crop_slug, crop, len(records))
|
||||
return records
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- detail parse
|
||||
|
||||
|
||||
_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
|
||||
|
||||
|
||||
def _clean(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", s or "").strip()
|
||||
|
||||
|
||||
def _two_span(li: Tag) -> tuple[str, str] | None:
|
||||
"""A spec row is an <li> with exactly two non-empty <span>
|
||||
descendants: (label, value)."""
|
||||
spans = [_clean(s.get_text(" ", strip=True)) for s in li.find_all("span")]
|
||||
if len(spans) == 2 and all(spans):
|
||||
return spans[0], spans[1]
|
||||
return None
|
||||
|
||||
|
||||
def _section_rows(header: Tag) -> list[tuple[str, str]]:
|
||||
"""Collect every two-span <li> from a section header up to (but not
|
||||
including) the next section header (h2/h3) in document order."""
|
||||
rows: list[tuple[str, str]] = []
|
||||
for el in header.find_all_next():
|
||||
if el.name in ("h2", "h3") and el is not header:
|
||||
break
|
||||
if isinstance(el, Tag) and el.name == "li":
|
||||
pair = _two_span(el)
|
||||
if pair:
|
||||
rows.append(pair)
|
||||
return rows
|
||||
|
||||
|
||||
def _parse_groups(soup: BeautifulSoup) -> list[dict]:
|
||||
"""Parse each known spec <h3> into a {label, items:[{characteristic,
|
||||
value}]} group. Sections with no rows are dropped."""
|
||||
groups: list[dict] = []
|
||||
for header in soup.find_all(["h2", "h3"]):
|
||||
head = _clean(header.get_text(" ", strip=True)).lower()
|
||||
label = SPEC_SECTIONS.get(head)
|
||||
if not label:
|
||||
continue
|
||||
rows = _section_rows(header)
|
||||
if not rows:
|
||||
continue
|
||||
items = [{"characteristic": k, "value": v} for k, v in rows]
|
||||
# If a previous section already mapped to this label (corn maps
|
||||
# both Agronomic + Plant Characteristics -> AGRONOMIC), merge so
|
||||
# the chunker sees one coherent bucket instead of two.
|
||||
existing = next((g for g in groups if g["label"] == label), None)
|
||||
if existing:
|
||||
existing["items"].extend(items)
|
||||
else:
|
||||
groups.append({"label": label, "items": items})
|
||||
return groups
|
||||
|
||||
|
||||
def _parse_maturity_from_groups(groups: list[dict], crop: str,
|
||||
) -> tuple[int | None, float | None]:
|
||||
"""Pull RM (corn) / MG (soy) from the parsed groups. Corn carries
|
||||
'Maturity' under the page's Key Features and 'Relative Maturity' is
|
||||
soy-side under Plant Characteristics."""
|
||||
keys = ("relative maturity", "maturity")
|
||||
for g in groups:
|
||||
for it in g["items"]:
|
||||
if it["characteristic"].strip().lower() in keys:
|
||||
m = _MATURITY_RE.search(it["value"])
|
||||
if not m:
|
||||
continue
|
||||
if crop == "corn":
|
||||
return int(float(m.group(1))), None
|
||||
return None, float(m.group(1))
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_maturity_keyfeatures(soup: BeautifulSoup, crop: str,
|
||||
) -> tuple[int | None, float | None]:
|
||||
"""Fallback: the 'Key Features' block carries a 'Maturity' row
|
||||
('113.00 RM' / '3.60 RM')."""
|
||||
for li in soup.find_all("li"):
|
||||
pair = _two_span(li)
|
||||
if pair and pair[0].strip().lower() == "maturity":
|
||||
m = _MATURITY_RE.search(pair[1])
|
||||
if m:
|
||||
if crop == "corn":
|
||||
return int(float(m.group(1))), None
|
||||
return None, float(m.group(1))
|
||||
return None, None
|
||||
|
||||
|
||||
def _parse_positioning(soup: BeautifulSoup) -> str | None:
|
||||
"""First substantive paragraph under the 'Product Overview' /
|
||||
'Hybrid Advantages' heading. Best-effort marketing blurb."""
|
||||
for header in soup.find_all(["h2", "h3"]):
|
||||
if _clean(header.get_text(" ", strip=True)).lower() not in (
|
||||
"product overview", "hybrid advantages"):
|
||||
continue
|
||||
for el in header.find_all_next():
|
||||
if el.name in ("h2", "h3") and el is not header:
|
||||
break
|
||||
if isinstance(el, Tag) and el.name == "p":
|
||||
t = _clean(el.get_text(" ", strip=True))
|
||||
if len(t) >= 40:
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def parse_detail(http: RateLimitedSession, rec: dict,
|
||||
trait_names: dict[int, str],
|
||||
year_names: dict[int, str]) -> LathamVariety:
|
||||
crop = rec["_crop"]
|
||||
slug = rec["slug"]
|
||||
url = rec.get("link") or f"{BASE}/products/{slug}/"
|
||||
name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
|
||||
r = http.get(url)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
# Drop noise so footer/nav text never bleeds into positioning.
|
||||
for t in soup(["script", "style", "noscript"]):
|
||||
t.decompose()
|
||||
|
||||
groups = _parse_groups(soup)
|
||||
rm, mg = _parse_maturity_from_groups(groups, crop)
|
||||
if rm is None and mg is None:
|
||||
rm, mg = _parse_maturity_keyfeatures(soup, crop)
|
||||
positioning = _parse_positioning(soup)
|
||||
traits = [trait_names[t] for t in (rec.get("variety_trait") or [])
|
||||
if t in trait_names]
|
||||
years = [year_names[t] for t in (rec.get("variety_year") or [])
|
||||
if t in year_names]
|
||||
release_year = years[0] if years else None
|
||||
|
||||
return LathamVariety(
|
||||
source_key=f"latham-{slug.lower()}",
|
||||
source_url=url,
|
||||
crop=crop,
|
||||
product_name=name,
|
||||
relative_maturity=rm,
|
||||
maturity_group=mg,
|
||||
release_year=release_year,
|
||||
trait_stack=traits,
|
||||
positioning=positioning,
|
||||
groups=groups,
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- render
|
||||
|
||||
|
||||
def render_markdown(v: LathamVariety) -> str:
|
||||
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
|
||||
v.crop, v.crop.title())
|
||||
head: list[str] = [
|
||||
f"# {v.product_name}",
|
||||
"",
|
||||
"- **Vendor:** Latham Hi-Tech Seeds (independent family-owned, Alexander, IA)",
|
||||
"- **Brand:** Latham Hi-Tech Seeds",
|
||||
f"- **Crop:** {crop_label}",
|
||||
]
|
||||
if v.crop == "corn" and v.relative_maturity is not None:
|
||||
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
|
||||
if v.crop == "soybeans" and v.maturity_group is not None:
|
||||
head.append(f"- **Maturity group:** {v.maturity_group}")
|
||||
if v.trait_stack:
|
||||
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
||||
head.append(f"- **Source:** {v.source_url}")
|
||||
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
||||
head.append("- **Service area:** Latham dealer network — Upper Midwest "
|
||||
"(IA/MN/WI/IL/ND/SD/NE)")
|
||||
head.append("")
|
||||
if v.positioning:
|
||||
head += ["---", "", f"_{v.positioning}_", ""]
|
||||
head += ["---", ""]
|
||||
for g in v.groups:
|
||||
head.append(f"## {g['label'].title()}")
|
||||
head.append("")
|
||||
for it in g["items"]:
|
||||
ch = it["characteristic"]
|
||||
val = it["value"] or "—"
|
||||
head.append(f"- **{ch}:** {val}")
|
||||
head.append("")
|
||||
return "\n".join(head)
|
||||
|
||||
|
||||
def write_variety(v: LathamVariety, body_md: str) -> None:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
||||
sidecar = {
|
||||
"source": "latham",
|
||||
"source_key": v.source_key,
|
||||
"vendor": "Latham Hi-Tech Seeds",
|
||||
"brand": "Latham Hi-Tech Seeds",
|
||||
"product_name": v.product_name,
|
||||
"product_id": v.product_name,
|
||||
"crop": v.crop,
|
||||
"release_year": v.release_year,
|
||||
"relative_maturity": v.relative_maturity,
|
||||
"maturity_group": v.maturity_group,
|
||||
"wheat_class": None,
|
||||
"trait_stack": v.trait_stack,
|
||||
"trait_descriptions": [],
|
||||
"positioning_statement": v.positioning,
|
||||
"strengths": [],
|
||||
"characteristics_groups": v.groups,
|
||||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||||
"regional_recommendations": [
|
||||
{"product_list_name": "Latham dealer network (Upper Midwest — "
|
||||
"IA/MN/WI/IL/ND/SD/NE)",
|
||||
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
||||
],
|
||||
"image_url": None,
|
||||
"source_urls": [v.source_url],
|
||||
"sitemap_last_modified": None,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"scraper_version": SCRAPER_VERSION,
|
||||
}
|
||||
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
||||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- pipeline
|
||||
|
||||
|
||||
def run(*, limit: int | None, force: bool,
|
||||
only_crop: str | None, only_product: str | None) -> int:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
http = RateLimitedSession()
|
||||
trait_names = _taxonomy_map(http, "variety_trait")
|
||||
year_names = _taxonomy_map(http, "variety_year")
|
||||
records = discover(http, only_crop=only_crop)
|
||||
|
||||
if only_product:
|
||||
key = only_product.lower()
|
||||
records = [r for r in records
|
||||
if f"latham-{r['slug'].lower()}" == key
|
||||
or r["slug"].lower() == key]
|
||||
if not records:
|
||||
log.error("no variety matched --product=%s", only_product)
|
||||
return 2
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "empty": 0}
|
||||
processed = 0
|
||||
for rec in records:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
source_key = f"latham-{rec['slug'].lower()}"
|
||||
md_path = CORPUS_DIR / f"{source_key}.md"
|
||||
if md_path.exists() and not force:
|
||||
counts["skipped"] += 1
|
||||
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
|
||||
continue
|
||||
try:
|
||||
v = parse_detail(http, rec, trait_names, year_names)
|
||||
except requests.HTTPError as exc:
|
||||
log.error("[%d/%d] %s detail fetch failed: %s",
|
||||
processed, len(records), source_key, exc)
|
||||
continue
|
||||
if not v.groups:
|
||||
counts["empty"] += 1
|
||||
log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
|
||||
processed, len(records), source_key)
|
||||
write_variety(v, render_markdown(v))
|
||||
counts["written"] += 1
|
||||
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
||||
processed, len(records), source_key, v.crop,
|
||||
v.relative_maturity or v.maturity_group or "-",
|
||||
len(v.groups), ",".join(v.trait_stack) or "-")
|
||||
|
||||
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
|
||||
processed, counts["written"], counts["skipped"], counts["empty"],
|
||||
len(records))
|
||||
return 0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- CLI
|
||||
|
||||
|
||||
def _build_argparser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="scrape.sources.latham",
|
||||
description="Scrape Latham Hi-Tech Seeds (independent Upper-Midwest "
|
||||
"brand) — corn / soybeans via the WP REST API + detail pages.")
|
||||
p.add_argument("--limit", type=int, default=None,
|
||||
help="Stop after processing N varieties (default: all).")
|
||||
p.add_argument("--force", action="store_true",
|
||||
help="Re-fetch even if the markdown file already exists.")
|
||||
p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
|
||||
help="Limit to one crop (corn / soybeans).")
|
||||
p.add_argument("--product", default=None,
|
||||
help="Process a single variety by source_key or slug.")
|
||||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = _build_argparser().parse_args(argv)
|
||||
logging.basicConfig(
|
||||
level=args.log_level.upper(),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
stream=sys.stderr)
|
||||
return run(limit=args.limit, force=args.force,
|
||||
only_crop=args.crop, only_product=args.product)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,767 @@
|
||||
"""Stine Seed Company scraper — independent family-owned breeder (Adel, IA).
|
||||
|
||||
Source: ``www.stineseed.com`` — a custom PHP site (NOT WordPress;
|
||||
``/wp-json/`` 404s). robots.txt returns 404 (none published); the
|
||||
``/legal/`` page carries only a standard copyright / no-reproduction
|
||||
clause (no anti-automation term — same posture as the other corpus
|
||||
vendors). ``sitemap.xml`` (~499 URLs) lists every live product page,
|
||||
so it is our canonical enumeration source.
|
||||
|
||||
Stine is the largest privately-owned seed company in the US; it
|
||||
breeds and sells **corn + soybeans** only (no wheat). The catalog is
|
||||
~58 corn hybrids + ~159 soybean varieties.
|
||||
|
||||
Two-step ingestion:
|
||||
|
||||
1. **Enumerate** the current catalog from ``sitemap.xml``. A product
|
||||
*detail* URL has the shape ``/{crop}/traits/{trait-slug}/{code}/``
|
||||
(four path segments); the bare ``/{crop}/traits/{trait-slug}/``
|
||||
landing pages are skipped. This yields exactly the live catalog
|
||||
(58 corn + 159 soy), unlike the comparison ajax endpoint which
|
||||
also returns thousands of discontinued/historical entries.
|
||||
|
||||
Fallback enumeration (``--enumerate ajax``) hits the comparison
|
||||
ajax fragments:
|
||||
- corn: POST ``/ajax/corn-comparison/filter_products.php``
|
||||
- soy: POST ``/ajax/soybean-comparison/filter_products.php``
|
||||
with ``sel1=&sel2=&sel3=`` (empty = all). Each ``<li>`` carries a
|
||||
numeric product id + the canonical detail URL.
|
||||
|
||||
2. **Parse the detail page.** Each ``/{crop}/traits/{slug}/{code}/``
|
||||
page server-renders all agronomic data (no JS needed) as
|
||||
``<section class="agronomic-details">`` →
|
||||
``<ul class="agronomy-chart"> <li> <strong>label</strong>
|
||||
<span class="value">value</span> </li> …``. The variety code +
|
||||
brand mark live in the ``<h1>`` (``Stine ® 9444-22 Brand``).
|
||||
|
||||
Rating scales differ by crop and are preserved verbatim (the chunker
|
||||
never fabricates a value):
|
||||
|
||||
- **Corn** publishes an on-page legend:
|
||||
``9: Excellent, 8: Very Good, 7: Good, 6: Average,
|
||||
5: Below Average`` — a **1-9 numeric** scale, **HIGHER = BETTER /
|
||||
more tolerant** (same direction as Bayer/NK, so no flip). Applies
|
||||
to the agronomic performance panel (Drydown/Root/Stalk/Stress/
|
||||
Cold Emergence/Test Weight) and the disease panel (Tar Spot/Gray
|
||||
Leaf Spot/Eye Spot/N.C. Leaf Blight/Goss' Wilt/Common Rust/…).
|
||||
Plant descriptors / soil placement / herbicide rows are
|
||||
qualitative (Tall, Highly Recommended, Yes/No) and pass through.
|
||||
- **Soybeans** are entirely **qualitative** (Excellent / Very Good
|
||||
/ Good / … and Resistant / Strong / Good / Susceptible for
|
||||
disease; "higher/'Resistant' = better"). There is no numeric
|
||||
legend on soy pages. SCN (Soybean Cyst Nematode) and RPS Gene
|
||||
rows carry the *source/gene* (e.g. Peking, 3a) rather than a
|
||||
rating.
|
||||
|
||||
We parse the chart into structured ``characteristics_groups`` — a
|
||||
DISEASE RATINGS group, an AGRONOMIC CHARACTERISTICS group, and a few
|
||||
pass-through groups (PLANT DESCRIPTION / SOIL & PLACEMENT / HERBICIDE
|
||||
TOLERANCE / SEED TREATMENT NOTES) — so every rating lands in the
|
||||
embedded chunk and is actually retrievable.
|
||||
|
||||
Output:
|
||||
corpus/stine/<source_key>.md
|
||||
corpus/stine/<source_key>.json
|
||||
|
||||
source_key: ``stine-<productcode>`` lowercased, e.g.
|
||||
``stine-9444-22`` (corn) or ``stine-22r32`` (soy).
|
||||
|
||||
CLI:
|
||||
python -m scrape.sources.stine --crop corn --limit 2 --force
|
||||
python -m scrape.sources.stine --crop soybeans --limit 2 --force
|
||||
python -m scrape.sources.stine --force
|
||||
python -m scrape.sources.stine --product stine-9444-22
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import warnings
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try: # bs4>=4.11 raises this when html.parser sees an XML doc (the sitemap)
|
||||
from bs4 import XMLParsedAsHTMLWarning
|
||||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||||
except Exception: # pragma: no cover — older bs4 without the warning class
|
||||
pass
|
||||
|
||||
SCRAPER_VERSION = "0.1.0"
|
||||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||||
BASE = "https://www.stineseed.com"
|
||||
SITEMAP = f"{BASE}/sitemap.xml"
|
||||
AJAX = {
|
||||
"corn": f"{BASE}/ajax/corn-comparison/filter_products.php",
|
||||
"soybeans": f"{BASE}/ajax/soybean-comparison/filter_products.php",
|
||||
}
|
||||
|
||||
# Stine site path segment -> chunker crop value (chunker keys on the
|
||||
# PLURAL "soybeans" for the MG branch). Stine has no wheat.
|
||||
CROP_PATHS = {
|
||||
"corn": "corn",
|
||||
"soybeans": "soybeans",
|
||||
}
|
||||
|
||||
# No robots.txt (404) and no Crawl-delay; stay polite at 1.5 s/req.
|
||||
# ~217 detail pages -> a full run finishes in ~6 min.
|
||||
REQ_INTERVAL_SEC = 1.5
|
||||
|
||||
RATING_SCALE_DIRECTION = (
|
||||
"corn agronomic+disease 1-9 numeric, 9=Excellent/best/most-tolerant, "
|
||||
"8=Very Good, 7=Good, 6=Average, 5=Below Average (higher=better, same "
|
||||
"direction as Bayer/NK; blank/'-'=not rated); soybeans qualitative "
|
||||
"(Excellent/Very Good/Good for vigor; Resistant/Strong/Good/Susceptible "
|
||||
"for disease, Resistant/Strong=best); SCN row gives source (e.g. Peking) "
|
||||
"and RPS Gene gives the gene, not a rating; plant/soil/herbicide rows "
|
||||
"qualitative (Tall, Highly Recommended/Recommended, Yes/No)"
|
||||
)
|
||||
|
||||
# ---- Chart-label classification -------------------------------------
|
||||
# The agronomy chart is a flat run of label/value <li>s mixing identity,
|
||||
# performance ratings, disease ratings, plant descriptors, soil/placement,
|
||||
# and herbicide rows. We bucket by label into characteristics_groups the
|
||||
# chunker understands (DISEASE RATINGS -> disease framing, AGRONOMIC
|
||||
# CHARACTERISTICS -> agronomic framing; the rest pass through titled).
|
||||
|
||||
# Identity rows already captured into RM/MG/dedicated facts — not repeated
|
||||
# as a generic characteristic.
|
||||
_IDENTITY_LABELS = {"maturity", "maturity end"}
|
||||
|
||||
# Corn 1-9 performance ratings -> AGRONOMIC CHARACTERISTICS.
|
||||
_CORN_AGRONOMIC = {
|
||||
"gdd", "mn maturity", "drydown", "root", "stalk", "stress",
|
||||
"cold emergence", "test weight", "harvest population",
|
||||
}
|
||||
# Corn disease ratings -> DISEASE RATINGS. Set kept generous because the
|
||||
# disease list varies per page (some add S.C. Leaf Blight / Anthracnose).
|
||||
_CORN_DISEASE = {
|
||||
"tar spot", "gray leaf spot", "eye spot", "n.c. leaf blight",
|
||||
"s.c. leaf blight", "anthracnose", "goss' wilt", "goss’ wilt",
|
||||
"common rust", "northern corn leaf blight", "southern corn leaf blight",
|
||||
"diplodia", "fusarium", "head smut",
|
||||
}
|
||||
# Corn plant descriptors -> PLANT DESCRIPTION.
|
||||
_CORN_PLANT = {"plant height", "ear placement", "ear flex", "cob color"}
|
||||
# Corn soil/placement -> SOIL & PLACEMENT.
|
||||
_CORN_SOIL = {
|
||||
"corn-on-corn", "sand", "loam", "clay", "wide rows", "narrow rows",
|
||||
'population % in 30" or wider rows', "population % in narrow rows",
|
||||
"population", "drought tolerance",
|
||||
}
|
||||
# Corn herbicide -> HERBICIDE TOLERANCE.
|
||||
_CORN_HERBICIDE = {"glyphosate tolerant", "glufosinate tolerant"}
|
||||
|
||||
# Soy vigor/standability -> AGRONOMIC CHARACTERISTICS.
|
||||
_SOY_AGRONOMIC = {"emergence", "standability", "shattering", "lodging"}
|
||||
# Soy disease + nematode + gene rows -> DISEASE RATINGS (SCN/RPS carry a
|
||||
# source/gene rather than a rating; that's still the disease panel).
|
||||
_SOY_DISEASE = {
|
||||
"phytophthora root rot", "rps gene", "iron deficiency chlorosis",
|
||||
"brown stem rot", "sudden death syndrome", "soybean cyst nematode",
|
||||
"frogeye leafspot", "frogeye leaf spot", "sclerotinia white mold",
|
||||
"white mold", "stem canker", "root knot nematode", "soybean rust",
|
||||
}
|
||||
# Soy plant descriptors / quality -> PLANT DESCRIPTION.
|
||||
_SOY_PLANT = {
|
||||
"height", "flower", "pubescence", "hilum", "chloride", "pod color",
|
||||
"canopy", "protein", "oil",
|
||||
}
|
||||
# Soy herbicide/trait management -> HERBICIDE TOLERANCE.
|
||||
_SOY_HERBICIDE = {"sulfonylurea tolerance", "sts", "glyphosate tolerant"}
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||||
CORPUS_DIR = CORPUS_ROOT / "stine"
|
||||
|
||||
log = logging.getLogger("scrape.stine")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- HTTP
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""Polite session with backoff. Stine's live catalog is ~217 detail
|
||||
pages, so 1.5 s/req still finishes in a few minutes."""
|
||||
|
||||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||||
self.s = requests.Session()
|
||||
self.s.headers["User-Agent"] = USER_AGENT
|
||||
self.interval = interval
|
||||
self._last = 0.0
|
||||
|
||||
def _wait(self) -> None:
|
||||
delta = time.monotonic() - self._last
|
||||
if delta < self.interval:
|
||||
time.sleep(self.interval - delta)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||||
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(max_retries):
|
||||
self._wait()
|
||||
try:
|
||||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||||
method, url, exc, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||||
ra = resp.headers.get("Retry-After")
|
||||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||||
30.0, (2 ** attempt) + random.random())
|
||||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||||
resp.status_code, method, url, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
return resp
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
return resp # type: ignore[return-value]
|
||||
|
||||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("GET", url, **kw)
|
||||
|
||||
def post(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("POST", url, **kw)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- model
|
||||
|
||||
|
||||
@dataclass
|
||||
class StineVariety:
|
||||
source_key: str
|
||||
source_url: str
|
||||
crop: str # chunker value: corn / soybeans
|
||||
product_name: str = "" # "9444-22", "22R32"
|
||||
relative_maturity: int | None = None # corn (representative RM days)
|
||||
maturity_group: float | None = None # soy MG
|
||||
trait_stack: list[str] = field(default_factory=list)
|
||||
positioning: str | None = None
|
||||
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
||||
groups: list[dict] = field(default_factory=list)
|
||||
sitemap_last_modified: str | None = None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- discovery
|
||||
|
||||
|
||||
_DETAIL_RE = re.compile(
|
||||
r"^https?://(?:www\.)?stineseed\.com/(corn|soybeans)/traits/"
|
||||
r"([^/]+)/([^/]+)/?$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiscoveredURL:
|
||||
url: str
|
||||
crop: str
|
||||
trait_slug: str
|
||||
code: str
|
||||
lastmod: str | None = None
|
||||
|
||||
|
||||
def _norm_url(url: str) -> str:
|
||||
"""Canonical product URL has a trailing slash."""
|
||||
url = url.strip()
|
||||
if not url.endswith("/"):
|
||||
url += "/"
|
||||
return url
|
||||
|
||||
|
||||
def discover_sitemap(http: RateLimitedSession, *,
|
||||
only_crop: str | None) -> list[DiscoveredURL]:
|
||||
"""Parse sitemap.xml for live product detail pages.
|
||||
|
||||
A detail URL has FOUR path segments (``/{crop}/traits/{slug}/{code}/``);
|
||||
the bare ``/{crop}/traits/{slug}/`` landing pages are excluded.
|
||||
"""
|
||||
r = http.get(SITEMAP)
|
||||
r.raise_for_status()
|
||||
# Parse with html.parser (lxml/xml backend isn't a guaranteed dep). It
|
||||
# lowercases tag names but <loc>/<lastmod> are already lowercase, so
|
||||
# find_all("url") still works on the sitemap fragments.
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
out: list[DiscoveredURL] = []
|
||||
seen: set[str] = set()
|
||||
for u in soup.find_all("url"):
|
||||
loc_el = u.find("loc")
|
||||
if not loc_el:
|
||||
continue
|
||||
loc = loc_el.get_text(strip=True)
|
||||
m = _DETAIL_RE.match(loc)
|
||||
if not m:
|
||||
continue
|
||||
crop, trait_slug, code = m.group(1).lower(), m.group(2), m.group(3)
|
||||
crop = CROP_PATHS.get(crop)
|
||||
if not crop:
|
||||
continue
|
||||
if only_crop and crop != only_crop:
|
||||
continue
|
||||
canon = _norm_url(loc)
|
||||
if canon in seen:
|
||||
continue
|
||||
seen.add(canon)
|
||||
lm_el = u.find("lastmod")
|
||||
lastmod = lm_el.get_text(strip=True) if lm_el else None
|
||||
out.append(DiscoveredURL(canon, crop, trait_slug, code, lastmod))
|
||||
out.sort(key=lambda d: (d.crop, d.code))
|
||||
log.info("sitemap: discovered %d product detail pages%s",
|
||||
len(out), f" (crop={only_crop})" if only_crop else "")
|
||||
return out
|
||||
|
||||
|
||||
def discover_ajax(http: RateLimitedSession, *,
|
||||
only_crop: str | None) -> list[DiscoveredURL]:
|
||||
"""Fallback enumeration via the comparison ajax fragments.
|
||||
|
||||
NOTE: these endpoints return the FULL historical product set
|
||||
(thousands of discontinued entries, with code dupes pointing at the
|
||||
same slug), so we de-dupe on canonical URL. The sitemap is preferred
|
||||
because it reflects only the current live catalog.
|
||||
"""
|
||||
out: list[DiscoveredURL] = []
|
||||
seen: set[str] = set()
|
||||
for crop, endpoint in AJAX.items():
|
||||
if only_crop and crop != only_crop:
|
||||
continue
|
||||
r = http.post(endpoint, data={"sel1": "", "sel2": "", "sel3": ""})
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
for a in soup.select("ul.comparison-list a[href]"):
|
||||
href = a.get("href") or ""
|
||||
loc = href if href.startswith("http") else BASE + href
|
||||
m = _DETAIL_RE.match(loc)
|
||||
if not m:
|
||||
continue
|
||||
mcrop = CROP_PATHS.get(m.group(1).lower())
|
||||
if not mcrop or (only_crop and mcrop != only_crop):
|
||||
continue
|
||||
canon = _norm_url(loc)
|
||||
if canon in seen:
|
||||
continue
|
||||
seen.add(canon)
|
||||
out.append(DiscoveredURL(canon, mcrop, m.group(2), m.group(3)))
|
||||
out.sort(key=lambda d: (d.crop, d.code))
|
||||
log.info("ajax: discovered %d product detail pages%s",
|
||||
len(out), f" (crop={only_crop})" if only_crop else "")
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- parse
|
||||
|
||||
|
||||
def _clean(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", s or "").strip()
|
||||
|
||||
|
||||
def _slug_to_trait(slug: str) -> str:
|
||||
"""Humanize a trait-slug into a display trait name.
|
||||
|
||||
``duracade-refuge-renew`` -> ``DuraCade Refuge Renew``;
|
||||
``enlist-e3-soybeans`` -> ``Enlist E3``; ``stine-gt-`` ->
|
||||
``Stine GT``; ``vt-double-pro-technology`` -> ``VT Double Pro``;
|
||||
``conventional-corn`` -> ``Conventional``.
|
||||
"""
|
||||
words = [w for w in re.split(r"[-_]+", slug) if w]
|
||||
drop_tail = {"soybeans", "soybean", "corn", "technology"}
|
||||
while words and words[-1].lower() in drop_tail:
|
||||
words.pop()
|
||||
if not words:
|
||||
return slug
|
||||
# Known acronyms / brand casings.
|
||||
acronyms = {"gt": "GT", "vt": "VT", "e3": "E3", "rnai": "RNAi",
|
||||
"sts": "STS", "ll": "LL", "rr2": "RR2", "3010": "3010",
|
||||
"3110": "3110", "3110a": "3110A"}
|
||||
out: list[str] = []
|
||||
for w in words:
|
||||
lw = w.lower()
|
||||
if lw in acronyms:
|
||||
out.append(acronyms[lw])
|
||||
elif lw == "duracade":
|
||||
out.append("DuraCade")
|
||||
elif lw == "viptera":
|
||||
out.append("Viptera")
|
||||
elif lw == "smartstax":
|
||||
out.append("SmartStax")
|
||||
elif lw == "xtendflex":
|
||||
out.append("XtendFlex")
|
||||
elif lw == "trecepta":
|
||||
out.append("Trecepta")
|
||||
elif lw == "agrisure":
|
||||
out.append("Agrisure")
|
||||
elif lw == "gt27":
|
||||
out.append("GT27")
|
||||
else:
|
||||
out.append(w.capitalize())
|
||||
return " ".join(out)
|
||||
|
||||
|
||||
def _extract_code(h1_text: str, fallback: str) -> str:
|
||||
"""Pull the product code from the ``Stine ® 9444-22 Brand`` H1.
|
||||
Falls back to the URL code segment (uppercased) if the H1 is odd."""
|
||||
t = h1_text
|
||||
t = re.sub(r"®|™", " ", t)
|
||||
t = re.sub(r"\bStine\b", " ", t, flags=re.I)
|
||||
t = re.sub(r"\bBrand\b", " ", t, flags=re.I)
|
||||
t = re.sub(r"\bNEW\b", " ", t)
|
||||
t = _clean(t)
|
||||
# Code is the first non-space token; keep it if it has a digit.
|
||||
tok = t.split(" ")[0] if t else ""
|
||||
if tok and any(ch.isdigit() for ch in tok):
|
||||
return tok
|
||||
return fallback.upper()
|
||||
|
||||
|
||||
def _parse_corn_maturity(value: str) -> int | None:
|
||||
"""Corn 'Maturity' is an RM range like '98 - 100' or a single '99'.
|
||||
Store the representative integer (mean of the range, rounded)."""
|
||||
nums = [int(n) for n in re.findall(r"\d+", value or "")]
|
||||
if not nums:
|
||||
return None
|
||||
if len(nums) == 1:
|
||||
return nums[0]
|
||||
return round(sum(nums[:2]) / 2)
|
||||
|
||||
|
||||
def _parse_soy_mg(value: str) -> float | None:
|
||||
"""Soy 'Maturity' is the RM expressed as a 2- or 3-digit code where
|
||||
MG = value/10 for 2-digit codes ('21' -> 2.1, '50' -> 5.0) and
|
||||
value/100 for 3-digit leading-zero codes ('008' -> 0.08). For a
|
||||
range ('008 - 009') take the start value."""
|
||||
m = re.match(r"\s*(\d+)", value or "")
|
||||
if not m:
|
||||
return None
|
||||
raw = m.group(1)
|
||||
n = int(raw)
|
||||
if len(raw) >= 3:
|
||||
return round(n / 100.0, 2)
|
||||
return round(n / 10.0, 2)
|
||||
|
||||
|
||||
def _bucket(crop: str, label: str) -> str:
|
||||
"""Map a chart label to a characteristics_groups label."""
|
||||
lk = label.lower().strip()
|
||||
if lk in _IDENTITY_LABELS:
|
||||
return "" # handled as a dedicated fact, not a generic item
|
||||
if crop == "corn":
|
||||
if lk in _CORN_DISEASE:
|
||||
return "DISEASE RATINGS"
|
||||
if lk in _CORN_AGRONOMIC:
|
||||
return "AGRONOMIC CHARACTERISTICS"
|
||||
if lk in _CORN_PLANT:
|
||||
return "PLANT DESCRIPTION"
|
||||
if lk in _CORN_SOIL:
|
||||
return "SOIL & PLACEMENT"
|
||||
if lk in _CORN_HERBICIDE:
|
||||
return "HERBICIDE TOLERANCE"
|
||||
else: # soybeans
|
||||
if lk in _SOY_DISEASE:
|
||||
return "DISEASE RATINGS"
|
||||
if lk in _SOY_AGRONOMIC:
|
||||
return "AGRONOMIC CHARACTERISTICS"
|
||||
if lk in _SOY_PLANT:
|
||||
return "PLANT DESCRIPTION"
|
||||
if lk in _SOY_HERBICIDE:
|
||||
return "HERBICIDE TOLERANCE"
|
||||
return "OTHER CHARACTERISTICS"
|
||||
|
||||
|
||||
def _parse_chart(crop: str, chart) -> tuple[list[dict], list[tuple[str, str]]]:
|
||||
"""Parse ``ul.agronomy-chart`` into grouped items.
|
||||
|
||||
Returns (groups, raw_pairs) where groups is the bucketed
|
||||
characteristics_groups list (display order preserved) and raw_pairs
|
||||
is every (label, value) pair (used to pull RM/MG)."""
|
||||
# Stable group order for rendering.
|
||||
order = ["AGRONOMIC CHARACTERISTICS", "DISEASE RATINGS",
|
||||
"PLANT DESCRIPTION", "SOIL & PLACEMENT",
|
||||
"HERBICIDE TOLERANCE", "OTHER CHARACTERISTICS"]
|
||||
bucketed: dict[str, list[dict]] = {k: [] for k in order}
|
||||
raw_pairs: list[tuple[str, str]] = []
|
||||
seen_item: set[tuple[str, str]] = set()
|
||||
for li in chart.find_all("li", recursive=False):
|
||||
strong = li.find("strong")
|
||||
val_el = li.find("span", class_="value")
|
||||
if not strong:
|
||||
continue
|
||||
label = _clean(strong.get_text(" ", strip=True))
|
||||
value = _clean(val_el.get_text(" ", strip=True)) if val_el else ""
|
||||
if not label:
|
||||
continue
|
||||
raw_pairs.append((label, value))
|
||||
grp = _bucket(crop, label)
|
||||
if not grp:
|
||||
continue
|
||||
# The soy page repeats "Maturity" twice and we drop those via
|
||||
# _IDENTITY_LABELS; de-dupe any other accidental repeats too.
|
||||
key = (label.lower(), value.lower())
|
||||
if key in seen_item:
|
||||
continue
|
||||
seen_item.add(key)
|
||||
bucketed[grp].append({"characteristic": label, "value": value})
|
||||
groups = [{"label": k, "items": bucketed[k]} for k in order if bucketed[k]]
|
||||
return groups, raw_pairs
|
||||
|
||||
|
||||
def parse_detail(http: RateLimitedSession, d: DiscoveredURL) -> StineVariety:
|
||||
r = http.get(d.url)
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
h1 = soup.find("h1")
|
||||
h1_text = _clean(h1.get_text(" ", strip=True)) if h1 else ""
|
||||
code = _extract_code(h1_text, d.code)
|
||||
|
||||
sec = soup.find("section", class_="agronomic-details")
|
||||
chart = sec.find("ul", class_="agronomy-chart") if sec else None
|
||||
groups: list[dict] = []
|
||||
raw_pairs: list[tuple[str, str]] = []
|
||||
if chart:
|
||||
groups, raw_pairs = _parse_chart(d.crop, chart)
|
||||
|
||||
# Pull maturity from the first "Maturity" pair.
|
||||
rm: int | None = None
|
||||
mg: float | None = None
|
||||
mat_text = ""
|
||||
for label, value in raw_pairs:
|
||||
if label.lower() == "maturity":
|
||||
mat_text = value
|
||||
break
|
||||
if d.crop == "corn":
|
||||
rm = _parse_corn_maturity(mat_text)
|
||||
# Keep the RM range text as a characteristic so the verbatim
|
||||
# range is retrievable alongside the representative integer.
|
||||
if mat_text:
|
||||
for g in groups:
|
||||
if g["label"] == "AGRONOMIC CHARACTERISTICS":
|
||||
g["items"].insert(0, {"characteristic": "Maturity (RM range)",
|
||||
"value": mat_text})
|
||||
break
|
||||
else:
|
||||
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
|
||||
"items": [{"characteristic": "Maturity (RM range)",
|
||||
"value": mat_text}]})
|
||||
else:
|
||||
mg = _parse_soy_mg(mat_text)
|
||||
if mat_text:
|
||||
for g in groups:
|
||||
if g["label"] == "AGRONOMIC CHARACTERISTICS":
|
||||
g["items"].insert(0, {"characteristic": "Maturity (RM)",
|
||||
"value": mat_text})
|
||||
break
|
||||
else:
|
||||
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
|
||||
"items": [{"characteristic": "Maturity (RM)",
|
||||
"value": mat_text}]})
|
||||
|
||||
trait = _slug_to_trait(d.trait_slug)
|
||||
trait_stack = [trait] if trait and trait.lower() != "conventional" else (
|
||||
["Conventional"] if trait.lower() == "conventional" else [])
|
||||
|
||||
return StineVariety(
|
||||
source_key=f"stine-{code.lower()}",
|
||||
source_url=d.url,
|
||||
crop=d.crop,
|
||||
product_name=code,
|
||||
relative_maturity=rm,
|
||||
maturity_group=mg,
|
||||
trait_stack=trait_stack,
|
||||
positioning=None,
|
||||
groups=groups,
|
||||
sitemap_last_modified=d.lastmod,
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- render
|
||||
|
||||
|
||||
def render_markdown(v: StineVariety) -> str:
|
||||
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
|
||||
v.crop, v.crop.title())
|
||||
head: list[str] = [
|
||||
f"# Stine {v.product_name}",
|
||||
"",
|
||||
"- **Vendor:** Stine Seed Company (independent family-owned breeder, Adel, IA)",
|
||||
"- **Brand:** Stine",
|
||||
f"- **Crop:** {crop_label}",
|
||||
]
|
||||
if v.crop == "corn" and v.relative_maturity is not None:
|
||||
head.append(f"- **Relative maturity:** {v.relative_maturity} days (representative)")
|
||||
if v.crop == "soybeans" and v.maturity_group is not None:
|
||||
head.append(f"- **Maturity group:** {v.maturity_group}")
|
||||
if v.trait_stack:
|
||||
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
||||
head.append(f"- **Source:** {v.source_url}")
|
||||
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
||||
head.append("- **Service area:** Stine dealer network — Corn Belt (IA/IL/IN/MN/NE/MO etc.)")
|
||||
head.append("")
|
||||
head += ["---", ""]
|
||||
for g in v.groups:
|
||||
head.append(f"## {g['label'].title()}")
|
||||
head.append("")
|
||||
for it in g["items"]:
|
||||
ch = it["characteristic"]
|
||||
val = it["value"] or "—"
|
||||
head.append(f"- **{ch}:** {val}")
|
||||
head.append("")
|
||||
return "\n".join(head)
|
||||
|
||||
|
||||
def write_variety(v: StineVariety, body_md: str) -> None:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
||||
sidecar = {
|
||||
"source": "stine",
|
||||
"source_key": v.source_key,
|
||||
"vendor": "Stine Seed Company",
|
||||
"brand": "Stine",
|
||||
"product_name": v.product_name,
|
||||
"product_id": v.product_name,
|
||||
"crop": v.crop,
|
||||
"release_year": None,
|
||||
"relative_maturity": v.relative_maturity,
|
||||
"maturity_group": v.maturity_group,
|
||||
"wheat_class": None,
|
||||
"trait_stack": v.trait_stack,
|
||||
"trait_descriptions": [],
|
||||
"positioning_statement": v.positioning,
|
||||
"strengths": [],
|
||||
"characteristics_groups": v.groups,
|
||||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||||
"regional_recommendations": [
|
||||
{"product_list_name": "Stine dealer network (Corn Belt — IA/IL/IN/MN/NE/MO etc.)",
|
||||
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
||||
],
|
||||
"image_url": None,
|
||||
"source_urls": [v.source_url],
|
||||
"sitemap_last_modified": v.sitemap_last_modified,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"scraper_version": SCRAPER_VERSION,
|
||||
}
|
||||
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
||||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- pipeline
|
||||
|
||||
|
||||
def run(*, limit: int | None, force: bool, only_crop: str | None,
|
||||
only_product: str | None, enumerate_via: str) -> int:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
http = RateLimitedSession()
|
||||
|
||||
if enumerate_via == "ajax":
|
||||
discovered = discover_ajax(http, only_crop=only_crop)
|
||||
else:
|
||||
discovered = discover_sitemap(http, only_crop=only_crop)
|
||||
if not discovered:
|
||||
log.warning("sitemap yielded nothing — falling back to ajax")
|
||||
discovered = discover_ajax(http, only_crop=only_crop)
|
||||
|
||||
if only_product:
|
||||
key = only_product.lower()
|
||||
discovered = [d for d in discovered
|
||||
if f"stine-{d.code.lower()}" == key
|
||||
or d.code.lower() == key]
|
||||
if not discovered:
|
||||
log.error("no variety matched --product=%s", only_product)
|
||||
return 2
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
|
||||
processed = 0
|
||||
total = len(discovered)
|
||||
for d in discovered:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
source_key = f"stine-{d.code.lower()}"
|
||||
md_path = CORPUS_DIR / f"{source_key}.md"
|
||||
if md_path.exists() and not force:
|
||||
counts["skipped"] += 1
|
||||
log.info("[%d/%d] %s skipped", processed, total, source_key)
|
||||
continue
|
||||
try:
|
||||
v = parse_detail(http, d)
|
||||
except requests.HTTPError as exc:
|
||||
counts["failed"] += 1
|
||||
log.error("[%d/%d] %s detail fetch failed: %s",
|
||||
processed, total, source_key, exc)
|
||||
continue
|
||||
except Exception as exc: # noqa: BLE001 — keep the run going
|
||||
counts["failed"] += 1
|
||||
log.error("[%d/%d] %s parse failed: %s",
|
||||
processed, total, source_key, exc)
|
||||
continue
|
||||
if not v.groups:
|
||||
counts["empty"] += 1
|
||||
log.warning("[%d/%d] %s — no chart groups parsed (still writing identity)",
|
||||
processed, total, source_key)
|
||||
write_variety(v, render_markdown(v))
|
||||
counts["written"] += 1
|
||||
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
||||
processed, total, source_key, v.crop,
|
||||
v.relative_maturity if v.crop == "corn" else v.maturity_group,
|
||||
len(v.groups), ",".join(v.trait_stack) or "-")
|
||||
|
||||
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
|
||||
processed, counts["written"], counts["skipped"],
|
||||
counts["empty"], counts["failed"], total)
|
||||
return 0
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- CLI
|
||||
|
||||
|
||||
def _build_argparser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="scrape.sources.stine",
|
||||
description="Scrape Stine Seed Company (independent Corn Belt breeder) — "
|
||||
"corn + soybeans via sitemap enumeration + detail pages.")
|
||||
p.add_argument("--limit", type=int, default=None,
|
||||
help="Stop after processing N varieties (default: all).")
|
||||
p.add_argument("--force", action="store_true",
|
||||
help="Re-fetch even if the markdown file already exists.")
|
||||
p.add_argument("--crop", default=None, choices=sorted(CROP_PATHS),
|
||||
help="Limit to one crop (corn / soybeans).")
|
||||
p.add_argument("--product", default=None,
|
||||
help="Process a single variety by source_key or product code.")
|
||||
p.add_argument("--enumerate", dest="enumerate_via", default="sitemap",
|
||||
choices=["sitemap", "ajax"],
|
||||
help="Enumeration source (default: sitemap; ajax = full historical set).")
|
||||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
args = _build_argparser().parse_args(argv)
|
||||
logging.basicConfig(
|
||||
level=args.log_level.upper(),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
stream=sys.stderr)
|
||||
return run(limit=args.limit, force=args.force,
|
||||
only_crop=args.crop, only_product=args.product,
|
||||
enumerate_via=args.enumerate_via)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user