Add 4 independent seed brands: Latham + Stine + 1st Choice + Burrus (+623 varieties) (#17)
Image rebuild (skip scrape) / build (push) Successful in 4m44s

Co-authored-by: claude <claude@jpaul.io>
Co-committed-by: claude <claude@jpaul.io>
This commit was merged in pull request #17.
This commit is contained in:
2026-06-04 21:58:07 -04:00
committed by Claude (agent)
parent 22e8092faf
commit 84ad2b1de6
1254 changed files with 103589 additions and 4 deletions
+561
View File
@@ -0,0 +1,561 @@
"""Burrus Seed scraper — independent family-owned company (Arenzville, IL).
Source: Burrus Hybrids ("Burrus Seed"), an independent family company
founded **1935** in Arenzville, Illinois — NOT owned by any of the
multinationals (Bayer / Corteva / Syngenta / BASF). It markets corn under
the **Burrus** and **Power Plus** brands and soybeans under the **Burrus**
and **DONMARIO** brands, sold through a dealer network across IL / IN / IA
/ MO / WI.
Unlike the ProHarvest scraper (which parses HTML detail pages), Burrus
publishes its full agronomic dataset through the **Seedware** catalog
widget's JSON-over-JSONP API (the backend for the product finder on
``burrusseed.com/products/{corn,soybeans}``). So this scraper does TWO
list calls and maps JSON fields straight into ``characteristics_groups``;
there is no per-variety page fetch.
Seedware API
------------
``GET https://burrus25.seedware.net/app/_queries/crop_varieties.php
?crop_pkey=101&callback=cb`` -> CORN (JSONP)
``crop_pkey=102`` -> SOYBEANS
Both require:
* a ``callback`` query param (WITHOUT it the endpoint returns ``[]``),
* a ``Referer: https://burrusseed.com/`` header.
The response is ``cb([...]);`` — strip the JSONP wrapper to get a JSON
array of ~38 corn + ~26 soy records. Each record has ~44 fields:
``id`` (variety code, e.g. ``8J697AM``), ``description`` (brand + code,
e.g. ``Power Plus 8J697AM``), ``pkey`` (Seedware row id), ``maturity``
(RM for corn / MG for soy, as a string like ``"97.00"`` / ``"2.00"``),
``released`` (year int), ``trait`` / ``trait_platform``, a per-record
brand in ``stat_corn_brand`` / ``stat_soybean_brand``, and many
``stat_*`` agronomic / disease / herbicide-tolerance ratings.
Rating scales (confirmed from the live data, Jun 2026)
------------------------------------------------------
* **Numeric agronomic + disease ratings: 1-10, 10 = best / most
tolerant** (observed values 4-10; standard Seedware/seed-industry
high-is-better scale). Soy agronomic stats arrive as ``"8.000"`` —
the trailing zeros are stripped to ``"8"``. ``NR`` / ``None`` /
blank / ``-`` = not rated and are SKIPPED (never coerced to a value).
* **Herbicide tolerance + insect-protection packages: Yes / No**
(verbatim). ``glyphosate`` / ``glufosinate`` / ``2,4-D choline`` /
``FOPs`` / ``dicamba`` tolerances and the Bt insect packages
(corn borer / rootworm / etc.) are categorical Yes/No, not numeric.
* **Categorical agronomic notes** (corn-on-corn suitability, refuge
structure) pass through verbatim.
Output:
corpus/burrus/<source_key>.md
corpus/burrus/<source_key>.json
source_key: ``burrus-<id>`` lowercased + slugified, e.g.
``burrus-8j697am``. The variety ``id`` (the catalog code) is stable.
CLI:
python -m scrape.sources.burrus --crop corn --limit 2 --force
python -m scrape.sources.burrus --crop soybeans
python -m scrape.sources.burrus --force
python -m scrape.sources.burrus --product burrus-8j697am
ROBOTS / UA: burrusseed.com robots.txt blocks ~33 NAMED AI/scraper bots
(Scrapy, CCBot, Bytespider, Diffbot, ...) and declares ``Crawl-delay: 10``
+ ``Content-signal: ai-train=no``; ``User-agent: *`` is allowed. The
operator has chosen to include this source. We use a non-blacklisted UA
and honour the 10-second crawl delay (the API call count is tiny — two
list calls — so this is cheap).
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
SCRAPER_VERSION = "0.1.0"
# NOT any blacklisted bot name — robots.txt allows User-agent: *.
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
SEEDWARE = "https://burrus25.seedware.net"
API = f"{SEEDWARE}/app/_queries/crop_varieties.php"
SITE = "https://burrusseed.com"
REFERER = "https://burrusseed.com/"
# crop_pkey -> (chunker crop value, public product page slug).
CROP_PKEYS = {
"corn": (101, "corn"),
"soybeans": (102, "soybeans"),
}
# robots.txt declares Crawl-delay: 10 for burrusseed.com / seedware.net.
# Honour it — the catalog is only two list calls so this is cheap.
REQ_INTERVAL_SEC = 10.0
RATING_SCALE_DIRECTION = (
"numeric agronomic + disease ratings 1-10, 10=best/most-tolerant "
"(observed 4-10; higher is better); NR/blank/0/'-' = not rated (omitted). "
"Herbicide tolerances and Bt insect-protection packages are Yes/No "
"(verbatim, not numeric). Corn-on-corn suitability and refuge structure "
"are categorical."
)
# ----- stat_* field -> (group label, human characteristic name) -----------
#
# Group labels match the chunker's buckets in rag/chunk.py:
# "DISEASE RATINGS" -> disease framing
# "AGRONOMIC CHARACTERISTICS" -> agronomic framing
# "HERBICIDE TOLERANCE" -> falls into the chunker's MANAGEMENT
# bucket ("HERBICIDE" is a recognised label),
# so it renders as "Management notes".
# Fields intentionally NOT mapped: stat_corn_brand / stat_soybean_brand
# (used for the per-record brand), stat_herbicide_tolerance (always blank
# in the live data — the per-chemistry stats carry the real signal).
DISEASE_FIELDS = {
# corn
"stat_gray_leaf_spot_tolerance": "Gray leaf spot tolerance",
"stat_tar_spot_tolerance": "Tar spot tolerance",
# soy
"stat_brown_stem_rot": "Brown stem rot (BSR) tolerance",
"stat_sds": "Sudden death syndrome (SDS) tolerance",
"stat_phytophthora_root_rot": "Phytophthora root rot tolerance",
"stat_prr_phytophthora_root_rot": "Phytophthora root rot (PRR) tolerance",
}
# Agronomic ratings — numeric 1-10 (corn) and "8.000"-style (soy).
AGRONOMIC_NUMERIC_FIELDS = {
# corn
"stat_drought_tolerance": "Drought tolerance",
"stat_greensnap_tolerance": "Greensnap tolerance",
"stat_root_strength": "Root strength",
"stat_stalk_strength": "Stalk strength",
"stat_standability": "Standability",
"stat_black_cutworm": "Black cutworm tolerance",
# soy
"stat_emergence": "Emergence",
"stat_canopy_width": "Canopy width",
"stat_plant_height": "Plant height",
}
# Agronomic categorical / Yes-No notes (insect protection + placement).
AGRONOMIC_CATEGORICAL_FIELDS = {
"stat_corn_corn": "Corn-on-corn suitability",
"stat_refuge": "Refuge structure",
"stat_corn_borer": "Corn borer protection (Bt)",
"stat_corn_rootworm": "Corn rootworm protection (Bt)",
"stat_corn_earworm": "Corn earworm protection (Bt)",
"stat_nematode": "Nematode protection",
"stat_wireworm": "Wireworm protection",
}
# Herbicide tolerance — Yes/No per chemistry.
HERBICIDE_FIELDS = {
"stat_glyphosate_tolerance": "Glyphosate tolerance",
"stat_glufosinate_tolerance": "Glufosinate tolerance",
"stat_24d_choline_tolerance": "2,4-D choline tolerance",
"stat_dicamba_tolerance": "Dicamba tolerance",
"stat_fops_tolerance": "FOPs (fop herbicide) tolerance",
}
GROUP_ORDER = [
("DISEASE RATINGS", DISEASE_FIELDS),
("AGRONOMIC CHARACTERISTICS", {**AGRONOMIC_NUMERIC_FIELDS,
**AGRONOMIC_CATEGORICAL_FIELDS}),
("HERBICIDE TOLERANCE", HERBICIDE_FIELDS),
]
# Values that mean "not rated" — never coerced into a chunk.
_NOT_RATED = {"", "-", "--", "n/a", "na", "nr", "none", "0", "0.000", "0.00"}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "burrus"
log = logging.getLogger("scrape.burrus")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. Honours burrusseed.com's
Crawl-delay: 10 (>=10 s between requests to seedware.net /
burrusseed.com). The Burrus catalog is two list calls total."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.s.headers["Referer"] = REFERER
self.s.headers["Accept"] = "*/*"
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if self._last and delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
resp: requests.Response | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
assert resp is not None
return resp
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
def _strip_jsonp(text: str) -> Any:
"""Strip a ``cb( ... );`` JSONP wrapper and parse the JSON inside."""
s = text.strip()
m = re.match(r"^[^(]*\((.*)\)\s*;?\s*$", s, re.S)
body = m.group(1) if m else s
return json.loads(body)
# --------------------------------------------------------------------- model
@dataclass
class BurrusVariety:
source_key: str
crop: str # chunker value: corn / soybeans
product_name: str # "Power Plus 8J697AM"
product_id: str # "8J697AM"
brand: str # "Burrus" | "Power Plus" | "DONMARIO"
relative_maturity: int | None = None
maturity_group: float | None = None
release_year: int | None = None
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
groups: list[dict] = field(default_factory=list)
source_url: str = ""
# --------------------------------------------------------------------- fetch
def fetch_crop(http: RateLimitedSession, crop_pkey: int) -> list[dict]:
"""Fetch + decode the JSONP variety array for one crop_pkey."""
url = f"{API}?crop_pkey={crop_pkey}&callback=cb"
r = http.get(url)
r.raise_for_status()
data = _strip_jsonp(r.text)
if not isinstance(data, list):
raise ValueError(f"unexpected payload for crop_pkey={crop_pkey}: "
f"{type(data).__name__}")
return data
# --------------------------------------------------------------------- mapping
def _slug(s: str) -> str:
s = (s or "").strip().lower()
s = re.sub(r"[^a-z0-9]+", "-", s)
return re.sub(r"-+", "-", s).strip("-")
def _is_rated(v: Any) -> bool:
if v is None:
return False
return str(v).strip().lower() not in _NOT_RATED
def _clean_value(v: Any) -> str:
"""Normalise a stat value for display. Numeric soy stats arrive as
'8.000' — strip the trailing zeros to '8'. Everything else passes
through verbatim (Yes / No / Suitable / Integrated refuge / ...)."""
s = str(v).strip()
# numeric like "8.000" / "8.00" / "97.00" -> "8" / "97"
if re.fullmatch(r"-?\d+(?:\.\d+)?", s):
f = float(s)
return str(int(f)) if f == int(f) else (f"{f:g}")
return s
def _maturity(rec: dict, crop: str) -> tuple[int | None, float | None]:
raw = rec.get("maturity")
if raw is None or str(raw).strip() == "":
return None, None
try:
f = float(str(raw).strip())
except ValueError:
return None, None
if crop == "corn":
return int(round(f)), None
return None, round(f, 1)
def _brand(rec: dict) -> str:
"""Per-record brand. corn -> stat_corn_brand (Burrus / Power Plus);
soy -> stat_soybean_brand (Burrus / DONMARIO). Falls back to the
leading token of the description, else 'Burrus'."""
b = rec.get("stat_corn_brand") or rec.get("stat_soybean_brand")
if b and str(b).strip():
return str(b).strip()
desc = (rec.get("description") or "").strip()
code = (rec.get("id") or "").strip()
if desc and code and desc.lower().endswith(code.lower()):
lead = desc[: len(desc) - len(code)].strip()
if lead:
return lead
return "Burrus"
def _traits(rec: dict) -> list[str]:
out: list[str] = []
for key in ("trait", "trait_platform"):
v = rec.get(key)
if v and str(v).strip():
# strip stray trailing punctuation seen in the data
# ("Conventional." / "AM`")
t = str(v).strip().rstrip(".`")
if t and t not in out:
out.append(t)
return out
def _build_groups(rec: dict) -> list[dict]:
groups: list[dict] = []
for label, fields in GROUP_ORDER:
items: list[dict] = []
for stat_key, human in fields.items():
v = rec.get(stat_key)
if _is_rated(v):
items.append({"characteristic": human, "value": _clean_value(v)})
if items:
groups.append({"label": label, "items": items})
return groups
def map_record(rec: dict, crop: str) -> BurrusVariety:
code = (rec.get("id") or "").strip()
pkey = rec.get("pkey")
key_seed = code or (f"pkey-{pkey}" if pkey else (rec.get("description") or ""))
source_key = f"burrus-{_slug(key_seed)}"
name = (rec.get("description") or code or key_seed).strip()
rm, mg = _maturity(rec, crop)
page_slug = CROP_PKEYS[crop][1]
return BurrusVariety(
source_key=source_key,
crop=crop,
product_name=name,
product_id=code or name,
brand=_brand(rec),
relative_maturity=rm,
maturity_group=mg,
release_year=(rec.get("released")
if isinstance(rec.get("released"), int) else None),
trait_stack=_traits(rec),
# The Seedware records carry no marketing blurb; leave positioning
# null rather than fabricate one.
positioning=None,
groups=_build_groups(rec),
source_url=f"{SITE}/products/{page_slug}",
)
# --------------------------------------------------------------------- render
def render_markdown(v: BurrusVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** Burrus Seed (Burrus Hybrids — independent family "
"company, Arenzville, IL, since 1935)",
f"- **Brand:** {v.brand}",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
if v.release_year:
head.append(f"- **Released:** {v.release_year}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Burrus dealer network "
"(IL / IN / IA / MO / WI)")
head.append("")
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
head.append(f"- **{it['characteristic']}:** {it['value'] or ''}")
head.append("")
return "\n".join(head)
def write_variety(v: BurrusVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "burrus",
"source_key": v.source_key,
"vendor": "Burrus Seed",
"brand": v.brand,
"product_name": v.product_name,
"product_id": v.product_id,
"crop": v.crop,
"release_year": v.release_year,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": None,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": [],
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "Burrus dealer network (IL/IN/IA/MO/WI)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": None,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
crops = [only_crop] if only_crop else list(CROP_PKEYS.keys())
records: list[tuple[str, dict]] = []
for crop in crops:
crop_pkey = CROP_PKEYS[crop][0]
try:
raw = fetch_crop(http, crop_pkey)
except (requests.HTTPError, ValueError) as exc:
log.error("fetch failed for crop=%s (pkey=%d): %s",
crop, crop_pkey, exc)
continue
log.info("crop=%-9s pkey=%d: %d records", crop, crop_pkey, len(raw))
for rec in raw:
records.append((crop, rec))
varieties = [map_record(rec, crop) for crop, rec in records]
if only_product:
key = only_product.lower()
varieties = [v for v in varieties
if v.source_key == key or v.product_id.lower() == key
or _slug(v.product_id) == _slug(key)]
if not varieties:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0}
processed = 0
total = len(varieties)
for v in varieties:
if limit is not None and processed >= limit:
break
processed += 1
md_path = CORPUS_DIR / f"{v.source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, total, v.source_key)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no rating groups (still writing identity)",
processed, total, v.source_key)
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | brand=%s crop=%s rm/mg=%s groups=%d "
"traits=%s", processed, total, v.source_key, v.brand, v.crop,
v.relative_maturity or v.maturity_group or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"],
total)
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.burrus",
description="Scrape Burrus Seed (independent family company, "
"Arenzville IL) — corn / soybeans via the Seedware "
"JSON-over-JSONP catalog API.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-write even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(CROP_PKEYS.keys()),
help="Limit to one crop (corn / soybeans).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or id.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())
+671
View File
@@ -0,0 +1,671 @@
"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).
Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
seed company in Rushville, Indiana, serving the Eastern Corn Belt
(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
that is out of scope for the row-crop advisor).
Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
post types (corn-hybrids / soybeans / wheat) are NOT exposed to
``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
per-crop child sitemaps:
- ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/`` (~52 URLs)
- ``/soybeans-sitemap.xml`` -> ``/soybeans/<slug>/`` (~22 URLs)
- ``/wheat-sitemap.xml`` -> ``/wheat/<slug>/`` (~4 URLs)
robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
between requests.
Detail-page DOM (server-rendered, no JS needed for the text):
* Product name: the second ``<h1>`` inside ``article.content`` (the
first is the site logo "1st Choice Seeds").
* Corn — three ``<h2>`` sections + a side table:
- "Hybrid Characteristics": a single ``<p>`` of ``label • value``
lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
Seedling Vigor (genuinely thin pages — still written).
- "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
(the numeric 0-10 bars are drawn client-side by d3 and are NOT
in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
Superior, so higher = better.
- "Management Tips": ``label: value`` lines (Corn-On-Corn,
Productivity / soil guidance, Silage Rating).
- A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
the Low/Medium/High recommended planting populations.
* Soybeans — three ``<h2>`` sections:
- "Field Notes": a ``<ul>`` of strengths (often includes SCN
source / PRR gene call-outs).
- "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
- "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
Color, Pubescence, Pod, Hilum).
* Wheat — thin (title + date only; wheat is private-label). We still
write an identity record so the variety is discoverable.
Rating scale: the published legend is **0-10, higher = better**
("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
Superior 9-10"). 1st Choice publishes the *qualitative* word
(Excellent / Very Good / …) in the HTML — those map directly onto that
legend — while the numeric bar is d3-rendered and absent from the
markup. NA / blank = not rated.
Output:
corpus/first_choice/<source_key>.md
corpus/first_choice/<source_key>.json
source_key: ``firstchoice-<slug>`` lowercased, e.g.
``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.
CLI:
python -m scrape.sources.first_choice --crop corn --limit 5
python -m scrape.sources.first_choice --force
python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.1stchoiceseeds.com"
SITEMAP_INDEX = f"{BASE}/sitemap.xml"
# Per-crop child sitemap -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. The
# cover-crops sitemap is intentionally omitted (out of scope for the
# row-crop advisor).
CROP_SITEMAPS = {
"corn": "corn-hybrids-sitemap.xml",
"soybeans": "soybeans-sitemap.xml",
"wheat": "wheat-sitemap.xml",
}
# URL path prefix that confirms a sitemap entry is a variety detail page
# (vs. a category/archive page that can sneak into a child sitemap).
CROP_PATH = {
"corn": "/corn-hybrids/",
"soybeans": "/soybeans/",
"wheat": "/wheat/",
}
# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
REQ_INTERVAL_SEC = 1.2
RATING_SCALE_DIRECTION = (
"0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
"6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
"publishes the qualitative word in HTML (the numeric bar is "
"d3-rendered, not in markup); blank/NA = not rated"
)
# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}
# Trait-suffix -> human label, derived from the slug tail. Best-effort;
# an unmapped suffix is title-cased verbatim so nothing is dropped.
TRAIT_LABELS = {
# corn
"vt2p": "VT Double PRO (VT2P)",
"gt": "Glyphosate Tolerant (GT)",
"c": "Conventional",
"pc": "PowerCore (PC)",
"tre": "Trecepta (TRE)",
"ss": "SmartStax (SS)",
"v": "VT (V)",
"dv": "Double VT (DV)",
"aa": "Agrisure Artesian (AA)",
# soybeans
"en": "Enlist E3 (EN)",
"xf": "XtendFlex (XF)",
"sts": "STS",
# wheat
"b": "Bin-run / branded (B)",
"s": "Soft (S)",
}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "first_choice"
log = logging.getLogger("scrape.first_choice")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. The 1st Choice row-crop catalog is
small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
a couple minutes."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
resp: requests.Response | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
assert resp is not None
return resp
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
# --------------------------------------------------------------------- model
@dataclass
class FCVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans / wheat
product_name: str = "" # "FC 8455 VT2P"
relative_maturity: int | None = None # corn (days)
maturity_group: float | None = None # soy
wheat_class: str | None = None # wheat
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
strengths: list[str] = field(default_factory=list)
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
sitemap_last_modified: str | None = None
# --------------------------------------------------------------------- discovery (sitemaps)
_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
re.IGNORECASE | re.DOTALL)
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
re.IGNORECASE | re.DOTALL)
def _slug_from_url(url: str) -> str:
return url.rstrip("/").rsplit("/", 1)[-1].lower()
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
"""Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
by walking the per-crop child sitemaps under /sitemap.xml.
We fetch each known child sitemap directly (their names are stable
All-in-One-SEO conventions) rather than trusting the index ordering,
but we still confirm against the index so a renamed sitemap is caught.
"""
# Pull the sitemap index once so we can warn if a crop sitemap is
# missing/renamed (defensive; we still target the known names).
index_locs: set[str] = set()
try:
idx = http.get(SITEMAP_INDEX)
idx.raise_for_status()
index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
except requests.RequestException as exc:
log.warning("could not read sitemap index %s: %s (continuing with "
"known child sitemap names)", SITEMAP_INDEX, exc)
records: list[dict] = []
for crop, child in CROP_SITEMAPS.items():
if only_crop and crop != only_crop:
continue
child_url = f"{BASE}/{child}"
if index_locs and child_url not in index_locs:
log.warning("crop sitemap %s not listed in the index — site may "
"have renamed it; trying anyway", child_url)
r = http.get(child_url)
if r.status_code == 404:
log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
continue
r.raise_for_status()
prefix = CROP_PATH[crop]
seen: set[str] = set()
n = 0
for block in _URL_BLOCK_RE.findall(r.text):
loc_m = _LOC_RE.search(block)
if not loc_m:
continue
url = loc_m.group(1).strip()
if prefix not in url:
continue # category/archive page leaked into the sitemap
slug = _slug_from_url(url)
if not slug or slug in seen:
continue
seen.add(slug)
lm_m = _LASTMOD_RE.search(block)
records.append({
"crop": crop,
"url": url,
"slug": slug,
"lastmod": lm_m.group(1).strip() if lm_m else None,
})
n += 1
log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
log.info("total varieties discovered: %d", len(records))
return records
# --------------------------------------------------------------------- detail parse
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _direct_text(el: Tag) -> str:
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
def _br_lines(el: Tag) -> list[str]:
"""Text of an element with <br> treated as a line break."""
# Work on a copy so the original tree (used by other parsers) stays intact.
for br in el.find_all("br"):
br.replace_with("\n")
return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]
def _product_name(article: Tag, slug: str) -> str:
"""The variety name is the 2nd <h1> in article.content (the 1st is the
site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
for h1 in article.find_all("h1"):
txt = _clean(h1.get_text(" ", strip=True))
if txt and txt.lower() != "1st choice seeds":
return txt
return slug.upper().replace("-", " ")
def _trait_stack(slug: str, crop: str) -> list[str]:
"""Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
fb-3545-c-sts -> Conventional + STS). The leading model token
(fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
parts = slug.split("-")
# Drop the leading model identifier: typically the first 1-2 tokens
# (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
# that is a known trait suffix counts; we scan from the right.
traits: list[str] = []
for tok in parts:
t = tok.lower()
if t in TRAIT_LABELS:
label = TRAIT_LABELS[t]
if label not in traits:
traits.append(label)
# Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
# above naturally skips them. Preserve discovery order (left->right).
return traits
def _parse_corn(article: Tag, v: FCVariety) -> None:
"""Populate corn ratings from Hybrid Characteristics + Management Tips
+ the Relative Maturity / Degree Days side table."""
agronomic: list[dict] = []
disease: list[dict] = []
management: list[dict] = []
# Hybrid Characteristics: a <p> of "label • value" lines.
hc = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Hybrid Characteristics"), None)
if hc is not None:
sib = hc.find_next_sibling()
if sib is not None and sib.name == "p":
for ln in _br_lines(sib):
# split on bullet (•) or fall back to first colon
if "" in ln:
k, _, val = ln.partition("")
elif ":" in ln:
k, _, val = ln.partition(":")
else:
k, val = ln, ""
k, val = _clean(k), _clean(val)
if not k:
continue
item = {"characteristic": k, "value": val}
if k.lower() in _CORN_DISEASE_LABELS:
disease.append(item)
else:
agronomic.append(item)
# Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
# Silage Rating). Stop pulling once we wander into the footer address.
mt = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Management Tips"), None)
if mt is not None:
sib = mt.find_next_sibling()
if sib is not None and sib.name == "p":
for ln in _br_lines(sib):
if ":" not in ln:
continue
k, _, val = ln.partition(":")
k, val = _clean(k), _clean(val)
# Footer noise (address / © line) has no useful colon form.
if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
management.append({"characteristic": k, "value": val})
# Side table: Relative Maturity / Degree Days + planting populations.
pop_rows: list[str] = []
for tbl in article.find_all("table"):
for tr in tbl.find_all("tr"):
cells = [_clean(c.get_text(" ", strip=True))
for c in tr.find_all(["td", "th"])]
cells = [c for c in cells if c]
if not cells:
continue
joined = " ".join(cells).lower()
if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
m = re.search(r"(\d+)", cells[1])
if m:
v.relative_maturity = int(m.group(1))
agronomic.insert(0, {"characteristic": "Relative Maturity",
"value": cells[1]})
elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
agronomic.append({"characteristic": "Degree Days (GDU)",
"value": cells[1]})
elif joined.startswith("low") and ("medium" in joined or "high" in joined):
pop_rows.append(" / ".join(cells))
if pop_rows:
management.append({"characteristic": "Recommended Planting Population",
"value": "; ".join(pop_rows)})
if agronomic:
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
if disease:
v.groups.append({"label": "DISEASE RATINGS", "items": disease})
if management:
v.groups.append({"label": "MANAGEMENT", "items": management})
def _parse_soy(article: Tag, v: FCVariety) -> None:
"""Populate soy MG + agronomic descriptors + field-note strengths."""
# Field Notes -> strengths (and positioning from the first one).
fn = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Field Notes"), None)
if fn is not None:
sib = fn.find_next_sibling()
if sib is not None and sib.name == "ul":
notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
v.strengths = [n for n in notes if n]
if v.strengths and not v.positioning:
v.positioning = v.strengths[0]
# Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
agronomic: list[dict] = []
vd = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Variety Description"), None)
if vd is not None:
for el in vd.find_all_next():
if el.name == "h2" and el is not vd:
break
if not isinstance(el, Tag):
continue
# Stop at the action buttons / right-nav / footer region.
cls = el.get("class") or []
if el.name == "div" and any(
c in cls for c in ("btn", "right-bar", "right-navigation",
"address", "wrapper")):
break
b = el.find("b", recursive=False) if el.name == "div" else None
if b is not None:
k = _clean(b.get_text(" ", strip=True)).rstrip(":")
val = _direct_text(el)
if not k:
continue
if k.lower() == "maturity":
try:
v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
except (AttributeError, ValueError):
pass
agronomic.append({"characteristic": "Maturity Group", "value": val})
else:
agronomic.append({"characteristic": k, "value": val})
if agronomic:
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
crop = rec["crop"]
slug = rec["slug"]
url = rec["url"]
v = FCVariety(
source_key=f"firstchoice-{slug}",
source_url=url,
crop=crop,
trait_stack=_trait_stack(slug, crop),
sitemap_last_modified=rec.get("lastmod"),
)
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
article = soup.find("article", class_="content") or soup
v.product_name = _product_name(article, slug)
if crop == "corn":
_parse_corn(article, v)
elif crop == "soybeans":
_parse_soy(article, v)
# wheat: thin pages — identity only (no spec sections to parse).
return v
# --------------------------------------------------------------------- render
def render_markdown(v: FCVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
"wheat": "Wheat"}.get(v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
"- **Brand:** 1st Choice Seeds",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} day")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.crop == "wheat" and v.wheat_class:
head.append(f"- **Wheat class:** {v.wheat_class}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** 1st Choice Seeds dealer network — "
"Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
head.append("")
if v.positioning:
head += ["---", "", f"_{v.positioning}_", ""]
if v.strengths:
head += ["---", "", "## Field Notes", ""]
head += [f"- {s}" for s in v.strengths]
head.append("")
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or ""
head.append(f"- **{ch}:** {val}")
head.append("")
if not v.groups and v.crop == "wheat":
head += ["_Identity record only — 1st Choice wheat is private-label "
"and the catalog page carries no agronomic spec block._", ""]
return "\n".join(head)
def write_variety(v: FCVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "first_choice",
"source_key": v.source_key,
"vendor": "1st Choice Seeds",
"brand": "1st Choice Seeds",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": None,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": v.wheat_class,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": v.strengths,
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "1st Choice Seeds dealer network "
"(Eastern Corn Belt — IN/OH/KY/TN)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": v.sitemap_last_modified,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
records = discover(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
records = [r for r in records
if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
if not records:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
processed = 0
for rec in records:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"firstchoice-{rec['slug']}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
continue
try:
v = parse_detail(http, rec)
except requests.HTTPError as exc:
counts["failed"] += 1
log.error("[%d/%d] %s detail fetch failed: %s",
processed, len(records), source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
processed, len(records), source_key,
"; thin wheat page" if v.crop == "wheat" else "")
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, len(records), source_key, v.crop,
v.relative_maturity or v.maturity_group or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"],
counts["failed"], len(records))
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.first_choice",
description="Scrape 1st Choice Seeds (independent, employee-owned — "
"Rushville, IN) — corn / soybeans / wheat via sitemaps "
"+ detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
help="Limit to one crop (corn / soybeans / wheat).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or slug.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())
+594
View File
@@ -0,0 +1,594 @@
"""Latham Hi-Tech Seeds scraper — independent family-owned brand (Alexander, IA).
Source: ``www.lathamseeds.com`` — WordPress site exposing a public,
no-auth REST API. robots.txt is permissive (only ``/wp-admin/``
disallowed; the catalog + ``/wp-json/`` are open, no Crawl-delay).
Independent Upper-Midwest seed company (the self-styled "Latham
Country" — IA / MN / WI / IL / ND / SD / NE); corn + soybeans only
(an Alfalfa crop term exists in the taxonomy but has zero published
varieties — no wheat).
Two-step ingestion (mirrors the ProHarvest scraper):
1. **Enumerate** via the WP REST API. ``/wp/v2/varieties`` is the
variety custom-post-type (~265 records, ``X-WP-Total: 265``).
``/wp/v2/variety_crop`` is the crop taxonomy (Corn=2013,
Soybean=2029, Alfalfa=2159/empty); ``/wp/v2/variety_trait`` is the
trait taxonomy (Enlist E3, VT2 PRO RIB, Smart Stax, XtendFlex, …).
The REST payload gives the canonical id / slug / title / permalink
and taxonomy term IDs, plus a human-readable ``class_list`` (e.g.
``variety_crop-soybean``, ``variety_trait-enlist-e3``). ``acf`` is
``[]`` and ``content.rendered`` is EMPTY in REST, so the ratings
have to come from the detail page.
2. **Parse the detail page.** Each ``/products/<slug>/`` page
server-renders the agronomic data as ``<h3>`` spec sections, each a
run of ``<li><span>label</span><span>value</span></li>`` rows up to
the next section header:
- Corn: "Agronomic Characteristics" (Early Vigor / Stalk Strength
/ Root Strength / Stay Green / Drydown / Test Weight / Drought
Tolerance / Foliar Fungicide / Corn-on-Corn), "Plant
Characteristics" (Ear Height / Ear Type), "Disease Ratings"
(Goss's Wilt / Northern Leaf Blight / Anthracnose Stalk Rot /
Gray Leaf Spot / tar spot etc).
- Soybean: "Plant Characteristics" (Relative Maturity / Emergence
/ Plant Height / Plant Type / Flower Color / Pubescence / Pod
Color / Hilum Color), "Defensive Characteristics & Disease
Ratings" (SCN Resistance source / Iron Chlorosis / Stress
Tolerance / Phytophthora Rps gene / Brown Stem Rot / White Mold
/ Sudden Death). "Herbicide Tolerance" + "Placement" sections
are present but carry no ``<li>`` rows.
The relative maturity also sits in a "Key Features" ``Maturity``
row ("113.00 RM" / "3.60 RM"); we read RM/MG from the per-crop
spec section first and fall back to that.
Rating scale: **numeric, LOWER = BETTER** (1 = best / most
tolerant / most resistant). No explicit on-page legend, so the
direction was confirmed by cross-referencing the Product Overview
prose against the published values across ~12 corn varieties:
hybrids described "very good / superior / excellent stalks and roots"
carry Stalk/Root Strength 1.01.5, weaker traits run 3.03.5, and no
value approaches 9 (observed range ~1.03.5). The soybean disease
panel (Iron Chlorosis / Brown Stem Rot / White Mold / Sudden Death /
Stress Tolerance) reads the same direction (lower = more resistant).
A handful of values are categorical rather than numeric and pass
through verbatim: SCN Resistance source ("PI 88788"), Phytophthora
"Rps 1k", Anthracnose "ASR", plant descriptors ("Medium Tall",
"Flex"). ``NA`` / blank = not rated.
Unlike the Ebbert's scraper (which left ``characteristics_groups``
empty and relied on a verbatim body), we parse the spec sections into
structured ``characteristics_groups`` so the numeric + categorical
ratings land in the embedded chunk and are actually retrievable. The
soybean "Defensive Characteristics & Disease Ratings" section maps to
the DISEASE RATINGS bucket; corn "Agronomic Characteristics" +
"Plant Characteristics" map to AGRONOMIC CHARACTERISTICS.
Output:
corpus/latham/<source_key>.md
corpus/latham/<source_key>.json
source_key: ``latham-<slug>`` lowercased, e.g. ``latham-l-3632-e3``.
CLI:
python -m scrape.sources.latham --crop corn --limit 5
python -m scrape.sources.latham --force
python -m scrape.sources.latham --product latham-l-3632-e3
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.lathamseeds.com"
WP = f"{BASE}/wp-json/wp/v2"
# variety_crop taxonomy slug -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. "alfalfa"
# is in the taxonomy but has zero published varieties; everything not
# listed here is out of scope for the row-crop advisor. (No wheat.)
CROP_TYPES = {
"corn": "corn",
"soybean": "soybeans",
}
# robots.txt declares no Crawl-delay and only blocks /wp-admin/; we
# stay polite. ~265 detail pages at 1.5 s/req finishes in ~7 min.
REQ_INTERVAL_SEC = 1.5
RATING_SCALE_DIRECTION = (
"numeric ratings ~1-9 where LOWER = BETTER (1 = best / most "
"tolerant / most resistant); confirmed by cross-referencing "
"Product Overview prose vs values (top-rated stalks/roots cluster "
"1.0-1.5, weak traits 3.0-3.5, none approach 9). Categorical "
"values pass through verbatim (SCN source 'PI 88788', "
"Phytophthora 'Rps 1k', Anthracnose 'ASR', 'Medium Tall', 'Flex'). "
"NA/blank = not rated."
)
# Detail-page spec section headers (<h3>) -> characteristics_groups
# label. DISEASE RATINGS -> disease framing, AGRONOMIC CHARACTERISTICS
# -> agronomic framing in the chunker; anything else passes through as
# its own titled section. Both corn and soy headers are covered. The
# soybean "Defensive Characteristics & Disease Ratings" panel mixes
# disease 1-9 ratings with categorical resistance source/genes — we
# bucket the whole panel as DISEASE so it embeds under disease framing.
SPEC_SECTIONS = {
"agronomic characteristics": "AGRONOMIC CHARACTERISTICS",
"plant characteristics": "AGRONOMIC CHARACTERISTICS",
"disease ratings": "DISEASE RATINGS",
"defensive characteristics & disease ratings": "DISEASE RATINGS",
"defensive characteristics and disease ratings": "DISEASE RATINGS",
}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "latham"
log = logging.getLogger("scrape.latham")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. Latham's catalog is ~265 detail
pages so 1.5 s/req finishes the full scrape in ~7 min."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
def get_json(self, url: str, **kw: Any) -> Any:
r = self.get(url, **kw)
r.raise_for_status()
return r.json()
# --------------------------------------------------------------------- model
@dataclass
class LathamVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans
product_name: str = "" # "L 3632 E3"
relative_maturity: int | None = None # corn (days)
maturity_group: float | None = None # soy
release_year: str | None = None
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
# --------------------------------------------------------------------- discovery (REST)
def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
"""term_id -> name for a WP taxonomy (paged)."""
out: dict[int, str] = {}
page = 1
while True:
url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
r = http.get(url)
if r.status_code == 400: # past last page
break
r.raise_for_status()
terms = r.json()
if not terms:
break
for t in terms:
out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
if len(terms) < 100:
break
page += 1
return out
def _crop_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
out: dict[str, int] = {}
for t in http.get_json(f"{WP}/variety_crop?per_page=100&_fields=id,slug"):
out[t["slug"]] = t["id"]
return out
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
"""Return REST variety records for the in-scope row crops."""
crop_ids = _crop_slug_to_id(http)
records: list[dict] = []
seen: set[int] = set()
for crop_slug, crop in CROP_TYPES.items():
if only_crop and crop != only_crop:
continue
cid = crop_ids.get(crop_slug)
if cid is None:
log.warning("variety_crop %r not found in taxonomy — skipping", crop_slug)
continue
page = 1
while True:
url = (f"{WP}/varieties?variety_crop={cid}&per_page=100&page={page}"
"&_fields=id,slug,title,link,variety_trait,variety_year")
r = http.get(url)
if r.status_code == 400:
break
r.raise_for_status()
batch = r.json()
if not batch:
break
for v in batch:
if v["id"] in seen:
continue
seen.add(v["id"])
v["_crop"] = crop
records.append(v)
if len(batch) < 100:
break
page += 1
log.info("variety_crop %-8s (%s): cumulative %d", crop_slug, crop, len(records))
return records
# --------------------------------------------------------------------- detail parse
_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _two_span(li: Tag) -> tuple[str, str] | None:
"""A spec row is an <li> with exactly two non-empty <span>
descendants: (label, value)."""
spans = [_clean(s.get_text(" ", strip=True)) for s in li.find_all("span")]
if len(spans) == 2 and all(spans):
return spans[0], spans[1]
return None
def _section_rows(header: Tag) -> list[tuple[str, str]]:
"""Collect every two-span <li> from a section header up to (but not
including) the next section header (h2/h3) in document order."""
rows: list[tuple[str, str]] = []
for el in header.find_all_next():
if el.name in ("h2", "h3") and el is not header:
break
if isinstance(el, Tag) and el.name == "li":
pair = _two_span(el)
if pair:
rows.append(pair)
return rows
def _parse_groups(soup: BeautifulSoup) -> list[dict]:
"""Parse each known spec <h3> into a {label, items:[{characteristic,
value}]} group. Sections with no rows are dropped."""
groups: list[dict] = []
for header in soup.find_all(["h2", "h3"]):
head = _clean(header.get_text(" ", strip=True)).lower()
label = SPEC_SECTIONS.get(head)
if not label:
continue
rows = _section_rows(header)
if not rows:
continue
items = [{"characteristic": k, "value": v} for k, v in rows]
# If a previous section already mapped to this label (corn maps
# both Agronomic + Plant Characteristics -> AGRONOMIC), merge so
# the chunker sees one coherent bucket instead of two.
existing = next((g for g in groups if g["label"] == label), None)
if existing:
existing["items"].extend(items)
else:
groups.append({"label": label, "items": items})
return groups
def _parse_maturity_from_groups(groups: list[dict], crop: str,
) -> tuple[int | None, float | None]:
"""Pull RM (corn) / MG (soy) from the parsed groups. Corn carries
'Maturity' under the page's Key Features and 'Relative Maturity' is
soy-side under Plant Characteristics."""
keys = ("relative maturity", "maturity")
for g in groups:
for it in g["items"]:
if it["characteristic"].strip().lower() in keys:
m = _MATURITY_RE.search(it["value"])
if not m:
continue
if crop == "corn":
return int(float(m.group(1))), None
return None, float(m.group(1))
return None, None
def _parse_maturity_keyfeatures(soup: BeautifulSoup, crop: str,
) -> tuple[int | None, float | None]:
"""Fallback: the 'Key Features' block carries a 'Maturity' row
('113.00 RM' / '3.60 RM')."""
for li in soup.find_all("li"):
pair = _two_span(li)
if pair and pair[0].strip().lower() == "maturity":
m = _MATURITY_RE.search(pair[1])
if m:
if crop == "corn":
return int(float(m.group(1))), None
return None, float(m.group(1))
return None, None
def _parse_positioning(soup: BeautifulSoup) -> str | None:
"""First substantive paragraph under the 'Product Overview' /
'Hybrid Advantages' heading. Best-effort marketing blurb."""
for header in soup.find_all(["h2", "h3"]):
if _clean(header.get_text(" ", strip=True)).lower() not in (
"product overview", "hybrid advantages"):
continue
for el in header.find_all_next():
if el.name in ("h2", "h3") and el is not header:
break
if isinstance(el, Tag) and el.name == "p":
t = _clean(el.get_text(" ", strip=True))
if len(t) >= 40:
return t
return None
def parse_detail(http: RateLimitedSession, rec: dict,
trait_names: dict[int, str],
year_names: dict[int, str]) -> LathamVariety:
crop = rec["_crop"]
slug = rec["slug"]
url = rec.get("link") or f"{BASE}/products/{slug}/"
name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Drop noise so footer/nav text never bleeds into positioning.
for t in soup(["script", "style", "noscript"]):
t.decompose()
groups = _parse_groups(soup)
rm, mg = _parse_maturity_from_groups(groups, crop)
if rm is None and mg is None:
rm, mg = _parse_maturity_keyfeatures(soup, crop)
positioning = _parse_positioning(soup)
traits = [trait_names[t] for t in (rec.get("variety_trait") or [])
if t in trait_names]
years = [year_names[t] for t in (rec.get("variety_year") or [])
if t in year_names]
release_year = years[0] if years else None
return LathamVariety(
source_key=f"latham-{slug.lower()}",
source_url=url,
crop=crop,
product_name=name,
relative_maturity=rm,
maturity_group=mg,
release_year=release_year,
trait_stack=traits,
positioning=positioning,
groups=groups,
)
# --------------------------------------------------------------------- render
def render_markdown(v: LathamVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** Latham Hi-Tech Seeds (independent family-owned, Alexander, IA)",
"- **Brand:** Latham Hi-Tech Seeds",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Latham dealer network — Upper Midwest "
"(IA/MN/WI/IL/ND/SD/NE)")
head.append("")
if v.positioning:
head += ["---", "", f"_{v.positioning}_", ""]
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or ""
head.append(f"- **{ch}:** {val}")
head.append("")
return "\n".join(head)
def write_variety(v: LathamVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "latham",
"source_key": v.source_key,
"vendor": "Latham Hi-Tech Seeds",
"brand": "Latham Hi-Tech Seeds",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": v.release_year,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": None,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": [],
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "Latham dealer network (Upper Midwest — "
"IA/MN/WI/IL/ND/SD/NE)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": None,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
trait_names = _taxonomy_map(http, "variety_trait")
year_names = _taxonomy_map(http, "variety_year")
records = discover(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
records = [r for r in records
if f"latham-{r['slug'].lower()}" == key
or r["slug"].lower() == key]
if not records:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0}
processed = 0
for rec in records:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"latham-{rec['slug'].lower()}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
continue
try:
v = parse_detail(http, rec, trait_names, year_names)
except requests.HTTPError as exc:
log.error("[%d/%d] %s detail fetch failed: %s",
processed, len(records), source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
processed, len(records), source_key)
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, len(records), source_key, v.crop,
v.relative_maturity or v.maturity_group or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"],
len(records))
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.latham",
description="Scrape Latham Hi-Tech Seeds (independent Upper-Midwest "
"brand) — corn / soybeans via the WP REST API + detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
help="Limit to one crop (corn / soybeans).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or slug.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())
+767
View File
@@ -0,0 +1,767 @@
"""Stine Seed Company scraper — independent family-owned breeder (Adel, IA).
Source: ``www.stineseed.com`` — a custom PHP site (NOT WordPress;
``/wp-json/`` 404s). robots.txt returns 404 (none published); the
``/legal/`` page carries only a standard copyright / no-reproduction
clause (no anti-automation term — same posture as the other corpus
vendors). ``sitemap.xml`` (~499 URLs) lists every live product page,
so it is our canonical enumeration source.
Stine is the largest privately-owned seed company in the US; it
breeds and sells **corn + soybeans** only (no wheat). The catalog is
~58 corn hybrids + ~159 soybean varieties.
Two-step ingestion:
1. **Enumerate** the current catalog from ``sitemap.xml``. A product
*detail* URL has the shape ``/{crop}/traits/{trait-slug}/{code}/``
(four path segments); the bare ``/{crop}/traits/{trait-slug}/``
landing pages are skipped. This yields exactly the live catalog
(58 corn + 159 soy), unlike the comparison ajax endpoint which
also returns thousands of discontinued/historical entries.
Fallback enumeration (``--enumerate ajax``) hits the comparison
ajax fragments:
- corn: POST ``/ajax/corn-comparison/filter_products.php``
- soy: POST ``/ajax/soybean-comparison/filter_products.php``
with ``sel1=&sel2=&sel3=`` (empty = all). Each ``<li>`` carries a
numeric product id + the canonical detail URL.
2. **Parse the detail page.** Each ``/{crop}/traits/{slug}/{code}/``
page server-renders all agronomic data (no JS needed) as
``<section class="agronomic-details">`` →
``<ul class="agronomy-chart"> <li> <strong>label</strong>
<span class="value">value</span> </li> …``. The variety code +
brand mark live in the ``<h1>`` (``Stine ® 9444-22 Brand``).
Rating scales differ by crop and are preserved verbatim (the chunker
never fabricates a value):
- **Corn** publishes an on-page legend:
``9: Excellent, 8: Very Good, 7: Good, 6: Average,
5: Below Average`` — a **1-9 numeric** scale, **HIGHER = BETTER /
more tolerant** (same direction as Bayer/NK, so no flip). Applies
to the agronomic performance panel (Drydown/Root/Stalk/Stress/
Cold Emergence/Test Weight) and the disease panel (Tar Spot/Gray
Leaf Spot/Eye Spot/N.C. Leaf Blight/Goss' Wilt/Common Rust/…).
Plant descriptors / soil placement / herbicide rows are
qualitative (Tall, Highly Recommended, Yes/No) and pass through.
- **Soybeans** are entirely **qualitative** (Excellent / Very Good
/ Good / … and Resistant / Strong / Good / Susceptible for
disease; "higher/'Resistant' = better"). There is no numeric
legend on soy pages. SCN (Soybean Cyst Nematode) and RPS Gene
rows carry the *source/gene* (e.g. Peking, 3a) rather than a
rating.
We parse the chart into structured ``characteristics_groups`` — a
DISEASE RATINGS group, an AGRONOMIC CHARACTERISTICS group, and a few
pass-through groups (PLANT DESCRIPTION / SOIL & PLACEMENT / HERBICIDE
TOLERANCE / SEED TREATMENT NOTES) — so every rating lands in the
embedded chunk and is actually retrievable.
Output:
corpus/stine/<source_key>.md
corpus/stine/<source_key>.json
source_key: ``stine-<productcode>`` lowercased, e.g.
``stine-9444-22`` (corn) or ``stine-22r32`` (soy).
CLI:
python -m scrape.sources.stine --crop corn --limit 2 --force
python -m scrape.sources.stine --crop soybeans --limit 2 --force
python -m scrape.sources.stine --force
python -m scrape.sources.stine --product stine-9444-22
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import warnings
import requests
from bs4 import BeautifulSoup
try: # bs4>=4.11 raises this when html.parser sees an XML doc (the sitemap)
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except Exception: # pragma: no cover — older bs4 without the warning class
pass
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.stineseed.com"
SITEMAP = f"{BASE}/sitemap.xml"
AJAX = {
"corn": f"{BASE}/ajax/corn-comparison/filter_products.php",
"soybeans": f"{BASE}/ajax/soybean-comparison/filter_products.php",
}
# Stine site path segment -> chunker crop value (chunker keys on the
# PLURAL "soybeans" for the MG branch). Stine has no wheat.
CROP_PATHS = {
"corn": "corn",
"soybeans": "soybeans",
}
# No robots.txt (404) and no Crawl-delay; stay polite at 1.5 s/req.
# ~217 detail pages -> a full run finishes in ~6 min.
REQ_INTERVAL_SEC = 1.5
RATING_SCALE_DIRECTION = (
"corn agronomic+disease 1-9 numeric, 9=Excellent/best/most-tolerant, "
"8=Very Good, 7=Good, 6=Average, 5=Below Average (higher=better, same "
"direction as Bayer/NK; blank/'-'=not rated); soybeans qualitative "
"(Excellent/Very Good/Good for vigor; Resistant/Strong/Good/Susceptible "
"for disease, Resistant/Strong=best); SCN row gives source (e.g. Peking) "
"and RPS Gene gives the gene, not a rating; plant/soil/herbicide rows "
"qualitative (Tall, Highly Recommended/Recommended, Yes/No)"
)
# ---- Chart-label classification -------------------------------------
# The agronomy chart is a flat run of label/value <li>s mixing identity,
# performance ratings, disease ratings, plant descriptors, soil/placement,
# and herbicide rows. We bucket by label into characteristics_groups the
# chunker understands (DISEASE RATINGS -> disease framing, AGRONOMIC
# CHARACTERISTICS -> agronomic framing; the rest pass through titled).
# Identity rows already captured into RM/MG/dedicated facts — not repeated
# as a generic characteristic.
_IDENTITY_LABELS = {"maturity", "maturity end"}
# Corn 1-9 performance ratings -> AGRONOMIC CHARACTERISTICS.
_CORN_AGRONOMIC = {
"gdd", "mn maturity", "drydown", "root", "stalk", "stress",
"cold emergence", "test weight", "harvest population",
}
# Corn disease ratings -> DISEASE RATINGS. Set kept generous because the
# disease list varies per page (some add S.C. Leaf Blight / Anthracnose).
_CORN_DISEASE = {
"tar spot", "gray leaf spot", "eye spot", "n.c. leaf blight",
"s.c. leaf blight", "anthracnose", "goss' wilt", "goss wilt",
"common rust", "northern corn leaf blight", "southern corn leaf blight",
"diplodia", "fusarium", "head smut",
}
# Corn plant descriptors -> PLANT DESCRIPTION.
_CORN_PLANT = {"plant height", "ear placement", "ear flex", "cob color"}
# Corn soil/placement -> SOIL & PLACEMENT.
_CORN_SOIL = {
"corn-on-corn", "sand", "loam", "clay", "wide rows", "narrow rows",
'population % in 30" or wider rows', "population % in narrow rows",
"population", "drought tolerance",
}
# Corn herbicide -> HERBICIDE TOLERANCE.
_CORN_HERBICIDE = {"glyphosate tolerant", "glufosinate tolerant"}
# Soy vigor/standability -> AGRONOMIC CHARACTERISTICS.
_SOY_AGRONOMIC = {"emergence", "standability", "shattering", "lodging"}
# Soy disease + nematode + gene rows -> DISEASE RATINGS (SCN/RPS carry a
# source/gene rather than a rating; that's still the disease panel).
_SOY_DISEASE = {
"phytophthora root rot", "rps gene", "iron deficiency chlorosis",
"brown stem rot", "sudden death syndrome", "soybean cyst nematode",
"frogeye leafspot", "frogeye leaf spot", "sclerotinia white mold",
"white mold", "stem canker", "root knot nematode", "soybean rust",
}
# Soy plant descriptors / quality -> PLANT DESCRIPTION.
_SOY_PLANT = {
"height", "flower", "pubescence", "hilum", "chloride", "pod color",
"canopy", "protein", "oil",
}
# Soy herbicide/trait management -> HERBICIDE TOLERANCE.
_SOY_HERBICIDE = {"sulfonylurea tolerance", "sts", "glyphosate tolerant"}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "stine"
log = logging.getLogger("scrape.stine")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. Stine's live catalog is ~217 detail
pages, so 1.5 s/req still finishes in a few minutes."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
def post(self, url: str, **kw: Any) -> requests.Response:
return self.request("POST", url, **kw)
# --------------------------------------------------------------------- model
@dataclass
class StineVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans
product_name: str = "" # "9444-22", "22R32"
relative_maturity: int | None = None # corn (representative RM days)
maturity_group: float | None = None # soy MG
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
sitemap_last_modified: str | None = None
# --------------------------------------------------------------------- discovery
_DETAIL_RE = re.compile(
r"^https?://(?:www\.)?stineseed\.com/(corn|soybeans)/traits/"
r"([^/]+)/([^/]+)/?$",
re.IGNORECASE,
)
@dataclass
class DiscoveredURL:
url: str
crop: str
trait_slug: str
code: str
lastmod: str | None = None
def _norm_url(url: str) -> str:
"""Canonical product URL has a trailing slash."""
url = url.strip()
if not url.endswith("/"):
url += "/"
return url
def discover_sitemap(http: RateLimitedSession, *,
only_crop: str | None) -> list[DiscoveredURL]:
"""Parse sitemap.xml for live product detail pages.
A detail URL has FOUR path segments (``/{crop}/traits/{slug}/{code}/``);
the bare ``/{crop}/traits/{slug}/`` landing pages are excluded.
"""
r = http.get(SITEMAP)
r.raise_for_status()
# Parse with html.parser (lxml/xml backend isn't a guaranteed dep). It
# lowercases tag names but <loc>/<lastmod> are already lowercase, so
# find_all("url") still works on the sitemap fragments.
soup = BeautifulSoup(r.text, "html.parser")
out: list[DiscoveredURL] = []
seen: set[str] = set()
for u in soup.find_all("url"):
loc_el = u.find("loc")
if not loc_el:
continue
loc = loc_el.get_text(strip=True)
m = _DETAIL_RE.match(loc)
if not m:
continue
crop, trait_slug, code = m.group(1).lower(), m.group(2), m.group(3)
crop = CROP_PATHS.get(crop)
if not crop:
continue
if only_crop and crop != only_crop:
continue
canon = _norm_url(loc)
if canon in seen:
continue
seen.add(canon)
lm_el = u.find("lastmod")
lastmod = lm_el.get_text(strip=True) if lm_el else None
out.append(DiscoveredURL(canon, crop, trait_slug, code, lastmod))
out.sort(key=lambda d: (d.crop, d.code))
log.info("sitemap: discovered %d product detail pages%s",
len(out), f" (crop={only_crop})" if only_crop else "")
return out
def discover_ajax(http: RateLimitedSession, *,
only_crop: str | None) -> list[DiscoveredURL]:
"""Fallback enumeration via the comparison ajax fragments.
NOTE: these endpoints return the FULL historical product set
(thousands of discontinued entries, with code dupes pointing at the
same slug), so we de-dupe on canonical URL. The sitemap is preferred
because it reflects only the current live catalog.
"""
out: list[DiscoveredURL] = []
seen: set[str] = set()
for crop, endpoint in AJAX.items():
if only_crop and crop != only_crop:
continue
r = http.post(endpoint, data={"sel1": "", "sel2": "", "sel3": ""})
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
for a in soup.select("ul.comparison-list a[href]"):
href = a.get("href") or ""
loc = href if href.startswith("http") else BASE + href
m = _DETAIL_RE.match(loc)
if not m:
continue
mcrop = CROP_PATHS.get(m.group(1).lower())
if not mcrop or (only_crop and mcrop != only_crop):
continue
canon = _norm_url(loc)
if canon in seen:
continue
seen.add(canon)
out.append(DiscoveredURL(canon, mcrop, m.group(2), m.group(3)))
out.sort(key=lambda d: (d.crop, d.code))
log.info("ajax: discovered %d product detail pages%s",
len(out), f" (crop={only_crop})" if only_crop else "")
return out
# --------------------------------------------------------------------- parse
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _slug_to_trait(slug: str) -> str:
"""Humanize a trait-slug into a display trait name.
``duracade-refuge-renew`` -> ``DuraCade Refuge Renew``;
``enlist-e3-soybeans`` -> ``Enlist E3``; ``stine-gt-`` ->
``Stine GT``; ``vt-double-pro-technology`` -> ``VT Double Pro``;
``conventional-corn`` -> ``Conventional``.
"""
words = [w for w in re.split(r"[-_]+", slug) if w]
drop_tail = {"soybeans", "soybean", "corn", "technology"}
while words and words[-1].lower() in drop_tail:
words.pop()
if not words:
return slug
# Known acronyms / brand casings.
acronyms = {"gt": "GT", "vt": "VT", "e3": "E3", "rnai": "RNAi",
"sts": "STS", "ll": "LL", "rr2": "RR2", "3010": "3010",
"3110": "3110", "3110a": "3110A"}
out: list[str] = []
for w in words:
lw = w.lower()
if lw in acronyms:
out.append(acronyms[lw])
elif lw == "duracade":
out.append("DuraCade")
elif lw == "viptera":
out.append("Viptera")
elif lw == "smartstax":
out.append("SmartStax")
elif lw == "xtendflex":
out.append("XtendFlex")
elif lw == "trecepta":
out.append("Trecepta")
elif lw == "agrisure":
out.append("Agrisure")
elif lw == "gt27":
out.append("GT27")
else:
out.append(w.capitalize())
return " ".join(out)
def _extract_code(h1_text: str, fallback: str) -> str:
"""Pull the product code from the ``Stine ® 9444-22 Brand`` H1.
Falls back to the URL code segment (uppercased) if the H1 is odd."""
t = h1_text
t = re.sub(r"®|™", " ", t)
t = re.sub(r"\bStine\b", " ", t, flags=re.I)
t = re.sub(r"\bBrand\b", " ", t, flags=re.I)
t = re.sub(r"\bNEW\b", " ", t)
t = _clean(t)
# Code is the first non-space token; keep it if it has a digit.
tok = t.split(" ")[0] if t else ""
if tok and any(ch.isdigit() for ch in tok):
return tok
return fallback.upper()
def _parse_corn_maturity(value: str) -> int | None:
"""Corn 'Maturity' is an RM range like '98 - 100' or a single '99'.
Store the representative integer (mean of the range, rounded)."""
nums = [int(n) for n in re.findall(r"\d+", value or "")]
if not nums:
return None
if len(nums) == 1:
return nums[0]
return round(sum(nums[:2]) / 2)
def _parse_soy_mg(value: str) -> float | None:
"""Soy 'Maturity' is the RM expressed as a 2- or 3-digit code where
MG = value/10 for 2-digit codes ('21' -> 2.1, '50' -> 5.0) and
value/100 for 3-digit leading-zero codes ('008' -> 0.08). For a
range ('008 - 009') take the start value."""
m = re.match(r"\s*(\d+)", value or "")
if not m:
return None
raw = m.group(1)
n = int(raw)
if len(raw) >= 3:
return round(n / 100.0, 2)
return round(n / 10.0, 2)
def _bucket(crop: str, label: str) -> str:
"""Map a chart label to a characteristics_groups label."""
lk = label.lower().strip()
if lk in _IDENTITY_LABELS:
return "" # handled as a dedicated fact, not a generic item
if crop == "corn":
if lk in _CORN_DISEASE:
return "DISEASE RATINGS"
if lk in _CORN_AGRONOMIC:
return "AGRONOMIC CHARACTERISTICS"
if lk in _CORN_PLANT:
return "PLANT DESCRIPTION"
if lk in _CORN_SOIL:
return "SOIL & PLACEMENT"
if lk in _CORN_HERBICIDE:
return "HERBICIDE TOLERANCE"
else: # soybeans
if lk in _SOY_DISEASE:
return "DISEASE RATINGS"
if lk in _SOY_AGRONOMIC:
return "AGRONOMIC CHARACTERISTICS"
if lk in _SOY_PLANT:
return "PLANT DESCRIPTION"
if lk in _SOY_HERBICIDE:
return "HERBICIDE TOLERANCE"
return "OTHER CHARACTERISTICS"
def _parse_chart(crop: str, chart) -> tuple[list[dict], list[tuple[str, str]]]:
"""Parse ``ul.agronomy-chart`` into grouped items.
Returns (groups, raw_pairs) where groups is the bucketed
characteristics_groups list (display order preserved) and raw_pairs
is every (label, value) pair (used to pull RM/MG)."""
# Stable group order for rendering.
order = ["AGRONOMIC CHARACTERISTICS", "DISEASE RATINGS",
"PLANT DESCRIPTION", "SOIL & PLACEMENT",
"HERBICIDE TOLERANCE", "OTHER CHARACTERISTICS"]
bucketed: dict[str, list[dict]] = {k: [] for k in order}
raw_pairs: list[tuple[str, str]] = []
seen_item: set[tuple[str, str]] = set()
for li in chart.find_all("li", recursive=False):
strong = li.find("strong")
val_el = li.find("span", class_="value")
if not strong:
continue
label = _clean(strong.get_text(" ", strip=True))
value = _clean(val_el.get_text(" ", strip=True)) if val_el else ""
if not label:
continue
raw_pairs.append((label, value))
grp = _bucket(crop, label)
if not grp:
continue
# The soy page repeats "Maturity" twice and we drop those via
# _IDENTITY_LABELS; de-dupe any other accidental repeats too.
key = (label.lower(), value.lower())
if key in seen_item:
continue
seen_item.add(key)
bucketed[grp].append({"characteristic": label, "value": value})
groups = [{"label": k, "items": bucketed[k]} for k in order if bucketed[k]]
return groups, raw_pairs
def parse_detail(http: RateLimitedSession, d: DiscoveredURL) -> StineVariety:
r = http.get(d.url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
h1 = soup.find("h1")
h1_text = _clean(h1.get_text(" ", strip=True)) if h1 else ""
code = _extract_code(h1_text, d.code)
sec = soup.find("section", class_="agronomic-details")
chart = sec.find("ul", class_="agronomy-chart") if sec else None
groups: list[dict] = []
raw_pairs: list[tuple[str, str]] = []
if chart:
groups, raw_pairs = _parse_chart(d.crop, chart)
# Pull maturity from the first "Maturity" pair.
rm: int | None = None
mg: float | None = None
mat_text = ""
for label, value in raw_pairs:
if label.lower() == "maturity":
mat_text = value
break
if d.crop == "corn":
rm = _parse_corn_maturity(mat_text)
# Keep the RM range text as a characteristic so the verbatim
# range is retrievable alongside the representative integer.
if mat_text:
for g in groups:
if g["label"] == "AGRONOMIC CHARACTERISTICS":
g["items"].insert(0, {"characteristic": "Maturity (RM range)",
"value": mat_text})
break
else:
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
"items": [{"characteristic": "Maturity (RM range)",
"value": mat_text}]})
else:
mg = _parse_soy_mg(mat_text)
if mat_text:
for g in groups:
if g["label"] == "AGRONOMIC CHARACTERISTICS":
g["items"].insert(0, {"characteristic": "Maturity (RM)",
"value": mat_text})
break
else:
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
"items": [{"characteristic": "Maturity (RM)",
"value": mat_text}]})
trait = _slug_to_trait(d.trait_slug)
trait_stack = [trait] if trait and trait.lower() != "conventional" else (
["Conventional"] if trait.lower() == "conventional" else [])
return StineVariety(
source_key=f"stine-{code.lower()}",
source_url=d.url,
crop=d.crop,
product_name=code,
relative_maturity=rm,
maturity_group=mg,
trait_stack=trait_stack,
positioning=None,
groups=groups,
sitemap_last_modified=d.lastmod,
)
# --------------------------------------------------------------------- render
def render_markdown(v: StineVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
v.crop, v.crop.title())
head: list[str] = [
f"# Stine {v.product_name}",
"",
"- **Vendor:** Stine Seed Company (independent family-owned breeder, Adel, IA)",
"- **Brand:** Stine",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} days (representative)")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Stine dealer network — Corn Belt (IA/IL/IN/MN/NE/MO etc.)")
head.append("")
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or ""
head.append(f"- **{ch}:** {val}")
head.append("")
return "\n".join(head)
def write_variety(v: StineVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "stine",
"source_key": v.source_key,
"vendor": "Stine Seed Company",
"brand": "Stine",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": None,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": None,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": [],
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "Stine dealer network (Corn Belt — IA/IL/IN/MN/NE/MO etc.)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": v.sitemap_last_modified,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool, only_crop: str | None,
only_product: str | None, enumerate_via: str) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
if enumerate_via == "ajax":
discovered = discover_ajax(http, only_crop=only_crop)
else:
discovered = discover_sitemap(http, only_crop=only_crop)
if not discovered:
log.warning("sitemap yielded nothing — falling back to ajax")
discovered = discover_ajax(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
discovered = [d for d in discovered
if f"stine-{d.code.lower()}" == key
or d.code.lower() == key]
if not discovered:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
processed = 0
total = len(discovered)
for d in discovered:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"stine-{d.code.lower()}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, total, source_key)
continue
try:
v = parse_detail(http, d)
except requests.HTTPError as exc:
counts["failed"] += 1
log.error("[%d/%d] %s detail fetch failed: %s",
processed, total, source_key, exc)
continue
except Exception as exc: # noqa: BLE001 — keep the run going
counts["failed"] += 1
log.error("[%d/%d] %s parse failed: %s",
processed, total, source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no chart groups parsed (still writing identity)",
processed, total, source_key)
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, total, source_key, v.crop,
v.relative_maturity if v.crop == "corn" else v.maturity_group,
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
processed, counts["written"], counts["skipped"],
counts["empty"], counts["failed"], total)
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.stine",
description="Scrape Stine Seed Company (independent Corn Belt breeder) — "
"corn + soybeans via sitemap enumeration + detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(CROP_PATHS),
help="Limit to one crop (corn / soybeans).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or product code.")
p.add_argument("--enumerate", dest="enumerate_via", default="sitemap",
choices=["sitemap", "ajax"],
help="Enumeration source (default: sitemap; ajax = full historical set).")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product,
enumerate_via=args.enumerate_via)
if __name__ == "__main__":
sys.exit(main())