Files
seed-mcp/scrape/sources/latham.py
T
claude 84ad2b1de6
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Add 4 independent seed brands: Latham + Stine + 1st Choice + Burrus (+623 varieties) (#17)
Co-authored-by: claude <claude@jpaul.io>
Co-committed-by: claude <claude@jpaul.io>
2026-06-04 21:58:07 -04:00

595 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Latham Hi-Tech Seeds scraper — independent family-owned brand (Alexander, IA).
Source: ``www.lathamseeds.com`` — WordPress site exposing a public,
no-auth REST API. robots.txt is permissive (only ``/wp-admin/``
disallowed; the catalog + ``/wp-json/`` are open, no Crawl-delay).
Independent Upper-Midwest seed company (the self-styled "Latham
Country" — IA / MN / WI / IL / ND / SD / NE); corn + soybeans only
(an Alfalfa crop term exists in the taxonomy but has zero published
varieties — no wheat).
Two-step ingestion (mirrors the ProHarvest scraper):
1. **Enumerate** via the WP REST API. ``/wp/v2/varieties`` is the
variety custom-post-type (~265 records, ``X-WP-Total: 265``).
``/wp/v2/variety_crop`` is the crop taxonomy (Corn=2013,
Soybean=2029, Alfalfa=2159/empty); ``/wp/v2/variety_trait`` is the
trait taxonomy (Enlist E3, VT2 PRO RIB, Smart Stax, XtendFlex, …).
The REST payload gives the canonical id / slug / title / permalink
and taxonomy term IDs, plus a human-readable ``class_list`` (e.g.
``variety_crop-soybean``, ``variety_trait-enlist-e3``). ``acf`` is
``[]`` and ``content.rendered`` is EMPTY in REST, so the ratings
have to come from the detail page.
2. **Parse the detail page.** Each ``/products/<slug>/`` page
server-renders the agronomic data as ``<h3>`` spec sections, each a
run of ``<li><span>label</span><span>value</span></li>`` rows up to
the next section header:
- Corn: "Agronomic Characteristics" (Early Vigor / Stalk Strength
/ Root Strength / Stay Green / Drydown / Test Weight / Drought
Tolerance / Foliar Fungicide / Corn-on-Corn), "Plant
Characteristics" (Ear Height / Ear Type), "Disease Ratings"
(Goss's Wilt / Northern Leaf Blight / Anthracnose Stalk Rot /
Gray Leaf Spot / tar spot etc).
- Soybean: "Plant Characteristics" (Relative Maturity / Emergence
/ Plant Height / Plant Type / Flower Color / Pubescence / Pod
Color / Hilum Color), "Defensive Characteristics & Disease
Ratings" (SCN Resistance source / Iron Chlorosis / Stress
Tolerance / Phytophthora Rps gene / Brown Stem Rot / White Mold
/ Sudden Death). "Herbicide Tolerance" + "Placement" sections
are present but carry no ``<li>`` rows.
The relative maturity also sits in a "Key Features" ``Maturity``
row ("113.00 RM" / "3.60 RM"); we read RM/MG from the per-crop
spec section first and fall back to that.
Rating scale: **numeric, LOWER = BETTER** (1 = best / most
tolerant / most resistant). No explicit on-page legend, so the
direction was confirmed by cross-referencing the Product Overview
prose against the published values across ~12 corn varieties:
hybrids described "very good / superior / excellent stalks and roots"
carry Stalk/Root Strength 1.01.5, weaker traits run 3.03.5, and no
value approaches 9 (observed range ~1.03.5). The soybean disease
panel (Iron Chlorosis / Brown Stem Rot / White Mold / Sudden Death /
Stress Tolerance) reads the same direction (lower = more resistant).
A handful of values are categorical rather than numeric and pass
through verbatim: SCN Resistance source ("PI 88788"), Phytophthora
"Rps 1k", Anthracnose "ASR", plant descriptors ("Medium Tall",
"Flex"). ``NA`` / blank = not rated.
Unlike the Ebbert's scraper (which left ``characteristics_groups``
empty and relied on a verbatim body), we parse the spec sections into
structured ``characteristics_groups`` so the numeric + categorical
ratings land in the embedded chunk and are actually retrievable. The
soybean "Defensive Characteristics & Disease Ratings" section maps to
the DISEASE RATINGS bucket; corn "Agronomic Characteristics" +
"Plant Characteristics" map to AGRONOMIC CHARACTERISTICS.
Output:
corpus/latham/<source_key>.md
corpus/latham/<source_key>.json
source_key: ``latham-<slug>`` lowercased, e.g. ``latham-l-3632-e3``.
CLI:
python -m scrape.sources.latham --crop corn --limit 5
python -m scrape.sources.latham --force
python -m scrape.sources.latham --product latham-l-3632-e3
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.lathamseeds.com"
WP = f"{BASE}/wp-json/wp/v2"
# variety_crop taxonomy slug -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. "alfalfa"
# is in the taxonomy but has zero published varieties; everything not
# listed here is out of scope for the row-crop advisor. (No wheat.)
CROP_TYPES = {
"corn": "corn",
"soybean": "soybeans",
}
# robots.txt declares no Crawl-delay and only blocks /wp-admin/; we
# stay polite. ~265 detail pages at 1.5 s/req finishes in ~7 min.
REQ_INTERVAL_SEC = 1.5
RATING_SCALE_DIRECTION = (
"numeric ratings ~1-9 where LOWER = BETTER (1 = best / most "
"tolerant / most resistant); confirmed by cross-referencing "
"Product Overview prose vs values (top-rated stalks/roots cluster "
"1.0-1.5, weak traits 3.0-3.5, none approach 9). Categorical "
"values pass through verbatim (SCN source 'PI 88788', "
"Phytophthora 'Rps 1k', Anthracnose 'ASR', 'Medium Tall', 'Flex'). "
"NA/blank = not rated."
)
# Detail-page spec section headers (<h3>) -> characteristics_groups
# label. DISEASE RATINGS -> disease framing, AGRONOMIC CHARACTERISTICS
# -> agronomic framing in the chunker; anything else passes through as
# its own titled section. Both corn and soy headers are covered. The
# soybean "Defensive Characteristics & Disease Ratings" panel mixes
# disease 1-9 ratings with categorical resistance source/genes — we
# bucket the whole panel as DISEASE so it embeds under disease framing.
SPEC_SECTIONS = {
"agronomic characteristics": "AGRONOMIC CHARACTERISTICS",
"plant characteristics": "AGRONOMIC CHARACTERISTICS",
"disease ratings": "DISEASE RATINGS",
"defensive characteristics & disease ratings": "DISEASE RATINGS",
"defensive characteristics and disease ratings": "DISEASE RATINGS",
}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "latham"
log = logging.getLogger("scrape.latham")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. Latham's catalog is ~265 detail
pages so 1.5 s/req finishes the full scrape in ~7 min."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
return resp # type: ignore[return-value]
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
def get_json(self, url: str, **kw: Any) -> Any:
r = self.get(url, **kw)
r.raise_for_status()
return r.json()
# --------------------------------------------------------------------- model
@dataclass
class LathamVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans
product_name: str = "" # "L 3632 E3"
relative_maturity: int | None = None # corn (days)
maturity_group: float | None = None # soy
release_year: str | None = None
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
# --------------------------------------------------------------------- discovery (REST)
def _taxonomy_map(http: RateLimitedSession, taxonomy: str) -> dict[int, str]:
"""term_id -> name for a WP taxonomy (paged)."""
out: dict[int, str] = {}
page = 1
while True:
url = f"{WP}/{taxonomy}?per_page=100&page={page}&_fields=id,name,slug"
r = http.get(url)
if r.status_code == 400: # past last page
break
r.raise_for_status()
terms = r.json()
if not terms:
break
for t in terms:
out[t["id"]] = t.get("name") or t.get("slug") or str(t["id"])
if len(terms) < 100:
break
page += 1
return out
def _crop_slug_to_id(http: RateLimitedSession) -> dict[str, int]:
out: dict[str, int] = {}
for t in http.get_json(f"{WP}/variety_crop?per_page=100&_fields=id,slug"):
out[t["slug"]] = t["id"]
return out
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
"""Return REST variety records for the in-scope row crops."""
crop_ids = _crop_slug_to_id(http)
records: list[dict] = []
seen: set[int] = set()
for crop_slug, crop in CROP_TYPES.items():
if only_crop and crop != only_crop:
continue
cid = crop_ids.get(crop_slug)
if cid is None:
log.warning("variety_crop %r not found in taxonomy — skipping", crop_slug)
continue
page = 1
while True:
url = (f"{WP}/varieties?variety_crop={cid}&per_page=100&page={page}"
"&_fields=id,slug,title,link,variety_trait,variety_year")
r = http.get(url)
if r.status_code == 400:
break
r.raise_for_status()
batch = r.json()
if not batch:
break
for v in batch:
if v["id"] in seen:
continue
seen.add(v["id"])
v["_crop"] = crop
records.append(v)
if len(batch) < 100:
break
page += 1
log.info("variety_crop %-8s (%s): cumulative %d", crop_slug, crop, len(records))
return records
# --------------------------------------------------------------------- detail parse
_MATURITY_RE = re.compile(r"([0-9]+(?:\.[0-9]+)?)")
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _two_span(li: Tag) -> tuple[str, str] | None:
"""A spec row is an <li> with exactly two non-empty <span>
descendants: (label, value)."""
spans = [_clean(s.get_text(" ", strip=True)) for s in li.find_all("span")]
if len(spans) == 2 and all(spans):
return spans[0], spans[1]
return None
def _section_rows(header: Tag) -> list[tuple[str, str]]:
"""Collect every two-span <li> from a section header up to (but not
including) the next section header (h2/h3) in document order."""
rows: list[tuple[str, str]] = []
for el in header.find_all_next():
if el.name in ("h2", "h3") and el is not header:
break
if isinstance(el, Tag) and el.name == "li":
pair = _two_span(el)
if pair:
rows.append(pair)
return rows
def _parse_groups(soup: BeautifulSoup) -> list[dict]:
"""Parse each known spec <h3> into a {label, items:[{characteristic,
value}]} group. Sections with no rows are dropped."""
groups: list[dict] = []
for header in soup.find_all(["h2", "h3"]):
head = _clean(header.get_text(" ", strip=True)).lower()
label = SPEC_SECTIONS.get(head)
if not label:
continue
rows = _section_rows(header)
if not rows:
continue
items = [{"characteristic": k, "value": v} for k, v in rows]
# If a previous section already mapped to this label (corn maps
# both Agronomic + Plant Characteristics -> AGRONOMIC), merge so
# the chunker sees one coherent bucket instead of two.
existing = next((g for g in groups if g["label"] == label), None)
if existing:
existing["items"].extend(items)
else:
groups.append({"label": label, "items": items})
return groups
def _parse_maturity_from_groups(groups: list[dict], crop: str,
) -> tuple[int | None, float | None]:
"""Pull RM (corn) / MG (soy) from the parsed groups. Corn carries
'Maturity' under the page's Key Features and 'Relative Maturity' is
soy-side under Plant Characteristics."""
keys = ("relative maturity", "maturity")
for g in groups:
for it in g["items"]:
if it["characteristic"].strip().lower() in keys:
m = _MATURITY_RE.search(it["value"])
if not m:
continue
if crop == "corn":
return int(float(m.group(1))), None
return None, float(m.group(1))
return None, None
def _parse_maturity_keyfeatures(soup: BeautifulSoup, crop: str,
) -> tuple[int | None, float | None]:
"""Fallback: the 'Key Features' block carries a 'Maturity' row
('113.00 RM' / '3.60 RM')."""
for li in soup.find_all("li"):
pair = _two_span(li)
if pair and pair[0].strip().lower() == "maturity":
m = _MATURITY_RE.search(pair[1])
if m:
if crop == "corn":
return int(float(m.group(1))), None
return None, float(m.group(1))
return None, None
def _parse_positioning(soup: BeautifulSoup) -> str | None:
"""First substantive paragraph under the 'Product Overview' /
'Hybrid Advantages' heading. Best-effort marketing blurb."""
for header in soup.find_all(["h2", "h3"]):
if _clean(header.get_text(" ", strip=True)).lower() not in (
"product overview", "hybrid advantages"):
continue
for el in header.find_all_next():
if el.name in ("h2", "h3") and el is not header:
break
if isinstance(el, Tag) and el.name == "p":
t = _clean(el.get_text(" ", strip=True))
if len(t) >= 40:
return t
return None
def parse_detail(http: RateLimitedSession, rec: dict,
trait_names: dict[int, str],
year_names: dict[int, str]) -> LathamVariety:
crop = rec["_crop"]
slug = rec["slug"]
url = rec.get("link") or f"{BASE}/products/{slug}/"
name = _clean((rec.get("title") or {}).get("rendered", "")) or slug.upper()
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
# Drop noise so footer/nav text never bleeds into positioning.
for t in soup(["script", "style", "noscript"]):
t.decompose()
groups = _parse_groups(soup)
rm, mg = _parse_maturity_from_groups(groups, crop)
if rm is None and mg is None:
rm, mg = _parse_maturity_keyfeatures(soup, crop)
positioning = _parse_positioning(soup)
traits = [trait_names[t] for t in (rec.get("variety_trait") or [])
if t in trait_names]
years = [year_names[t] for t in (rec.get("variety_year") or [])
if t in year_names]
release_year = years[0] if years else None
return LathamVariety(
source_key=f"latham-{slug.lower()}",
source_url=url,
crop=crop,
product_name=name,
relative_maturity=rm,
maturity_group=mg,
release_year=release_year,
trait_stack=traits,
positioning=positioning,
groups=groups,
)
# --------------------------------------------------------------------- render
def render_markdown(v: LathamVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** Latham Hi-Tech Seeds (independent family-owned, Alexander, IA)",
"- **Brand:** Latham Hi-Tech Seeds",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** Latham dealer network — Upper Midwest "
"(IA/MN/WI/IL/ND/SD/NE)")
head.append("")
if v.positioning:
head += ["---", "", f"_{v.positioning}_", ""]
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or "—"
head.append(f"- **{ch}:** {val}")
head.append("")
return "\n".join(head)
def write_variety(v: LathamVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "latham",
"source_key": v.source_key,
"vendor": "Latham Hi-Tech Seeds",
"brand": "Latham Hi-Tech Seeds",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": v.release_year,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": None,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": [],
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "Latham dealer network (Upper Midwest — "
"IA/MN/WI/IL/ND/SD/NE)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": None,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
trait_names = _taxonomy_map(http, "variety_trait")
year_names = _taxonomy_map(http, "variety_year")
records = discover(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
records = [r for r in records
if f"latham-{r['slug'].lower()}" == key
or r["slug"].lower() == key]
if not records:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0}
processed = 0
for rec in records:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"latham-{rec['slug'].lower()}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
continue
try:
v = parse_detail(http, rec, trait_names, year_names)
except requests.HTTPError as exc:
log.error("[%d/%d] %s detail fetch failed: %s",
processed, len(records), source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no spec groups parsed (still writing identity)",
processed, len(records), source_key)
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, len(records), source_key, v.crop,
v.relative_maturity or v.maturity_group or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"],
len(records))
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.latham",
description="Scrape Latham Hi-Tech Seeds (independent Upper-Midwest "
"brand) — corn / soybeans via the WP REST API + detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(set(CROP_TYPES.values())),
help="Limit to one crop (corn / soybeans).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or slug.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())