84ad2b1de6
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
768 lines
29 KiB
Python
768 lines
29 KiB
Python
"""Stine Seed Company scraper — independent family-owned breeder (Adel, IA).
|
||
|
||
Source: ``www.stineseed.com`` — a custom PHP site (NOT WordPress;
|
||
``/wp-json/`` 404s). robots.txt returns 404 (none published); the
|
||
``/legal/`` page carries only a standard copyright / no-reproduction
|
||
clause (no anti-automation term — same posture as the other corpus
|
||
vendors). ``sitemap.xml`` (~499 URLs) lists every live product page,
|
||
so it is our canonical enumeration source.
|
||
|
||
Stine is the largest privately-owned seed company in the US; it
|
||
breeds and sells **corn + soybeans** only (no wheat). The catalog is
|
||
~58 corn hybrids + ~159 soybean varieties.
|
||
|
||
Two-step ingestion:
|
||
|
||
1. **Enumerate** the current catalog from ``sitemap.xml``. A product
|
||
*detail* URL has the shape ``/{crop}/traits/{trait-slug}/{code}/``
|
||
(four path segments); the bare ``/{crop}/traits/{trait-slug}/``
|
||
landing pages are skipped. This yields exactly the live catalog
|
||
(58 corn + 159 soy), unlike the comparison ajax endpoint which
|
||
also returns thousands of discontinued/historical entries.
|
||
|
||
Fallback enumeration (``--enumerate ajax``) hits the comparison
|
||
ajax fragments:
|
||
- corn: POST ``/ajax/corn-comparison/filter_products.php``
|
||
- soy: POST ``/ajax/soybean-comparison/filter_products.php``
|
||
with ``sel1=&sel2=&sel3=`` (empty = all). Each ``<li>`` carries a
|
||
numeric product id + the canonical detail URL.
|
||
|
||
2. **Parse the detail page.** Each ``/{crop}/traits/{slug}/{code}/``
|
||
page server-renders all agronomic data (no JS needed) as
|
||
``<section class="agronomic-details">`` →
|
||
``<ul class="agronomy-chart"> <li> <strong>label</strong>
|
||
<span class="value">value</span> </li> …``. The variety code +
|
||
brand mark live in the ``<h1>`` (``Stine ® 9444-22 Brand``).
|
||
|
||
Rating scales differ by crop and are preserved verbatim (the chunker
|
||
never fabricates a value):
|
||
|
||
- **Corn** publishes an on-page legend:
|
||
``9: Excellent, 8: Very Good, 7: Good, 6: Average,
|
||
5: Below Average`` — a **1-9 numeric** scale, **HIGHER = BETTER /
|
||
more tolerant** (same direction as Bayer/NK, so no flip). Applies
|
||
to the agronomic performance panel (Drydown/Root/Stalk/Stress/
|
||
Cold Emergence/Test Weight) and the disease panel (Tar Spot/Gray
|
||
Leaf Spot/Eye Spot/N.C. Leaf Blight/Goss' Wilt/Common Rust/…).
|
||
Plant descriptors / soil placement / herbicide rows are
|
||
qualitative (Tall, Highly Recommended, Yes/No) and pass through.
|
||
- **Soybeans** are entirely **qualitative** (Excellent / Very Good
|
||
/ Good / … and Resistant / Strong / Good / Susceptible for
|
||
disease; "higher/'Resistant' = better"). There is no numeric
|
||
legend on soy pages. SCN (Soybean Cyst Nematode) and RPS Gene
|
||
rows carry the *source/gene* (e.g. Peking, 3a) rather than a
|
||
rating.
|
||
|
||
We parse the chart into structured ``characteristics_groups`` — a
|
||
DISEASE RATINGS group, an AGRONOMIC CHARACTERISTICS group, and a few
|
||
pass-through groups (PLANT DESCRIPTION / SOIL & PLACEMENT / HERBICIDE
|
||
TOLERANCE / SEED TREATMENT NOTES) — so every rating lands in the
|
||
embedded chunk and is actually retrievable.
|
||
|
||
Output:
|
||
corpus/stine/<source_key>.md
|
||
corpus/stine/<source_key>.json
|
||
|
||
source_key: ``stine-<productcode>`` lowercased, e.g.
|
||
``stine-9444-22`` (corn) or ``stine-22r32`` (soy).
|
||
|
||
CLI:
|
||
python -m scrape.sources.stine --crop corn --limit 2 --force
|
||
python -m scrape.sources.stine --crop soybeans --limit 2 --force
|
||
python -m scrape.sources.stine --force
|
||
python -m scrape.sources.stine --product stine-9444-22
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import logging
|
||
import os
|
||
import random
|
||
import re
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass, field
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import warnings
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
try: # bs4>=4.11 raises this when html.parser sees an XML doc (the sitemap)
|
||
from bs4 import XMLParsedAsHTMLWarning
|
||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||
except Exception: # pragma: no cover — older bs4 without the warning class
|
||
pass
|
||
|
||
SCRAPER_VERSION = "0.1.0"
|
||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||
BASE = "https://www.stineseed.com"
|
||
SITEMAP = f"{BASE}/sitemap.xml"
|
||
AJAX = {
|
||
"corn": f"{BASE}/ajax/corn-comparison/filter_products.php",
|
||
"soybeans": f"{BASE}/ajax/soybean-comparison/filter_products.php",
|
||
}
|
||
|
||
# Stine site path segment -> chunker crop value (chunker keys on the
|
||
# PLURAL "soybeans" for the MG branch). Stine has no wheat.
|
||
CROP_PATHS = {
|
||
"corn": "corn",
|
||
"soybeans": "soybeans",
|
||
}
|
||
|
||
# No robots.txt (404) and no Crawl-delay; stay polite at 1.5 s/req.
|
||
# ~217 detail pages -> a full run finishes in ~6 min.
|
||
REQ_INTERVAL_SEC = 1.5
|
||
|
||
RATING_SCALE_DIRECTION = (
|
||
"corn agronomic+disease 1-9 numeric, 9=Excellent/best/most-tolerant, "
|
||
"8=Very Good, 7=Good, 6=Average, 5=Below Average (higher=better, same "
|
||
"direction as Bayer/NK; blank/'-'=not rated); soybeans qualitative "
|
||
"(Excellent/Very Good/Good for vigor; Resistant/Strong/Good/Susceptible "
|
||
"for disease, Resistant/Strong=best); SCN row gives source (e.g. Peking) "
|
||
"and RPS Gene gives the gene, not a rating; plant/soil/herbicide rows "
|
||
"qualitative (Tall, Highly Recommended/Recommended, Yes/No)"
|
||
)
|
||
|
||
# ---- Chart-label classification -------------------------------------
|
||
# The agronomy chart is a flat run of label/value <li>s mixing identity,
|
||
# performance ratings, disease ratings, plant descriptors, soil/placement,
|
||
# and herbicide rows. We bucket by label into characteristics_groups the
|
||
# chunker understands (DISEASE RATINGS -> disease framing, AGRONOMIC
|
||
# CHARACTERISTICS -> agronomic framing; the rest pass through titled).
|
||
|
||
# Identity rows already captured into RM/MG/dedicated facts — not repeated
|
||
# as a generic characteristic.
|
||
_IDENTITY_LABELS = {"maturity", "maturity end"}
|
||
|
||
# Corn 1-9 performance ratings -> AGRONOMIC CHARACTERISTICS.
|
||
_CORN_AGRONOMIC = {
|
||
"gdd", "mn maturity", "drydown", "root", "stalk", "stress",
|
||
"cold emergence", "test weight", "harvest population",
|
||
}
|
||
# Corn disease ratings -> DISEASE RATINGS. Set kept generous because the
|
||
# disease list varies per page (some add S.C. Leaf Blight / Anthracnose).
|
||
_CORN_DISEASE = {
|
||
"tar spot", "gray leaf spot", "eye spot", "n.c. leaf blight",
|
||
"s.c. leaf blight", "anthracnose", "goss' wilt", "goss’ wilt",
|
||
"common rust", "northern corn leaf blight", "southern corn leaf blight",
|
||
"diplodia", "fusarium", "head smut",
|
||
}
|
||
# Corn plant descriptors -> PLANT DESCRIPTION.
|
||
_CORN_PLANT = {"plant height", "ear placement", "ear flex", "cob color"}
|
||
# Corn soil/placement -> SOIL & PLACEMENT.
|
||
_CORN_SOIL = {
|
||
"corn-on-corn", "sand", "loam", "clay", "wide rows", "narrow rows",
|
||
'population % in 30" or wider rows', "population % in narrow rows",
|
||
"population", "drought tolerance",
|
||
}
|
||
# Corn herbicide -> HERBICIDE TOLERANCE.
|
||
_CORN_HERBICIDE = {"glyphosate tolerant", "glufosinate tolerant"}
|
||
|
||
# Soy vigor/standability -> AGRONOMIC CHARACTERISTICS.
|
||
_SOY_AGRONOMIC = {"emergence", "standability", "shattering", "lodging"}
|
||
# Soy disease + nematode + gene rows -> DISEASE RATINGS (SCN/RPS carry a
|
||
# source/gene rather than a rating; that's still the disease panel).
|
||
_SOY_DISEASE = {
|
||
"phytophthora root rot", "rps gene", "iron deficiency chlorosis",
|
||
"brown stem rot", "sudden death syndrome", "soybean cyst nematode",
|
||
"frogeye leafspot", "frogeye leaf spot", "sclerotinia white mold",
|
||
"white mold", "stem canker", "root knot nematode", "soybean rust",
|
||
}
|
||
# Soy plant descriptors / quality -> PLANT DESCRIPTION.
|
||
_SOY_PLANT = {
|
||
"height", "flower", "pubescence", "hilum", "chloride", "pod color",
|
||
"canopy", "protein", "oil",
|
||
}
|
||
# Soy herbicide/trait management -> HERBICIDE TOLERANCE.
|
||
_SOY_HERBICIDE = {"sulfonylurea tolerance", "sts", "glyphosate tolerant"}
|
||
|
||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||
CORPUS_DIR = CORPUS_ROOT / "stine"
|
||
|
||
log = logging.getLogger("scrape.stine")
|
||
|
||
|
||
# --------------------------------------------------------------------- HTTP
|
||
|
||
|
||
class RateLimitedSession:
|
||
"""Polite session with backoff. Stine's live catalog is ~217 detail
|
||
pages, so 1.5 s/req still finishes in a few minutes."""
|
||
|
||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||
self.s = requests.Session()
|
||
self.s.headers["User-Agent"] = USER_AGENT
|
||
self.interval = interval
|
||
self._last = 0.0
|
||
|
||
def _wait(self) -> None:
|
||
delta = time.monotonic() - self._last
|
||
if delta < self.interval:
|
||
time.sleep(self.interval - delta)
|
||
self._last = time.monotonic()
|
||
|
||
def request(self, method: str, url: str, *, max_retries: int = 4,
|
||
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
||
last_exc: Exception | None = None
|
||
for attempt in range(max_retries):
|
||
self._wait()
|
||
try:
|
||
resp = self.s.request(method, url, timeout=timeout, **kw)
|
||
except requests.RequestException as exc:
|
||
last_exc = exc
|
||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||
method, url, exc, backoff)
|
||
time.sleep(backoff)
|
||
continue
|
||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||
ra = resp.headers.get("Retry-After")
|
||
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
||
30.0, (2 ** attempt) + random.random())
|
||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||
resp.status_code, method, url, backoff)
|
||
time.sleep(backoff)
|
||
continue
|
||
return resp
|
||
if last_exc:
|
||
raise last_exc
|
||
return resp # type: ignore[return-value]
|
||
|
||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||
return self.request("GET", url, **kw)
|
||
|
||
def post(self, url: str, **kw: Any) -> requests.Response:
|
||
return self.request("POST", url, **kw)
|
||
|
||
|
||
# --------------------------------------------------------------------- model
|
||
|
||
|
||
@dataclass
|
||
class StineVariety:
|
||
source_key: str
|
||
source_url: str
|
||
crop: str # chunker value: corn / soybeans
|
||
product_name: str = "" # "9444-22", "22R32"
|
||
relative_maturity: int | None = None # corn (representative RM days)
|
||
maturity_group: float | None = None # soy MG
|
||
trait_stack: list[str] = field(default_factory=list)
|
||
positioning: str | None = None
|
||
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
||
groups: list[dict] = field(default_factory=list)
|
||
sitemap_last_modified: str | None = None
|
||
|
||
|
||
# --------------------------------------------------------------------- discovery
|
||
|
||
|
||
_DETAIL_RE = re.compile(
|
||
r"^https?://(?:www\.)?stineseed\.com/(corn|soybeans)/traits/"
|
||
r"([^/]+)/([^/]+)/?$",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class DiscoveredURL:
|
||
url: str
|
||
crop: str
|
||
trait_slug: str
|
||
code: str
|
||
lastmod: str | None = None
|
||
|
||
|
||
def _norm_url(url: str) -> str:
|
||
"""Canonical product URL has a trailing slash."""
|
||
url = url.strip()
|
||
if not url.endswith("/"):
|
||
url += "/"
|
||
return url
|
||
|
||
|
||
def discover_sitemap(http: RateLimitedSession, *,
|
||
only_crop: str | None) -> list[DiscoveredURL]:
|
||
"""Parse sitemap.xml for live product detail pages.
|
||
|
||
A detail URL has FOUR path segments (``/{crop}/traits/{slug}/{code}/``);
|
||
the bare ``/{crop}/traits/{slug}/`` landing pages are excluded.
|
||
"""
|
||
r = http.get(SITEMAP)
|
||
r.raise_for_status()
|
||
# Parse with html.parser (lxml/xml backend isn't a guaranteed dep). It
|
||
# lowercases tag names but <loc>/<lastmod> are already lowercase, so
|
||
# find_all("url") still works on the sitemap fragments.
|
||
soup = BeautifulSoup(r.text, "html.parser")
|
||
out: list[DiscoveredURL] = []
|
||
seen: set[str] = set()
|
||
for u in soup.find_all("url"):
|
||
loc_el = u.find("loc")
|
||
if not loc_el:
|
||
continue
|
||
loc = loc_el.get_text(strip=True)
|
||
m = _DETAIL_RE.match(loc)
|
||
if not m:
|
||
continue
|
||
crop, trait_slug, code = m.group(1).lower(), m.group(2), m.group(3)
|
||
crop = CROP_PATHS.get(crop)
|
||
if not crop:
|
||
continue
|
||
if only_crop and crop != only_crop:
|
||
continue
|
||
canon = _norm_url(loc)
|
||
if canon in seen:
|
||
continue
|
||
seen.add(canon)
|
||
lm_el = u.find("lastmod")
|
||
lastmod = lm_el.get_text(strip=True) if lm_el else None
|
||
out.append(DiscoveredURL(canon, crop, trait_slug, code, lastmod))
|
||
out.sort(key=lambda d: (d.crop, d.code))
|
||
log.info("sitemap: discovered %d product detail pages%s",
|
||
len(out), f" (crop={only_crop})" if only_crop else "")
|
||
return out
|
||
|
||
|
||
def discover_ajax(http: RateLimitedSession, *,
|
||
only_crop: str | None) -> list[DiscoveredURL]:
|
||
"""Fallback enumeration via the comparison ajax fragments.
|
||
|
||
NOTE: these endpoints return the FULL historical product set
|
||
(thousands of discontinued entries, with code dupes pointing at the
|
||
same slug), so we de-dupe on canonical URL. The sitemap is preferred
|
||
because it reflects only the current live catalog.
|
||
"""
|
||
out: list[DiscoveredURL] = []
|
||
seen: set[str] = set()
|
||
for crop, endpoint in AJAX.items():
|
||
if only_crop and crop != only_crop:
|
||
continue
|
||
r = http.post(endpoint, data={"sel1": "", "sel2": "", "sel3": ""})
|
||
r.raise_for_status()
|
||
soup = BeautifulSoup(r.text, "html.parser")
|
||
for a in soup.select("ul.comparison-list a[href]"):
|
||
href = a.get("href") or ""
|
||
loc = href if href.startswith("http") else BASE + href
|
||
m = _DETAIL_RE.match(loc)
|
||
if not m:
|
||
continue
|
||
mcrop = CROP_PATHS.get(m.group(1).lower())
|
||
if not mcrop or (only_crop and mcrop != only_crop):
|
||
continue
|
||
canon = _norm_url(loc)
|
||
if canon in seen:
|
||
continue
|
||
seen.add(canon)
|
||
out.append(DiscoveredURL(canon, mcrop, m.group(2), m.group(3)))
|
||
out.sort(key=lambda d: (d.crop, d.code))
|
||
log.info("ajax: discovered %d product detail pages%s",
|
||
len(out), f" (crop={only_crop})" if only_crop else "")
|
||
return out
|
||
|
||
|
||
# --------------------------------------------------------------------- parse
|
||
|
||
|
||
def _clean(s: str) -> str:
|
||
return re.sub(r"\s+", " ", s or "").strip()
|
||
|
||
|
||
def _slug_to_trait(slug: str) -> str:
|
||
"""Humanize a trait-slug into a display trait name.
|
||
|
||
``duracade-refuge-renew`` -> ``DuraCade Refuge Renew``;
|
||
``enlist-e3-soybeans`` -> ``Enlist E3``; ``stine-gt-`` ->
|
||
``Stine GT``; ``vt-double-pro-technology`` -> ``VT Double Pro``;
|
||
``conventional-corn`` -> ``Conventional``.
|
||
"""
|
||
words = [w for w in re.split(r"[-_]+", slug) if w]
|
||
drop_tail = {"soybeans", "soybean", "corn", "technology"}
|
||
while words and words[-1].lower() in drop_tail:
|
||
words.pop()
|
||
if not words:
|
||
return slug
|
||
# Known acronyms / brand casings.
|
||
acronyms = {"gt": "GT", "vt": "VT", "e3": "E3", "rnai": "RNAi",
|
||
"sts": "STS", "ll": "LL", "rr2": "RR2", "3010": "3010",
|
||
"3110": "3110", "3110a": "3110A"}
|
||
out: list[str] = []
|
||
for w in words:
|
||
lw = w.lower()
|
||
if lw in acronyms:
|
||
out.append(acronyms[lw])
|
||
elif lw == "duracade":
|
||
out.append("DuraCade")
|
||
elif lw == "viptera":
|
||
out.append("Viptera")
|
||
elif lw == "smartstax":
|
||
out.append("SmartStax")
|
||
elif lw == "xtendflex":
|
||
out.append("XtendFlex")
|
||
elif lw == "trecepta":
|
||
out.append("Trecepta")
|
||
elif lw == "agrisure":
|
||
out.append("Agrisure")
|
||
elif lw == "gt27":
|
||
out.append("GT27")
|
||
else:
|
||
out.append(w.capitalize())
|
||
return " ".join(out)
|
||
|
||
|
||
def _extract_code(h1_text: str, fallback: str) -> str:
|
||
"""Pull the product code from the ``Stine ® 9444-22 Brand`` H1.
|
||
Falls back to the URL code segment (uppercased) if the H1 is odd."""
|
||
t = h1_text
|
||
t = re.sub(r"®|™", " ", t)
|
||
t = re.sub(r"\bStine\b", " ", t, flags=re.I)
|
||
t = re.sub(r"\bBrand\b", " ", t, flags=re.I)
|
||
t = re.sub(r"\bNEW\b", " ", t)
|
||
t = _clean(t)
|
||
# Code is the first non-space token; keep it if it has a digit.
|
||
tok = t.split(" ")[0] if t else ""
|
||
if tok and any(ch.isdigit() for ch in tok):
|
||
return tok
|
||
return fallback.upper()
|
||
|
||
|
||
def _parse_corn_maturity(value: str) -> int | None:
|
||
"""Corn 'Maturity' is an RM range like '98 - 100' or a single '99'.
|
||
Store the representative integer (mean of the range, rounded)."""
|
||
nums = [int(n) for n in re.findall(r"\d+", value or "")]
|
||
if not nums:
|
||
return None
|
||
if len(nums) == 1:
|
||
return nums[0]
|
||
return round(sum(nums[:2]) / 2)
|
||
|
||
|
||
def _parse_soy_mg(value: str) -> float | None:
|
||
"""Soy 'Maturity' is the RM expressed as a 2- or 3-digit code where
|
||
MG = value/10 for 2-digit codes ('21' -> 2.1, '50' -> 5.0) and
|
||
value/100 for 3-digit leading-zero codes ('008' -> 0.08). For a
|
||
range ('008 - 009') take the start value."""
|
||
m = re.match(r"\s*(\d+)", value or "")
|
||
if not m:
|
||
return None
|
||
raw = m.group(1)
|
||
n = int(raw)
|
||
if len(raw) >= 3:
|
||
return round(n / 100.0, 2)
|
||
return round(n / 10.0, 2)
|
||
|
||
|
||
def _bucket(crop: str, label: str) -> str:
|
||
"""Map a chart label to a characteristics_groups label."""
|
||
lk = label.lower().strip()
|
||
if lk in _IDENTITY_LABELS:
|
||
return "" # handled as a dedicated fact, not a generic item
|
||
if crop == "corn":
|
||
if lk in _CORN_DISEASE:
|
||
return "DISEASE RATINGS"
|
||
if lk in _CORN_AGRONOMIC:
|
||
return "AGRONOMIC CHARACTERISTICS"
|
||
if lk in _CORN_PLANT:
|
||
return "PLANT DESCRIPTION"
|
||
if lk in _CORN_SOIL:
|
||
return "SOIL & PLACEMENT"
|
||
if lk in _CORN_HERBICIDE:
|
||
return "HERBICIDE TOLERANCE"
|
||
else: # soybeans
|
||
if lk in _SOY_DISEASE:
|
||
return "DISEASE RATINGS"
|
||
if lk in _SOY_AGRONOMIC:
|
||
return "AGRONOMIC CHARACTERISTICS"
|
||
if lk in _SOY_PLANT:
|
||
return "PLANT DESCRIPTION"
|
||
if lk in _SOY_HERBICIDE:
|
||
return "HERBICIDE TOLERANCE"
|
||
return "OTHER CHARACTERISTICS"
|
||
|
||
|
||
def _parse_chart(crop: str, chart) -> tuple[list[dict], list[tuple[str, str]]]:
|
||
"""Parse ``ul.agronomy-chart`` into grouped items.
|
||
|
||
Returns (groups, raw_pairs) where groups is the bucketed
|
||
characteristics_groups list (display order preserved) and raw_pairs
|
||
is every (label, value) pair (used to pull RM/MG)."""
|
||
# Stable group order for rendering.
|
||
order = ["AGRONOMIC CHARACTERISTICS", "DISEASE RATINGS",
|
||
"PLANT DESCRIPTION", "SOIL & PLACEMENT",
|
||
"HERBICIDE TOLERANCE", "OTHER CHARACTERISTICS"]
|
||
bucketed: dict[str, list[dict]] = {k: [] for k in order}
|
||
raw_pairs: list[tuple[str, str]] = []
|
||
seen_item: set[tuple[str, str]] = set()
|
||
for li in chart.find_all("li", recursive=False):
|
||
strong = li.find("strong")
|
||
val_el = li.find("span", class_="value")
|
||
if not strong:
|
||
continue
|
||
label = _clean(strong.get_text(" ", strip=True))
|
||
value = _clean(val_el.get_text(" ", strip=True)) if val_el else ""
|
||
if not label:
|
||
continue
|
||
raw_pairs.append((label, value))
|
||
grp = _bucket(crop, label)
|
||
if not grp:
|
||
continue
|
||
# The soy page repeats "Maturity" twice and we drop those via
|
||
# _IDENTITY_LABELS; de-dupe any other accidental repeats too.
|
||
key = (label.lower(), value.lower())
|
||
if key in seen_item:
|
||
continue
|
||
seen_item.add(key)
|
||
bucketed[grp].append({"characteristic": label, "value": value})
|
||
groups = [{"label": k, "items": bucketed[k]} for k in order if bucketed[k]]
|
||
return groups, raw_pairs
|
||
|
||
|
||
def parse_detail(http: RateLimitedSession, d: DiscoveredURL) -> StineVariety:
|
||
r = http.get(d.url)
|
||
r.raise_for_status()
|
||
soup = BeautifulSoup(r.text, "html.parser")
|
||
|
||
h1 = soup.find("h1")
|
||
h1_text = _clean(h1.get_text(" ", strip=True)) if h1 else ""
|
||
code = _extract_code(h1_text, d.code)
|
||
|
||
sec = soup.find("section", class_="agronomic-details")
|
||
chart = sec.find("ul", class_="agronomy-chart") if sec else None
|
||
groups: list[dict] = []
|
||
raw_pairs: list[tuple[str, str]] = []
|
||
if chart:
|
||
groups, raw_pairs = _parse_chart(d.crop, chart)
|
||
|
||
# Pull maturity from the first "Maturity" pair.
|
||
rm: int | None = None
|
||
mg: float | None = None
|
||
mat_text = ""
|
||
for label, value in raw_pairs:
|
||
if label.lower() == "maturity":
|
||
mat_text = value
|
||
break
|
||
if d.crop == "corn":
|
||
rm = _parse_corn_maturity(mat_text)
|
||
# Keep the RM range text as a characteristic so the verbatim
|
||
# range is retrievable alongside the representative integer.
|
||
if mat_text:
|
||
for g in groups:
|
||
if g["label"] == "AGRONOMIC CHARACTERISTICS":
|
||
g["items"].insert(0, {"characteristic": "Maturity (RM range)",
|
||
"value": mat_text})
|
||
break
|
||
else:
|
||
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
|
||
"items": [{"characteristic": "Maturity (RM range)",
|
||
"value": mat_text}]})
|
||
else:
|
||
mg = _parse_soy_mg(mat_text)
|
||
if mat_text:
|
||
for g in groups:
|
||
if g["label"] == "AGRONOMIC CHARACTERISTICS":
|
||
g["items"].insert(0, {"characteristic": "Maturity (RM)",
|
||
"value": mat_text})
|
||
break
|
||
else:
|
||
groups.insert(0, {"label": "AGRONOMIC CHARACTERISTICS",
|
||
"items": [{"characteristic": "Maturity (RM)",
|
||
"value": mat_text}]})
|
||
|
||
trait = _slug_to_trait(d.trait_slug)
|
||
trait_stack = [trait] if trait and trait.lower() != "conventional" else (
|
||
["Conventional"] if trait.lower() == "conventional" else [])
|
||
|
||
return StineVariety(
|
||
source_key=f"stine-{code.lower()}",
|
||
source_url=d.url,
|
||
crop=d.crop,
|
||
product_name=code,
|
||
relative_maturity=rm,
|
||
maturity_group=mg,
|
||
trait_stack=trait_stack,
|
||
positioning=None,
|
||
groups=groups,
|
||
sitemap_last_modified=d.lastmod,
|
||
)
|
||
|
||
|
||
# --------------------------------------------------------------------- render
|
||
|
||
|
||
def render_markdown(v: StineVariety) -> str:
|
||
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
|
||
v.crop, v.crop.title())
|
||
head: list[str] = [
|
||
f"# Stine {v.product_name}",
|
||
"",
|
||
"- **Vendor:** Stine Seed Company (independent family-owned breeder, Adel, IA)",
|
||
"- **Brand:** Stine",
|
||
f"- **Crop:** {crop_label}",
|
||
]
|
||
if v.crop == "corn" and v.relative_maturity is not None:
|
||
head.append(f"- **Relative maturity:** {v.relative_maturity} days (representative)")
|
||
if v.crop == "soybeans" and v.maturity_group is not None:
|
||
head.append(f"- **Maturity group:** {v.maturity_group}")
|
||
if v.trait_stack:
|
||
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
||
head.append(f"- **Source:** {v.source_url}")
|
||
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
||
head.append("- **Service area:** Stine dealer network — Corn Belt (IA/IL/IN/MN/NE/MO etc.)")
|
||
head.append("")
|
||
head += ["---", ""]
|
||
for g in v.groups:
|
||
head.append(f"## {g['label'].title()}")
|
||
head.append("")
|
||
for it in g["items"]:
|
||
ch = it["characteristic"]
|
||
val = it["value"] or "—"
|
||
head.append(f"- **{ch}:** {val}")
|
||
head.append("")
|
||
return "\n".join(head)
|
||
|
||
|
||
def write_variety(v: StineVariety, body_md: str) -> None:
|
||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
||
sidecar = {
|
||
"source": "stine",
|
||
"source_key": v.source_key,
|
||
"vendor": "Stine Seed Company",
|
||
"brand": "Stine",
|
||
"product_name": v.product_name,
|
||
"product_id": v.product_name,
|
||
"crop": v.crop,
|
||
"release_year": None,
|
||
"relative_maturity": v.relative_maturity,
|
||
"maturity_group": v.maturity_group,
|
||
"wheat_class": None,
|
||
"trait_stack": v.trait_stack,
|
||
"trait_descriptions": [],
|
||
"positioning_statement": v.positioning,
|
||
"strengths": [],
|
||
"characteristics_groups": v.groups,
|
||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||
"regional_recommendations": [
|
||
{"product_list_name": "Stine dealer network (Corn Belt — IA/IL/IN/MN/NE/MO etc.)",
|
||
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
||
],
|
||
"image_url": None,
|
||
"source_urls": [v.source_url],
|
||
"sitemap_last_modified": v.sitemap_last_modified,
|
||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||
"scraper_version": SCRAPER_VERSION,
|
||
}
|
||
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||
encoding="utf-8")
|
||
|
||
|
||
# --------------------------------------------------------------------- pipeline
|
||
|
||
|
||
def run(*, limit: int | None, force: bool, only_crop: str | None,
|
||
only_product: str | None, enumerate_via: str) -> int:
|
||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||
http = RateLimitedSession()
|
||
|
||
if enumerate_via == "ajax":
|
||
discovered = discover_ajax(http, only_crop=only_crop)
|
||
else:
|
||
discovered = discover_sitemap(http, only_crop=only_crop)
|
||
if not discovered:
|
||
log.warning("sitemap yielded nothing — falling back to ajax")
|
||
discovered = discover_ajax(http, only_crop=only_crop)
|
||
|
||
if only_product:
|
||
key = only_product.lower()
|
||
discovered = [d for d in discovered
|
||
if f"stine-{d.code.lower()}" == key
|
||
or d.code.lower() == key]
|
||
if not discovered:
|
||
log.error("no variety matched --product=%s", only_product)
|
||
return 2
|
||
|
||
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
|
||
processed = 0
|
||
total = len(discovered)
|
||
for d in discovered:
|
||
if limit is not None and processed >= limit:
|
||
break
|
||
processed += 1
|
||
source_key = f"stine-{d.code.lower()}"
|
||
md_path = CORPUS_DIR / f"{source_key}.md"
|
||
if md_path.exists() and not force:
|
||
counts["skipped"] += 1
|
||
log.info("[%d/%d] %s skipped", processed, total, source_key)
|
||
continue
|
||
try:
|
||
v = parse_detail(http, d)
|
||
except requests.HTTPError as exc:
|
||
counts["failed"] += 1
|
||
log.error("[%d/%d] %s detail fetch failed: %s",
|
||
processed, total, source_key, exc)
|
||
continue
|
||
except Exception as exc: # noqa: BLE001 — keep the run going
|
||
counts["failed"] += 1
|
||
log.error("[%d/%d] %s parse failed: %s",
|
||
processed, total, source_key, exc)
|
||
continue
|
||
if not v.groups:
|
||
counts["empty"] += 1
|
||
log.warning("[%d/%d] %s — no chart groups parsed (still writing identity)",
|
||
processed, total, source_key)
|
||
write_variety(v, render_markdown(v))
|
||
counts["written"] += 1
|
||
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
||
processed, total, source_key, v.crop,
|
||
v.relative_maturity if v.crop == "corn" else v.maturity_group,
|
||
len(v.groups), ",".join(v.trait_stack) or "-")
|
||
|
||
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
|
||
processed, counts["written"], counts["skipped"],
|
||
counts["empty"], counts["failed"], total)
|
||
return 0
|
||
|
||
|
||
# --------------------------------------------------------------------- CLI
|
||
|
||
|
||
def _build_argparser() -> argparse.ArgumentParser:
|
||
p = argparse.ArgumentParser(
|
||
prog="scrape.sources.stine",
|
||
description="Scrape Stine Seed Company (independent Corn Belt breeder) — "
|
||
"corn + soybeans via sitemap enumeration + detail pages.")
|
||
p.add_argument("--limit", type=int, default=None,
|
||
help="Stop after processing N varieties (default: all).")
|
||
p.add_argument("--force", action="store_true",
|
||
help="Re-fetch even if the markdown file already exists.")
|
||
p.add_argument("--crop", default=None, choices=sorted(CROP_PATHS),
|
||
help="Limit to one crop (corn / soybeans).")
|
||
p.add_argument("--product", default=None,
|
||
help="Process a single variety by source_key or product code.")
|
||
p.add_argument("--enumerate", dest="enumerate_via", default="sitemap",
|
||
choices=["sitemap", "ajax"],
|
||
help="Enumeration source (default: sitemap; ajax = full historical set).")
|
||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||
return p
|
||
|
||
|
||
def main(argv: list[str] | None = None) -> int:
|
||
args = _build_argparser().parse_args(argv)
|
||
logging.basicConfig(
|
||
level=args.log_level.upper(),
|
||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||
stream=sys.stderr)
|
||
return run(limit=args.limit, force=args.force,
|
||
only_crop=args.crop, only_product=args.product,
|
||
enumerate_via=args.enumerate_via)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|