84ad2b1de6
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
562 lines
21 KiB
Python
562 lines
21 KiB
Python
"""Burrus Seed scraper — independent family-owned company (Arenzville, IL).
|
|
|
|
Source: Burrus Hybrids ("Burrus Seed"), an independent family company
|
|
founded **1935** in Arenzville, Illinois — NOT owned by any of the
|
|
multinationals (Bayer / Corteva / Syngenta / BASF). It markets corn under
|
|
the **Burrus** and **Power Plus** brands and soybeans under the **Burrus**
|
|
and **DONMARIO** brands, sold through a dealer network across IL / IN / IA
|
|
/ MO / WI.
|
|
|
|
Unlike the ProHarvest scraper (which parses HTML detail pages), Burrus
|
|
publishes its full agronomic dataset through the **Seedware** catalog
|
|
widget's JSON-over-JSONP API (the backend for the product finder on
|
|
``burrusseed.com/products/{corn,soybeans}``). So this scraper does TWO
|
|
list calls and maps JSON fields straight into ``characteristics_groups``;
|
|
there is no per-variety page fetch.
|
|
|
|
Seedware API
|
|
------------
|
|
``GET https://burrus25.seedware.net/app/_queries/crop_varieties.php
|
|
?crop_pkey=101&callback=cb`` -> CORN (JSONP)
|
|
``crop_pkey=102`` -> SOYBEANS
|
|
|
|
Both require:
|
|
* a ``callback`` query param (WITHOUT it the endpoint returns ``[]``),
|
|
* a ``Referer: https://burrusseed.com/`` header.
|
|
The response is ``cb([...]);`` — strip the JSONP wrapper to get a JSON
|
|
array of ~38 corn + ~26 soy records. Each record has ~44 fields:
|
|
``id`` (variety code, e.g. ``8J697AM``), ``description`` (brand + code,
|
|
e.g. ``Power Plus 8J697AM``), ``pkey`` (Seedware row id), ``maturity``
|
|
(RM for corn / MG for soy, as a string like ``"97.00"`` / ``"2.00"``),
|
|
``released`` (year int), ``trait`` / ``trait_platform``, a per-record
|
|
brand in ``stat_corn_brand`` / ``stat_soybean_brand``, and many
|
|
``stat_*`` agronomic / disease / herbicide-tolerance ratings.
|
|
|
|
Rating scales (confirmed from the live data, Jun 2026)
|
|
------------------------------------------------------
|
|
* **Numeric agronomic + disease ratings: 1-10, 10 = best / most
|
|
tolerant** (observed values 4-10; standard Seedware/seed-industry
|
|
high-is-better scale). Soy agronomic stats arrive as ``"8.000"`` —
|
|
the trailing zeros are stripped to ``"8"``. ``NR`` / ``None`` /
|
|
blank / ``-`` = not rated and are SKIPPED (never coerced to a value).
|
|
* **Herbicide tolerance + insect-protection packages: Yes / No**
|
|
(verbatim). ``glyphosate`` / ``glufosinate`` / ``2,4-D choline`` /
|
|
``FOPs`` / ``dicamba`` tolerances and the Bt insect packages
|
|
(corn borer / rootworm / etc.) are categorical Yes/No, not numeric.
|
|
* **Categorical agronomic notes** (corn-on-corn suitability, refuge
|
|
structure) pass through verbatim.
|
|
|
|
Output:
|
|
corpus/burrus/<source_key>.md
|
|
corpus/burrus/<source_key>.json
|
|
|
|
source_key: ``burrus-<id>`` lowercased + slugified, e.g.
|
|
``burrus-8j697am``. The variety ``id`` (the catalog code) is stable.
|
|
|
|
CLI:
|
|
python -m scrape.sources.burrus --crop corn --limit 2 --force
|
|
python -m scrape.sources.burrus --crop soybeans
|
|
python -m scrape.sources.burrus --force
|
|
python -m scrape.sources.burrus --product burrus-8j697am
|
|
|
|
ROBOTS / UA: burrusseed.com robots.txt blocks ~33 NAMED AI/scraper bots
|
|
(Scrapy, CCBot, Bytespider, Diffbot, ...) and declares ``Crawl-delay: 10``
|
|
+ ``Content-signal: ai-train=no``; ``User-agent: *`` is allowed. The
|
|
operator has chosen to include this source. We use a non-blacklisted UA
|
|
and honour the 10-second crawl delay (the API call count is tiny — two
|
|
list calls — so this is cheap).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
|
|
SCRAPER_VERSION = "0.1.0"
|
|
# NOT any blacklisted bot name — robots.txt allows User-agent: *.
|
|
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
|
SEEDWARE = "https://burrus25.seedware.net"
|
|
API = f"{SEEDWARE}/app/_queries/crop_varieties.php"
|
|
SITE = "https://burrusseed.com"
|
|
REFERER = "https://burrusseed.com/"
|
|
|
|
# crop_pkey -> (chunker crop value, public product page slug).
|
|
CROP_PKEYS = {
|
|
"corn": (101, "corn"),
|
|
"soybeans": (102, "soybeans"),
|
|
}
|
|
|
|
# robots.txt declares Crawl-delay: 10 for burrusseed.com / seedware.net.
|
|
# Honour it — the catalog is only two list calls so this is cheap.
|
|
REQ_INTERVAL_SEC = 10.0
|
|
|
|
RATING_SCALE_DIRECTION = (
|
|
"numeric agronomic + disease ratings 1-10, 10=best/most-tolerant "
|
|
"(observed 4-10; higher is better); NR/blank/0/'-' = not rated (omitted). "
|
|
"Herbicide tolerances and Bt insect-protection packages are Yes/No "
|
|
"(verbatim, not numeric). Corn-on-corn suitability and refuge structure "
|
|
"are categorical."
|
|
)
|
|
|
|
# ----- stat_* field -> (group label, human characteristic name) -----------
|
|
#
|
|
# Group labels match the chunker's buckets in rag/chunk.py:
|
|
# "DISEASE RATINGS" -> disease framing
|
|
# "AGRONOMIC CHARACTERISTICS" -> agronomic framing
|
|
# "HERBICIDE TOLERANCE" -> falls into the chunker's MANAGEMENT
|
|
# bucket ("HERBICIDE" is a recognised label),
|
|
# so it renders as "Management notes".
|
|
# Fields intentionally NOT mapped: stat_corn_brand / stat_soybean_brand
|
|
# (used for the per-record brand), stat_herbicide_tolerance (always blank
|
|
# in the live data — the per-chemistry stats carry the real signal).
|
|
|
|
DISEASE_FIELDS = {
|
|
# corn
|
|
"stat_gray_leaf_spot_tolerance": "Gray leaf spot tolerance",
|
|
"stat_tar_spot_tolerance": "Tar spot tolerance",
|
|
# soy
|
|
"stat_brown_stem_rot": "Brown stem rot (BSR) tolerance",
|
|
"stat_sds": "Sudden death syndrome (SDS) tolerance",
|
|
"stat_phytophthora_root_rot": "Phytophthora root rot tolerance",
|
|
"stat_prr_phytophthora_root_rot": "Phytophthora root rot (PRR) tolerance",
|
|
}
|
|
|
|
# Agronomic ratings — numeric 1-10 (corn) and "8.000"-style (soy).
|
|
AGRONOMIC_NUMERIC_FIELDS = {
|
|
# corn
|
|
"stat_drought_tolerance": "Drought tolerance",
|
|
"stat_greensnap_tolerance": "Greensnap tolerance",
|
|
"stat_root_strength": "Root strength",
|
|
"stat_stalk_strength": "Stalk strength",
|
|
"stat_standability": "Standability",
|
|
"stat_black_cutworm": "Black cutworm tolerance",
|
|
# soy
|
|
"stat_emergence": "Emergence",
|
|
"stat_canopy_width": "Canopy width",
|
|
"stat_plant_height": "Plant height",
|
|
}
|
|
|
|
# Agronomic categorical / Yes-No notes (insect protection + placement).
|
|
AGRONOMIC_CATEGORICAL_FIELDS = {
|
|
"stat_corn_corn": "Corn-on-corn suitability",
|
|
"stat_refuge": "Refuge structure",
|
|
"stat_corn_borer": "Corn borer protection (Bt)",
|
|
"stat_corn_rootworm": "Corn rootworm protection (Bt)",
|
|
"stat_corn_earworm": "Corn earworm protection (Bt)",
|
|
"stat_nematode": "Nematode protection",
|
|
"stat_wireworm": "Wireworm protection",
|
|
}
|
|
|
|
# Herbicide tolerance — Yes/No per chemistry.
|
|
HERBICIDE_FIELDS = {
|
|
"stat_glyphosate_tolerance": "Glyphosate tolerance",
|
|
"stat_glufosinate_tolerance": "Glufosinate tolerance",
|
|
"stat_24d_choline_tolerance": "2,4-D choline tolerance",
|
|
"stat_dicamba_tolerance": "Dicamba tolerance",
|
|
"stat_fops_tolerance": "FOPs (fop herbicide) tolerance",
|
|
}
|
|
|
|
GROUP_ORDER = [
|
|
("DISEASE RATINGS", DISEASE_FIELDS),
|
|
("AGRONOMIC CHARACTERISTICS", {**AGRONOMIC_NUMERIC_FIELDS,
|
|
**AGRONOMIC_CATEGORICAL_FIELDS}),
|
|
("HERBICIDE TOLERANCE", HERBICIDE_FIELDS),
|
|
]
|
|
|
|
# Values that mean "not rated" — never coerced into a chunk.
|
|
_NOT_RATED = {"", "-", "--", "n/a", "na", "nr", "none", "0", "0.000", "0.00"}
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
|
CORPUS_DIR = CORPUS_ROOT / "burrus"
|
|
|
|
log = logging.getLogger("scrape.burrus")
|
|
|
|
|
|
# --------------------------------------------------------------------- HTTP
|
|
|
|
|
|
class RateLimitedSession:
|
|
"""Polite session with backoff. Honours burrusseed.com's
|
|
Crawl-delay: 10 (>=10 s between requests to seedware.net /
|
|
burrusseed.com). The Burrus catalog is two list calls total."""
|
|
|
|
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
|
self.s = requests.Session()
|
|
self.s.headers["User-Agent"] = USER_AGENT
|
|
self.s.headers["Referer"] = REFERER
|
|
self.s.headers["Accept"] = "*/*"
|
|
self.interval = interval
|
|
self._last = 0.0
|
|
|
|
def _wait(self) -> None:
|
|
delta = time.monotonic() - self._last
|
|
if self._last and delta < self.interval:
|
|
time.sleep(self.interval - delta)
|
|
self._last = time.monotonic()
|
|
|
|
def request(self, method: str, url: str, *, max_retries: int = 4,
|
|
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
|
last_exc: Exception | None = None
|
|
resp: requests.Response | None = None
|
|
for attempt in range(max_retries):
|
|
self._wait()
|
|
try:
|
|
resp = self.s.request(method, url, timeout=timeout, **kw)
|
|
except requests.RequestException as exc:
|
|
last_exc = exc
|
|
backoff = min(30.0, (2 ** attempt) + random.random())
|
|
log.warning("network error on %s %s: %s — retry in %.1fs",
|
|
method, url, exc, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
|
ra = resp.headers.get("Retry-After")
|
|
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
|
30.0, (2 ** attempt) + random.random())
|
|
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
|
resp.status_code, method, url, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
return resp
|
|
if last_exc:
|
|
raise last_exc
|
|
assert resp is not None
|
|
return resp
|
|
|
|
def get(self, url: str, **kw: Any) -> requests.Response:
|
|
return self.request("GET", url, **kw)
|
|
|
|
|
|
def _strip_jsonp(text: str) -> Any:
|
|
"""Strip a ``cb( ... );`` JSONP wrapper and parse the JSON inside."""
|
|
s = text.strip()
|
|
m = re.match(r"^[^(]*\((.*)\)\s*;?\s*$", s, re.S)
|
|
body = m.group(1) if m else s
|
|
return json.loads(body)
|
|
|
|
|
|
# --------------------------------------------------------------------- model
|
|
|
|
|
|
@dataclass
|
|
class BurrusVariety:
|
|
source_key: str
|
|
crop: str # chunker value: corn / soybeans
|
|
product_name: str # "Power Plus 8J697AM"
|
|
product_id: str # "8J697AM"
|
|
brand: str # "Burrus" | "Power Plus" | "DONMARIO"
|
|
relative_maturity: int | None = None
|
|
maturity_group: float | None = None
|
|
release_year: int | None = None
|
|
trait_stack: list[str] = field(default_factory=list)
|
|
positioning: str | None = None
|
|
groups: list[dict] = field(default_factory=list)
|
|
source_url: str = ""
|
|
|
|
|
|
# --------------------------------------------------------------------- fetch
|
|
|
|
|
|
def fetch_crop(http: RateLimitedSession, crop_pkey: int) -> list[dict]:
|
|
"""Fetch + decode the JSONP variety array for one crop_pkey."""
|
|
url = f"{API}?crop_pkey={crop_pkey}&callback=cb"
|
|
r = http.get(url)
|
|
r.raise_for_status()
|
|
data = _strip_jsonp(r.text)
|
|
if not isinstance(data, list):
|
|
raise ValueError(f"unexpected payload for crop_pkey={crop_pkey}: "
|
|
f"{type(data).__name__}")
|
|
return data
|
|
|
|
|
|
# --------------------------------------------------------------------- mapping
|
|
|
|
|
|
def _slug(s: str) -> str:
|
|
s = (s or "").strip().lower()
|
|
s = re.sub(r"[^a-z0-9]+", "-", s)
|
|
return re.sub(r"-+", "-", s).strip("-")
|
|
|
|
|
|
def _is_rated(v: Any) -> bool:
|
|
if v is None:
|
|
return False
|
|
return str(v).strip().lower() not in _NOT_RATED
|
|
|
|
|
|
def _clean_value(v: Any) -> str:
|
|
"""Normalise a stat value for display. Numeric soy stats arrive as
|
|
'8.000' — strip the trailing zeros to '8'. Everything else passes
|
|
through verbatim (Yes / No / Suitable / Integrated refuge / ...)."""
|
|
s = str(v).strip()
|
|
# numeric like "8.000" / "8.00" / "97.00" -> "8" / "97"
|
|
if re.fullmatch(r"-?\d+(?:\.\d+)?", s):
|
|
f = float(s)
|
|
return str(int(f)) if f == int(f) else (f"{f:g}")
|
|
return s
|
|
|
|
|
|
def _maturity(rec: dict, crop: str) -> tuple[int | None, float | None]:
|
|
raw = rec.get("maturity")
|
|
if raw is None or str(raw).strip() == "":
|
|
return None, None
|
|
try:
|
|
f = float(str(raw).strip())
|
|
except ValueError:
|
|
return None, None
|
|
if crop == "corn":
|
|
return int(round(f)), None
|
|
return None, round(f, 1)
|
|
|
|
|
|
def _brand(rec: dict) -> str:
|
|
"""Per-record brand. corn -> stat_corn_brand (Burrus / Power Plus);
|
|
soy -> stat_soybean_brand (Burrus / DONMARIO). Falls back to the
|
|
leading token of the description, else 'Burrus'."""
|
|
b = rec.get("stat_corn_brand") or rec.get("stat_soybean_brand")
|
|
if b and str(b).strip():
|
|
return str(b).strip()
|
|
desc = (rec.get("description") or "").strip()
|
|
code = (rec.get("id") or "").strip()
|
|
if desc and code and desc.lower().endswith(code.lower()):
|
|
lead = desc[: len(desc) - len(code)].strip()
|
|
if lead:
|
|
return lead
|
|
return "Burrus"
|
|
|
|
|
|
def _traits(rec: dict) -> list[str]:
|
|
out: list[str] = []
|
|
for key in ("trait", "trait_platform"):
|
|
v = rec.get(key)
|
|
if v and str(v).strip():
|
|
# strip stray trailing punctuation seen in the data
|
|
# ("Conventional." / "AM`")
|
|
t = str(v).strip().rstrip(".`")
|
|
if t and t not in out:
|
|
out.append(t)
|
|
return out
|
|
|
|
|
|
def _build_groups(rec: dict) -> list[dict]:
|
|
groups: list[dict] = []
|
|
for label, fields in GROUP_ORDER:
|
|
items: list[dict] = []
|
|
for stat_key, human in fields.items():
|
|
v = rec.get(stat_key)
|
|
if _is_rated(v):
|
|
items.append({"characteristic": human, "value": _clean_value(v)})
|
|
if items:
|
|
groups.append({"label": label, "items": items})
|
|
return groups
|
|
|
|
|
|
def map_record(rec: dict, crop: str) -> BurrusVariety:
|
|
code = (rec.get("id") or "").strip()
|
|
pkey = rec.get("pkey")
|
|
key_seed = code or (f"pkey-{pkey}" if pkey else (rec.get("description") or ""))
|
|
source_key = f"burrus-{_slug(key_seed)}"
|
|
name = (rec.get("description") or code or key_seed).strip()
|
|
rm, mg = _maturity(rec, crop)
|
|
page_slug = CROP_PKEYS[crop][1]
|
|
return BurrusVariety(
|
|
source_key=source_key,
|
|
crop=crop,
|
|
product_name=name,
|
|
product_id=code or name,
|
|
brand=_brand(rec),
|
|
relative_maturity=rm,
|
|
maturity_group=mg,
|
|
release_year=(rec.get("released")
|
|
if isinstance(rec.get("released"), int) else None),
|
|
trait_stack=_traits(rec),
|
|
# The Seedware records carry no marketing blurb; leave positioning
|
|
# null rather than fabricate one.
|
|
positioning=None,
|
|
groups=_build_groups(rec),
|
|
source_url=f"{SITE}/products/{page_slug}",
|
|
)
|
|
|
|
|
|
# --------------------------------------------------------------------- render
|
|
|
|
|
|
def render_markdown(v: BurrusVariety) -> str:
|
|
crop_label = {"corn": "Corn", "soybeans": "Soybeans"}.get(
|
|
v.crop, v.crop.title())
|
|
head: list[str] = [
|
|
f"# {v.product_name}",
|
|
"",
|
|
"- **Vendor:** Burrus Seed (Burrus Hybrids — independent family "
|
|
"company, Arenzville, IL, since 1935)",
|
|
f"- **Brand:** {v.brand}",
|
|
f"- **Crop:** {crop_label}",
|
|
]
|
|
if v.crop == "corn" and v.relative_maturity is not None:
|
|
head.append(f"- **Relative maturity:** {v.relative_maturity} days")
|
|
if v.crop == "soybeans" and v.maturity_group is not None:
|
|
head.append(f"- **Maturity group:** {v.maturity_group}")
|
|
if v.trait_stack:
|
|
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
|
if v.release_year:
|
|
head.append(f"- **Released:** {v.release_year}")
|
|
head.append(f"- **Source:** {v.source_url}")
|
|
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
|
head.append("- **Service area:** Burrus dealer network "
|
|
"(IL / IN / IA / MO / WI)")
|
|
head.append("")
|
|
head += ["---", ""]
|
|
for g in v.groups:
|
|
head.append(f"## {g['label'].title()}")
|
|
head.append("")
|
|
for it in g["items"]:
|
|
head.append(f"- **{it['characteristic']}:** {it['value'] or '—'}")
|
|
head.append("")
|
|
return "\n".join(head)
|
|
|
|
|
|
def write_variety(v: BurrusVariety, body_md: str) -> None:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
|
sidecar = {
|
|
"source": "burrus",
|
|
"source_key": v.source_key,
|
|
"vendor": "Burrus Seed",
|
|
"brand": v.brand,
|
|
"product_name": v.product_name,
|
|
"product_id": v.product_id,
|
|
"crop": v.crop,
|
|
"release_year": v.release_year,
|
|
"relative_maturity": v.relative_maturity,
|
|
"maturity_group": v.maturity_group,
|
|
"wheat_class": None,
|
|
"trait_stack": v.trait_stack,
|
|
"trait_descriptions": [],
|
|
"positioning_statement": v.positioning,
|
|
"strengths": [],
|
|
"characteristics_groups": v.groups,
|
|
"_scale_direction": RATING_SCALE_DIRECTION,
|
|
"regional_recommendations": [
|
|
{"product_list_name": "Burrus dealer network (IL/IN/IA/MO/WI)",
|
|
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
|
],
|
|
"image_url": None,
|
|
"source_urls": [v.source_url],
|
|
"sitemap_last_modified": None,
|
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
|
"scraper_version": SCRAPER_VERSION,
|
|
}
|
|
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
|
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
|
encoding="utf-8")
|
|
|
|
|
|
# --------------------------------------------------------------------- pipeline
|
|
|
|
|
|
def run(*, limit: int | None, force: bool,
|
|
only_crop: str | None, only_product: str | None) -> int:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
http = RateLimitedSession()
|
|
|
|
crops = [only_crop] if only_crop else list(CROP_PKEYS.keys())
|
|
records: list[tuple[str, dict]] = []
|
|
for crop in crops:
|
|
crop_pkey = CROP_PKEYS[crop][0]
|
|
try:
|
|
raw = fetch_crop(http, crop_pkey)
|
|
except (requests.HTTPError, ValueError) as exc:
|
|
log.error("fetch failed for crop=%s (pkey=%d): %s",
|
|
crop, crop_pkey, exc)
|
|
continue
|
|
log.info("crop=%-9s pkey=%d: %d records", crop, crop_pkey, len(raw))
|
|
for rec in raw:
|
|
records.append((crop, rec))
|
|
|
|
varieties = [map_record(rec, crop) for crop, rec in records]
|
|
|
|
if only_product:
|
|
key = only_product.lower()
|
|
varieties = [v for v in varieties
|
|
if v.source_key == key or v.product_id.lower() == key
|
|
or _slug(v.product_id) == _slug(key)]
|
|
if not varieties:
|
|
log.error("no variety matched --product=%s", only_product)
|
|
return 2
|
|
|
|
counts = {"written": 0, "skipped": 0, "empty": 0}
|
|
processed = 0
|
|
total = len(varieties)
|
|
for v in varieties:
|
|
if limit is not None and processed >= limit:
|
|
break
|
|
processed += 1
|
|
md_path = CORPUS_DIR / f"{v.source_key}.md"
|
|
if md_path.exists() and not force:
|
|
counts["skipped"] += 1
|
|
log.info("[%d/%d] %s skipped", processed, total, v.source_key)
|
|
continue
|
|
if not v.groups:
|
|
counts["empty"] += 1
|
|
log.warning("[%d/%d] %s — no rating groups (still writing identity)",
|
|
processed, total, v.source_key)
|
|
write_variety(v, render_markdown(v))
|
|
counts["written"] += 1
|
|
log.info("[%d/%d] %s written | brand=%s crop=%s rm/mg=%s groups=%d "
|
|
"traits=%s", processed, total, v.source_key, v.brand, v.crop,
|
|
v.relative_maturity or v.maturity_group or "-",
|
|
len(v.groups), ",".join(v.trait_stack) or "-")
|
|
|
|
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d (of %d)",
|
|
processed, counts["written"], counts["skipped"], counts["empty"],
|
|
total)
|
|
return 0
|
|
|
|
|
|
# --------------------------------------------------------------------- CLI
|
|
|
|
|
|
def _build_argparser() -> argparse.ArgumentParser:
|
|
p = argparse.ArgumentParser(
|
|
prog="scrape.sources.burrus",
|
|
description="Scrape Burrus Seed (independent family company, "
|
|
"Arenzville IL) — corn / soybeans via the Seedware "
|
|
"JSON-over-JSONP catalog API.")
|
|
p.add_argument("--limit", type=int, default=None,
|
|
help="Stop after processing N varieties (default: all).")
|
|
p.add_argument("--force", action="store_true",
|
|
help="Re-write even if the markdown file already exists.")
|
|
p.add_argument("--crop", default=None, choices=sorted(CROP_PKEYS.keys()),
|
|
help="Limit to one crop (corn / soybeans).")
|
|
p.add_argument("--product", default=None,
|
|
help="Process a single variety by source_key or id.")
|
|
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
|
return p
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
args = _build_argparser().parse_args(argv)
|
|
logging.basicConfig(
|
|
level=args.log_level.upper(),
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
stream=sys.stderr)
|
|
return run(limit=args.limit, force=args.force,
|
|
only_crop=args.crop, only_product=args.product)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|