Files
claude 84ad2b1de6
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Add 4 independent seed brands: Latham + Stine + 1st Choice + Burrus (+623 varieties) (#17)
Co-authored-by: claude <claude@jpaul.io>
Co-committed-by: claude <claude@jpaul.io>
2026-06-04 21:58:07 -04:00

672 lines
27 KiB
Python

"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).
Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
seed company in Rushville, Indiana, serving the Eastern Corn Belt
(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
that is out of scope for the row-crop advisor).
Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
post types (corn-hybrids / soybeans / wheat) are NOT exposed to
``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
per-crop child sitemaps:
- ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/`` (~52 URLs)
- ``/soybeans-sitemap.xml`` -> ``/soybeans/<slug>/`` (~22 URLs)
- ``/wheat-sitemap.xml`` -> ``/wheat/<slug>/`` (~4 URLs)
robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
between requests.
Detail-page DOM (server-rendered, no JS needed for the text):
* Product name: the second ``<h1>`` inside ``article.content`` (the
first is the site logo "1st Choice Seeds").
* Corn — three ``<h2>`` sections + a side table:
- "Hybrid Characteristics": a single ``<p>`` of ``label • value``
lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
Seedling Vigor (genuinely thin pages — still written).
- "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
(the numeric 0-10 bars are drawn client-side by d3 and are NOT
in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
Superior, so higher = better.
- "Management Tips": ``label: value`` lines (Corn-On-Corn,
Productivity / soil guidance, Silage Rating).
- A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
the Low/Medium/High recommended planting populations.
* Soybeans — three ``<h2>`` sections:
- "Field Notes": a ``<ul>`` of strengths (often includes SCN
source / PRR gene call-outs).
- "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
- "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
Color, Pubescence, Pod, Hilum).
* Wheat — thin (title + date only; wheat is private-label). We still
write an identity record so the variety is discoverable.
Rating scale: the published legend is **0-10, higher = better**
("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
Superior 9-10"). 1st Choice publishes the *qualitative* word
(Excellent / Very Good / …) in the HTML — those map directly onto that
legend — while the numeric bar is d3-rendered and absent from the
markup. NA / blank = not rated.
Output:
corpus/first_choice/<source_key>.md
corpus/first_choice/<source_key>.json
source_key: ``firstchoice-<slug>`` lowercased, e.g.
``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.
CLI:
python -m scrape.sources.first_choice --crop corn --limit 5
python -m scrape.sources.first_choice --force
python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import random
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.1stchoiceseeds.com"
SITEMAP_INDEX = f"{BASE}/sitemap.xml"
# Per-crop child sitemap -> chunker crop value. The chunker keys on
# "soybeans" (plural) for the MG branch, so map accordingly. The
# cover-crops sitemap is intentionally omitted (out of scope for the
# row-crop advisor).
CROP_SITEMAPS = {
"corn": "corn-hybrids-sitemap.xml",
"soybeans": "soybeans-sitemap.xml",
"wheat": "wheat-sitemap.xml",
}
# URL path prefix that confirms a sitemap entry is a variety detail page
# (vs. a category/archive page that can sneak into a child sitemap).
CROP_PATH = {
"corn": "/corn-hybrids/",
"soybeans": "/soybeans/",
"wheat": "/wheat/",
}
# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
REQ_INTERVAL_SEC = 1.2
RATING_SCALE_DIRECTION = (
"0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
"6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
"publishes the qualitative word in HTML (the numeric bar is "
"d3-rendered, not in markup); blank/NA = not rated"
)
# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}
# Trait-suffix -> human label, derived from the slug tail. Best-effort;
# an unmapped suffix is title-cased verbatim so nothing is dropped.
TRAIT_LABELS = {
# corn
"vt2p": "VT Double PRO (VT2P)",
"gt": "Glyphosate Tolerant (GT)",
"c": "Conventional",
"pc": "PowerCore (PC)",
"tre": "Trecepta (TRE)",
"ss": "SmartStax (SS)",
"v": "VT (V)",
"dv": "Double VT (DV)",
"aa": "Agrisure Artesian (AA)",
# soybeans
"en": "Enlist E3 (EN)",
"xf": "XtendFlex (XF)",
"sts": "STS",
# wheat
"b": "Bin-run / branded (B)",
"s": "Soft (S)",
}
REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "first_choice"
log = logging.getLogger("scrape.first_choice")
# --------------------------------------------------------------------- HTTP
class RateLimitedSession:
"""Polite session with backoff. The 1st Choice row-crop catalog is
small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
a couple minutes."""
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
def request(self, method: str, url: str, *, max_retries: int = 4,
timeout: float = 30.0, **kw: Any) -> requests.Response:
last_exc: Exception | None = None
resp: requests.Response | None = None
for attempt in range(max_retries):
self._wait()
try:
resp = self.s.request(method, url, timeout=timeout, **kw)
except requests.RequestException as exc:
last_exc = exc
backoff = min(30.0, (2 ** attempt) + random.random())
log.warning("network error on %s %s: %s — retry in %.1fs",
method, url, exc, backoff)
time.sleep(backoff)
continue
if resp.status_code == 429 or 500 <= resp.status_code < 600:
ra = resp.headers.get("Retry-After")
backoff = float(ra) if (ra and ra.isdigit()) else min(
30.0, (2 ** attempt) + random.random())
log.warning("HTTP %d on %s %s — retry in %.1fs",
resp.status_code, method, url, backoff)
time.sleep(backoff)
continue
return resp
if last_exc:
raise last_exc
assert resp is not None
return resp
def get(self, url: str, **kw: Any) -> requests.Response:
return self.request("GET", url, **kw)
# --------------------------------------------------------------------- model
@dataclass
class FCVariety:
source_key: str
source_url: str
crop: str # chunker value: corn / soybeans / wheat
product_name: str = "" # "FC 8455 VT2P"
relative_maturity: int | None = None # corn (days)
maturity_group: float | None = None # soy
wheat_class: str | None = None # wheat
trait_stack: list[str] = field(default_factory=list)
positioning: str | None = None
strengths: list[str] = field(default_factory=list)
# [{label, items:[{characteristic, value}]}] — chunker source of truth
groups: list[dict] = field(default_factory=list)
sitemap_last_modified: str | None = None
# --------------------------------------------------------------------- discovery (sitemaps)
_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
re.IGNORECASE | re.DOTALL)
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
re.IGNORECASE | re.DOTALL)
def _slug_from_url(url: str) -> str:
return url.rstrip("/").rsplit("/", 1)[-1].lower()
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
"""Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
by walking the per-crop child sitemaps under /sitemap.xml.
We fetch each known child sitemap directly (their names are stable
All-in-One-SEO conventions) rather than trusting the index ordering,
but we still confirm against the index so a renamed sitemap is caught.
"""
# Pull the sitemap index once so we can warn if a crop sitemap is
# missing/renamed (defensive; we still target the known names).
index_locs: set[str] = set()
try:
idx = http.get(SITEMAP_INDEX)
idx.raise_for_status()
index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
except requests.RequestException as exc:
log.warning("could not read sitemap index %s: %s (continuing with "
"known child sitemap names)", SITEMAP_INDEX, exc)
records: list[dict] = []
for crop, child in CROP_SITEMAPS.items():
if only_crop and crop != only_crop:
continue
child_url = f"{BASE}/{child}"
if index_locs and child_url not in index_locs:
log.warning("crop sitemap %s not listed in the index — site may "
"have renamed it; trying anyway", child_url)
r = http.get(child_url)
if r.status_code == 404:
log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
continue
r.raise_for_status()
prefix = CROP_PATH[crop]
seen: set[str] = set()
n = 0
for block in _URL_BLOCK_RE.findall(r.text):
loc_m = _LOC_RE.search(block)
if not loc_m:
continue
url = loc_m.group(1).strip()
if prefix not in url:
continue # category/archive page leaked into the sitemap
slug = _slug_from_url(url)
if not slug or slug in seen:
continue
seen.add(slug)
lm_m = _LASTMOD_RE.search(block)
records.append({
"crop": crop,
"url": url,
"slug": slug,
"lastmod": lm_m.group(1).strip() if lm_m else None,
})
n += 1
log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
log.info("total varieties discovered: %d", len(records))
return records
# --------------------------------------------------------------------- detail parse
def _clean(s: str) -> str:
return re.sub(r"\s+", " ", s or "").strip()
def _direct_text(el: Tag) -> str:
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
def _br_lines(el: Tag) -> list[str]:
"""Text of an element with <br> treated as a line break."""
# Work on a copy so the original tree (used by other parsers) stays intact.
for br in el.find_all("br"):
br.replace_with("\n")
return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]
def _product_name(article: Tag, slug: str) -> str:
"""The variety name is the 2nd <h1> in article.content (the 1st is the
site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
for h1 in article.find_all("h1"):
txt = _clean(h1.get_text(" ", strip=True))
if txt and txt.lower() != "1st choice seeds":
return txt
return slug.upper().replace("-", " ")
def _trait_stack(slug: str, crop: str) -> list[str]:
"""Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
fb-3545-c-sts -> Conventional + STS). The leading model token
(fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
parts = slug.split("-")
# Drop the leading model identifier: typically the first 1-2 tokens
# (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
# that is a known trait suffix counts; we scan from the right.
traits: list[str] = []
for tok in parts:
t = tok.lower()
if t in TRAIT_LABELS:
label = TRAIT_LABELS[t]
if label not in traits:
traits.append(label)
# Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
# above naturally skips them. Preserve discovery order (left->right).
return traits
def _parse_corn(article: Tag, v: FCVariety) -> None:
"""Populate corn ratings from Hybrid Characteristics + Management Tips
+ the Relative Maturity / Degree Days side table."""
agronomic: list[dict] = []
disease: list[dict] = []
management: list[dict] = []
# Hybrid Characteristics: a <p> of "label • value" lines.
hc = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Hybrid Characteristics"), None)
if hc is not None:
sib = hc.find_next_sibling()
if sib is not None and sib.name == "p":
for ln in _br_lines(sib):
# split on bullet (•) or fall back to first colon
if "•" in ln:
k, _, val = ln.partition("•")
elif ":" in ln:
k, _, val = ln.partition(":")
else:
k, val = ln, ""
k, val = _clean(k), _clean(val)
if not k:
continue
item = {"characteristic": k, "value": val}
if k.lower() in _CORN_DISEASE_LABELS:
disease.append(item)
else:
agronomic.append(item)
# Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
# Silage Rating). Stop pulling once we wander into the footer address.
mt = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Management Tips"), None)
if mt is not None:
sib = mt.find_next_sibling()
if sib is not None and sib.name == "p":
for ln in _br_lines(sib):
if ":" not in ln:
continue
k, _, val = ln.partition(":")
k, val = _clean(k), _clean(val)
# Footer noise (address / © line) has no useful colon form.
if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
management.append({"characteristic": k, "value": val})
# Side table: Relative Maturity / Degree Days + planting populations.
pop_rows: list[str] = []
for tbl in article.find_all("table"):
for tr in tbl.find_all("tr"):
cells = [_clean(c.get_text(" ", strip=True))
for c in tr.find_all(["td", "th"])]
cells = [c for c in cells if c]
if not cells:
continue
joined = " ".join(cells).lower()
if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
m = re.search(r"(\d+)", cells[1])
if m:
v.relative_maturity = int(m.group(1))
agronomic.insert(0, {"characteristic": "Relative Maturity",
"value": cells[1]})
elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
agronomic.append({"characteristic": "Degree Days (GDU)",
"value": cells[1]})
elif joined.startswith("low") and ("medium" in joined or "high" in joined):
pop_rows.append(" / ".join(cells))
if pop_rows:
management.append({"characteristic": "Recommended Planting Population",
"value": "; ".join(pop_rows)})
if agronomic:
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
if disease:
v.groups.append({"label": "DISEASE RATINGS", "items": disease})
if management:
v.groups.append({"label": "MANAGEMENT", "items": management})
def _parse_soy(article: Tag, v: FCVariety) -> None:
"""Populate soy MG + agronomic descriptors + field-note strengths."""
# Field Notes -> strengths (and positioning from the first one).
fn = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Field Notes"), None)
if fn is not None:
sib = fn.find_next_sibling()
if sib is not None and sib.name == "ul":
notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
v.strengths = [n for n in notes if n]
if v.strengths and not v.positioning:
v.positioning = v.strengths[0]
# Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
agronomic: list[dict] = []
vd = next((h for h in article.find_all("h2")
if _clean(h.get_text()) == "Variety Description"), None)
if vd is not None:
for el in vd.find_all_next():
if el.name == "h2" and el is not vd:
break
if not isinstance(el, Tag):
continue
# Stop at the action buttons / right-nav / footer region.
cls = el.get("class") or []
if el.name == "div" and any(
c in cls for c in ("btn", "right-bar", "right-navigation",
"address", "wrapper")):
break
b = el.find("b", recursive=False) if el.name == "div" else None
if b is not None:
k = _clean(b.get_text(" ", strip=True)).rstrip(":")
val = _direct_text(el)
if not k:
continue
if k.lower() == "maturity":
try:
v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
except (AttributeError, ValueError):
pass
agronomic.append({"characteristic": "Maturity Group", "value": val})
else:
agronomic.append({"characteristic": k, "value": val})
if agronomic:
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
crop = rec["crop"]
slug = rec["slug"]
url = rec["url"]
v = FCVariety(
source_key=f"firstchoice-{slug}",
source_url=url,
crop=crop,
trait_stack=_trait_stack(slug, crop),
sitemap_last_modified=rec.get("lastmod"),
)
r = http.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
article = soup.find("article", class_="content") or soup
v.product_name = _product_name(article, slug)
if crop == "corn":
_parse_corn(article, v)
elif crop == "soybeans":
_parse_soy(article, v)
# wheat: thin pages — identity only (no spec sections to parse).
return v
# --------------------------------------------------------------------- render
def render_markdown(v: FCVariety) -> str:
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
"wheat": "Wheat"}.get(v.crop, v.crop.title())
head: list[str] = [
f"# {v.product_name}",
"",
"- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
"- **Brand:** 1st Choice Seeds",
f"- **Crop:** {crop_label}",
]
if v.crop == "corn" and v.relative_maturity is not None:
head.append(f"- **Relative maturity:** {v.relative_maturity} day")
if v.crop == "soybeans" and v.maturity_group is not None:
head.append(f"- **Maturity group:** {v.maturity_group}")
if v.crop == "wheat" and v.wheat_class:
head.append(f"- **Wheat class:** {v.wheat_class}")
if v.trait_stack:
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
head.append(f"- **Source:** {v.source_url}")
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
head.append("- **Service area:** 1st Choice Seeds dealer network — "
"Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
head.append("")
if v.positioning:
head += ["---", "", f"_{v.positioning}_", ""]
if v.strengths:
head += ["---", "", "## Field Notes", ""]
head += [f"- {s}" for s in v.strengths]
head.append("")
head += ["---", ""]
for g in v.groups:
head.append(f"## {g['label'].title()}")
head.append("")
for it in g["items"]:
ch = it["characteristic"]
val = it["value"] or "—"
head.append(f"- **{ch}:** {val}")
head.append("")
if not v.groups and v.crop == "wheat":
head += ["_Identity record only — 1st Choice wheat is private-label "
"and the catalog page carries no agronomic spec block._", ""]
return "\n".join(head)
def write_variety(v: FCVariety, body_md: str) -> None:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
sidecar = {
"source": "first_choice",
"source_key": v.source_key,
"vendor": "1st Choice Seeds",
"brand": "1st Choice Seeds",
"product_name": v.product_name,
"product_id": v.product_name,
"crop": v.crop,
"release_year": None,
"relative_maturity": v.relative_maturity,
"maturity_group": v.maturity_group,
"wheat_class": v.wheat_class,
"trait_stack": v.trait_stack,
"trait_descriptions": [],
"positioning_statement": v.positioning,
"strengths": v.strengths,
"characteristics_groups": v.groups,
"_scale_direction": RATING_SCALE_DIRECTION,
"regional_recommendations": [
{"product_list_name": "1st Choice Seeds dealer network "
"(Eastern Corn Belt — IN/OH/KY/TN)",
"agronomist": None, "agronomist_email": None, "variant_id": None},
],
"image_url": None,
"source_urls": [v.source_url],
"sitemap_last_modified": v.sitemap_last_modified,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"scraper_version": SCRAPER_VERSION,
}
(CORPUS_DIR / f"{v.source_key}.json").write_text(
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
# --------------------------------------------------------------------- pipeline
def run(*, limit: int | None, force: bool,
only_crop: str | None, only_product: str | None) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
records = discover(http, only_crop=only_crop)
if only_product:
key = only_product.lower()
records = [r for r in records
if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
if not records:
log.error("no variety matched --product=%s", only_product)
return 2
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
processed = 0
for rec in records:
if limit is not None and processed >= limit:
break
processed += 1
source_key = f"firstchoice-{rec['slug']}"
md_path = CORPUS_DIR / f"{source_key}.md"
if md_path.exists() and not force:
counts["skipped"] += 1
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
continue
try:
v = parse_detail(http, rec)
except requests.HTTPError as exc:
counts["failed"] += 1
log.error("[%d/%d] %s detail fetch failed: %s",
processed, len(records), source_key, exc)
continue
if not v.groups:
counts["empty"] += 1
log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
processed, len(records), source_key,
"; thin wheat page" if v.crop == "wheat" else "")
write_variety(v, render_markdown(v))
counts["written"] += 1
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
processed, len(records), source_key, v.crop,
v.relative_maturity or v.maturity_group or "-",
len(v.groups), ",".join(v.trait_stack) or "-")
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
processed, counts["written"], counts["skipped"], counts["empty"],
counts["failed"], len(records))
return 0
# --------------------------------------------------------------------- CLI
def _build_argparser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
prog="scrape.sources.first_choice",
description="Scrape 1st Choice Seeds (independent, employee-owned — "
"Rushville, IN) — corn / soybeans / wheat via sitemaps "
"+ detail pages.")
p.add_argument("--limit", type=int, default=None,
help="Stop after processing N varieties (default: all).")
p.add_argument("--force", action="store_true",
help="Re-fetch even if the markdown file already exists.")
p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
help="Limit to one crop (corn / soybeans / wheat).")
p.add_argument("--product", default=None,
help="Process a single variety by source_key or slug.")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
def main(argv: list[str] | None = None) -> int:
args = _build_argparser().parse_args(argv)
logging.basicConfig(
level=args.log_level.upper(),
format="%(asctime)s %(levelname)s %(name)s %(message)s",
stream=sys.stderr)
return run(limit=args.limit, force=args.force,
only_crop=args.crop, only_product=args.product)
if __name__ == "__main__":
sys.exit(main())