"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN). Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site (All in One SEO). 1st Choice Seeds is an **independent, employee-owned** seed company in Rushville, Indiana, serving the Eastern Corn Belt (IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line that is out of scope for the row-crop advisor). Discovery is by **sitemap**, NOT the WP REST API: the catalog custom post types (corn-hybrids / soybeans / wheat) are NOT exposed to ``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the per-crop child sitemaps: - ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids//`` (~52 URLs) - ``/soybeans-sitemap.xml`` -> ``/soybeans//`` (~22 URLs) - ``/wheat-sitemap.xml`` -> ``/wheat//`` (~4 URLs) robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` / ``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay, no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s between requests. Detail-page DOM (server-rendered, no JS needed for the text): * Product name: the second ``

`` inside ``article.content`` (the first is the site logo "1st Choice Seeds"). * Corn — three ``

`` sections + a side table: - "Hybrid Characteristics": a single ``

`` of ``label • value`` lines split on ``
`` (Seedling Vigor, Plant Height, Ear Placement, Root Rating, Stalk Rating, Foliar Health, Drydown, Ear Length/Girth/Flex, Test Weight). Some hybrids only publish Seedling Vigor (genuinely thin pages — still written). - "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart`` (the numeric 0-10 bars are drawn client-side by d3 and are NOT in the HTML). The legend IS the scale: 0-4 Below Average … 9-10 Superior, so higher = better. - "Management Tips": ``label: value`` lines (Corn-On-Corn, Productivity / soil guidance, Silage Rating). - A ```` carrying Relative Maturity, Degree Days (GDU), and the Low/Medium/High recommended planting populations. * Soybeans — three ``

`` sections: - "Field Notes": a ``
    `` of strengths (often includes SCN source / PRR gene call-outs). - "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart). - "Variety Description": ``div`` blocks of ``Label: value`` pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower Color, Pubescence, Pod, Hilum). * Wheat — thin (title + date only; wheat is private-label). We still write an identity record so the variety is discoverable. Rating scale: the published legend is **0-10, higher = better** ("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8, Superior 9-10"). 1st Choice publishes the *qualitative* word (Excellent / Very Good / …) in the HTML — those map directly onto that legend — while the numeric bar is d3-rendered and absent from the markup. NA / blank = not rated. Output: corpus/first_choice/.md corpus/first_choice/.json source_key: ``firstchoice-`` lowercased, e.g. ``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``. CLI: python -m scrape.sources.first_choice --crop corn --limit 5 python -m scrape.sources.first_choice --force python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup, NavigableString, Tag SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.1stchoiceseeds.com" SITEMAP_INDEX = f"{BASE}/sitemap.xml" # Per-crop child sitemap -> chunker crop value. The chunker keys on # "soybeans" (plural) for the MG branch, so map accordingly. The # cover-crops sitemap is intentionally omitted (out of scope for the # row-crop advisor). CROP_SITEMAPS = { "corn": "corn-hybrids-sitemap.xml", "soybeans": "soybeans-sitemap.xml", "wheat": "wheat-sitemap.xml", } # URL path prefix that confirms a sitemap entry is a variety detail page # (vs. a category/archive page that can sneak into a child sitemap). CROP_PATH = { "corn": "/corn-hybrids/", "soybeans": "/soybeans/", "wheat": "/wheat/", } # robots.txt declares no Crawl-delay; we stay polite. The full row-crop # catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min. REQ_INTERVAL_SEC = 1.2 RATING_SCALE_DIRECTION = ( "0-10, higher = better (legend: 0-4 Below Average, 5 Average, " "6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice " "publishes the qualitative word in HTML (the numeric bar is " "d3-rendered, not in markup); blank/NA = not rated" ) # Corn "Hybrid Characteristics" lines that are foliar/disease in nature # bucket into DISEASE RATINGS; the rest are agronomic/plant ratings. _CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"} # Trait-suffix -> human label, derived from the slug tail. Best-effort; # an unmapped suffix is title-cased verbatim so nothing is dropped. TRAIT_LABELS = { # corn "vt2p": "VT Double PRO (VT2P)", "gt": "Glyphosate Tolerant (GT)", "c": "Conventional", "pc": "PowerCore (PC)", "tre": "Trecepta (TRE)", "ss": "SmartStax (SS)", "v": "VT (V)", "dv": "Double VT (DV)", "aa": "Agrisure Artesian (AA)", # soybeans "en": "Enlist E3 (EN)", "xf": "XtendFlex (XF)", "sts": "STS", # wheat "b": "Bin-run / branded (B)", "s": "Soft (S)", } REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "first_choice" log = logging.getLogger("scrape.first_choice") # --------------------------------------------------------------------- HTTP class RateLimitedSession: """Polite session with backoff. The 1st Choice row-crop catalog is small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in a couple minutes.""" def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None resp: requests.Response | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min( 30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc assert resp is not None return resp def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class FCVariety: source_key: str source_url: str crop: str # chunker value: corn / soybeans / wheat product_name: str = "" # "FC 8455 VT2P" relative_maturity: int | None = None # corn (days) maturity_group: float | None = None # soy wheat_class: str | None = None # wheat trait_stack: list[str] = field(default_factory=list) positioning: str | None = None strengths: list[str] = field(default_factory=list) # [{label, items:[{characteristic, value}]}] — chunker source of truth groups: list[dict] = field(default_factory=list) sitemap_last_modified: str | None = None # --------------------------------------------------------------------- discovery (sitemaps) _LOC_RE = re.compile(r"\s*(?:)?\s*", re.IGNORECASE | re.DOTALL) _URL_BLOCK_RE = re.compile(r"(.*?)", re.IGNORECASE | re.DOTALL) _LASTMOD_RE = re.compile(r"\s*(?:)?\s*", re.IGNORECASE | re.DOTALL) def _slug_from_url(url: str) -> str: return url.rstrip("/").rsplit("/", 1)[-1].lower() def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]: """Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties by walking the per-crop child sitemaps under /sitemap.xml. We fetch each known child sitemap directly (their names are stable All-in-One-SEO conventions) rather than trusting the index ordering, but we still confirm against the index so a renamed sitemap is caught. """ # Pull the sitemap index once so we can warn if a crop sitemap is # missing/renamed (defensive; we still target the known names). index_locs: set[str] = set() try: idx = http.get(SITEMAP_INDEX) idx.raise_for_status() index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)} except requests.RequestException as exc: log.warning("could not read sitemap index %s: %s (continuing with " "known child sitemap names)", SITEMAP_INDEX, exc) records: list[dict] = [] for crop, child in CROP_SITEMAPS.items(): if only_crop and crop != only_crop: continue child_url = f"{BASE}/{child}" if index_locs and child_url not in index_locs: log.warning("crop sitemap %s not listed in the index — site may " "have renamed it; trying anyway", child_url) r = http.get(child_url) if r.status_code == 404: log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop) continue r.raise_for_status() prefix = CROP_PATH[crop] seen: set[str] = set() n = 0 for block in _URL_BLOCK_RE.findall(r.text): loc_m = _LOC_RE.search(block) if not loc_m: continue url = loc_m.group(1).strip() if prefix not in url: continue # category/archive page leaked into the sitemap slug = _slug_from_url(url) if not slug or slug in seen: continue seen.add(slug) lm_m = _LASTMOD_RE.search(block) records.append({ "crop": crop, "url": url, "slug": slug, "lastmod": lm_m.group(1).strip() if lm_m else None, }) n += 1 log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n) log.info("total varieties discovered: %d", len(records)) return records # --------------------------------------------------------------------- detail parse def _clean(s: str) -> str: return re.sub(r"\s+", " ", s or "").strip() def _direct_text(el: Tag) -> str: return _clean("".join(c for c in el.children if isinstance(c, NavigableString))) def _br_lines(el: Tag) -> list[str]: """Text of an element with
    treated as a line break.""" # Work on a copy so the original tree (used by other parsers) stays intact. for br in el.find_all("br"): br.replace_with("\n") return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()] def _product_name(article: Tag, slug: str) -> str: """The variety name is the 2nd

    in article.content (the 1st is the site-logo "1st Choice Seeds"). Fall back to a tidied slug.""" for h1 in article.find_all("h1"): txt = _clean(h1.get_text(" ", strip=True)) if txt and txt.lower() != "1st choice seeds": return txt return slug.upper().replace("-", " ") def _trait_stack(slug: str, crop: str) -> list[str]: """Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P, fb-3545-c-sts -> Conventional + STS). The leading model token (fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait.""" parts = slug.split("-") # Drop the leading model identifier: typically the first 1-2 tokens # (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything # that is a known trait suffix counts; we scan from the right. traits: list[str] = [] for tok in parts: t = tok.lower() if t in TRAIT_LABELS: label = TRAIT_LABELS[t] if label not in traits: traits.append(label) # Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the # above naturally skips them. Preserve discovery order (left->right). return traits def _parse_corn(article: Tag, v: FCVariety) -> None: """Populate corn ratings from Hybrid Characteristics + Management Tips + the Relative Maturity / Degree Days side table.""" agronomic: list[dict] = [] disease: list[dict] = [] management: list[dict] = [] # Hybrid Characteristics: a

    of "label • value" lines. hc = next((h for h in article.find_all("h2") if _clean(h.get_text()) == "Hybrid Characteristics"), None) if hc is not None: sib = hc.find_next_sibling() if sib is not None and sib.name == "p": for ln in _br_lines(sib): # split on bullet (•) or fall back to first colon if "•" in ln: k, _, val = ln.partition("•") elif ":" in ln: k, _, val = ln.partition(":") else: k, val = ln, "" k, val = _clean(k), _clean(val) if not k: continue item = {"characteristic": k, "value": val} if k.lower() in _CORN_DISEASE_LABELS: disease.append(item) else: agronomic.append(item) # Management Tips: "label: value" lines (Corn-On-Corn / Productivity / # Silage Rating). Stop pulling once we wander into the footer address. mt = next((h for h in article.find_all("h2") if _clean(h.get_text()) == "Management Tips"), None) if mt is not None: sib = mt.find_next_sibling() if sib is not None and sib.name == "p": for ln in _br_lines(sib): if ":" not in ln: continue k, _, val = ln.partition(":") k, val = _clean(k), _clean(val) # Footer noise (address / © line) has no useful colon form. if k and val and not k.startswith("©") and "rights reserved" not in ln.lower(): management.append({"characteristic": k, "value": val}) # Side table: Relative Maturity / Degree Days + planting populations. pop_rows: list[str] = [] for tbl in article.find_all("table"): for tr in tbl.find_all("tr"): cells = [_clean(c.get_text(" ", strip=True)) for c in tr.find_all(["td", "th"])] cells = [c for c in cells if c] if not cells: continue joined = " ".join(cells).lower() if cells[0].lower().startswith("relative maturity") and len(cells) >= 2: m = re.search(r"(\d+)", cells[1]) if m: v.relative_maturity = int(m.group(1)) agronomic.insert(0, {"characteristic": "Relative Maturity", "value": cells[1]}) elif cells[0].lower().startswith("degree days") and len(cells) >= 2: agronomic.append({"characteristic": "Degree Days (GDU)", "value": cells[1]}) elif joined.startswith("low") and ("medium" in joined or "high" in joined): pop_rows.append(" / ".join(cells)) if pop_rows: management.append({"characteristic": "Recommended Planting Population", "value": "; ".join(pop_rows)}) if agronomic: v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic}) if disease: v.groups.append({"label": "DISEASE RATINGS", "items": disease}) if management: v.groups.append({"label": "MANAGEMENT", "items": management}) def _parse_soy(article: Tag, v: FCVariety) -> None: """Populate soy MG + agronomic descriptors + field-note strengths.""" # Field Notes -> strengths (and positioning from the first one). fn = next((h for h in article.find_all("h2") if _clean(h.get_text()) == "Field Notes"), None) if fn is not None: sib = fn.find_next_sibling() if sib is not None and sib.name == "ul": notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")] v.strengths = [n for n in notes if n] if v.strengths and not v.positioning: v.positioning = v.strengths[0] # Variety Description -> [{characteristic, value}] from Label: value. agronomic: list[dict] = [] vd = next((h for h in article.find_all("h2") if _clean(h.get_text()) == "Variety Description"), None) if vd is not None: for el in vd.find_all_next(): if el.name == "h2" and el is not vd: break if not isinstance(el, Tag): continue # Stop at the action buttons / right-nav / footer region. cls = el.get("class") or [] if el.name == "div" and any( c in cls for c in ("btn", "right-bar", "right-navigation", "address", "wrapper")): break b = el.find("b", recursive=False) if el.name == "div" else None if b is not None: k = _clean(b.get_text(" ", strip=True)).rstrip(":") val = _direct_text(el) if not k: continue if k.lower() == "maturity": try: v.maturity_group = float(re.search(r"[\d.]+", val).group(0)) except (AttributeError, ValueError): pass agronomic.append({"characteristic": "Maturity Group", "value": val}) else: agronomic.append({"characteristic": k, "value": val}) if agronomic: v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic}) def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety: crop = rec["crop"] slug = rec["slug"] url = rec["url"] v = FCVariety( source_key=f"firstchoice-{slug}", source_url=url, crop=crop, trait_stack=_trait_stack(slug, crop), sitemap_last_modified=rec.get("lastmod"), ) r = http.get(url) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") article = soup.find("article", class_="content") or soup v.product_name = _product_name(article, slug) if crop == "corn": _parse_corn(article, v) elif crop == "soybeans": _parse_soy(article, v) # wheat: thin pages — identity only (no spec sections to parse). return v # --------------------------------------------------------------------- render def render_markdown(v: FCVariety) -> str: crop_label = {"corn": "Corn", "soybeans": "Soybeans", "wheat": "Wheat"}.get(v.crop, v.crop.title()) head: list[str] = [ f"# {v.product_name}", "", "- **Vendor:** 1st Choice Seeds (independent, employee-owned)", "- **Brand:** 1st Choice Seeds", f"- **Crop:** {crop_label}", ] if v.crop == "corn" and v.relative_maturity is not None: head.append(f"- **Relative maturity:** {v.relative_maturity} day") if v.crop == "soybeans" and v.maturity_group is not None: head.append(f"- **Maturity group:** {v.maturity_group}") if v.crop == "wheat" and v.wheat_class: head.append(f"- **Wheat class:** {v.wheat_class}") if v.trait_stack: head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}") head.append(f"- **Source:** {v.source_url}") head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}") head.append("- **Service area:** 1st Choice Seeds dealer network — " "Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN") head.append("") if v.positioning: head += ["---", "", f"_{v.positioning}_", ""] if v.strengths: head += ["---", "", "## Field Notes", ""] head += [f"- {s}" for s in v.strengths] head.append("") head += ["---", ""] for g in v.groups: head.append(f"## {g['label'].title()}") head.append("") for it in g["items"]: ch = it["characteristic"] val = it["value"] or "—" head.append(f"- **{ch}:** {val}") head.append("") if not v.groups and v.crop == "wheat": head += ["_Identity record only — 1st Choice wheat is private-label " "and the catalog page carries no agronomic spec block._", ""] return "\n".join(head) def write_variety(v: FCVariety, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) (CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8") sidecar = { "source": "first_choice", "source_key": v.source_key, "vendor": "1st Choice Seeds", "brand": "1st Choice Seeds", "product_name": v.product_name, "product_id": v.product_name, "crop": v.crop, "release_year": None, "relative_maturity": v.relative_maturity, "maturity_group": v.maturity_group, "wheat_class": v.wheat_class, "trait_stack": v.trait_stack, "trait_descriptions": [], "positioning_statement": v.positioning, "strengths": v.strengths, "characteristics_groups": v.groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [ {"product_list_name": "1st Choice Seeds dealer network " "(Eastern Corn Belt — IN/OH/KY/TN)", "agronomist": None, "agronomist_email": None, "variant_id": None}, ], "image_url": None, "source_urls": [v.source_url], "sitemap_last_modified": v.sitemap_last_modified, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } (CORPUS_DIR / f"{v.source_key}.json").write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # --------------------------------------------------------------------- pipeline def run(*, limit: int | None, force: bool, only_crop: str | None, only_product: str | None) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() records = discover(http, only_crop=only_crop) if only_product: key = only_product.lower() records = [r for r in records if f"firstchoice-{r['slug']}" == key or r["slug"] == key] if not records: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0} processed = 0 for rec in records: if limit is not None and processed >= limit: break processed += 1 source_key = f"firstchoice-{rec['slug']}" md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: counts["skipped"] += 1 log.info("[%d/%d] %s skipped", processed, len(records), source_key) continue try: v = parse_detail(http, rec) except requests.HTTPError as exc: counts["failed"] += 1 log.error("[%d/%d] %s detail fetch failed: %s", processed, len(records), source_key, exc) continue if not v.groups: counts["empty"] += 1 log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)", processed, len(records), source_key, "; thin wheat page" if v.crop == "wheat" else "") write_variety(v, render_markdown(v)) counts["written"] += 1 log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s", processed, len(records), source_key, v.crop, v.relative_maturity or v.maturity_group or "-", len(v.groups), ",".join(v.trait_stack) or "-") log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)", processed, counts["written"], counts["skipped"], counts["empty"], counts["failed"], len(records)) return 0 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.first_choice", description="Scrape 1st Choice Seeds (independent, employee-owned — " "Rushville, IN) — corn / soybeans / wheat via sitemaps " "+ detail pages.") p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS), help="Limit to one crop (corn / soybeans / wheat).") p.add_argument("--product", default=None, help="Process a single variety by source_key or slug.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr) return run(limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product) if __name__ == "__main__": sys.exit(main())