84ad2b1de6
Image rebuild (skip scrape) / build (push) Successful in 4m44s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
672 lines
27 KiB
Python
672 lines
27 KiB
Python
"""1st Choice Seeds scraper — employee-owned independent (Rushville, IN).
|
|
|
|
Source: ``www.1stchoiceseeds.com`` — a plain Apache/PHP WordPress site
|
|
(All in One SEO). 1st Choice Seeds is an **independent, employee-owned**
|
|
seed company in Rushville, Indiana, serving the Eastern Corn Belt
|
|
(IN/OH/KY/TN). Corn hybrids / soybeans / wheat (plus a cover-crop line
|
|
that is out of scope for the row-crop advisor).
|
|
|
|
Discovery is by **sitemap**, NOT the WP REST API: the catalog custom
|
|
post types (corn-hybrids / soybeans / wheat) are NOT exposed to
|
|
``/wp-json/`` (every variety route returns ``rest_no_route``). Instead we
|
|
fetch ``/sitemap.xml`` (an All-in-One-SEO sitemap *index*) and follow the
|
|
per-crop child sitemaps:
|
|
|
|
- ``/corn-hybrids-sitemap.xml`` -> ``/corn-hybrids/<slug>/`` (~52 URLs)
|
|
- ``/soybeans-sitemap.xml`` -> ``/soybeans/<slug>/`` (~22 URLs)
|
|
- ``/wheat-sitemap.xml`` -> ``/wheat/<slug>/`` (~4 URLs)
|
|
|
|
robots.txt is permissive (``User-agent: *`` / ``Disallow: /wp-admin/`` /
|
|
``Allow: /wp-admin/admin-ajax.php`` + a ``Sitemap:`` line). No Crawl-delay,
|
|
no Terms-of-Use page, no bot wall. We use a descriptive UA and ~1.2 s
|
|
between requests.
|
|
|
|
Detail-page DOM (server-rendered, no JS needed for the text):
|
|
* Product name: the second ``<h1>`` inside ``article.content`` (the
|
|
first is the site logo "1st Choice Seeds").
|
|
* Corn — three ``<h2>`` sections + a side table:
|
|
- "Hybrid Characteristics": a single ``<p>`` of ``label • value``
|
|
lines split on ``<br>`` (Seedling Vigor, Plant Height, Ear
|
|
Placement, Root Rating, Stalk Rating, Foliar Health, Drydown,
|
|
Ear Length/Girth/Flex, Test Weight). Some hybrids only publish
|
|
Seedling Vigor (genuinely thin pages — still written).
|
|
- "Hybrid Ratings": a ``ul.chart-key`` legend + a ``div.d3-chart``
|
|
(the numeric 0-10 bars are drawn client-side by d3 and are NOT
|
|
in the HTML). The legend IS the scale: 0-4 Below Average … 9-10
|
|
Superior, so higher = better.
|
|
- "Management Tips": ``label: value`` lines (Corn-On-Corn,
|
|
Productivity / soil guidance, Silage Rating).
|
|
- A ``<table>`` carrying Relative Maturity, Degree Days (GDU), and
|
|
the Low/Medium/High recommended planting populations.
|
|
* Soybeans — three ``<h2>`` sections:
|
|
- "Field Notes": a ``<ul>`` of strengths (often includes SCN
|
|
source / PRR gene call-outs).
|
|
- "Soybean Ratings": ``ul.chart-key`` legend only (same d3 chart).
|
|
- "Variety Description": ``div`` blocks of ``<b>Label:</b> value``
|
|
pairs (Maturity = MG, Plant Type, Plant Height, PRR Gene, Flower
|
|
Color, Pubescence, Pod, Hilum).
|
|
* Wheat — thin (title + date only; wheat is private-label). We still
|
|
write an identity record so the variety is discoverable.
|
|
|
|
Rating scale: the published legend is **0-10, higher = better**
|
|
("Below Average 0-4, Average 5, Good 6, Very Good 7, Excellent 8,
|
|
Superior 9-10"). 1st Choice publishes the *qualitative* word
|
|
(Excellent / Very Good / …) in the HTML — those map directly onto that
|
|
legend — while the numeric bar is d3-rendered and absent from the
|
|
markup. NA / blank = not rated.
|
|
|
|
Output:
|
|
corpus/first_choice/<source_key>.md
|
|
corpus/first_choice/<source_key>.json
|
|
|
|
source_key: ``firstchoice-<slug>`` lowercased, e.g.
|
|
``firstchoice-fc-8455-vt2p`` or ``firstchoice-fb-2733-en``.
|
|
|
|
CLI:
|
|
python -m scrape.sources.first_choice --crop corn --limit 5
|
|
python -m scrape.sources.first_choice --force
|
|
python -m scrape.sources.first_choice --product firstchoice-fc-8455-vt2p
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
SCRAPER_VERSION = "0.1.0"
|
|
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
|
BASE = "https://www.1stchoiceseeds.com"
|
|
SITEMAP_INDEX = f"{BASE}/sitemap.xml"
|
|
|
|
# Per-crop child sitemap -> chunker crop value. The chunker keys on
|
|
# "soybeans" (plural) for the MG branch, so map accordingly. The
|
|
# cover-crops sitemap is intentionally omitted (out of scope for the
|
|
# row-crop advisor).
|
|
CROP_SITEMAPS = {
|
|
"corn": "corn-hybrids-sitemap.xml",
|
|
"soybeans": "soybeans-sitemap.xml",
|
|
"wheat": "wheat-sitemap.xml",
|
|
}
|
|
|
|
# URL path prefix that confirms a sitemap entry is a variety detail page
|
|
# (vs. a category/archive page that can sneak into a child sitemap).
|
|
CROP_PATH = {
|
|
"corn": "/corn-hybrids/",
|
|
"soybeans": "/soybeans/",
|
|
"wheat": "/wheat/",
|
|
}
|
|
|
|
# robots.txt declares no Crawl-delay; we stay polite. The full row-crop
|
|
# catalog is ~78 detail pages, so ~1.2 s/req finishes in a couple min.
|
|
REQ_INTERVAL_SEC = 1.2
|
|
|
|
RATING_SCALE_DIRECTION = (
|
|
"0-10, higher = better (legend: 0-4 Below Average, 5 Average, "
|
|
"6 Good, 7 Very Good, 8 Excellent, 9-10 Superior); 1st Choice "
|
|
"publishes the qualitative word in HTML (the numeric bar is "
|
|
"d3-rendered, not in markup); blank/NA = not rated"
|
|
)
|
|
|
|
# Corn "Hybrid Characteristics" lines that are foliar/disease in nature
|
|
# bucket into DISEASE RATINGS; the rest are agronomic/plant ratings.
|
|
_CORN_DISEASE_LABELS = {"foliar health", "foliar rating", "foliar"}
|
|
|
|
# Trait-suffix -> human label, derived from the slug tail. Best-effort;
|
|
# an unmapped suffix is title-cased verbatim so nothing is dropped.
|
|
TRAIT_LABELS = {
|
|
# corn
|
|
"vt2p": "VT Double PRO (VT2P)",
|
|
"gt": "Glyphosate Tolerant (GT)",
|
|
"c": "Conventional",
|
|
"pc": "PowerCore (PC)",
|
|
"tre": "Trecepta (TRE)",
|
|
"ss": "SmartStax (SS)",
|
|
"v": "VT (V)",
|
|
"dv": "Double VT (DV)",
|
|
"aa": "Agrisure Artesian (AA)",
|
|
# soybeans
|
|
"en": "Enlist E3 (EN)",
|
|
"xf": "XtendFlex (XF)",
|
|
"sts": "STS",
|
|
# wheat
|
|
"b": "Bin-run / branded (B)",
|
|
"s": "Soft (S)",
|
|
}
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
|
CORPUS_DIR = CORPUS_ROOT / "first_choice"
|
|
|
|
log = logging.getLogger("scrape.first_choice")
|
|
|
|
|
|
# --------------------------------------------------------------------- HTTP
|
|
|
|
|
|
class RateLimitedSession:
|
|
"""Polite session with backoff. The 1st Choice row-crop catalog is
|
|
small (~78 detail pages + 4 sitemaps) so 1.2 s/req still finishes in
|
|
a couple minutes."""
|
|
|
|
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
|
self.s = requests.Session()
|
|
self.s.headers["User-Agent"] = USER_AGENT
|
|
self.interval = interval
|
|
self._last = 0.0
|
|
|
|
def _wait(self) -> None:
|
|
delta = time.monotonic() - self._last
|
|
if delta < self.interval:
|
|
time.sleep(self.interval - delta)
|
|
self._last = time.monotonic()
|
|
|
|
def request(self, method: str, url: str, *, max_retries: int = 4,
|
|
timeout: float = 30.0, **kw: Any) -> requests.Response:
|
|
last_exc: Exception | None = None
|
|
resp: requests.Response | None = None
|
|
for attempt in range(max_retries):
|
|
self._wait()
|
|
try:
|
|
resp = self.s.request(method, url, timeout=timeout, **kw)
|
|
except requests.RequestException as exc:
|
|
last_exc = exc
|
|
backoff = min(30.0, (2 ** attempt) + random.random())
|
|
log.warning("network error on %s %s: %s — retry in %.1fs",
|
|
method, url, exc, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
|
ra = resp.headers.get("Retry-After")
|
|
backoff = float(ra) if (ra and ra.isdigit()) else min(
|
|
30.0, (2 ** attempt) + random.random())
|
|
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
|
resp.status_code, method, url, backoff)
|
|
time.sleep(backoff)
|
|
continue
|
|
return resp
|
|
if last_exc:
|
|
raise last_exc
|
|
assert resp is not None
|
|
return resp
|
|
|
|
def get(self, url: str, **kw: Any) -> requests.Response:
|
|
return self.request("GET", url, **kw)
|
|
|
|
|
|
# --------------------------------------------------------------------- model
|
|
|
|
|
|
@dataclass
|
|
class FCVariety:
|
|
source_key: str
|
|
source_url: str
|
|
crop: str # chunker value: corn / soybeans / wheat
|
|
product_name: str = "" # "FC 8455 VT2P"
|
|
relative_maturity: int | None = None # corn (days)
|
|
maturity_group: float | None = None # soy
|
|
wheat_class: str | None = None # wheat
|
|
trait_stack: list[str] = field(default_factory=list)
|
|
positioning: str | None = None
|
|
strengths: list[str] = field(default_factory=list)
|
|
# [{label, items:[{characteristic, value}]}] — chunker source of truth
|
|
groups: list[dict] = field(default_factory=list)
|
|
sitemap_last_modified: str | None = None
|
|
|
|
|
|
# --------------------------------------------------------------------- discovery (sitemaps)
|
|
|
|
|
|
_LOC_RE = re.compile(r"<loc>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</loc>",
|
|
re.IGNORECASE | re.DOTALL)
|
|
_URL_BLOCK_RE = re.compile(r"<url>(.*?)</url>", re.IGNORECASE | re.DOTALL)
|
|
_LASTMOD_RE = re.compile(r"<lastmod>\s*(?:<!\[CDATA\[)?\s*(.*?)\s*(?:\]\]>)?\s*</lastmod>",
|
|
re.IGNORECASE | re.DOTALL)
|
|
|
|
|
|
def _slug_from_url(url: str) -> str:
|
|
return url.rstrip("/").rsplit("/", 1)[-1].lower()
|
|
|
|
|
|
def discover(http: RateLimitedSession, *, only_crop: str | None) -> list[dict]:
|
|
"""Return [{crop, url, slug, lastmod}] for in-scope row-crop varieties
|
|
by walking the per-crop child sitemaps under /sitemap.xml.
|
|
|
|
We fetch each known child sitemap directly (their names are stable
|
|
All-in-One-SEO conventions) rather than trusting the index ordering,
|
|
but we still confirm against the index so a renamed sitemap is caught.
|
|
"""
|
|
# Pull the sitemap index once so we can warn if a crop sitemap is
|
|
# missing/renamed (defensive; we still target the known names).
|
|
index_locs: set[str] = set()
|
|
try:
|
|
idx = http.get(SITEMAP_INDEX)
|
|
idx.raise_for_status()
|
|
index_locs = {m.strip() for m in _LOC_RE.findall(idx.text)}
|
|
except requests.RequestException as exc:
|
|
log.warning("could not read sitemap index %s: %s (continuing with "
|
|
"known child sitemap names)", SITEMAP_INDEX, exc)
|
|
|
|
records: list[dict] = []
|
|
for crop, child in CROP_SITEMAPS.items():
|
|
if only_crop and crop != only_crop:
|
|
continue
|
|
child_url = f"{BASE}/{child}"
|
|
if index_locs and child_url not in index_locs:
|
|
log.warning("crop sitemap %s not listed in the index — site may "
|
|
"have renamed it; trying anyway", child_url)
|
|
r = http.get(child_url)
|
|
if r.status_code == 404:
|
|
log.warning("crop sitemap %s -> 404; skipping %s", child_url, crop)
|
|
continue
|
|
r.raise_for_status()
|
|
prefix = CROP_PATH[crop]
|
|
seen: set[str] = set()
|
|
n = 0
|
|
for block in _URL_BLOCK_RE.findall(r.text):
|
|
loc_m = _LOC_RE.search(block)
|
|
if not loc_m:
|
|
continue
|
|
url = loc_m.group(1).strip()
|
|
if prefix not in url:
|
|
continue # category/archive page leaked into the sitemap
|
|
slug = _slug_from_url(url)
|
|
if not slug or slug in seen:
|
|
continue
|
|
seen.add(slug)
|
|
lm_m = _LASTMOD_RE.search(block)
|
|
records.append({
|
|
"crop": crop,
|
|
"url": url,
|
|
"slug": slug,
|
|
"lastmod": lm_m.group(1).strip() if lm_m else None,
|
|
})
|
|
n += 1
|
|
log.info("crop sitemap %-22s (%s): %d varieties", child, crop, n)
|
|
log.info("total varieties discovered: %d", len(records))
|
|
return records
|
|
|
|
|
|
# --------------------------------------------------------------------- detail parse
|
|
|
|
|
|
def _clean(s: str) -> str:
|
|
return re.sub(r"\s+", " ", s or "").strip()
|
|
|
|
|
|
def _direct_text(el: Tag) -> str:
|
|
return _clean("".join(c for c in el.children if isinstance(c, NavigableString)))
|
|
|
|
|
|
def _br_lines(el: Tag) -> list[str]:
|
|
"""Text of an element with <br> treated as a line break."""
|
|
# Work on a copy so the original tree (used by other parsers) stays intact.
|
|
for br in el.find_all("br"):
|
|
br.replace_with("\n")
|
|
return [ln.strip() for ln in el.get_text("\n").split("\n") if ln.strip()]
|
|
|
|
|
|
def _product_name(article: Tag, slug: str) -> str:
|
|
"""The variety name is the 2nd <h1> in article.content (the 1st is the
|
|
site-logo "1st Choice Seeds"). Fall back to a tidied slug."""
|
|
for h1 in article.find_all("h1"):
|
|
txt = _clean(h1.get_text(" ", strip=True))
|
|
if txt and txt.lower() != "1st choice seeds":
|
|
return txt
|
|
return slug.upper().replace("-", " ")
|
|
|
|
|
|
def _trait_stack(slug: str, crop: str) -> list[str]:
|
|
"""Derive a trait label from the slug tail (e.g. fc-8455-vt2p -> VT2P,
|
|
fb-3545-c-sts -> Conventional + STS). The leading model token
|
|
(fc-8455 / fb-2733 / fw-2035 / 20rw36) is not a trait."""
|
|
parts = slug.split("-")
|
|
# Drop the leading model identifier: typically the first 1-2 tokens
|
|
# (brand letters + number, e.g. "fc","8455" or "20rw36"). Anything
|
|
# that is a known trait suffix counts; we scan from the right.
|
|
traits: list[str] = []
|
|
for tok in parts:
|
|
t = tok.lower()
|
|
if t in TRAIT_LABELS:
|
|
label = TRAIT_LABELS[t]
|
|
if label not in traits:
|
|
traits.append(label)
|
|
# Trailing numeric-like / model tokens won't be in TRAIT_LABELS, so the
|
|
# above naturally skips them. Preserve discovery order (left->right).
|
|
return traits
|
|
|
|
|
|
def _parse_corn(article: Tag, v: FCVariety) -> None:
|
|
"""Populate corn ratings from Hybrid Characteristics + Management Tips
|
|
+ the Relative Maturity / Degree Days side table."""
|
|
agronomic: list[dict] = []
|
|
disease: list[dict] = []
|
|
management: list[dict] = []
|
|
|
|
# Hybrid Characteristics: a <p> of "label • value" lines.
|
|
hc = next((h for h in article.find_all("h2")
|
|
if _clean(h.get_text()) == "Hybrid Characteristics"), None)
|
|
if hc is not None:
|
|
sib = hc.find_next_sibling()
|
|
if sib is not None and sib.name == "p":
|
|
for ln in _br_lines(sib):
|
|
# split on bullet (•) or fall back to first colon
|
|
if "•" in ln:
|
|
k, _, val = ln.partition("•")
|
|
elif ":" in ln:
|
|
k, _, val = ln.partition(":")
|
|
else:
|
|
k, val = ln, ""
|
|
k, val = _clean(k), _clean(val)
|
|
if not k:
|
|
continue
|
|
item = {"characteristic": k, "value": val}
|
|
if k.lower() in _CORN_DISEASE_LABELS:
|
|
disease.append(item)
|
|
else:
|
|
agronomic.append(item)
|
|
|
|
# Management Tips: "label: value" lines (Corn-On-Corn / Productivity /
|
|
# Silage Rating). Stop pulling once we wander into the footer address.
|
|
mt = next((h for h in article.find_all("h2")
|
|
if _clean(h.get_text()) == "Management Tips"), None)
|
|
if mt is not None:
|
|
sib = mt.find_next_sibling()
|
|
if sib is not None and sib.name == "p":
|
|
for ln in _br_lines(sib):
|
|
if ":" not in ln:
|
|
continue
|
|
k, _, val = ln.partition(":")
|
|
k, val = _clean(k), _clean(val)
|
|
# Footer noise (address / © line) has no useful colon form.
|
|
if k and val and not k.startswith("©") and "rights reserved" not in ln.lower():
|
|
management.append({"characteristic": k, "value": val})
|
|
|
|
# Side table: Relative Maturity / Degree Days + planting populations.
|
|
pop_rows: list[str] = []
|
|
for tbl in article.find_all("table"):
|
|
for tr in tbl.find_all("tr"):
|
|
cells = [_clean(c.get_text(" ", strip=True))
|
|
for c in tr.find_all(["td", "th"])]
|
|
cells = [c for c in cells if c]
|
|
if not cells:
|
|
continue
|
|
joined = " ".join(cells).lower()
|
|
if cells[0].lower().startswith("relative maturity") and len(cells) >= 2:
|
|
m = re.search(r"(\d+)", cells[1])
|
|
if m:
|
|
v.relative_maturity = int(m.group(1))
|
|
agronomic.insert(0, {"characteristic": "Relative Maturity",
|
|
"value": cells[1]})
|
|
elif cells[0].lower().startswith("degree days") and len(cells) >= 2:
|
|
agronomic.append({"characteristic": "Degree Days (GDU)",
|
|
"value": cells[1]})
|
|
elif joined.startswith("low") and ("medium" in joined or "high" in joined):
|
|
pop_rows.append(" / ".join(cells))
|
|
if pop_rows:
|
|
management.append({"characteristic": "Recommended Planting Population",
|
|
"value": "; ".join(pop_rows)})
|
|
|
|
if agronomic:
|
|
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
|
|
if disease:
|
|
v.groups.append({"label": "DISEASE RATINGS", "items": disease})
|
|
if management:
|
|
v.groups.append({"label": "MANAGEMENT", "items": management})
|
|
|
|
|
|
def _parse_soy(article: Tag, v: FCVariety) -> None:
|
|
"""Populate soy MG + agronomic descriptors + field-note strengths."""
|
|
# Field Notes -> strengths (and positioning from the first one).
|
|
fn = next((h for h in article.find_all("h2")
|
|
if _clean(h.get_text()) == "Field Notes"), None)
|
|
if fn is not None:
|
|
sib = fn.find_next_sibling()
|
|
if sib is not None and sib.name == "ul":
|
|
notes = [_clean(li.get_text(" ", strip=True)) for li in sib.find_all("li")]
|
|
v.strengths = [n for n in notes if n]
|
|
if v.strengths and not v.positioning:
|
|
v.positioning = v.strengths[0]
|
|
|
|
# Variety Description -> [{characteristic, value}] from <b>Label:</b> value.
|
|
agronomic: list[dict] = []
|
|
vd = next((h for h in article.find_all("h2")
|
|
if _clean(h.get_text()) == "Variety Description"), None)
|
|
if vd is not None:
|
|
for el in vd.find_all_next():
|
|
if el.name == "h2" and el is not vd:
|
|
break
|
|
if not isinstance(el, Tag):
|
|
continue
|
|
# Stop at the action buttons / right-nav / footer region.
|
|
cls = el.get("class") or []
|
|
if el.name == "div" and any(
|
|
c in cls for c in ("btn", "right-bar", "right-navigation",
|
|
"address", "wrapper")):
|
|
break
|
|
b = el.find("b", recursive=False) if el.name == "div" else None
|
|
if b is not None:
|
|
k = _clean(b.get_text(" ", strip=True)).rstrip(":")
|
|
val = _direct_text(el)
|
|
if not k:
|
|
continue
|
|
if k.lower() == "maturity":
|
|
try:
|
|
v.maturity_group = float(re.search(r"[\d.]+", val).group(0))
|
|
except (AttributeError, ValueError):
|
|
pass
|
|
agronomic.append({"characteristic": "Maturity Group", "value": val})
|
|
else:
|
|
agronomic.append({"characteristic": k, "value": val})
|
|
if agronomic:
|
|
v.groups.append({"label": "AGRONOMIC CHARACTERISTICS", "items": agronomic})
|
|
|
|
|
|
def parse_detail(http: RateLimitedSession, rec: dict) -> FCVariety:
|
|
crop = rec["crop"]
|
|
slug = rec["slug"]
|
|
url = rec["url"]
|
|
v = FCVariety(
|
|
source_key=f"firstchoice-{slug}",
|
|
source_url=url,
|
|
crop=crop,
|
|
trait_stack=_trait_stack(slug, crop),
|
|
sitemap_last_modified=rec.get("lastmod"),
|
|
)
|
|
r = http.get(url)
|
|
r.raise_for_status()
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
article = soup.find("article", class_="content") or soup
|
|
v.product_name = _product_name(article, slug)
|
|
|
|
if crop == "corn":
|
|
_parse_corn(article, v)
|
|
elif crop == "soybeans":
|
|
_parse_soy(article, v)
|
|
# wheat: thin pages — identity only (no spec sections to parse).
|
|
return v
|
|
|
|
|
|
# --------------------------------------------------------------------- render
|
|
|
|
|
|
def render_markdown(v: FCVariety) -> str:
|
|
crop_label = {"corn": "Corn", "soybeans": "Soybeans",
|
|
"wheat": "Wheat"}.get(v.crop, v.crop.title())
|
|
head: list[str] = [
|
|
f"# {v.product_name}",
|
|
"",
|
|
"- **Vendor:** 1st Choice Seeds (independent, employee-owned)",
|
|
"- **Brand:** 1st Choice Seeds",
|
|
f"- **Crop:** {crop_label}",
|
|
]
|
|
if v.crop == "corn" and v.relative_maturity is not None:
|
|
head.append(f"- **Relative maturity:** {v.relative_maturity} day")
|
|
if v.crop == "soybeans" and v.maturity_group is not None:
|
|
head.append(f"- **Maturity group:** {v.maturity_group}")
|
|
if v.crop == "wheat" and v.wheat_class:
|
|
head.append(f"- **Wheat class:** {v.wheat_class}")
|
|
if v.trait_stack:
|
|
head.append(f"- **Trait(s):** {', '.join(v.trait_stack)}")
|
|
head.append(f"- **Source:** {v.source_url}")
|
|
head.append(f"- **Rating scale:** {RATING_SCALE_DIRECTION}")
|
|
head.append("- **Service area:** 1st Choice Seeds dealer network — "
|
|
"Eastern Corn Belt (IN/OH/KY/TN), Rushville, IN")
|
|
head.append("")
|
|
if v.positioning:
|
|
head += ["---", "", f"_{v.positioning}_", ""]
|
|
if v.strengths:
|
|
head += ["---", "", "## Field Notes", ""]
|
|
head += [f"- {s}" for s in v.strengths]
|
|
head.append("")
|
|
head += ["---", ""]
|
|
for g in v.groups:
|
|
head.append(f"## {g['label'].title()}")
|
|
head.append("")
|
|
for it in g["items"]:
|
|
ch = it["characteristic"]
|
|
val = it["value"] or "—"
|
|
head.append(f"- **{ch}:** {val}")
|
|
head.append("")
|
|
if not v.groups and v.crop == "wheat":
|
|
head += ["_Identity record only — 1st Choice wheat is private-label "
|
|
"and the catalog page carries no agronomic spec block._", ""]
|
|
return "\n".join(head)
|
|
|
|
|
|
def write_variety(v: FCVariety, body_md: str) -> None:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
(CORPUS_DIR / f"{v.source_key}.md").write_text(body_md, encoding="utf-8")
|
|
sidecar = {
|
|
"source": "first_choice",
|
|
"source_key": v.source_key,
|
|
"vendor": "1st Choice Seeds",
|
|
"brand": "1st Choice Seeds",
|
|
"product_name": v.product_name,
|
|
"product_id": v.product_name,
|
|
"crop": v.crop,
|
|
"release_year": None,
|
|
"relative_maturity": v.relative_maturity,
|
|
"maturity_group": v.maturity_group,
|
|
"wheat_class": v.wheat_class,
|
|
"trait_stack": v.trait_stack,
|
|
"trait_descriptions": [],
|
|
"positioning_statement": v.positioning,
|
|
"strengths": v.strengths,
|
|
"characteristics_groups": v.groups,
|
|
"_scale_direction": RATING_SCALE_DIRECTION,
|
|
"regional_recommendations": [
|
|
{"product_list_name": "1st Choice Seeds dealer network "
|
|
"(Eastern Corn Belt — IN/OH/KY/TN)",
|
|
"agronomist": None, "agronomist_email": None, "variant_id": None},
|
|
],
|
|
"image_url": None,
|
|
"source_urls": [v.source_url],
|
|
"sitemap_last_modified": v.sitemap_last_modified,
|
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
|
"scraper_version": SCRAPER_VERSION,
|
|
}
|
|
(CORPUS_DIR / f"{v.source_key}.json").write_text(
|
|
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
|
|
|
|
# --------------------------------------------------------------------- pipeline
|
|
|
|
|
|
def run(*, limit: int | None, force: bool,
|
|
only_crop: str | None, only_product: str | None) -> int:
|
|
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
|
http = RateLimitedSession()
|
|
records = discover(http, only_crop=only_crop)
|
|
|
|
if only_product:
|
|
key = only_product.lower()
|
|
records = [r for r in records
|
|
if f"firstchoice-{r['slug']}" == key or r["slug"] == key]
|
|
if not records:
|
|
log.error("no variety matched --product=%s", only_product)
|
|
return 2
|
|
|
|
counts = {"written": 0, "skipped": 0, "empty": 0, "failed": 0}
|
|
processed = 0
|
|
for rec in records:
|
|
if limit is not None and processed >= limit:
|
|
break
|
|
processed += 1
|
|
source_key = f"firstchoice-{rec['slug']}"
|
|
md_path = CORPUS_DIR / f"{source_key}.md"
|
|
if md_path.exists() and not force:
|
|
counts["skipped"] += 1
|
|
log.info("[%d/%d] %s skipped", processed, len(records), source_key)
|
|
continue
|
|
try:
|
|
v = parse_detail(http, rec)
|
|
except requests.HTTPError as exc:
|
|
counts["failed"] += 1
|
|
log.error("[%d/%d] %s detail fetch failed: %s",
|
|
processed, len(records), source_key, exc)
|
|
continue
|
|
if not v.groups:
|
|
counts["empty"] += 1
|
|
log.warning("[%d/%d] %s — no spec groups parsed (writing identity%s)",
|
|
processed, len(records), source_key,
|
|
"; thin wheat page" if v.crop == "wheat" else "")
|
|
write_variety(v, render_markdown(v))
|
|
counts["written"] += 1
|
|
log.info("[%d/%d] %s written | crop=%s rm/mg=%s groups=%d traits=%s",
|
|
processed, len(records), source_key, v.crop,
|
|
v.relative_maturity or v.maturity_group or "-",
|
|
len(v.groups), ",".join(v.trait_stack) or "-")
|
|
|
|
log.info("done: processed=%d written=%d skipped=%d empty_groups=%d failed=%d (of %d)",
|
|
processed, counts["written"], counts["skipped"], counts["empty"],
|
|
counts["failed"], len(records))
|
|
return 0
|
|
|
|
|
|
# --------------------------------------------------------------------- CLI
|
|
|
|
|
|
def _build_argparser() -> argparse.ArgumentParser:
|
|
p = argparse.ArgumentParser(
|
|
prog="scrape.sources.first_choice",
|
|
description="Scrape 1st Choice Seeds (independent, employee-owned — "
|
|
"Rushville, IN) — corn / soybeans / wheat via sitemaps "
|
|
"+ detail pages.")
|
|
p.add_argument("--limit", type=int, default=None,
|
|
help="Stop after processing N varieties (default: all).")
|
|
p.add_argument("--force", action="store_true",
|
|
help="Re-fetch even if the markdown file already exists.")
|
|
p.add_argument("--crop", default=None, choices=sorted(CROP_SITEMAPS),
|
|
help="Limit to one crop (corn / soybeans / wheat).")
|
|
p.add_argument("--product", default=None,
|
|
help="Process a single variety by source_key or slug.")
|
|
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
|
return p
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> int:
|
|
args = _build_argparser().parse_args(argv)
|
|
logging.basicConfig(
|
|
level=args.log_level.upper(),
|
|
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
|
stream=sys.stderr)
|
|
return run(limit=args.limit, force=args.force,
|
|
only_crop=args.crop, only_product=args.product)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|