"""LG Seeds scraper — AgReliant Genetics brand. Source: ``www.lgseeds.com`` — WordPress site. Empty robots.txt (no Disallow). Catalog covers 4 crops: corn, soybeans, alfalfa, sorghum. Two-layer fetch: 1. **Listing page** (one per crop): inline JavaScript variable ``products = [{...}, ...]`` carries the full variety summary — Variety code, Maturity, Traits[], Bullets[], CropType. No per-variety HTTP needed for identity. 2. **Detail page** (``/products//``): rich plant characteristics + disease tolerance + management ratings, rendered as ``
`` blocks with ```` where N ∈ 1-9 is the rating. Same convention as Bayer/Golden Harvest (9 = best). LG Seeds is a regional brand (Eastern Corn Belt focus) under AgReliant Genetics, the same parent as AgriGold. Brand voice is distinct so we keep them in separate scrapers. Rating scale: ``1-9 (9 = best)`` — verified empirically on the bar-N markup; matches Bayer / Golden Harvest convention. Output: corpus/lg_seeds/.md corpus/lg_seeds/.json source_key: ``lg-`` lowercased, e.g. ``lg-lg5701``, ``lg-c3400`` (soybean — codes don't use LG prefix), ``lg-7c300`` (alfalfa), ``lg-silo-max-100`` (sorghum). CLI: python -m scrape.sources.lg_seeds --crop corn --limit 5 python -m scrape.sources.lg_seeds --force """ from __future__ import annotations import argparse import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.lgseeds.com" # Crops listed in nav. Each has a listing page at /products/ # with an inline `var products = [...]` JSON blob. LISTING_PATHS = { "corn": "/products/corn", "soybeans": "/products/soybeans", "alfalfa": "/products/alfalfa", "sorghum": "/products/sorghum", } RATING_SCALE_DIRECTION = "1-9 (9 = best)" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "lg_seeds" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.lg_seeds") # --------------------------------------------------------------------- HTTP class RateLimitedSession: def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class LGProduct: source_key: str source_url: str crop: str product_name: str = "" product_id: int | None = None maturity_raw: str | None = None # corn RM days / soy MG / alfalfa FD / sorghum days fall_dormancy: str | None = None # alfalfa only trait_descriptions: list[str] = field(default_factory=list) bullets: list[str] = field(default_factory=list) characteristics_groups: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery _VAR_RE = re.compile( r'var\s+\w+\s*=\s*(\[\{"Variety":.+?\}\]);', re.S, ) def discover_varieties( http: RateLimitedSession, *, only_crop: str | None = None, ) -> list[tuple[str, dict]]: """Return ``[(crop, summary_dict), ...]`` from each listing page's inline JSON. Summary dict has Variety / Id / Maturity / Traits / Bullets / CropType / FallDormancy.""" out: list[tuple[str, dict]] = [] for crop, path in LISTING_PATHS.items(): if only_crop and crop != only_crop: continue log.info("fetching listing %s%s", BASE, path) r = http.get(f"{BASE}{path}") r.raise_for_status() m = _VAR_RE.search(r.text) if not m: log.warning("no products array in %s", path) continue try: items = json.loads(m.group(1)) except json.JSONDecodeError as exc: log.error("JSON parse failed for %s: %s", path, exc) continue log.info(" %s: %d varieties", crop, len(items)) for it in items: out.append((crop, it)) log.info("total varieties discovered: %d", len(out)) return out # --------------------------------------------------------------------- helpers def source_key_for(variety: str) -> str: """Slugify the variety code into a stable source_key.""" slug = re.sub(r"[^a-zA-Z0-9-]+", "-", variety).strip("-").lower() return f"lg-{slug}" _BAR_CLASS_RE = re.compile(r"^bar-(\d)$") def _parse_bar_value(span_classes: list[str]) -> int | None: """Extract the integer rating from a ``bar-N`` CSS class.""" for c in span_classes or []: m = _BAR_CLASS_RE.match(c) if m: return int(m.group(1)) return None # --------------------------------------------------------------------- detail def fetch_product_detail( http: RateLimitedSession, summary: dict, crop: str, ) -> LGProduct: """Fetch the detail page and merge characteristics into an LGProduct seeded by the listing-page summary.""" variety = summary.get("Variety") or "" # LG's detail URL is /products//. The Variety in the # listing JSON appears in correct case; LG seems to accept any case # but we use what's published. url = f"{BASE}/products/{crop}/{variety}" prod = LGProduct( source_key=source_key_for(variety), source_url=url, crop=crop, product_name=variety, product_id=summary.get("Id"), maturity_raw=str(summary.get("Maturity")) if summary.get("Maturity") is not None else None, fall_dormancy=str(summary.get("FallDormancy")) if summary.get("FallDormancy") else None, trait_descriptions=list(summary.get("Traits") or []), bullets=list(summary.get("Bullets") or []), ) try: r = http.get(url) r.raise_for_status() except Exception as exc: # noqa: BLE001 log.warning("detail fetch failed for %s: %s", variety, exc) return prod # identity-only fallback soup = BeautifulSoup(r.text, "html.parser") # The detail page has multiple .product-section blocks; each has # a heading + a collection of .characteristics-bar rows. We bucket # by the section's text content. Common LG section labels: # "Characteristics" / "Management" / "Disease Tolerance". sections: list[tuple[str, list[dict]]] = [] for section in soup.find_all("div", class_=re.compile(r"product-section")): # Heading is the first text node inside the section, before bars. # The section class often includes a hint like "disease-toler", # "plantCharacteristics", "management-pr". section_classes = " ".join(section.get("class", [])) bars = section.find_all("div", class_="characteristics-bar") if not bars: continue # Section label — use the first heading-like element or the # text right after the section class anchor. label = "" for h in section.find_all(["h2", "h3", "h4"]): t = h.get_text(strip=True) if t: label = t break if not label: # fallback: section_classes hint if "disease" in section_classes.lower(): label = "Disease Tolerance" elif "management" in section_classes.lower(): label = "Management" elif "plantcharacteristics" in section_classes.lower(): label = "Characteristics" items: list[dict] = [] for bar in bars: name_el = bar.find(class_="product-name") value_span = bar.find("span", class_=_BAR_CLASS_RE) name = (name_el.get_text(" ", strip=True) if name_el else "").strip() rating = _parse_bar_value(value_span.get("class") if value_span else []) if not name: continue # Some "bars" are actually qualitative (e.g. "Tar Spot Susceptible", # "Fungicide Response High"). For those we keep the label as the # value text rather than a missing rating. if rating is None: # Look inside the bar element for a non-name text snippet inner_text = bar.get_text(" ", strip=True) # Strip the label off the front if inner_text.startswith(name): inner_text = inner_text[len(name):].strip() items.append({"characteristic": name, "value": inner_text or "-"}) else: items.append({"characteristic": name, "value": str(rating)}) if items: sections.append((label or "Characteristics", items)) prod.characteristics_groups = [ {"label": label.upper(), "type": "bars", "items": items} for label, items in sections ] return prod # --------------------------------------------------------------------- render def render_markdown(p: LGProduct) -> str: title = p.product_name or p.source_key crop_label = { "corn": "Corn", "soybeans": "Soybeans", "alfalfa": "Alfalfa", "sorghum": "Sorghum", }.get(p.crop, p.crop.title()) head: list[str] = [ f"# {title}", "", "- **Vendor:** AgReliant Genetics", "- **Brand:** LG Seeds", f"- **Crop:** {crop_label}", ] if p.maturity_raw: if p.crop == "corn": head.append(f"- **Relative maturity:** {p.maturity_raw}") elif p.crop == "soybeans": head.append(f"- **Maturity group:** {p.maturity_raw}") elif p.crop == "alfalfa": head.append(f"- **Fall dormancy / maturity:** {p.maturity_raw}") elif p.crop == "sorghum": head.append(f"- **Days to maturity:** {p.maturity_raw}") if p.trait_descriptions: head.append(f"- **Traits:** {', '.join(p.trait_descriptions)}") head.append(f"- **Source:** {p.source_url}") head.append(f"- **Rating scale (LG Seeds):** {RATING_SCALE_DIRECTION}") head.append("") head.append("---") head.append("") sections: list[str] = [] if p.bullets: bullets = "\n".join(f"- {b}" for b in p.bullets) sections.append("## Strengths\n\n" + bullets + "\n") for g in p.characteristics_groups: label = (g.get("label") or "Characteristics").title() items = g.get("items") or [] if not items: continue rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items) sections.append( f"## {label}\n\n" "| Characteristic | Value |\n" "|---|---|\n" f"{rows}\n" ) return "\n".join(head) + "\n".join(sections) # --------------------------------------------------------------------- write def write_product(prod: LGProduct, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) md_path = CORPUS_DIR / f"{prod.source_key}.md" json_path = CORPUS_DIR / f"{prod.source_key}.json" md_path.write_text(body_md, encoding="utf-8") sidecar = { "source": "lg_seeds", "source_key": prod.source_key, "vendor": "AgReliant Genetics", "brand": "LG Seeds", "product_name": prod.product_name, "product_id": prod.product_id, "hybrid_prefix": prod.product_name, "hybrid_suffix": None, "crop": prod.crop, "release_year": None, # Maturity routing: corn = RM days, soy = MG, alfalfa = FD, # sorghum = days-to-maturity. Stored in the canonical fields # so the chunker's crop-aware preamble works. "relative_maturity": prod.maturity_raw if prod.crop in ("corn", "sorghum") else None, "maturity_group": prod.maturity_raw if prod.crop == "soybeans" else None, "fall_dormancy": prod.maturity_raw if prod.crop == "alfalfa" else prod.fall_dormancy, "wheat_class": None, "trait_stack": prod.trait_descriptions, # LG publishes full names, not codes "trait_descriptions": prod.trait_descriptions, "positioning_statement": None, "strengths": prod.bullets, "characteristics_groups": prod.characteristics_groups, "_scale_direction": RATING_SCALE_DIRECTION, "regional_recommendations": [], "image_url": None, "source_urls": [prod.source_url], "sitemap_last_modified": None, "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } json_path.write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) # --------------------------------------------------------------------- pipeline def process_product( http: RateLimitedSession, summary: dict, crop: str, *, force: bool, ) -> tuple[str, LGProduct | None]: variety = summary.get("Variety") or "" source_key = source_key_for(variety) md_path = CORPUS_DIR / f"{source_key}.md" if md_path.exists() and not force: return "skipped", None try: prod = fetch_product_detail(http, summary, crop) except Exception as exc: # noqa: BLE001 log.error("variety %s failed: %s", variety, exc) return "failed", None body = render_markdown(prod) write_product(prod, body) return "written", prod def run( *, limit: int | None, force: bool, only_crop: str | None, only_product: str | None, ) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() targets = discover_varieties(http, only_crop=only_crop) if only_product: targets = [ (c, s) for (c, s) in targets if source_key_for(s.get("Variety", "")) == only_product or s.get("Variety", "").lower() == only_product.lower() ] if not targets: log.error("no variety matched --product=%s", only_product) return 2 counts = {"written": 0, "skipped": 0, "failed": 0} processed = 0 for crop, summary in targets: if limit is not None and processed >= limit: break processed += 1 status, prod = process_product(http, summary, crop, force=force) counts[status] = counts.get(status, 0) + 1 if prod is not None: log.info( "[%d/%s] %s %s | crop=%s maturity=%s traits=%d groups=%d", processed, str(limit) if limit else "all", prod.source_key, status, prod.crop, prod.maturity_raw or "-", len(prod.trait_descriptions), len(prod.characteristics_groups), ) else: log.info("[%d/%s] %s %s", processed, str(limit) if limit else "all", source_key_for(summary.get("Variety", "")), status) log.info( "done: processed=%d written=%d skipped=%d failed=%d (of %d candidates)", processed, counts["written"], counts["skipped"], counts["failed"], len(targets), ) return 0 if counts["failed"] == 0 else 1 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.lg_seeds", description="Scrape LG Seeds (AgReliant Genetics) — corn / " "soybeans / alfalfa / sorghum.", ) p.add_argument("--limit", type=int, default=None, help="Stop after processing N varieties (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=list(LISTING_PATHS), help="Limit to one crop.") p.add_argument("--product", default=None, help="Process a single variety by source_key or Variety code.") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr, ) return run( limit=args.limit, force=args.force, only_crop=args.crop, only_product=args.product, ) if __name__ == "__main__": sys.exit(main())