"""NK (Syngenta) seed scraper — corn + soybeans. Source: ``syngenta-us.com`` — ASP.NET WebForms catalog with an ASMX-style JSON endpoint for the seed-finder UI, plus tech-sheet PDFs on the Syngenta CDN at ``assets.syngenta-us.com/pdf/techsheets/_YYMMDD.pdf``. Expected count: 29 varieties (12 corn + 17 soy on 2026-05-25). No wheat. Discovery: the HTML catalog pages (``/corn/nk/products``, ``/soybeans/nk/products``) load product cards via JS. The JS calls POST /NKSeeds/CornProductFinder.aspx/GetProducts POST /NKSeeds/SoyProductFinder.aspx/GetProducts Both endpoints return ASP.NET's ``{"d": "..."}`` wrapper where ``d`` is a string of HTML fragments separated by `` @ `` containing one ``
`` per variety. Each card carries: - product code (e.g. ``NK8005`` / ``NK008-P8XF``) - RM days (corn) / MG decimal (soy) in a ```` next to the title - "Brands Available" line listing trait variants (NK8005-V, NK8005-GT/LL — these are trait-specific SKUs) - positioning slogan + bullet-list of strengths - tech-sheet PDF URL Per-variety disease ratings live ONLY in the PDF tech sheets (the HTML cards have marketing text but no rating numbers). We extract disease ratings via ``pdfplumber`` text extraction — they appear as "Label Number" lines that we parse with a regex. **Rating-scale direction**: NK explicitly publishes ``1-9 Scale: 1 = Best, Tallest or Highest; 9 = Worst, Shortest or Lowest`` on every tech sheet — REVERSED from Bayer/Golden Harvest. The chunker preserves values verbatim and the sidecar's ``_scale_direction`` field declares this so the LLM correctly interprets the chunk preamble. **Agronomic ratings**: rendered as horizontal bar charts in the PDF; pdfplumber's text extraction captures the LABELS (Emergence, Stalk Strength, Drought, etc.) but NOT the bar values. Surfacing those would require either OCR of the bar positions or pdfplumber's geometric layout parsing — deferred. For now the chunk records the labels and an explicit "agronomic ratings rendered as chart bars in the source PDF — values not currently extracted" annotation so the agent knows to direct the farmer at the tech-sheet PDF for those numbers. Tech-sheet PDF URLs come from the API response (live URL is correct; the assets-host filenames include a YYMMDD that changes). Output: corpus/nk/.md corpus/nk/.json source_key convention: ``nk-`` lowercased, e.g. ``nk-nk8005`` or ``nk-nk008-p8xf``. CLI: python -m scrape.sources.nk --limit 5 python -m scrape.sources.nk --crop corn --limit 12 python -m scrape.sources.nk --force """ from __future__ import annotations import argparse import io import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup import pdfplumber SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.syngenta-us.com" CORN_LIST_URL = f"{BASE}/corn/nk/products" SOY_LIST_URL = f"{BASE}/soybeans/nk/products" CORN_API = f"{BASE}/NKSeeds/CornProductFinder.aspx/GetProducts" SOY_API = f"{BASE}/NKSeeds/SoyProductFinder.aspx/GetProducts" # NK + AgriPro both use the "1 = best, lower = more resistant" convention. # Confirmed by tech-sheet footer: "1-9 Scale: 1 = Best...; 9 = Worst..." RATING_SCALE_DIRECTION = "1-9 (1 = best, lower = more resistant)" REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "nk" REQ_INTERVAL_SEC = 1.0 log = logging.getLogger("scrape.nk") # --------------------------------------------------------------------- HTTP class RateLimitedSession: def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request( self, method: str, url: str, *, max_retries: int = 4, timeout: float = 30.0, **kw: Any, ) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) def post(self, url: str, **kw: Any) -> requests.Response: return self.request("POST", url, **kw) # --------------------------------------------------------------------- model @dataclass class NKProduct: source_key: str source_url: str # the brand catalog page (closest thing to a per-variety URL) crop: str # "corn" | "soybeans" product_code: str = "" # NK8005 / NK008-P8XF relative_maturity: str | None = None # corn maturity_group: str | None = None # soy brand_variants: list[str] = field(default_factory=list) # ["NK8005-V", "NK8005-GT/LL"] trait_codes: list[str] = field(default_factory=list) trait_descriptions: list[str] = field(default_factory=list) positioning_statement: str | None = None strengths: list[str] = field(default_factory=list) techsheet_url: str | None = None characteristics_groups: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery def _api_payload_corn(rm_low: str, rm_high: str) -> str: """Payload for ``CornProductFinder.aspx/GetProducts``.""" return json.dumps({ "cornCount": "1", "rmLowerRange": rm_low, "rmUpperRange": rm_high, "brands": "NK", "agisuraTraits": "", "insectResistance": "", "herbicideTolerance": "", "waterOptimization": "", "reducedRefuge": "", "diseaseResistence": "", "silage": "", "path": "false", "currentUrl": CORN_LIST_URL, "fieldForged": "", "newProduct": "", }) def _api_payload_soy(rm_low: str, rm_high: str) -> str: return json.dumps({ "soyaBeanCount": "1", "rmLowerRange": rm_low, "rmUpperRange": rm_high, "herbicideTolerance": "", "diseaseFilter": "", "nematodeFilter": "", "agroPlantCharFilter": "", "plantHeightFilter": "", "brands": "NK", "browserURL": SOY_LIST_URL, "fieldForged": "", "newProduct": "", }) def _parse_card(html_chunk: str, crop: str) -> NKProduct | None: """Parse one ``
`` card from the API response into an NKProduct.""" soup = BeautifulSoup(html_chunk, "html.parser") title_el = soup.find(class_="sf-result-title") if not title_el: return None # Title contains code + RM tail code = (title_el.contents[0].strip() if title_el.contents else "").strip() if not code: return None rm_str: str | None = None span = title_el.find("span") if span: # span text is like "RM\n80" — strip to digits/decimal text = span.get_text(" ", strip=True) m = re.search(r"(\d+(?:\.\d+)?)", text) if m: rm_str = m.group(1) prod = NKProduct( source_key=f"nk-{code.lower()}", # NK doesn't expose per-variety URLs; the brand catalog is the # nearest equivalent. lookup_variety / get_page will still work # via source_key. source_url=CORN_LIST_URL if crop == "corn" else SOY_LIST_URL, crop=crop, product_code=code, ) if rm_str is not None: if crop == "corn": prod.relative_maturity = rm_str else: prod.maturity_group = rm_str # Brands Available (trait variants). inner = soup.find(class_="sf-result-content-inner") if inner: # The first with "Brands available:" or # "Herbicide Tolerant Trait(s):" sets the trait context. for strong in inner.find_all("strong"): text = strong.get_text(" ", strip=True) if text.lower().startswith("brands available"): rest = text.split(":", 1)[1] if ":" in text else "" for v in rest.split("|"): v = v.strip() if v: prod.brand_variants.append(v) elif text.lower().startswith("herbicide tolerant trait"): rest = text.split(":", 1)[1] if ":" in text else "" for t in rest.split(","): t = t.strip() if t: prod.trait_codes.append(t) else: # Positioning slogan is also rendered as a bare . if not prod.positioning_statement and len(text) > 12: prod.positioning_statement = text # Bullet strengths ul = inner.find("ul") if ul: for li in ul.find_all("li"): t = li.get_text(" ", strip=True) if t: prod.strengths.append(t) # Tech-sheet PDF URL. for a in soup.find_all("a", href=True): h = a["href"] if "assets.syngenta-us.com/pdf/techsheets/" in h and h.lower().endswith(".pdf"): prod.techsheet_url = h break return prod def discover_products( http: RateLimitedSession, *, only_crop: str | None = None, ) -> list[NKProduct]: """Hit the corn + soy product-finder APIs and parse the returned HTML cards into NKProducts. Returns identity-level data only; ratings come from the per-variety tech-sheet PDF in ``enrich_with_pdf``.""" # Warm the session cookie (some Syngenta deployments need it). http.get(CORN_LIST_URL) out: list[NKProduct] = [] headers = { "Content-Type": "application/json; charset=utf-8", "X-Requested-With": "XMLHttpRequest", } def _parse_response(html_blob: str, crop: str) -> int: """Parse the API response's inner HTML into NKProducts. The endpoint emits one ``
`` per variety, each wrapped in a ``
`` column. Strip the leading ``@`` markers and let BeautifulSoup tokenize the whole blob — no per-chunk split (the API doesn't actually delimit with ``@`` reliably, despite appearances). """ n = 0 # Strip leading " @ " noise (rendered by the JS when filters # change, not a structural delimiter). cleaned = html_blob.replace("@", "").strip() soup = BeautifulSoup(cleaned, "html.parser") for card in soup.find_all("div", class_="sf-result"): prod = _parse_card(str(card), crop) if prod: out.append(prod) n += 1 return n if only_crop in (None, "corn"): log.info("fetching NK corn product list") r = http.post( CORN_API, data=_api_payload_corn("75", "120"), headers={**headers, "Referer": CORN_LIST_URL}, ) r.raise_for_status() n = _parse_response(r.json().get("d") or "", "corn") log.info("corn cards parsed: %d", n) if only_crop in (None, "soybeans"): log.info("fetching NK soy product list") r = http.post( SOY_API, data=_api_payload_soy("0", "9.9"), headers={**headers, "Referer": SOY_LIST_URL}, ) r.raise_for_status() n = _parse_response(r.json().get("d") or "", "soybeans") log.info("soy cards parsed: %d", n) log.info("total: %d NK varieties", len(out)) return out # --------------------------------------------------------------------- PDF def _extract_disease_ratings(text: str) -> list[dict]: """Pull disease-tolerance ratings out of the tech-sheet PDF text. The PDF renders disease ratings as a left-column-label / right- column-number layout. pdfplumber's ``extract_text`` interleaves the agronomic-chart labels (no number) with the disease-rating labels + numbers, so we just look for lines ending in a numeric rating or a literal ``-`` (not available). Returns a list of ``{characteristic, value}``. Values are preserved as strings (including ``-`` for "not available"). """ # The disease list per tech sheet is small (~10 conditions) and # the labels are stable. We anchor on the known label set rather # than try to guess by layout. known_diseases = [ "Gray Leaf Spot", "Northern Corn Leaf Blight", "Goss's Wilt", "Goss's wilt", "Bacterial Leaf Streak", "Bacterial Corn Leaf Streak", "Southern Corn Leaf Blight", "Anthracnose Stalk Rot", "Anthracnose Leaf Blight", "Tar Spot", "Fusarium Crown Rot", "Common Rust", "Southern Rust", "Eye Spot", "Stewart's Bacterial Wilt", # Soybean "Brown Stem Rot", "Charcoal Rot", "Frogeye Leaf Spot", "Iron Deficiency Chlorosis", "Phytophthora Root Rot", "Sclerotinia White Mold", "White Mold", "Soybean Cyst Nematode", "Sudden Death Syndrome", "Southern Stem Canker", "Stem Canker", "Soybean Mosaic Virus", ] items: list[dict] = [] for line in text.splitlines(): line = line.strip() if not line: continue # Match "