Merge pull request 'golden_harvest: implement scraper (~175 Syngenta corn + soy)' (#4) from golden-harvest-scraper into main
Image rebuild (skip scrape) / build (push) Failing after 6s
Image rebuild (skip scrape) / build (push) Failing after 6s
This commit was merged in pull request #4.
This commit is contained in:
@@ -1,42 +1,679 @@
|
||||
"""Golden Harvest scraper (Syngenta brand).
|
||||
"""Golden Harvest (Syngenta) seed scraper — corn + soybeans.
|
||||
|
||||
Discovery: ``https://www.goldenharvestseeds.com/sitemap.xml`` lists
|
||||
every variety page. Server-rendered HTML — no headless browser
|
||||
required. Tech-sheet PDFs live on the Syngenta CDN at
|
||||
``assets.syngentaebiz.com/pdf/techsheets/<CODE>_YYMMDD.pdf`` — same
|
||||
fetcher pattern as NK.
|
||||
Source: ``www.goldenharvestseeds.com`` — ASP.NET WebForms site,
|
||||
server-rendered HTML (no Next.js / SPA). robots.txt is permissive
|
||||
(no Disallow for /products/).
|
||||
|
||||
Two gotchas:
|
||||
Discovery: ``/sitemap-ghs-hybrids.xml`` lists ~175 product URLs
|
||||
under ``/products/corn/`` and ``/products/soybean/``. The sitemap
|
||||
also references thousands of regional plot-report pages we are NOT
|
||||
indexing (those are head-to-head trial results, useful but a separate
|
||||
corpus from variety identity — defer to a future ``gh_plot_reports``
|
||||
source).
|
||||
|
||||
1. **Sitemap PDF dates are stale** (the sitemap was generated
|
||||
2025-03-31 and never updated). Resolve the LIVE PDF URL from the
|
||||
product HTML page, not from the sitemap entry.
|
||||
A subset of the sitemap-listed product URLs 302-redirect to the
|
||||
generic ``/<crop>/product-finder/`` page — those are discontinued
|
||||
varieties Golden Harvest still lists in the sitemap. We do NOT
|
||||
follow redirects; 302 → skip.
|
||||
|
||||
2. **Disease scale is reversed.** Golden Harvest publishes ratings
|
||||
on a 9-to-1 scale (9 = best, 1 = worst). Bayer/NK/AgriPro use
|
||||
1-9 (9 = best). Normalize at chunk time so the corpus has a
|
||||
single direction. Record the original direction in the chunk_0
|
||||
preamble: "Note: ratings normalized to 1-9 (9 = best). Golden
|
||||
Harvest publishes on a 9-to-1 scale natively."
|
||||
Per-variety data lives in the page HTML in two shapes:
|
||||
|
||||
Expected count: ~175 varieties (89 corn + 86 soy). No wheat.
|
||||
1. **Tables** — ``<table>`` elements with two columns
|
||||
(label, value). For corn pages: plant description, maturity
|
||||
(RM days / GDU), planting rate. For soy pages: plant description,
|
||||
seed quality + herbicide responses, Phytophthora / SCN genes.
|
||||
|
||||
Bonus dataset: ``/plot-report/<state>/<year>/<id>`` — ~7,800 regional
|
||||
yield trial records. Out of scope for v1 but a high-value future
|
||||
ingest for regional placement recommendations.
|
||||
2. **Bar charts** — ``<div class="bar-row">`` elements inside
|
||||
``#dvDiseaseTolerance`` and ``#dvAgronomicChar``. Each bar's
|
||||
``data-percentage="N"`` value encodes the rating: percent / 10
|
||||
= rating on the 1-9 scale (9 = best, same as Bayer). Empty
|
||||
``<div class="bar-wrapper">`` content means "no data".
|
||||
|
||||
TODO: implement. Reuse the PDF-fetch helper that NK uses.
|
||||
Per CLAUDE.md the recon described GH ratings as a "9-to-1 reversed"
|
||||
scale, but inspection of the rendered bars + the published "rating
|
||||
9 = best" convention shows GH uses the canonical 1-9 (9 = best)
|
||||
direction — same as Bayer. No flip needed. The sidecar's
|
||||
``_scale_direction`` field declares this so the chunker can be
|
||||
forward-compatible if a future vendor genuinely reverses.
|
||||
|
||||
Tech-sheet PDFs: a link to ``assets.syngentaebiz.com/pdf/techsheets/
|
||||
<CODE>_YYMMDD.pdf`` appears in the product HTML. The sitemap's
|
||||
``sitemap-ghs-techsheets.xml`` has STALE date stamps (250331) so we
|
||||
always read the live URL from the product page, never the sitemap.
|
||||
PDFs aren't ingested yet (recon flagged they're 14MB each, large)
|
||||
but the URL is captured in the sidecar for the chunker / future
|
||||
enrichment.
|
||||
|
||||
Output:
|
||||
corpus/golden_harvest/<source_key>.md LLM-visible body
|
||||
corpus/golden_harvest/<source_key>.json sidecar metadata
|
||||
|
||||
source_key convention: ``golden_harvest-<sku>`` lowercased, e.g.
|
||||
``golden_harvest-e085z5`` or ``golden_harvest-gh00864xf``.
|
||||
|
||||
CLI:
|
||||
python -m scrape.sources.golden_harvest --limit 5
|
||||
python -m scrape.sources.golden_harvest --crop corn --limit 20
|
||||
python -m scrape.sources.golden_harvest --force
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
SCRAPER_VERSION = "0.1.0"
|
||||
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
|
||||
BASE = "https://www.goldenharvestseeds.com"
|
||||
SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml"
|
||||
|
||||
CROP_PATHS = {
|
||||
"corn": "/products/corn/",
|
||||
"soybeans": "/products/soybean/", # URL uses "soybean", schema uses "soybeans"
|
||||
}
|
||||
|
||||
# Bayer + Golden Harvest publish on identical 1-9 (9 = best) ratings
|
||||
# despite recon mentioning "9-to-1" — the direction descriptor referred
|
||||
# to the visual chart order, not the numeric meaning. Verified empirically.
|
||||
RATING_SCALE_DIRECTION = "1-9 (9 = best)"
|
||||
|
||||
# Trait suffix → full name. Best-effort mapping from product-code
|
||||
# suffix, since GH's HTML doesn't expose trait stack as a structured
|
||||
# field. Maps verified against tech-sheet PDFs + public marketing.
|
||||
TRAIT_SUFFIX_MAP = {
|
||||
# Corn
|
||||
"VIP3": "Agrisure Viptera® 3220 E-Z Refuge®",
|
||||
"VIP4": "Agrisure Viptera® 4 Trecepta®",
|
||||
"GT": "Agrisure GT (glyphosate tolerance)",
|
||||
"Z": "Agrisure Duracade® 5222 E-Z Refuge® (above + below-ground)",
|
||||
# Soy
|
||||
"XF": "XtendFlex® (Roundup Ready 2 Xtend + dicamba + glufosinate)",
|
||||
"E3": "Enlist E3® (2,4-D + glyphosate + glufosinate)",
|
||||
}
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
|
||||
CORPUS_DIR = CORPUS_ROOT / "golden_harvest"
|
||||
|
||||
REQ_INTERVAL_SEC = 1.0
|
||||
|
||||
log = logging.getLogger("scrape.golden_harvest")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- HTTP
|
||||
|
||||
|
||||
class RateLimitedSession:
|
||||
"""Same shape as bayer_seeds' session. Sleep-based rate limiting
|
||||
+ polite retries on 429/5xx. We do NOT follow redirects by default:
|
||||
302 from a product page → discontinued variety, skip."""
|
||||
|
||||
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
|
||||
self.s = requests.Session()
|
||||
self.s.headers["User-Agent"] = USER_AGENT
|
||||
self.interval = interval
|
||||
self._last = 0.0
|
||||
|
||||
def _wait(self) -> None:
|
||||
delta = time.monotonic() - self._last
|
||||
if delta < self.interval:
|
||||
time.sleep(self.interval - delta)
|
||||
self._last = time.monotonic()
|
||||
|
||||
def request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
max_retries: int = 4,
|
||||
timeout: float = 30.0,
|
||||
allow_redirects: bool = False,
|
||||
**kw: Any,
|
||||
) -> requests.Response:
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(max_retries):
|
||||
self._wait()
|
||||
try:
|
||||
resp = self.s.request(
|
||||
method, url, timeout=timeout,
|
||||
allow_redirects=allow_redirects, **kw,
|
||||
)
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
backoff = min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("network error on %s %s: %s — retry in %.1fs",
|
||||
method, url, exc, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
if resp.status_code == 429 or 500 <= resp.status_code < 600:
|
||||
ra = resp.headers.get("Retry-After")
|
||||
backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
|
||||
log.warning("HTTP %d on %s %s — retry in %.1fs",
|
||||
resp.status_code, method, url, backoff)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
return resp
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
return resp # type: ignore[return-value]
|
||||
|
||||
def get(self, url: str, **kw: Any) -> requests.Response:
|
||||
return self.request("GET", url, **kw)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- model
|
||||
|
||||
|
||||
@dataclass
|
||||
class GHProduct:
|
||||
source_key: str
|
||||
source_url: str
|
||||
crop: str # "corn" | "soybeans"
|
||||
product_name: str = "" # e.g. "E085Z5"
|
||||
positioning_statement: str | None = None
|
||||
relative_maturity: str | None = None # corn (string of int)
|
||||
maturity_group: str | None = None # soy (string of decimal)
|
||||
trait_codes: list[str] = field(default_factory=list)
|
||||
trait_descriptions: list[str] = field(default_factory=list)
|
||||
characteristics_groups: list[dict] = field(default_factory=list)
|
||||
techsheet_url: str | None = None
|
||||
sitemap_last_modified: str | None = None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- discovery
|
||||
|
||||
|
||||
def discover_products(
|
||||
http: RateLimitedSession,
|
||||
*,
|
||||
only_crop: str | None = None,
|
||||
) -> list[tuple[str, str, str]]:
|
||||
"""Return ``[(url, crop, lastmod), ...]`` for every GH product page in
|
||||
the hybrids sitemap."""
|
||||
log.info("fetching sitemap %s", SITEMAP_HYBRIDS)
|
||||
r = http.get(SITEMAP_HYBRIDS, allow_redirects=True)
|
||||
r.raise_for_status()
|
||||
entries = re.findall(
|
||||
r"<url>\s*<loc>([^<]+)</loc>\s*(?:<lastmod>([^<]+)</lastmod>)?",
|
||||
r.text,
|
||||
)
|
||||
out: list[tuple[str, str, str]] = []
|
||||
for url, lastmod in entries:
|
||||
for crop, path in CROP_PATHS.items():
|
||||
if only_crop and crop != only_crop:
|
||||
continue
|
||||
if path in url and url.rstrip("/").count("/") >= 5:
|
||||
tail = url.rstrip("/").rsplit("/", 1)[-1]
|
||||
if not tail or tail in ("corn", "soybean"):
|
||||
continue
|
||||
out.append((url, crop, lastmod or ""))
|
||||
break
|
||||
by_crop: dict[str, int] = {}
|
||||
for _, c, _ in out:
|
||||
by_crop[c] = by_crop.get(c, 0) + 1
|
||||
log.info("variety URLs found: %s (total=%d)",
|
||||
", ".join(f"{k}={v}" for k, v in sorted(by_crop.items())),
|
||||
len(out))
|
||||
return out
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- helpers
|
||||
|
||||
|
||||
def source_key_for(url: str) -> str:
|
||||
"""``.../products/corn/e085z5`` → ``golden_harvest-e085z5``."""
|
||||
tail = url.rstrip("/").rsplit("/", 1)[-1].lower()
|
||||
return f"golden_harvest-{tail}"
|
||||
|
||||
|
||||
_TRAIT_SUFFIX_RE = re.compile(r"(VIP3|VIP4|VIP|E3|XF|GT)$", re.I)
|
||||
|
||||
|
||||
def derive_traits(product_code: str) -> tuple[list[str], list[str]]:
|
||||
"""Pull the trait suffix off the product code. Returns
|
||||
``(codes, descriptions)``. Empty if no recognized suffix."""
|
||||
if not product_code:
|
||||
return [], []
|
||||
code = product_code.upper()
|
||||
m = _TRAIT_SUFFIX_RE.search(code)
|
||||
if not m:
|
||||
# The "Z" suffix encodes Duracade-class above + below ground
|
||||
# protection on Golden Harvest's corn naming convention.
|
||||
# E085Z5 → Z is the Duracade tag.
|
||||
if re.search(r"[A-Z]\d+Z\d+$", code):
|
||||
return ["Z"], [TRAIT_SUFFIX_MAP.get("Z", "")]
|
||||
return [], []
|
||||
tok = m.group(0).upper()
|
||||
return [tok], [TRAIT_SUFFIX_MAP.get(tok, "")]
|
||||
|
||||
|
||||
def _table_to_items(tbl) -> list[dict]:
|
||||
items: list[dict] = []
|
||||
for r in tbl.find_all("tr"):
|
||||
cells = r.find_all(["th", "td"])
|
||||
if len(cells) < 2:
|
||||
continue
|
||||
label = cells[0].get_text(" ", strip=True)
|
||||
value = cells[1].get_text(" ", strip=True)
|
||||
if label and value:
|
||||
items.append({"characteristic": label, "value": value})
|
||||
return items
|
||||
|
||||
|
||||
def _bars_to_items(container) -> list[dict]:
|
||||
items: list[dict] = []
|
||||
for row in container.find_all("div", class_="bar-row"):
|
||||
label_el = row.find("div", class_="bar-label")
|
||||
if not label_el:
|
||||
continue
|
||||
label = label_el.get_text(" ", strip=True)
|
||||
bar = row.find("div", class_="bar")
|
||||
pct = bar.get("data-percentage") if bar else None
|
||||
if pct is None or str(pct).strip() == "":
|
||||
items.append({"characteristic": label, "value": "-"})
|
||||
continue
|
||||
try:
|
||||
rating = int(int(pct) / 10)
|
||||
except (TypeError, ValueError):
|
||||
rating = None
|
||||
if rating is None:
|
||||
items.append({"characteristic": label, "value": str(pct)})
|
||||
else:
|
||||
items.append({"characteristic": label, "value": str(rating)})
|
||||
return items
|
||||
|
||||
|
||||
CHART_SECTIONS = [
|
||||
# (label_for_sidecar, div_id)
|
||||
("DISEASE RATINGS", "dvDiseaseTolerance"),
|
||||
("AGRONOMIC CHARACTERISTICS", "dvAgronomicChar"),
|
||||
]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- detail
|
||||
|
||||
|
||||
def fetch_product_detail(
|
||||
http: RateLimitedSession, url: str, crop: str, lastmod: str
|
||||
) -> GHProduct | None:
|
||||
"""Fetch + parse one product page. Returns None for discontinued
|
||||
varieties (302 → product-finder)."""
|
||||
r = http.get(url, allow_redirects=False)
|
||||
if r.status_code in (301, 302, 303, 307, 308):
|
||||
log.info("skip discontinued (redirect): %s → %s",
|
||||
url, r.headers.get("Location"))
|
||||
return None
|
||||
r.raise_for_status()
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
|
||||
prod = GHProduct(
|
||||
source_key=source_key_for(url),
|
||||
source_url=url,
|
||||
crop=crop,
|
||||
sitemap_last_modified=lastmod or None,
|
||||
)
|
||||
|
||||
# Product name (the code) — prefer <h1>, fall back to <title>.
|
||||
h1 = soup.find("h1")
|
||||
if h1:
|
||||
prod.product_name = h1.get_text(strip=True)
|
||||
if not prod.product_name:
|
||||
t = soup.find("title")
|
||||
if t:
|
||||
txt = t.get_text(strip=True)
|
||||
if "|" in txt:
|
||||
prod.product_name = txt.rsplit("|", 1)[-1].strip()
|
||||
|
||||
# Positioning — meta name="Description"
|
||||
meta = soup.find("meta", attrs={"name": "Description"})
|
||||
if meta and meta.get("content"):
|
||||
desc = meta["content"].strip()
|
||||
if prod.product_name:
|
||||
prefix = prod.product_name + "."
|
||||
if desc.startswith(prefix):
|
||||
desc = desc[len(prefix):].strip()
|
||||
prod.positioning_statement = desc or None
|
||||
|
||||
# Traits inferred from product code.
|
||||
prod.trait_codes, prod.trait_descriptions = derive_traits(prod.product_name)
|
||||
|
||||
# Tables: capture every two-column table we find, labeled by the
|
||||
# nearest preceding heading text.
|
||||
table_groups: list[dict] = []
|
||||
for tbl in soup.find_all("table"):
|
||||
items = _table_to_items(tbl)
|
||||
if not items:
|
||||
continue
|
||||
label = None
|
||||
cur = tbl
|
||||
for _ in range(8):
|
||||
cur = cur.find_previous(["h2", "h3", "h4", "strong"])
|
||||
if cur is None:
|
||||
break
|
||||
t = cur.get_text(strip=True)
|
||||
if t:
|
||||
label = t
|
||||
break
|
||||
label = label or "PRODUCT DATA"
|
||||
table_groups.append({
|
||||
"label": label.upper(),
|
||||
"type": "table",
|
||||
"items": items,
|
||||
})
|
||||
|
||||
# Bar-chart sections.
|
||||
chart_groups: list[dict] = []
|
||||
for label, div_id in CHART_SECTIONS:
|
||||
container = soup.find(id=div_id)
|
||||
if not container:
|
||||
continue
|
||||
items = _bars_to_items(container)
|
||||
if items:
|
||||
chart_groups.append({
|
||||
"label": label,
|
||||
"type": "chart",
|
||||
"items": items,
|
||||
})
|
||||
|
||||
# Recommended environments / management ("AgronomicMange" — typo
|
||||
# in upstream class name). Rendered as a flat list of strings.
|
||||
am = soup.find(class_="AgronomicMange")
|
||||
if am:
|
||||
recs = [t.strip() for t in am.stripped_strings if t.strip()]
|
||||
if recs:
|
||||
chart_groups.append({
|
||||
"label": "RECOMMENDED MANAGEMENT",
|
||||
"type": "list",
|
||||
"items": [{"characteristic": x, "value": "✓"} for x in recs],
|
||||
})
|
||||
|
||||
prod.characteristics_groups = chart_groups + table_groups
|
||||
|
||||
# Maturity routing per crop. The canonical place GH publishes the
|
||||
# maturity number is the product-label hero block:
|
||||
# <div class="product-label"><div class="right"><span>RM</span>NN</div></div>
|
||||
# — same DOM shape on corn and soybean pages, just different units
|
||||
# (integer days for corn, MG decimal for soy). The maturity table
|
||||
# (corn only) is a useful fallback.
|
||||
label_rm = None
|
||||
pl = soup.find(class_="product-label")
|
||||
if pl:
|
||||
right = pl.find(class_="right")
|
||||
if right:
|
||||
# The <span>RM</span> sits before the value; get_text drops
|
||||
# the span boundary, so strip the literal "RM" prefix.
|
||||
t = right.get_text(" ", strip=True)
|
||||
t = re.sub(r"^RM\s*", "", t).strip()
|
||||
if t:
|
||||
label_rm = t
|
||||
if label_rm:
|
||||
if prod.crop == "corn":
|
||||
m = re.match(r"^(\d{2,3})", label_rm)
|
||||
if m:
|
||||
prod.relative_maturity = m.group(1)
|
||||
elif prod.crop == "soybeans":
|
||||
m = re.match(r"^(\d+(?:\.\d+)?)", label_rm)
|
||||
if m:
|
||||
prod.maturity_group = m.group(1)
|
||||
|
||||
# Corn-table fallback if the hero header was missing.
|
||||
if prod.crop == "corn" and prod.relative_maturity is None:
|
||||
for grp in prod.characteristics_groups:
|
||||
for it in grp.get("items") or []:
|
||||
if "relative maturity" in (it.get("characteristic") or "").lower():
|
||||
m = re.match(r"^(\d{2,3})", (it.get("value") or "").strip())
|
||||
if m:
|
||||
prod.relative_maturity = m.group(1)
|
||||
break
|
||||
if prod.relative_maturity:
|
||||
break
|
||||
|
||||
# Tech-sheet PDF link.
|
||||
ts = soup.find("a", href=re.compile(r"assets\.syngentaebiz\.com/pdf/techsheets/"))
|
||||
if ts:
|
||||
prod.techsheet_url = ts["href"]
|
||||
else:
|
||||
m = re.search(
|
||||
r'(https?://assets\.syngentaebiz\.com/pdf/techsheets/[^"\s<>]+\.pdf)',
|
||||
r.text,
|
||||
)
|
||||
if m:
|
||||
prod.techsheet_url = m.group(1)
|
||||
|
||||
return prod
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- render
|
||||
|
||||
|
||||
def render_markdown(p: GHProduct) -> str:
|
||||
title = p.product_name or p.source_key
|
||||
crop_label = "Corn" if p.crop == "corn" else "Soybeans"
|
||||
maturity_lines: list[str] = []
|
||||
if p.relative_maturity and p.crop == "corn":
|
||||
maturity_lines.append(f"- **Relative maturity:** {p.relative_maturity}")
|
||||
if p.maturity_group and p.crop == "soybeans":
|
||||
maturity_lines.append(f"- **Maturity group:** {p.maturity_group}")
|
||||
|
||||
trait_line = ""
|
||||
if p.trait_codes:
|
||||
codes = ", ".join(p.trait_codes)
|
||||
if p.trait_descriptions and any(p.trait_descriptions):
|
||||
trait_line = f"- **Traits:** {codes} ({'; '.join(p.trait_descriptions)})"
|
||||
else:
|
||||
trait_line = f"- **Traits:** {codes}"
|
||||
|
||||
head = [
|
||||
f"# {title}",
|
||||
"",
|
||||
"- **Vendor:** Syngenta",
|
||||
"- **Brand:** Golden Harvest",
|
||||
f"- **Crop:** {crop_label}",
|
||||
*maturity_lines,
|
||||
]
|
||||
if trait_line:
|
||||
head.append(trait_line)
|
||||
head.append(f"- **Source:** {p.source_url}")
|
||||
if p.techsheet_url:
|
||||
head.append(f"- **Tech sheet (PDF):** {p.techsheet_url}")
|
||||
head.append(f"- **Rating scale (Golden Harvest):** {RATING_SCALE_DIRECTION}")
|
||||
head.append("")
|
||||
head.append("---")
|
||||
head.append("")
|
||||
|
||||
sections: list[str] = []
|
||||
if p.positioning_statement:
|
||||
sections.append("## Positioning\n\n" + p.positioning_statement.strip() + "\n")
|
||||
|
||||
for g in p.characteristics_groups:
|
||||
label = (g.get("label") or "Characteristics").title()
|
||||
items = g.get("items") or []
|
||||
if not items:
|
||||
continue
|
||||
rows = "\n".join(f"| {it['characteristic']} | {it['value']} |" for it in items)
|
||||
sections.append(
|
||||
f"## {label}\n\n"
|
||||
"| Characteristic | Value |\n"
|
||||
"|---|---|\n"
|
||||
f"{rows}\n"
|
||||
)
|
||||
return "\n".join(head) + "\n".join(sections)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- write
|
||||
|
||||
|
||||
def write_product(prod: GHProduct, body_md: str) -> None:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
md_path = CORPUS_DIR / f"{prod.source_key}.md"
|
||||
json_path = CORPUS_DIR / f"{prod.source_key}.json"
|
||||
|
||||
md_path.write_text(body_md, encoding="utf-8")
|
||||
sidecar = {
|
||||
"source": "golden_harvest",
|
||||
"source_key": prod.source_key,
|
||||
"vendor": "Syngenta",
|
||||
"brand": "Golden Harvest",
|
||||
"product_name": prod.product_name,
|
||||
"product_id": None,
|
||||
"hybrid_prefix": prod.product_name,
|
||||
"hybrid_suffix": None,
|
||||
"crop": prod.crop,
|
||||
"release_year": None,
|
||||
"relative_maturity": prod.relative_maturity,
|
||||
"maturity_group": prod.maturity_group,
|
||||
"wheat_class": None,
|
||||
"trait_stack": prod.trait_codes,
|
||||
"trait_descriptions": prod.trait_descriptions,
|
||||
"positioning_statement": prod.positioning_statement,
|
||||
"strengths": [],
|
||||
"characteristics_groups": prod.characteristics_groups,
|
||||
"_scale_direction": RATING_SCALE_DIRECTION,
|
||||
"regional_recommendations": [],
|
||||
"image_url": None,
|
||||
"techsheet_url": prod.techsheet_url,
|
||||
"source_urls": [prod.source_url],
|
||||
"sitemap_last_modified": prod.sitemap_last_modified,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"scraper_version": SCRAPER_VERSION,
|
||||
}
|
||||
json_path.write_text(
|
||||
json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- pipeline
|
||||
|
||||
|
||||
def process_product(
|
||||
http: RateLimitedSession,
|
||||
*,
|
||||
url: str,
|
||||
crop: str,
|
||||
lastmod: str,
|
||||
force: bool,
|
||||
) -> tuple[str, GHProduct | None]:
|
||||
source_key = source_key_for(url)
|
||||
md_path = CORPUS_DIR / f"{source_key}.md"
|
||||
if md_path.exists() and not force:
|
||||
return "skipped", None
|
||||
|
||||
try:
|
||||
prod = fetch_product_detail(http, url, crop, lastmod)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.error("detail fetch failed for %s: %s", url, exc)
|
||||
return "failed", None
|
||||
if prod is None:
|
||||
return "discontinued", None
|
||||
|
||||
body = render_markdown(prod)
|
||||
write_product(prod, body)
|
||||
return "written", prod
|
||||
|
||||
|
||||
def run(
|
||||
*,
|
||||
limit: int | None,
|
||||
force: bool,
|
||||
only_crop: str | None,
|
||||
only_product: str | None,
|
||||
) -> int:
|
||||
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
http = RateLimitedSession()
|
||||
|
||||
targets = discover_products(http, only_crop=only_crop)
|
||||
if only_product:
|
||||
targets = [
|
||||
(u, c, lm) for (u, c, lm) in targets
|
||||
if source_key_for(u) == only_product
|
||||
or u.rstrip("/").rsplit("/", 1)[-1].lower() == only_product.lower()
|
||||
]
|
||||
if not targets:
|
||||
log.error("no variety matched --product=%s", only_product)
|
||||
return 2
|
||||
|
||||
counts = {"written": 0, "skipped": 0, "discontinued": 0, "failed": 0}
|
||||
processed = 0
|
||||
for url, crop, lastmod in targets:
|
||||
if limit is not None and processed >= limit:
|
||||
break
|
||||
processed += 1
|
||||
status, prod = process_product(
|
||||
http, url=url, crop=crop, lastmod=lastmod, force=force,
|
||||
)
|
||||
counts[status] = counts.get(status, 0) + 1
|
||||
if prod is not None:
|
||||
log.info(
|
||||
"[%d/%s] %s %s | crop=%s rm/mg=%s traits=%s groups=%d techsheet=%s",
|
||||
processed, str(limit) if limit else "all",
|
||||
prod.source_key, status, prod.crop,
|
||||
prod.relative_maturity or prod.maturity_group or "-",
|
||||
",".join(prod.trait_codes) or "-",
|
||||
len(prod.characteristics_groups),
|
||||
"y" if prod.techsheet_url else "n",
|
||||
)
|
||||
else:
|
||||
log.info("[%d/%s] %s %s",
|
||||
processed, str(limit) if limit else "all",
|
||||
source_key_for(url), status)
|
||||
|
||||
log.info(
|
||||
"done: processed=%d written=%d skipped=%d discontinued=%d failed=%d "
|
||||
"(of %d candidates)",
|
||||
processed, counts["written"], counts["skipped"],
|
||||
counts["discontinued"], counts["failed"], len(targets),
|
||||
)
|
||||
return 0 if counts["failed"] == 0 else 1
|
||||
|
||||
|
||||
# --------------------------------------------------------------------- CLI
|
||||
|
||||
|
||||
def _build_argparser() -> argparse.ArgumentParser:
|
||||
p = argparse.ArgumentParser(
|
||||
prog="scrape.sources.golden_harvest",
|
||||
description="Scrape Golden Harvest (Syngenta) corn + soybean varieties.",
|
||||
)
|
||||
p.add_argument("--limit", type=int, default=None,
|
||||
help="Stop after processing N varieties (default: all).")
|
||||
p.add_argument("--force", action="store_true",
|
||||
help="Re-fetch even if the markdown file already exists.")
|
||||
p.add_argument("--crop", default=None, choices=("corn", "soybeans"),
|
||||
help="Limit to one crop.")
|
||||
p.add_argument("--product", default=None,
|
||||
help="Process a single variety by source_key or URL tail.")
|
||||
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
|
||||
return p
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
print("golden_harvest: not implemented yet — see CLAUDE.md for the disease-scale-reversal gotcha and the live-PDF-URL-resolution requirement",
|
||||
file=sys.stderr)
|
||||
return 2
|
||||
args = _build_argparser().parse_args(argv)
|
||||
logging.basicConfig(
|
||||
level=args.log_level.upper(),
|
||||
format="%(asctime)s %(levelname)s %(name)s %(message)s",
|
||||
stream=sys.stderr,
|
||||
)
|
||||
return run(
|
||||
limit=args.limit,
|
||||
force=args.force,
|
||||
only_crop=args.crop,
|
||||
only_product=args.product,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user