"""Ohio Corn & Soybean Performance Test (OCPT/OSPT) — independent, cross-vendor yield trials (data_type=trial). Source: ``ohiocroptest.cfaes.osu.edu`` — The Ohio State University / CFAES extension publishes the annual Ohio Corn Performance Test and Ohio Soybean Performance Trials as full-report PDFs with a real text layer. These are *third-party* university trials: every brand that pays the entry fee is evaluated head-to-head at the SAME sites, so a single report ranks CHANNEL, DEKALB, NK, Golden Harvest, LG Seeds, Augusta, Ebberts, Seed Consultants, etc. against each other — the highest-value class of trial data because no vendor controls it. This is the FOURTH ``data_type: "trial"`` source family after the vendor plot reports (gh / lg / agrigold / proharvest). Unlike those — which are ONE plot per report — the OCPT report is ONE PDF carrying a dozen multi-site tables, each table laying out several SITES as side-by-side column groups (Hebron | Washington CH | South Charleston | Covington | Summary). We split each report into ONE sidecar per SITE (and one per regional Summary), so the corpus's per-site shape matches the vendor plot reports and the trial chunker's shared ``_render_gh_plot_chunk`` renderer handles it with NO chunk.py edit (we emit ``results: [{rank, brand, product, traits, metrics}]`` with a canonical ``"Yield"`` metric key). PDF layout (corn, e.g. CountryJournal2025.pdf): Table 1E/1L/4E/4L/7E/7L : single-year per-site tables. A site-name line names the column groups; each hybrid row carries 5 numbers (Yield, Harv.Mst, Stk.Ldg, Final Std, Emergence) PER site, then a single trailing TW (test weight, summary-level). These are the gold per-site data — we emit one sidecar per real site (+ Summary). Table 2/3/5/6/8/9 : multi-year / combined regional summaries — same column-group structure (year/region groups). We emit these as region-level summary sidecars (the Summary group of the matching single-year table already covers the site mean, so these add the 2-yr / 3-yr / statewide aggregates). Table 10 : hybrid -> technology-traits lookup (RR,CB,TRE,...). Parsed into a traits map and joined onto every result's ``traits`` field. PDF layout (soybeans, e.g. 2025OCJwithproteinandoil.pdf): a different column order — ``Variety | Brand | Type | Seed Treatment | RM | | Mean``. Region (North/Central/South) x maturity (Early/Late) tables, with site codes (N1/N2, C1/C2, S1/S2) defined in a Site-Descriptions table (Table 1). We emit one sidecar per region x maturity, keyed by the region (the soy report does not break per-site yields into separate ranked tables the way corn does — the per-site columns are within the regional table, so a region sidecar carries the full ranked field with each site's yield in the metrics). SANITY GATE: every parsed row must have a real (non-numeric, len>1) brand, a product code, and a plausible Yield (1 < y < 400). Rows that fail (stat rows High/Average/Low/LSD, wrapped lines, footnotes) are dropped + counted. If a whole table won't parse into >=2 good rows it is SKIPPED + logged + counted — never emitted with mis-assigned rows. Scope: 2025 (latest) + 2024 baseline. ``--include-old`` pulls the u.osu.edu archive years. Output: corpus/ohio_ocpt_trials/.md corpus/ohio_ocpt_trials/.json source_key: ``ocpt---`` e.g. ``ocpt-corn-2025-sw-hebron``, ``ocpt-soybeans-2025-north-early``. CLI: python -m scrape.sources.ohio_ocpt_trials --year 2025 --crop corn python -m scrape.sources.ohio_ocpt_trials --force python -m scrape.sources.ohio_ocpt_trials --include-old """ from __future__ import annotations import argparse import io import json import logging import os import random import re import sys import time from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any import pdfplumber import requests SCRAPER_VERSION = "0.1.0" USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://ohiocroptest.cfaes.osu.edu" ARCHIVE = "https://u.osu.edu/perf/archive/" # The publisher labels (per the trial sidecar contract). VENDOR = "The Ohio State University" BRAND_AGGREGATOR = "Ohio Corn/Soybean Performance Test publishes" PUBLISHER_BRAND = "Ohio Crop Performance Test" TOS_NOTE = ("© OSU on the report; explicit no-endorsement clause; public " "CFAES extension publication; attribute Ohio Corn/Soybean " "Performance Test, OSU CFAES.") BASELINE_YEARS = [2024, 2025] OLD_YEARS = [2018, 2019, 2020, 2021, 2022, 2023] REQ_INTERVAL_SEC = 2.0 # polite, low rate against a university host REPO_ROOT = Path(__file__).resolve().parents[2] CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "ohio_ocpt_trials" log = logging.getLogger("scrape.ohio_ocpt_trials") # --------------------------------------------------------------------- HTTP class RateLimitedSession: def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: self.s = requests.Session() self.s.headers["User-Agent"] = USER_AGENT self.interval = interval self._last = 0.0 def _wait(self) -> None: delta = time.monotonic() - self._last if delta < self.interval: time.sleep(self.interval - delta) self._last = time.monotonic() def request(self, method: str, url: str, *, max_retries: int = 4, timeout: float = 60.0, **kw: Any) -> requests.Response: last_exc: Exception | None = None for attempt in range(max_retries): self._wait() try: resp = self.s.request(method, url, timeout=timeout, **kw) except requests.RequestException as exc: last_exc = exc backoff = min(30.0, (2 ** attempt) + random.random()) log.warning("network error on %s %s: %s — retry in %.1fs", method, url, exc, backoff) time.sleep(backoff) continue if resp.status_code == 429 or 500 <= resp.status_code < 600: ra = resp.headers.get("Retry-After") backoff = float(ra) if (ra and ra.isdigit()) else min( 30.0, (2 ** attempt) + random.random()) log.warning("HTTP %d on %s %s — retry in %.1fs", resp.status_code, method, url, backoff) time.sleep(backoff) continue return resp if last_exc: raise last_exc return resp # type: ignore[return-value] def get(self, url: str, **kw: Any) -> requests.Response: return self.request("GET", url, **kw) # --------------------------------------------------------------------- model @dataclass class SiteTrial: """One per-site (or per-region-summary) ranked trial, ready to write.""" source_key: str crop: str # "corn" | "soybeans" year: int region: str # SW / NW / NE region or county site: str | None # site town / county / "Summary" pdf_url: str table_label: str = "" # e.g. "Table 1E (early maturity)" # per-site agronomic footnote metadata (corn single-year tables) soil_type: str | None = None previous_crop: str | None = None planting_date: str | None = None harvest_date: str | None = None tillage: str | None = None fungicide: str | None = None cooperator: str | None = None county: str | None = None results: list[dict] = field(default_factory=list) # --------------------------------------------------------------------- discovery _PDF_HREF_RE = re.compile(r'href="([^"]+\.pdf)"', re.I) def _discover_pdf_hrefs(http: RateLimitedSession, index_url: str) -> list[str]: """Fetch an index page and return absolute hrefs to every .pdf linked.""" from urllib.parse import urljoin try: r = http.get(index_url) r.raise_for_status() except requests.RequestException as exc: log.warning("index fetch failed %s: %s", index_url, exc) return [] out: list[str] = [] seen: set[str] = set() for href in _PDF_HREF_RE.findall(r.text): full = urljoin(index_url, href) if full not in seen: seen.add(full) out.append(full) return out # The full-report PDF (corn) is "CountryJournal" / (soy) the # "OCJwithproteinandoil" file. Weather / seed-quality / pollinator / # seed-composition PDFs carry no head-to-head yield table → ignore them. def _pick_report_pdf(hrefs: list[str], crop: str, year: int) -> str | None: yr = str(year) if crop == "corn": for h in hrefs: base = h.rsplit("/", 1)[-1].lower() if "countryjournal" in base and yr in base: return h # fallback: any non-weather pdf with the year for h in hrefs: base = h.rsplit("/", 1)[-1].lower() if yr in base and "weather" not in base: return h else: # soybeans for h in hrefs: base = h.rsplit("/", 1)[-1].lower() if "ocj" in base and "protein" in base and yr in base: return h for h in hrefs: base = h.rsplit("/", 1)[-1].lower() if yr in base and "seed" not in base and "pollinator" not in base: return h return None def discover_report_pdf(http: RateLimitedSession, crop: str, year: int) -> str | None: """Find the full-report PDF for a (crop, year) by walking the live index pages and extracting PDF hrefs (no hardcoded filename).""" if crop == "corn": # The corn index is /corntrials/ ; older years via ?year=NNNN. indexes = [f"{BASE}/corntrials/default.asp?year={year}", f"{BASE}/corntrials/"] else: # Soy index lives at /soyNNNN/ (year in path). indexes = [f"{BASE}/soy{year}/", f"{BASE}/soy{year}"] for idx in indexes: hrefs = _discover_pdf_hrefs(http, idx) pdf = _pick_report_pdf(hrefs, crop, year) if pdf: log.info("%s %d report PDF: %s", crop, year, pdf) return pdf log.warning("no report PDF discovered for %s %d", crop, year) return None # --------------------------------------------------------------------- helpers _NUM_TOKEN = re.compile(r"^-?\d+(?:\.\d+)?$") # A yield-significance marker the soy tables append to per-site yields. _SIG_RE = re.compile(r"[*]+$") def _to_num(s: str) -> float | int | None: s = (s or "").strip() s = _SIG_RE.sub("", s) if not s or not _NUM_TOKEN.match(s): return None f = float(s) return int(f) if f.is_integer() else f def _slug(s: str) -> str: return re.sub(r"[^a-z0-9]+", "-", (s or "").lower()).strip("-") # Lines that begin a non-data (stat / footnote) row — never a hybrid row. _STAT_PREFIXES = ( "high", "average", "low", "lsd", "mean", "soil type", "soil test", "previous crop", "planting", "harvest", "tillage", "fertilizer", "fungicide", "cooperator", "county", "entry", "brand", "variety", "bu/a", "lbs.", "harv.", "stk.", "final", "table", ) # --------------------------------------------------------------------- CORN parse # # Corn region codes by region phrase in the table caption. _CORN_REGION = ( ("SOUTHWESTERN", "SW"), ("NORTHWESTERN", "NW"), ("NORTH CENTRAL", "NE"), ("NORTHEASTERN", "NE"), ) # Known corn seed brands (ALL-CAPS publisher labels). Longest match first # so multi-word brands aren't split. Built from the distinct brands seen # across the SW/NW/NE regions; matched case-insensitively at line start. CORN_BRANDS = [ "1ST CHOICE SEEDS", "AGRIGOLD HYBRIDS", "AGRIGOLD", "AUGUSTA SEED", "AXIS SEED", "BA GENETICS", "CHANNEL", "DEKALB", "DYNA-GRO", "EBBERTS", "FS INVISION", "GOLDEN HARVEST", "GREAT HEART SEED", "GRO-MOR", "LG SEEDS", "NK", "PC SEED", "SEED CONSULTANTS", "SEED GENETICS DIRECT", "SEEDWAY", "SHUR GROW", "VIKING / BLUE RIVER", "VIKING/BLUE RIVER", "PIONEER", "BREVANT", "STINE", "BECK'S", "BECKS", "WYFFELS", "CROPLAN", "MASTERS CHOICE", "HOEGEMEYER", "LOCAL SEED", "MYCOGEN", ] _CORN_BRANDS_SORTED = sorted(CORN_BRANDS, key=len, reverse=True) def _corn_region(caption: str) -> str: up = caption.upper() for phrase, code in _CORN_REGION: if phrase in up: return code return "OH" def _corn_split_brand_product(text: str) -> tuple[str, str] | None: """Split a row's leading text (brand + hybrid) into (brand, hybrid). Brands are the ALL-CAPS publisher labels in CORN_BRANDS; match the longest known phrase at the start, the rest is the hybrid code. """ up = text.upper() for b in _CORN_BRANDS_SORTED: if up.startswith(b + " ") or up == b: brand = text[:len(b)].strip() product = text[len(b):].strip() if product: return brand, product return None # brand with no hybrid → not a data row return None def _corn_site_groups(site_line: str) -> list[str]: """Parse the site-name line into ordered column-group labels. The names are multi-word and space-separated on one line, e.g. 'Hebron Washington Court House South Charleston Covington Summary'. We can't split purely on whitespace, so we match against the known OCPT site vocabulary (longest first) plus 'Summary'. """ # Known multi-word OCPT site/group names (longest-match-first). Single- # word sites (Hebron, Greenville, Covington, Bucyrus…) need no vocabulary # entry — anything not matched as a multi-word phrase is its own group. known_multi = [ "Washington Court House", "South Charleston", "Upper Sandusky", "Western Ohio", "Statewide All Regions", "Van Wert", ] s = re.sub(r"\s+", " ", site_line).strip() groups: list[str] = [] tokens = s.split(" ") i = 0 while i < len(tokens): # Fold a parenthetical annotation like "(7 Sites)" into the prev group. if groups and (tokens[i].startswith("(") or tokens[i].endswith(")")): groups[-1] = (groups[-1] + " " + tokens[i]).strip() i += 1 continue matched = None for name in sorted(known_multi, key=lambda n: -len(n.split())): nlen = len(name.split()) if " ".join(tokens[i:i + nlen]).lower() == name.lower(): matched = name break if matched: groups.append(matched) i += len(matched.split()) else: # Single-token site name (Hebron / Greenville / Summary / …). groups.append(tokens[i]) i += 1 return groups # Metric labels per corn site group, in column order. _CORN_METRICS = ["Yield", "Harv. Moisture", "Stalk Lodging", "Final Stand", "Emergence"] def _corn_parse_table(lines: list[str], header_idx: int, caption: str, site_line: str) -> tuple[list[str], list[dict]] | None: """Parse one corn table body into ``(group_labels, rows)`` where each row is ``{brand, product, by_group: {label: {metric: val}}, tw}``.""" groups = _corn_site_groups(site_line) if not groups: return None # The header carries one 'Yield' token per column group — the # authoritative group count. If the parsed site-name count disagrees # (an unrecognized layout), SKIP the table rather than mis-assign # numerics to the wrong sites. n_header = lines[header_idx].lower().count("yield") if n_header and n_header != len(groups): log.warning("site-count mismatch (sites=%d, header Yield=%d) — " "skipping table: %s", len(groups), n_header, caption[:60]) return None n = len(groups) rows: list[dict] = [] for line in lines[header_idx + 1:]: stripped = line.strip() if not stripped: continue low = stripped.lower() if low.startswith(_STAT_PREFIXES): # A stat/footnote row, or the next table's header — stop if we # hit a new caption, else just skip the stat row. if low.startswith("table"): break continue # Identify trailing numeric run. toks = stripped.split() j = len(toks) while j > 0 and _NUM_TOKEN.match(toks[j - 1]): j -= 1 nums = [_to_num(t) for t in toks[j:]] lead = " ".join(toks[:j]) bp = _corn_split_brand_product(lead) if bp is None: continue brand, product = bp # Expected numeric count: 5 per group, + optional trailing TW. per = 5 * n tw = None if len(nums) == per + 1: tw = nums[-1] nums = nums[:per] elif len(nums) == per: pass else: # Wrapped / short / malformed numeric run → skip (sanity gate). continue by_group: dict[str, dict] = {} for gi, gname in enumerate(groups): chunk = nums[gi * 5:(gi + 1) * 5] if len(chunk) < 5: continue m: dict[str, Any] = {} for label, v in zip(_CORN_METRICS, chunk): if v is not None: m[label] = v by_group[gname] = m rows.append({"brand": brand, "product": product, "by_group": by_group, "tw": tw}) return groups, rows # Per-site footnote fields we lift from the single-year table footers. _FOOTNOTE_FIELDS = { "Soil Type": "soil_type", "Previous Crop": "previous_crop", "Tillage": "tillage", "Cooperator": "cooperator", "County": "county", } def build_corn_footnotes(pages: list) -> dict[str, dict[str, str]]: """Word-position footnote extractor. pdfplumber's ``extract_text`` collapses the column whitespace, so a footnote line like ``Cooperator Parrish Farms Sollars Farm ...`` can't be split back into per-site values from text alone. Instead we read WORDS with x-coordinates: the site-name header words give each column's x-anchor, and we bucket every footnote word under the nearest column to the LEFT-or-at its x. Returns ``{site_name: {field: value}}`` so values are never mis-assigned. """ out: dict[str, dict[str, str]] = {} for page in pages: words = page.extract_words(use_text_flow=False) # Group words into lines by their 'top' coordinate. lines: dict[float, list[dict]] = {} for w in words: key = round(w["top"] / 2.0) * 2.0 # ~2px bucket lines.setdefault(key, []).append(w) ordered = [sorted(ws, key=lambda w: w["x0"]) for _, ws in sorted(lines.items())] # Find site-header lines: a row whose words are all known site names. for li, lw in enumerate(ordered): joined = " ".join(w["text"] for w in lw) site_groups = _corn_site_groups(joined) # require at least 2 real sites + the words to BE those sites real = [g for g in site_groups if not g.lower().startswith("summary")] if len(real) < 2: continue # Build column x-anchors: first word x0 of each site phrase. anchors = _site_anchor_xs(lw, site_groups) if not anchors: continue # Scan the following lines for footnote labels until next site hdr. # The footnote block sits below 50-100+ hybrid data rows, so the # window must be generous (it self-terminates at the next site # header anyway). for fl in ordered[li + 1:li + 130]: ftxt = " ".join(w["text"] for w in fl) if _corn_site_groups(ftxt) and len( [g for g in _corn_site_groups(ftxt) if not g.lower().startswith("summary")]) >= 2: break # next table's site header label, key = _match_footnote_label(fl) if not label: continue # Words after the label, bucketed by nearest anchor. lbl_words = label.split() value_words = fl[len(lbl_words):] buckets = _bucket_by_anchor(value_words, anchors) for site_name, val in buckets.items(): if val: out.setdefault(site_name, {})[key] = val return out def _site_anchor_xs(header_words: list[dict], groups: list[str]) -> list[tuple[float, str]]: """Return ``[(x0, site_name), ...]`` for the real (non-Summary) sites, using the first word of each multi-word site phrase as its x-anchor.""" anchors: list[tuple[float, str]] = [] i = 0 for g in groups: glen = len(g.split()) if i < len(header_words): x0 = header_words[i]["x0"] if not g.lower().startswith("summary"): anchors.append((x0, g)) i += glen return anchors def _match_footnote_label(line_words: list[dict]) -> tuple[str | None, str | None]: """If a footnote line starts with a known label, return (label, key).""" txt = " ".join(w["text"] for w in line_words) for label, key in _FOOTNOTE_FIELDS.items(): if txt.lower().startswith(label.lower()): return label, key return None, None def _bucket_by_anchor(value_words: list[dict], anchors: list[tuple[float, str]]) -> dict[str, str]: """Assign each value word to the site whose x-anchor is nearest at or to the left of the word's x0 (footnote values sit roughly under their column header). Joins the words per site preserving order.""" if not anchors: return {} xs = [a[0] for a in anchors] out: dict[str, list[str]] = {a[1]: [] for a in anchors} for w in value_words: x = w["x0"] # nearest anchor by absolute distance best_i = min(range(len(xs)), key=lambda i: abs(xs[i] - x)) out[anchors[best_i][1]].append(w["text"]) return {site: " ".join(ws).strip(" ,") for site, ws in out.items()} # --------------------------------------------------------------------- CORN build def build_corn_sites(text: str, year: int, pdf_url: str, footnotes_by_site: dict[str, dict[str, str]] | None = None ) -> list[SiteTrial]: """Parse the whole corn report text into per-site SiteTrial objects. We process the SINGLE-YEAR per-site tables (1E/1L/4E/4L/7E/7L) as the gold per-site data, and the multi-year/combined tables (2/3/5/6/8/9) as region-level summary sidecars. Within a region+year, the Early and Full-season tables for the same SITE are merged into one site sidecar. ``footnotes_by_site`` (from ``build_corn_footnotes``) supplies per-site soil/cooperator/county metadata extracted by word x-position. """ footnotes_by_site = footnotes_by_site or {} lines = text.splitlines() # First parse Table 10 -> hybrid->traits. traits_map = _corn_traits_map(lines) # Index every (caption_idx, site_line_idx, header_idx) triple. blocks: list[tuple[int, str, str, int]] = [] i = 0 while i < len(lines): m = re.match(r"^\s*(TABLE|Table)\s+(\d+)([EL]?)\.?[:\s]", lines[i]) if m: tbl_no = int(m.group(2)) tbl_sfx = m.group(3) caption = lines[i].strip() # Find the next 'Brand ... Hybrid ...' header within a few lines. header_idx = None site_line = "" for k in range(i + 1, min(i + 8, len(lines))): lk = lines[k].strip() if lk.lower().startswith("brand") and "hybrid" in lk.lower(): header_idx = k break # The site-name line is the first non-empty, non-unit line # after the caption that isn't the 'Harv. Stk. Final' line. if lk and not site_line and not lk.lower().startswith( ("harv.", "stk.", "final", "bu/a")): site_line = lk if header_idx is not None: blocks.append((tbl_no, tbl_sfx, caption, header_idx)) # also stash the site_line on a parallel structure blocks[-1] = (tbl_no, tbl_sfx, caption, header_idx) # remember site line via attribute on a dict below _BLOCK_SITELINE[header_idx] = site_line i = header_idx if header_idx else i + 1 i += 1 # Aggregate per (region, table-kind, site). site_acc: dict[str, SiteTrial] = {} for tbl_no, tbl_sfx, caption, header_idx in blocks: site_line = _BLOCK_SITELINE.get(header_idx, "") parsed = _corn_parse_table(lines, header_idx, caption, site_line) if not parsed: continue groups, rows = parsed if not rows: continue region = _corn_region(caption) single_year = tbl_sfx in ("E", "L") # 1E/1L/4E/4L/7E/7L maturity = {"E": "early", "L": "full-season"}.get(tbl_sfx, "") for gi, gname in enumerate(groups): is_summary = gname.lower().startswith("summary") # Build results list for this group. results: list[dict] = [] for r in rows: m = dict(r["by_group"].get(gname, {})) if not m or "Yield" not in m: continue # Test weight (TW) is a summary-level number → attach to the # Summary group only. if is_summary and r.get("tw") is not None: m["Test Wt."] = r["tw"] results.append({ "brand": r["brand"], "product": r["product"], "traits": traits_map.get(_norm_hybrid(r["product"])), "metrics": m, }) # Sanity-gate + rank. results = _finalize_results(results) if len(results) < 2: continue if single_year: if is_summary: site = "Summary" key = f"ocpt-corn-{year}-{region.lower()}-summary" else: site = gname key = f"ocpt-corn-{year}-{region.lower()}-{_slug(gname)}" else: # multi-year / combined → region (or statewide) summary site = gname tag = {2: "2yr", 3: "3yr", 5: "2yr", 6: "3yr", 8: "2yr", 9: "combined"}.get(tbl_no, f"t{tbl_no}") key = (f"ocpt-corn-{year}-{region.lower()}-{tag}-{_slug(gname)}" if not is_summary else f"ocpt-corn-{year}-{region.lower()}-{tag}-summary") st = site_acc.get(key) if st is None: st = SiteTrial( source_key=key, crop="corn", year=year, region=region, site=site, pdf_url=pdf_url, table_label=caption) # attach footnote agronomic metadata for real sites # (word-position keyed by site name → never mis-assigned) if single_year and not is_summary: fn = footnotes_by_site.get(gname, {}) st.soil_type = fn.get("soil_type") or None st.previous_crop = fn.get("previous_crop") or None st.tillage = fn.get("tillage") or None st.cooperator = fn.get("cooperator") or None st.county = fn.get("county") or None site_acc[key] = st # Merge: append new hybrids (early + full season tables of the # same site land in the same sidecar). Dedup by (brand, product). seen = {(r["brand"], r["product"]) for r in st.results} for r in results: if (r["brand"], r["product"]) not in seen: st.results.append(r) seen.add((r["brand"], r["product"])) if maturity and single_year: st.table_label = caption # last caption wins (informational) # Re-rank each merged sidecar by Yield desc. out = list(site_acc.values()) for st in out: st.results = _finalize_results(st.results) return [st for st in out if len(st.results) >= 2] _BLOCK_SITELINE: dict[int, str] = {} def _norm_hybrid(h: str) -> str: return re.sub(r"\s+", " ", (h or "").strip()).upper() def _corn_traits_map(lines: list[str]) -> dict[str, str]: """Parse Table 10 (Seed source / Hybrid No. / Technology Traits) into a ``hybrid -> traits`` map. The table has a 'Hybrid No.' header and a 'Technology Traits' column; rows are messy (brand wraps across lines), so we anchor on the trait token vocabulary.""" out: dict[str, str] = {} trait_vocab = re.compile( r"\b(RR|GT|CB|RW|LL|TRE|WBC|CEW|BCW|VIP|VT2P?|SmartStax|Enlist|" r"NON-GMO|Conv|STS|PWE)\b", re.I) in_t10 = False for line in lines: if re.match(r"^\s*TABLE\s+10\.", line): in_t10 = True continue if in_t10 and re.match(r"^\s*TABLE\s+11", line): break if not in_t10: continue s = line.strip() if not s or not trait_vocab.search(s): continue # The hybrid number precedes the table-no column and the traits. # Pattern: ... m = re.search(r"((?:[A-Z]{1,3}\s*)?[A-Z0-9][A-Z0-9\- ]*?)\s+" r"(?:\d+[EL]?(?:,\s*)?)+\s+([A-Za-z0-9,\- ]*" + r"(?:RR|GT|CB|RW|LL|TRE|WBC|CEW|BCW|VIP|VT2|" r"Enlist|NON-GMO|Conv|STS|PWE)[A-Za-z0-9,\- ]*)", s) if not m: continue hybrid = _norm_hybrid(m.group(1)) traits = re.sub(r"\s+", " ", m.group(2).strip()).rstrip(",") traits = _trim_trait_codes(traits) if hybrid and traits and len(hybrid) <= 30: out.setdefault(hybrid, traits) return out # Technology-trait code vocabulary (Table 10 'Technology Traits' column). # Everything after this comma-list is the Fungicide / Seed-Treatment column, # which we drop so ``traits`` carries ONLY the genetic trait stack. _TRAIT_CODES = { "RR", "GT", "CB", "RW", "LL", "TRE", "WBC", "CEW", "BCW", "VIP", "VT2", "VT2P", "VT2PRO", "DG", "DT", "ENLIST", "NON-GMO", "CONV", "STS", "PWE", "SMARTSTAX", "RW2", "SS", } def _trim_trait_codes(traits: str) -> str: """Keep only the leading comma-separated technology-trait codes. The Table 10 trait column reads e.g. ``RR,CB,LL, Enlist Lumiscend PRO`` where ``Lumiscend PRO`` is the Fungicide/Seed-Treatment column bleeding in. The trait codes are comma-joined; the first SPACE-separated token that isn't itself a trait code marks the end of the trait stack.""" # Normalize: split on commas first to get the code tokens, but the codes # themselves are space-or-comma separated. Walk tokens; stop at the first # token that (a) follows a space (not a comma) and (b) is not a code. kept: list[str] = [] # Tokenize keeping comma adjacency info. parts = re.findall(r"[A-Za-z0-9\-]+|,", traits) prev_was_comma = True # treat start as if a comma preceded for tok in parts: if tok == ",": kept.append(",") prev_was_comma = True continue if tok.upper() in _TRAIT_CODES: kept.append(tok) prev_was_comma = False continue # A non-code token: only allowed if it directly follows a comma # (e.g. ", Enlist" already handled above; this guards odd spacing). if prev_was_comma and tok.upper() in _TRAIT_CODES: kept.append(tok) prev_was_comma = False continue break # reached the Fungicide column → stop s = "".join( (t if t == "," else (" " + t if i and kept[i - 1] != "," else t)) for i, t in enumerate(kept)) return s.strip().strip(",").strip() # --------------------------------------------------------------------- SOY parse # # Soy region by caption phrase. _SOY_REGION = (("NORTH", "North"), ("CENTRAL", "Central"), ("SOUTH", "South")) # Soy seed brands — Brand is the SECOND column. Multi-word, longest first. # Includes year-to-year string variants (the report is inconsistent: # "Axis" vs "Axis Seed", "Dyna-Gro" vs "Dyna-Gro Seed"). SOY_BRANDS = [ "Seed Consultants, Inc.", "Ebberts Field Seeds", "Seed Genetics Direct", "Great Heart Seed Co.", "Viking|Blue River", "GROWMARK, INC.", "Albert Lea Seed", "Dyna-Gro Seed", "Shur Grow", "Dyna-Gro", "Gro Mor", "Seedway", "Xitavo", "Asgrow", "DonMario", "Golden Harvest", "Axis Seed", "Axis", "NK Seeds", "Confluence Genetics", "FS HiSOY", "Benson Hill", "Bayer", "Beck's", "Stine", "Pioneer", "Brevant", "Channel", "LG Seeds", "Hoegemeyer", ] _SOY_BRANDS_SORTED = sorted(SOY_BRANDS, key=len, reverse=True) # Soy "Type" column values that mark the boundary between the # Variety+Brand columns and the Seed-Treatment column. The Type column is # strictly one of these herbicide-trait classes (EN=Enlist, CV=conventional, # XF=XtendFlex, STS=sulfonylurea-tolerant) and may be a comma-compound # ("EN, STS"). NOT E3/RR2X/LL — those appear INSIDE variety names # ("E3190 E3") and would split the row in the wrong place. A trailing # comma signals a compound type whose continuation we consume. _SOY_TYPE_TOKENS = {"EN", "CV", "XF", "STS"} def _soy_type_at(toks: list[str], i: int) -> int | None: """If a Type column begins at index i, return the index AFTER it (so the Seed-Treatment column starts there). Handles bare 'EN' and the comma-compound 'EN, STS' (where toks[i]=='EN,' then 'STS').""" t = toks[i] bare = t.rstrip(",") if bare not in _SOY_TYPE_TOKENS: return None end = i + 1 # consume continuation tokens of a comma-compound type while t.endswith(",") and end < len(toks) and \ toks[end].rstrip(",") in _SOY_TYPE_TOKENS: t = toks[end] end += 1 return end def _soy_region(caption: str) -> tuple[str, str]: up = caption.upper() region = "OH" for phrase, code in _SOY_REGION: if phrase in up: region = code break maturity = "late" if "LATE" in up else ("early" if "EARLY" in up else "") return region, maturity def _soy_site_codes(text: str) -> dict[str, str]: """Parse Table 1 (Site Descriptions) -> {code: 'County Co.'}. The header line lists codes (N1 N2 C1 C2 S1 S2) and the next line lists the matching counties.""" lines = text.splitlines() for i, line in enumerate(lines): codes = re.findall(r"\b([NCS]\d)\b", line) if len(codes) >= 4 and i + 1 < len(lines): # county line follows county_line = lines[i + 1].strip() counties = re.split(r"\s{2,}", county_line) counties = [c.strip() for c in counties if c.strip()] if len(counties) >= len(codes): return dict(zip(codes, counties[:len(codes)])) # fall back: single-space split cs = county_line.split() # rebuild "X Co." pairs merged: list[str] = [] j = 0 while j < len(cs): if j + 1 < len(cs) and cs[j + 1].lower().startswith("co"): merged.append(f"{cs[j]} {cs[j+1]}") j += 2 else: merged.append(cs[j]) j += 1 if len(merged) >= len(codes): return dict(zip(codes, merged[:len(codes)])) return {} def _soy_parse_table(lines: list[str], header_idx: int, site_codes_in_header: list[str]) -> list[dict]: """Parse a soy region table body. Header columns: Variety | Brand | Type | Seed Treatment | RM | | Mean[s] Returns rows of ``{brand, product, traits, metrics}`` (metrics keyed by site code yield + '25 Mean' + optional 2-yr mean).""" rows: list[dict] = [] for line in lines[header_idx + 1:]: s = line.strip() if not s: continue low = s.lower() if low.startswith(("table", "average", "lsd", "mean", "cv", "high", "low", "variety", "entry")): if low.startswith("table"): break continue toks = s.split() # Trailing numeric run = RM + per-site yields + means. j = len(toks) while j > 0 and _NUM_TOKEN.match(_SIG_RE.sub("", toks[j - 1])): j -= 1 nums = toks[j:] if len(nums) < 3: # need RM + >=1 site + mean continue lead = toks[:j] if len(lead) < 3: continue # Find the Type column to split Variety+Brand | SeedTreatment. # Skip the variety (token 0) and require at least variety+brand # before the Type column (type_pos >= 2). type_pos = None type_end = None for ti in range(2, len(lead)): end = _soy_type_at(lead, ti) if end is not None: type_pos, type_end = ti, end break if type_pos is None: continue soy_type = " ".join(lead[type_pos:type_end]).rstrip(",") vb = " ".join(lead[:type_pos]) # variety + brand seed_treatment = " ".join(lead[type_end:]) # Split Variety | Brand: brand is a known phrase at the END of vb. brand, variety = _soy_split_variety_brand(vb) if not brand or not variety: continue # Numeric run: nums[0] is RM (a 2.x-4.x decimal); rest are yields. rm_val = _to_num(nums[0]) yields = [_to_num(x) for x in nums[1:]] metrics: dict[str, Any] = {} if rm_val is not None: metrics["RM"] = rm_val # Map per-site yields by header code; the last 1-2 numbers are # the regional mean(s). primary "Yield" = '25 mean (the regional # mean for the harvest year). site_yields = yields[:len(site_codes_in_header)] for code, yv in zip(site_codes_in_header, site_yields): if yv is not None: metrics[f"Yield {code}"] = yv # Means: whatever's left after the site columns. rest = yields[len(site_codes_in_header):] if rest and rest[0] is not None: metrics["Yield"] = rest[0] # '25 regional mean — primary if len(rest) >= 2 and rest[1] is not None: metrics["Yield 2yr Mean"] = rest[1] if "Yield" not in metrics: # No regional mean column → use the best available site yield. site_vals = [v for v in site_yields if v is not None] if site_vals: metrics["Yield"] = max(site_vals) rows.append({ "brand": brand, "product": variety, "traits": soy_type if soy_type not in ("",) else None, "_seed_treatment": seed_treatment or None, "metrics": metrics, }) return rows def _soy_split_variety_brand(vb: str) -> tuple[str | None, str | None]: """``'30B4 Viking|Blue River'`` -> (brand='Viking|Blue River', variety='30B4'). Brand is a known SOY_BRANDS phrase; it sits AFTER the variety token(s).""" low = vb.lower() for b in _SOY_BRANDS_SORTED: bl = b.lower() if low.endswith(" " + bl) or low == bl: variety = vb[:len(vb) - len(b)].strip() if variety: return b, variety return b, None # Fallback: first token = variety, rest = brand (best effort). parts = vb.split() if len(parts) >= 2: return " ".join(parts[1:]), parts[0] return None, None def build_soy_regions(text: str, year: int, pdf_url: str) -> list[SiteTrial]: """Parse the soy report text into per-region x maturity SiteTrials.""" lines = text.splitlines() site_codes = _soy_site_codes(text) # {N1: 'Henry Co.', ...} out: dict[str, SiteTrial] = {} i = 0 while i < len(lines): m = re.match(r"^\s*(TABLE|Table)\s+(\d+)\s*[:\.]", lines[i]) if not m: i += 1 continue tbl_no = int(m.group(2)) caption = lines[i].strip() # Only the regional yield tables (3-8) carry Variety/Brand rows. if tbl_no < 3 or tbl_no > 8: i += 1 continue # Find the column header and the per-site codes it lists. header_idx = None for k in range(i + 1, min(i + 6, len(lines))): lk = lines[k].strip() if lk.lower().startswith("variety") and "brand" in lk.lower(): header_idx = k break if header_idx is None: i += 1 continue header = lines[header_idx] codes_in_header = re.findall(r"\b([NCS]\d)\b", header) rows = _soy_parse_table(lines, header_idx, codes_in_header) rows = _finalize_results(rows) if len(rows) < 2: i = header_idx + 1 continue region, maturity = _soy_region(caption) key = f"ocpt-soybeans-{year}-{region.lower()}-{maturity or 't'+str(tbl_no)}" # Build a per-site mapping for the sidecar's site list. site_towns = "; ".join( f"{c}={site_codes.get(c, c)}" for c in codes_in_header) st = out.get(key) if st is None: st = SiteTrial( source_key=key, crop="soybeans", year=year, region=region, site=(maturity + " maturity").strip() or None, pdf_url=pdf_url, table_label=caption) st.cooperator = site_towns or None # repurpose for site map note out[key] = st seen = {(r["brand"], r["product"]) for r in st.results} for r in rows: if (r["brand"], r["product"]) not in seen: st.results.append(r) seen.add((r["brand"], r["product"])) i = header_idx + 1 res = list(out.values()) for st in res: st.results = _finalize_results(st.results) return [st for st in res if len(st.results) >= 2] # --------------------------------------------------------------------- sanity + rank def _row_ok(r: dict) -> bool: brand = (r.get("brand") or "").strip() if not brand or len(brand) <= 1: return False # A purely-numeric brand is junk (a stat row leaked through). if _NUM_TOKEN.match(brand): return False if not (r.get("product") or "").strip(): return False y = (r.get("metrics") or {}).get("Yield") if not isinstance(y, (int, float)) or not (1 < y < 400): return False return True def _finalize_results(results: list[dict]) -> list[dict]: """Drop junk rows, sort by Yield desc, assign 1-based rank.""" good = [r for r in results if _row_ok(r)] good.sort(key=lambda r: -float(r["metrics"]["Yield"])) for idx, r in enumerate(good, start=1): r["rank"] = idx # drop internal-only key if present r.pop("_seed_treatment", None) # canonical key order: rank, brand, product, traits, metrics return [{"rank": r["rank"], "brand": r["brand"], "product": r["product"], "traits": r.get("traits"), "metrics": r["metrics"]} for r in good] # --------------------------------------------------------------------- render def render_markdown(st: SiteTrial) -> str: crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get( st.crop, st.crop.title()) where = ", ".join(filter(None, [st.site, "OH"])) head: list[str] = [ f"# {crop_label} yield trial — {where} ({st.region}, {st.year})", "", f"- **Publisher:** {PUBLISHER_BRAND} (independent third-party," f" all brands)", f"- **Vendor:** {VENDOR} (CFAES extension)", f"- **Crop:** {crop_label}", f"- **Year:** {st.year}", f"- **Region:** {st.region}", ] if st.site: head.append(f"- **Site:** {st.site}") if st.county: head.append(f"- **County:** {st.county}") if st.table_label: head.append(f"- **Source table:** {st.table_label}") if st.cooperator: head.append(f"- **Cooperator / sites:** {st.cooperator}") if st.soil_type: head.append(f"- **Soil type:** {st.soil_type}") if st.previous_crop: head.append(f"- **Previous crop:** {st.previous_crop}") if st.tillage: head.append(f"- **Tillage:** {st.tillage}") head += [f"- **Source PDF:** {st.pdf_url}", f"- **Note:** {TOS_NOTE}", "", "---", "", "## Results (by yield rank)", ""] # Discover metric columns from the first result. metric_keys: list[str] = [] for r in st.results: if r.get("metrics"): metric_keys = list(r["metrics"].keys()) break headers = ["Rank", "Brand", "Hybrid/Variety", "Traits"] + metric_keys head.append("| " + " | ".join(headers) + " |") head.append("|" + "|".join(["---"] * len(headers)) + "|") for r in st.results: row = [str(r.get("rank", "-")), r.get("brand") or "-", r.get("product") or "-", r.get("traits") or "-"] m = r.get("metrics") or {} for k in metric_keys: v = m.get(k) row.append("-" if v is None else str(v)) head.append("| " + " | ".join(row) + " |") head.append("") return "\n".join(head) def write_trial(st: SiteTrial, body_md: str) -> None: CORPUS_DIR.mkdir(parents=True, exist_ok=True) (CORPUS_DIR / f"{st.source_key}.md").write_text(body_md, encoding="utf-8") sidecar = { "source": "ohio_ocpt_trials", "source_key": st.source_key, "data_type": "trial", "vendor": VENDOR, "brand_aggregator": BRAND_AGGREGATOR, "brand": PUBLISHER_BRAND, "crop": st.crop, "state": "OH", "state_abbrev": "oh", "year": st.year, "region": st.region, "site": st.site, "cooperator": st.cooperator, "county": st.county, "soil_type": st.soil_type, "previous_crop": st.previous_crop, "tillage": st.tillage, "table_label": st.table_label, "results": st.results, "n_results": len(st.results), "tos_note": TOS_NOTE, "source_urls": [st.pdf_url], "fetched_at": datetime.now(timezone.utc).isoformat(), "scraper_version": SCRAPER_VERSION, } (CORPUS_DIR / f"{st.source_key}.json").write_text( json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") # --------------------------------------------------------------------- pipeline def _extract_pdf(http: RateLimitedSession, pdf_url: str ) -> tuple[str, dict[str, dict[str, str]]]: """Return ``(full_text, corn_footnotes_by_site)``. The footnote map is only meaningful for corn; soy ignores it.""" r = http.get(pdf_url) r.raise_for_status() with pdfplumber.open(io.BytesIO(r.content)) as pdf: text = "\n".join((p.extract_text() or "") for p in pdf.pages) try: footnotes = build_corn_footnotes(pdf.pages) except Exception as exc: # noqa: BLE001 — footnotes are enrichment log.warning("footnote extraction failed (%s): %s", pdf_url, exc) footnotes = {} return text, footnotes def process_report(http: RateLimitedSession, *, crop: str, year: int, force: bool, limit: int | None, counts: dict) -> int: pdf_url = discover_report_pdf(http, crop, year) if not pdf_url: counts["no_pdf"] += 1 return 0 try: text, footnotes = _extract_pdf(http, pdf_url) except Exception as exc: # noqa: BLE001 log.error("%s %d PDF fetch/parse failed: %s", crop, year, exc) counts["failed"] += 1 return 0 if not text or len(text) < 1000: log.warning("%s %d: PDF text layer too thin (%d chars) — skip", crop, year, len(text)) counts["image_skip"] += 1 return 0 _BLOCK_SITELINE.clear() if crop == "corn": trials = build_corn_sites(text, year, pdf_url, footnotes_by_site=footnotes) else: trials = build_soy_regions(text, year, pdf_url) if not trials: log.warning("%s %d: no per-site tables parsed cleanly — skip", crop, year) counts["table_skip"] += 1 return 0 written = 0 for st in trials: if limit is not None and counts["written"] >= limit: break md_path = CORPUS_DIR / f"{st.source_key}.md" if md_path.exists() and not force: counts["skipped"] += 1 continue write_trial(st, render_markdown(st)) counts["written"] += 1 written += 1 brands = sorted({r["brand"] for r in st.results}) log.info("%s | %d results | %d brands (%s%s)", st.source_key, len(st.results), len(brands), ", ".join(brands[:4]), "…" if len(brands) > 4 else "") log.info("%s %d: %d sidecars written (%d candidate tables)", crop, year, written, len(trials)) return written def run(*, crops: list[str], years: list[int], limit: int | None, force: bool) -> int: CORPUS_DIR.mkdir(parents=True, exist_ok=True) http = RateLimitedSession() counts = {"written": 0, "skipped": 0, "no_pdf": 0, "failed": 0, "image_skip": 0, "table_skip": 0} for crop in crops: for year in years: if limit is not None and counts["written"] >= limit: break process_report(http, crop=crop, year=year, force=force, limit=limit, counts=counts) log.info("done: written=%d skipped=%d no_pdf=%d failed=%d " "image_skip=%d table_skip=%d", counts["written"], counts["skipped"], counts["no_pdf"], counts["failed"], counts["image_skip"], counts["table_skip"]) return 0 # --------------------------------------------------------------------- CLI def _build_argparser() -> argparse.ArgumentParser: p = argparse.ArgumentParser( prog="scrape.sources.ohio_ocpt_trials", description="Scrape the Ohio Corn/Soybean Performance Test report " "PDFs into per-site cross-vendor yield trials.") p.add_argument("--year", type=int, default=None, help="Scrape a single year (default: 2024+2025 baseline).") p.add_argument("--include-old", action="store_true", help="Also scrape the u.osu.edu archive years (2018–2023).") p.add_argument("--limit", type=int, default=None, help="Stop after writing N sidecars (default: all).") p.add_argument("--force", action="store_true", help="Re-fetch even if the markdown file already exists.") p.add_argument("--crop", default=None, choices=("corn", "soybeans"), help="Limit to one crop (default: both).") p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) return p def main(argv: list[str] | None = None) -> int: args = _build_argparser().parse_args(argv) logging.basicConfig( level=args.log_level.upper(), format="%(asctime)s %(levelname)s %(name)s %(message)s", stream=sys.stderr) crops = [args.crop] if args.crop else ["corn", "soybeans"] if args.year is not None: years = [args.year] elif args.include_old: years = OLD_YEARS + BASELINE_YEARS else: years = BASELINE_YEARS return run(crops=crops, years=years, limit=args.limit, force=args.force) if __name__ == "__main__": sys.exit(main())