From c737871c4ccb4f8f122354ac1bed3ef97bb036c6 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Mon, 25 May 2026 15:19:03 -0400 Subject: [PATCH] Trial-data scrapers: gh_plot_reports + agripro_trials + search_trials tool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces TRIAL data — yield-performance results from real field trials — as a SEPARATE data type alongside variety identity. The two are complementary: search_docs → "What's the disease resistance of DKC62-08RIB?" (variety identity — what it IS) search_trials → "Which corn hybrid won the IA 2024 trials?" (performance data — how it PERFORMED) scrape/sources/gh_plot_reports.py — Golden Harvest plot reports - 4,618 expected (2024+2025; 2023 deferred to a backfill pass). - URL: //plot-report/// - Cross-vendor: each plot lists products from multiple brands (NK / DEKALB / Golden Harvest / Enogen / Pioneer / Channel) side by side at one cooperator's field — the kind of independent comparison data Bayer doesn't publish itself. - Generic per-column metrics dict (Yield/MST/Test Weight/$/Ac for corn+soy, Ton/Acre + Milk + Beef columns for silage). - Politeness: 1 req/sec, retries on 429/5xx, no redirect-follow. scrape/sources/agripro_trials.py — AgriPro regional trial PDFs - 14 unique PDFs (38 sitemap links deduped) at /trials-data - pdfplumber text extraction, region/year detection from filename - Verbatim PDF text preserved in chunk body so variety + yield number adjacency drives retrieval (AP Iliad's Aberdeen ID yield matches a query about "AP Iliad Idaho yield") rag/chunk.py — chunks_from_trial() dispatching by source - Plot reports: identity preamble + Top-5 by primary metric + full ranking table. Metric labels chosen from the data (corn/soy use "Yield", silage uses "Ton/Acre"). - AgriPro PDFs: identity preamble + verbatim trial body inline so per-location yields surface for region+variety queries. - Variety chunks get data_type="variety" metadata; trial chunks get data_type="trial". Single Chroma collection; the tool router filters by data_type rather than maintaining two collections. rag/index.py — dispatch by sidecar's data_type field rag/bm25.py — new filter columns (data_type, year, state) docs_mcp/server.py — sixth MCP tool: search_trials(crop?, state?, year?, product?, k=10) - Filters trial chunks via where={"data_type": "trial", ...} - Optional product substring post-filter for "DKC62-08RIB Iowa 2024" style searches - search_docs now defaults to data_type="variety" so trial chunks don't bleed into variety identity queries - Tool docstring routes the agent: "use lookup_variety to verify identity details on any trial winner you surface" NK trial endpoint (/NKSeeds/wsProxy.asmx/GetPlotResult) is documented as deferred — the ASMX-SOAP shape returned empty XML on initial probe. Bayer per-variety yield data is not publicly indexed at all — documented in the trial-scope note (DEKALB/Asgrow trial data flows through Channel reps, not the web). AgRevival research books exist as 10 large annual PDFs but are deferred (low ROI per parse). Initial corpus shipped in this PR: 14 AgriPro trial PDFs. The 4,618 Golden Harvest plot reports are scraping in background and will be added in a follow-up corpus-snapshot PR (~70 min ETA). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agripro_trials/agt-2024-pnw-combined.json | 44 + .../agripro_trials/agt-2024-pnw-combined.md | 54 ++ .../agt-2025-np-perf-data-sd-web.json | 35 + .../agt-2025-np-perf-data-sd-web.md | 104 +++ .../agt-2025-np-perf-data-web-east.json | 36 + .../agt-2025-np-perf-data-web-east.md | 114 +++ .../agt-2025-np-perf-data-web-west.json | 36 + .../agt-2025-np-perf-data-web-west.md | 113 +++ .../agt-central-plains-dryland-2025-r1.json | 33 + .../agt-central-plains-dryland-2025-r1.md | 56 ++ .../agripro_trials/agt-montana-2025-web.json | 32 + corpus/agripro_trials/agt-montana-2025-web.md | 53 ++ .../agripro_trials/agt-ne-colorado-2025.json | 33 + corpus/agripro_trials/agt-ne-colorado-2025.md | 58 ++ .../agt-plains-irrigated-2025.json | 33 + .../agt-plains-irrigated-2025.md | 52 ++ .../agt-sc-ks-nc-ok-2024-0.json | 34 + .../agripro_trials/agt-sc-ks-nc-ok-2024-0.md | 54 ++ .../agripro_trials/agt-south-dakota-2025.json | 36 + .../agripro_trials/agt-south-dakota-2025.md | 52 ++ .../agt-southern-idaho-2025.json | 40 + .../agripro_trials/agt-southern-idaho-2025.md | 49 ++ .../agt-washington-n-idaho-2025.json | 40 + .../agt-washington-n-idaho-2025.md | 48 ++ .../agt-western-plains-dryland-2025-0.json | 33 + .../agt-western-plains-dryland-2025-0.md | 58 ++ .../agt-wheat-after-soy-2025.json | 33 + .../agt-wheat-after-soy-2025.md | 55 ++ docs_mcp/server.py | 243 +++++- rag/bm25.py | 21 +- rag/chunk.py | 253 ++++++ rag/index.py | 25 +- scrape/sources/agripro_trials.py | 483 +++++++++++ scrape/sources/gh_plot_reports.py | 781 ++++++++++++++++++ sources.json | 103 ++- 35 files changed, 3302 insertions(+), 25 deletions(-) create mode 100644 corpus/agripro_trials/agt-2024-pnw-combined.json create mode 100644 corpus/agripro_trials/agt-2024-pnw-combined.md create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-web-east.json create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-web-east.md create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-web-west.json create mode 100644 corpus/agripro_trials/agt-2025-np-perf-data-web-west.md create mode 100644 corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json create mode 100644 corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md create mode 100644 corpus/agripro_trials/agt-montana-2025-web.json create mode 100644 corpus/agripro_trials/agt-montana-2025-web.md create mode 100644 corpus/agripro_trials/agt-ne-colorado-2025.json create mode 100644 corpus/agripro_trials/agt-ne-colorado-2025.md create mode 100644 corpus/agripro_trials/agt-plains-irrigated-2025.json create mode 100644 corpus/agripro_trials/agt-plains-irrigated-2025.md create mode 100644 corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json create mode 100644 corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md create mode 100644 corpus/agripro_trials/agt-south-dakota-2025.json create mode 100644 corpus/agripro_trials/agt-south-dakota-2025.md create mode 100644 corpus/agripro_trials/agt-southern-idaho-2025.json create mode 100644 corpus/agripro_trials/agt-southern-idaho-2025.md create mode 100644 corpus/agripro_trials/agt-washington-n-idaho-2025.json create mode 100644 corpus/agripro_trials/agt-washington-n-idaho-2025.md create mode 100644 corpus/agripro_trials/agt-western-plains-dryland-2025-0.json create mode 100644 corpus/agripro_trials/agt-western-plains-dryland-2025-0.md create mode 100644 corpus/agripro_trials/agt-wheat-after-soy-2025.json create mode 100644 corpus/agripro_trials/agt-wheat-after-soy-2025.md create mode 100644 scrape/sources/agripro_trials.py create mode 100644 scrape/sources/gh_plot_reports.py diff --git a/corpus/agripro_trials/agt-2024-pnw-combined.json b/corpus/agripro_trials/agt-2024-pnw-combined.json new file mode 100644 index 00000000..eb319481 --- /dev/null +++ b/corpus/agripro_trials/agt-2024-pnw-combined.json @@ -0,0 +1,44 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2024-pnw-combined", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2024 Pacific Northwest Combined Summary, Three-Year Data", + "filename": "2024%20PNW%20Combined.pdf", + "region": "Pacific Northwest", + "wheat_class_section": null, + "year": 2024, + "years_covered": [ + 2024 + ], + "varieties_found": [ + "AP Olympia", + "AP Exceed", + "SY Ovation", + "SY Dayton", + "SY Assure", + "AP Iliad", + "LCS Shine", + "LCS Artdeco", + "Norwest Duet", + "LCS Hulk", + "PNW Hailey", + "LCS Sonic", + "Norwest Tandem", + "UI Magic CL+", + "LCS Blackjack", + "LCS Drive", + "LCS Jefe", + "LCS Kamiack" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf" + ], + "page_text_chars": 2613, + "fetched_at": "2026-05-25T19:11:04.196638+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2024-pnw-combined.md b/corpus/agripro_trials/agt-2024-pnw-combined.md new file mode 100644 index 00000000..7f38fb06 --- /dev/null +++ b/corpus/agripro_trials/agt-2024-pnw-combined.md @@ -0,0 +1,54 @@ +# 2024 Pacific Northwest Combined Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Pacific Northwest +- **Year:** 2024 +- **PDF:** https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Olympia, AP Exceed, SY Ovation, SY Dayton, SY Assure, AP Iliad, LCS Shine, LCS Artdeco, Norwest Duet, LCS Hulk, PNW Hailey, LCS Sonic, Norwest Tandem, UI Magic CL+, LCS Blackjack, LCS Drive, LCS Jefe, LCS Kamiack + +--- + +## Trial data (verbatim from PDF) + +``` +2024 Pacific Northwest Combined Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2022-2024 +3-Yr Combined 2-Yr Combined Combined Moses Lake, Walla Walla, Aberdeen, Craigmont, Twin Falls, +Variety (2022-2024) (2023-2024) (2024) WA WA ID ID ID +Soft White Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A Bu/A +AP Olympia 153.6 63.7 146.5 63.5 154.7 65.4 201.6 189.5 113.8 124.5 144.2 +AP Exceed 150.5 63.2 147.2 63.2 156.6 65.2 188.0 191.2 124.2 124.7 154.8 +SY Ovation 148.5 63.2 146.3 63.1 148.9 64.8 186.4 179.4 117.1 120.1 141.3 +SY Dayton 146.4 63.2 143.1 63.2 155.4 65.1 192.6 189.4 115.4 128.9 150.6 +SY Assure 141.6 64.0 138.7 63.9 141.3 65.4 144.6 172.6 119.7 133.4 136.1 +AP Iliad 144.7 63.1 150.7 65.1 173.2 175.3 116.3 135.4 153.2 +LCS Shine 150.5 62.9 146.0 62.8 156.1 64.4 182.0 169.0 137.3 125.8 166.5 +LCS Artdeco 149.4 62.5 145.9 62.6 158.9 64.2 197.9 183.8 139.9 122.3 150.3 +Norwest Duet 148.5 62.3 148.7 62.0 162.1 64.3 220.0 196.5 129.3 123.9 140.8 +LCS Hulk 147.9 64.0 145.1 63.6 156.3 65.3 204.1 174.1 133.5 123.5 146.4 +PNW Hailey 147.3 63.9 142.0 63.9 152.1 65.4 188.6 192.7 112.1 126.1 141.0 +LCS Sonic 146.8 62.8 143.7 62.7 150.3 64.7 190.3 186.6 124.2 125.6 125.0 +Norwest Tandem 144.2 62.4 143.2 62.3 156.1 64.6 196.1 169.8 137.5 130.1 146.8 +UI Magic CL+ 143.4 63.7 139.4 63.4 150.5 65.4 179.5 183.6 124.2 123.2 142.1 +LCS Blackjack 147.7 60.5 162.0 63.1 203.8 198.0 122.3 131.9 154.0 +LCS Drive 141.7 61.3 153.0 63.8 181.4 181.0 125.8 122.9 153.8 +LCS Jefe 158.0 65.1 208.3 191.0 113.4 123.0 154.3 +LCS Kamiack 151.1 65.1 181.1 178.9 122.9 133.2 139.6 +Mean General 148.7 63.2 146.4 62.9 156.0 64.8 192.3 186.6 126.6 127.6 147.1 +LSD General (5%) EE 7.8 0.7 9.2 1.0 13.0 1.3 18.5 ns 4.5 8.9 ns +CV (Effective) 6.5 1.8 6.5 1.9 5.4 1.5 4.7 6.7 1.8 3.4 6.7 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +NS = Non Significant +Locations +2022—Colfax, WA; Aberdeen, Genesee, and Twin Falls, ID +2023—Moses Lake and Walla Walla, WA; Craigmont, Genesee, and Twin Falls, ID +2024—Moses Lake and Walla Walla, WA; Aberdeen, Craigmont, and Twin Falls, ID +© 2023 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. 8-30-24 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json new file mode 100644 index 00000000..b4c31a4d --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json @@ -0,0 +1,35 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-sd-web", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20SD%20web.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Valda", + "AP Dagr", + "AP Iconic", + "AP Elevate", + "AP Murdock", + "AP Gunsmoke CL2", + "SY Ingmar", + "AP Revolution", + "LCS Trigger" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf" + ], + "page_text_chars": 5882, + "fetched_at": "2026-05-25T19:11:13.388036+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md new file mode 100644 index 00000000..8709f83b --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md @@ -0,0 +1,104 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Valda, AP Dagr, AP Iconic, AP Elevate, AP Murdock, AP Gunsmoke CL2, SY Ingmar, AP Revolution, LCS Trigger + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +South Dakota +2025 Yield bu/ac +South Dakota Protein Test Wt. Heading Height +Variety Avg. Agar Miller Northville Selby % lbs/bu 1-9 1-9 +AgriPro HY141 59.4 35.8 57.6 72.0 72.1 15.9 57.1 5 6 +SY Valda 56.8 35.3 63.3 61.2 67.3 16.0 58.2 5 5 +AP Dagr 56.4 29.7 54.5 69.8 71.5 16.0 55.6 6 4 +AgriPro HY155 56.2 31.1 57.3 65.6 70.9 16.9 57.6 5 6 +AgriPro HY162 54.8 30.4 52.2 68.2 68.5 16.2 56.7 5 6 +AP Iconic 54.1 29.0 52.0 68.9 66.4 17.0 56.3 5 6 +AP Elevate 53.3 27.1 51.1 71.5 63.8 16.8 56.2 6 4 +AP Murdock 48.5 29.8 46.6 51.0 66.7 17.5 57.1 4 4 +AP Gunsmoke CL2 47.6 30.1 44.8 64.5 51.2 17.8 55.7 5 5 +SY Ingmar 44.3 23.1 40.5 59.3 54.3 17.8 57.2 5 5 +AP Revolution 43.0 31.5 35.7 38.9 66.1 17.3 57.9 4 4 +LCS Trigger 63.5 34.0 67.6 77.8 74.6 15.2 58.1 6 6 +ND Stampede 60.2 47.3 54.3 66.5 72.6 17.1 58.2 5 5 +Brawn-SD 58.8 42.1 55.4 69.0 68.9 15.4 59.5 5 NA +WB9641 57.0 32.2 58.1 71.6 66.0 16.0 56.8 6 5 +MN-Torgy 56.2 34.0 54.0 67.4 69.4 17.9 58.9 6 6 +WB9645 55.9 37.5 51.3 67.2 67.6 16.1 56.8 7 6 +Ascend-SD 54.7 33.2 51.1 66.3 68.3 16.9 57.7 6 8 +Driver 52.0 30.9 44.4 70.2 62.5 16.7 57.4 5 7 +WB9642 49.4 34.7 30.0 65.8 67.0 16.4 59.1 6 4 +WB9590 42.6 41.3 25.8 50.4 52.9 17.9 57.1 4 4 +Mean 53.7 33.7 50.3 64.4 66.3 16.7 57.4 +LSD (5%) 8.4 7.1 9.9 5.8 1.0 2.4 +CV (%) 9.7 9.9 9.0 13.0 4.3 2.6 1.2 +No. of Locs. 4 2 2 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Lodging: 1 = no lodging; Disease 1 = tolerant +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly +100% hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +South Dakota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Gross Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY141 65.0 65.7 59.4 15.4 60.2 370.0 3 5 6 6 5 4 +AP Dagr 63.1 62.5 56.4 15.6 59.8 364.2 10 6 4 5 4 5 +AgriPro HY162 62.7 61.4 54.8 15.7 59.9 364.5 9 5 6 5 5 4 +AP Iconic 62.2 62.1 54.1 16.2 60.1 368.4 5 5 6 3 4 4 +SY Valda 62.2 62.5 56.8 16.1 60.5 368.3 6 5 5 5 4 4 +AgriPro HY155 61.8 61.5 56.2 16.5 60.3 365.8 7 5 6 5 5 4 +AP Elevate 61.6 60.9 53.3 16.3 60.2 364.9 8 6 4 3 4 4 +AP Gunsmoke CL2 56.9 55.6 47.6 16.8 60.0 336.9 12 5 5 3 5 4 +AP Murdock 53.1 54.7 48.5 16.9 60.2 314.3 13 4 4 4 4 4 +AP Revolution 52.8 53.2 43.0 16.9 60.7 312.9 14 4 4 4 3 3 +SY Ingmar 52.4 51.7 44.3 17.2 60.7 309.9 15 5 5 3 3 3 +LCS Trigger 71.9 70.9 63.5 14.3 60.9 376.2 2 6 6 NA 4 NA +Brawn-SD 64.5 62.1 58.8 15.5 61.7 369.3 4 5 NA NA 4 5 +Ascend-SD 64.4 62.5 54.7 16.7 60.8 381.2 1 6 8 7 3 4 +Driver 60.5 58.5 52.0 16.3 60.5 358.1 11 5 7 NA NA NA +ND Stampede 60.2 5 5 5 4 5 +WB9641 57.0 6 5 4 6 5 +MN-Torgy 56.2 6 6 5 3 3 +WB9645 55.9 7 6 5 6 5 +WB9642 49.4 6 4 6 6 4 +WB9590 42.6 4 4 2 6 6 +Mean 61.0 60.4 53.7 16.2 60.4 +LSD (5%) 4.3 4.5 8.4 0.7 0.4 +CV (%) 7.7 7.9 9.74 0.8 2.2 +No. of Locs. 10 8 4 6 6 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Lodging: 1 = no lodging; Disease 1 = tolerant +2025 Locations: Agar, Miller, Northville, and Selby SD +2024 Locations: Agar, Roscoe, Northville, and Selby, SD +2023 Locations: Selby, and Webster, SD +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json new file mode 100644 index 00000000..ecc20dab --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-web-east", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20web%20East.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Iconic", + "AP Elevate", + "SY Valda", + "AP Murdock", + "AP Smith", + "AP Dagr", + "AP Gunsmoke CL2", + "SY", + "SY Ingmar", + "LCS Boom" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf" + ], + "page_text_chars": 7194, + "fetched_at": "2026-05-25T19:11:10.521992+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md new file mode 100644 index 00000000..b9d1ea9a --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md @@ -0,0 +1,114 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Iconic, AP Elevate, SY Valda, AP Murdock, AP Smith, AP Dagr, AP Gunsmoke CL2, SY, SY Ingmar, LCS Boom + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +Eastern North Dakota and Minnesota +2025 Yield bu/ac +Test +Minnesota North Dakota +Prot. Wt. +Variety Avg. Crookston Glyndon Warren Wolverton Cando Casselton Langdon McVille Park River Thompson % lbs/bu +AgriPro HY141 90.8 87.2 75.8 85.7 90.7 88.4 103.5 108.6 92.7 82.0 93.9 13.5 59.5 +AgriPro HY155 90.8 93.7 74.7 93.3 88.3 84.6 91.8 101.3 99.1 85.3 96.1 14.0 60.2 +AgriPro HY162 86.7 85.8 73.1 88.9 92.5 75.8 86.3 103.8 90.1 82.3 88.6 13.5 60.2 +AP Iconic 83.5 92.2 69.0 75.1 76.9 86.8 85.5 104.1 86.6 73.6 85.5 13.5 58.7 +AP Elevate 81.9 84.3 66.1 74.2 74.9 78.1 81.2 104.7 93.9 73.2 87.9 13.5 59.7 +SY Valda 81.2 74.6 71.6 94.7 73.2 81.0 82.8 91.0 82.5 72.9 87.3 13.9 58.4 +AP Murdock 80.2 90.0 64.7 85.2 78.6 75.7 82.3 96.2 87.4 55.3 87.1 13.5 59.5 +AP Smith 80.2 80.8 64.9 83.0 70.3 71.3 85.0 91.5 95.1 71.6 88.6 14.4 59.1 +AP Dagr 79.0 79.2 68.1 90.5 71.2 75.5 82.1 88.1 79.5 59.0 97.0 13.2 57.6 +AP Gunsmoke CL2 77.4 81.7 63.1 79.7 67.8 82.3 83.7 93.5 88.1 49.9 83.9 13.3 58.1 +SY 611 CL2 76.7 78.0 63.1 73.2 71.1 74.4 77.3 100.8 83.3 62.7 82.8 14.5 58.9 +SY Ingmar 74.8 70.0 62.1 80.8 66.6 69.4 85.0 89.0 73.1 65.0 86.7 14.5 59.1 +ND Stampede 87.1 86.8 69.0 90.3 88.7 77.2 91.0 100.5 100.9 76.2 90.4 15.1 58.8 +WB9645 85.7 96.2 72.7 92.2 73.9 74.2 95.1 103.0 99.2 69.1 81.4 13.3 58.9 +Ascend-SD 85.5 95.6 69.0 88.8 90.5 79.7 88.3 93.1 90.6 78.0 81.7 14.1 60.5 +WB9641 85.5 74.7 64.2 93.8 75.0 88.1 92.1 104.8 98.3 65.4 98.9 12.7 58.6 +WB9590 84.8 81.3 63.8 88.1 76.9 77.0 86.2 103.1 93.0 77.0 101.4 14.8 59.5 +WB9606 83.8 87.4 70.8 90.6 69.7 78.8 96.4 95.5 94.2 75.3 79.0 12.9 60.5 +Faller 82.5 88.9 62.2 92.1 69.4 83.8 83.0 98.6 87.6 66.5 92.6 14.0 58.9 +WB9642 81.9 83.7 65.7 92.0 77.8 81.1 80.8 91.9 85.2 73.2 88.0 13.6 60.9 +MN-Torgy 80.9 89.4 65.1 84.6 80.9 82.0 76.0 92.6 94.5 70.3 73.7 14.6 60.5 +MN-Rothsay 80.8 82.2 65.8 91.1 69.7 82.2 80.5 96.1 91.1 60.7 88.3 13.9 58.8 +WB9719 75.9 84.6 60.3 88.8 63.8 72.7 79.1 97.8 78.4 42.9 90.7 13.3 57.5 +LCS Boom 74.3 84.0 52.5 77.5 67.6 79.3 76.5 91.7 91.0 46.4 76.9 14.2 60.6 +ND Thresher 73.3 81.9 56.3 86.9 71.9 73.7 70.5 86.7 70.2 56.2 78.9 14.4 56.6 +Mean 82.2 84.8 66.4 86.9 76.6 79.2 85.2 97.5 89.2 68.3 87.6 13.8 59.2 +LSD (5%) 4.8 11.5 4.7 13.2 13.1 11.0 — 10.6 10.3 9.2 11.4 0.5 0.7 +CV (%) 6.5 6.6 4.3 7.3 8.2 8.4 12.1 6.6 7.0 7.8 6.4 6.9 2.1 +No. of Locs. 10 8 10 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= Early, Height: 1 = Short +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +Eastern North Dakota and Minnesota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu Gross $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY155 92.4 91.5 90.8 14.4 60.2 488.8 1 5 6 5 5 4 +AgriPro HY141 91.7 90.6 90.8 13.8 59.8 461.9 5 5 6 6 5 4 +AgriPro HY162 91.0 89.3 86.7 13.6 60.3 450.0 8 5 6 5 5 4 +AP Iconic 87.5 87.1 83.5 14.0 59.4 446.3 9 5 6 3 4 4 +AP Elevate 85.9 86.0 81.9 14.5 60.1 458.5 6 6 4 3 4 4 +AP Dagr 85.7 82.6 79.0 13.7 58.7 426.2 17 6 4 5 4 5 +SY Valda 85.5 84.5 81.2 14.2 58.8 446.2 10 5 5 5 4 4 +AP Smith 82.6 82.0 80.2 14.8 59.6 450.3 7 6 4 2 3 4 +AP Murdock 82.4 82.4 80.2 14.3 60.0 431.5 16 4 4 4 4 4 +AP Gunsmoke CL2 81.2 80.9 77.4 14.6 58.9 434.8 15 5 5 3 5 4 +SY 611 CL2 80.7 79.5 76.7 14.9 59.3 443.4 11 5 4 4 4 3 +SY Ingmar 79.8 79.7 74.8 15.0 59.7 440.1 13 5 5 3 3 3 +WB9606 87.7 85.8 83.8 13.8 60.9 438.9 14 5 6 5 6 5 +MN-Rothsay 86.3 84.3 80.8 14.8 59.9 468.5 3 7 3 3 4 5 +WB9590 85.5 84.4 84.8 15.1 59.3 476.8 2 4 4 2 6 6 +Faller 85.0 82.9 82.5 14.2 59.3 443.3 12 6 7 7 3 3 +MN-Torgy 84.0 82.4 80.9 15.1 60.9 466.3 4 6 6 5 3 3 +WB9719 81.6 77.9 75.9 14.0 58.6 417.1 18 6 5 2 5 6 +Ascend-SD 84.3 85.5 6 8 7 3 4 +ND Thresher 74.4 73.3 6 5 7 3 5 +ND Stampede 87.1 5 5 5 4 5 +WB9645 85.7 7 6 5 6 5 +WB9641 85.5 6 5 4 6 5 +WB9642 81.9 6 4 6 6 4 +LCS Boom 74.3 3 5 4 6 4 +Mean 85.4 83.6 82.5 14.6 59.7 +LSD (5%) 3.3 4.0 4.8 0.5 0.8 +CV (%) 6.7 6.9 6.5 2.8 2.1 +No. of Locs. 23 16 10 22 21 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Disease: 1 = no disease +2025 Locations: Crookston, Glyndon, Warren, and Wolverton MN; Cando, Casselton, Langdon, McVille, Park River, and Thompson ND +2024 Locations: Casselton, Drayton, Langdon, and Park River, ND; Crookston and Warren, MN +2023 Locations: Casselton, McVille, Park River, and Valley City, ND; Crookston, Glyndon, and Warren, MN +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein. +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json new file mode 100644 index 00000000..20609df4 --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-web-west", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20web%20west.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Iconic", + "AP Elevate", + "SY", + "SY Valda", + "AP Smith", + "AP Dagr", + "AP Gunsmoke CL2", + "AP Murdock", + "SY Ingmar", + "LCS Boom" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf" + ], + "page_text_chars": 6380, + "fetched_at": "2026-05-25T19:11:11.464402+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md new file mode 100644 index 00000000..77804829 --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md @@ -0,0 +1,113 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Iconic, AP Elevate, SY, SY Valda, AP Smith, AP Dagr, AP Gunsmoke CL2, AP Murdock, SY Ingmar, LCS Boom + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +Western North Dakota +2025 Yield bu/ac +North Dakota +Prot. Test Wt. +Variety Avg. Berthold Coleharbor Harvey New Leipzig Steele Velva % lbs/bu +AgriPro HY162 78.9 45.9 110.4 89.1 67.3 61.1 99.3 14.6 60.2 +AgriPro HY155 77.5 45.4 110.9 85.1 65.2 60.3 98.0 15.3 59.7 +AP Iconic 75.8 50.2 100.4 88.9 67.4 55.2 92.7 14.5 59.9 +AgriPro HY141 75.1 46.8 101.8 75.8 68.4 59.1 98.6 14.3 60.0 +AP Elevate 74.6 52.8 104.9 81.6 57.1 61.4 89.6 14.9 60.7 +SY 611 CL2 73.3 54.0 98.7 74.3 66.3 55.9 90.5 15.5 60.3 +SY Valda 72.5 56.7 94.3 78.3 62.1 52.0 91.7 14.5 58.9 +AP Smith 72.1 56.9 98.2 81.9 46.9 58.8 90.1 15.2 59.9 +AP Dagr 70.7 59.2 94.3 63.9 60.9 53.3 92.4 14.4 59.6 +AP Gunsmoke CL2 70.5 51.2 92.5 77.9 61.0 53.5 87.0 15.5 60.5 +AP Murdock 69.2 50.7 97.5 69.7 64.7 48.1 84.2 15.0 60.8 +SY Ingmar 65.8 49.0 95.8 68.5 57.3 42.7 81.4 15.5 58.7 +ND Stampede 78.1 51.0 107.3 86.8 62.0 68.5 93.2 16.1 60.3 +WB9641 76.8 58.8 104.9 87.2 51.6 62.8 95.4 14.1 59.8 +Faller 76.5 47.1 98.6 85.1 63.6 68.1 96.6 15.2 60.4 +WB9645 75.6 51.3 100.9 76.0 63.4 58.9 102.8 14.1 60.6 +MN-Torgy 75.1 58.5 93.0 83.0 68.5 63.2 84.6 15.8 61.8 +LCS Boom 75.0 49.2 100.7 87.7 70.8 54.1 87.7 15.0 62.5 +WB9606 74.0 49.1 109.1 84.1 52.7 55.5 93.3 14.4 60.6 +Ascend-SD 73.0 50.7 98.9 84.4 50.3 68.4 85.0 15.1 62.1 +WB9590 72.2 52.6 104.4 74.8 59.4 49.8 92.1 15.6 60.4 +WB9642 71.8 48.0 96.3 66.9 72.3 57.1 90.4 14.9 60.4 +WB9719 71.7 50.4 98.6 71.1 69.1 53.2 87.9 15.0 59.7 +MN-Rothsay 71.0 48.0 100.7 89.1 36.5 61.4 90.5 13.5 60.4 +ND Thresher 68.4 49.1 93.5 68.8 58.3 55.6 85.0 15.8 59.2 +Mean 73.7 51.3 100.8 79.1 61.3 57.8 91.7 14.9 60.3 +LSD (5%) 6.6 10.4 12.5 8.9 — 6.2 1.1 1.6 +CV (%) 7.7 11.8 6.3 9.6 8.7 — 3.2 5.1 2.3 +No. of Locs. 6 5 6 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= Early, Height: 1 = Short +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +Western North Dakota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu Gross $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY162 84.9 81.3 78.9 14.2 60.0 440.5 4 5 6 5 5 4 +AP Iconic 83.4 78.5 75.8 14.3 59.9 438.3 7 5 6 3 4 4 +AgriPro HY141 83.3 79.6 75.1 14.1 59.9 430.5 11 5 6 6 5 4 +AgriPro HY155 82.5 79.4 77.5 14.8 59.4 449.9 2 5 6 5 5 4 +AP Elevate 80.4 76.9 74.6 14.8 60.3 438.6 6 6 4 3 4 4 +AP Dagr 79.8 73.4 70.7 14.1 59.5 412.3 16 6 4 5 4 5 +SY Valda 79.5 73.4 72.5 14.3 59.1 416.3 15 5 5 5 4 4 +SY 611 CL2 78.2 73.5 73.3 15.1 59.7 435.7 8 5 4 4 4 3 +AP Smith 77.9 74.0 72.1 15.0 59.8 431.3 10 6 4 2 3 4 +AP Gunsmoke CL2 76.9 72.8 70.5 15.4 60.1 435.7 9 5 5 3 5 4 +AP Murdock 76.6 74.1 69.2 14.9 60.5 418.6 14 4 4 4 4 4 +SY Ingmar 74.7 71.4 65.8 15.4 59.5 424.5 13 5 5 3 3 3 +WB9606 82.9 78.6 74.0 14.1 60.2 429.1 12 5 6 5 6 5 +Faller 81.9 76.9 76.5 14.7 59.8 443.0 3 6 7 7 3 3 +WB9719 80.6 75.7 71.7 14.9 59.4 440.1 5 6 5 2 5 6 +WB9590 78.4 75.7 72.2 15.7 59.9 453.6 1 4 4 2 6 6 +MN-Torgy 77.2 75.1 6 6 5 3 3 +Ascend-SD 75.0 73.0 6 8 7 3 4 +MN-Rothsay 74.0 71.0 7 3 3 4 5 +ND Thresher 69.2 68.4 6 5 7 3 5 +ND Stampede 78.1 5 5 5 4 5 +WB9641 76.8 6 5 4 6 5 +WB9645 75.6 7 6 5 6 5 +LCS Boom 75.0 3 5 4 6 4 +WB9642 71.8 6 4 6 6 4 +Mean 80.1 75.5 73.7 14.7 59.8 +LSD (5%) 4.3 5.7 6.6 0.5 1.2 +CV (%) 6.9 8.1 7.7 5.9 2.2 +No. of Locs. 13 9 6 8 9 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Disease: 1 = no disease +2025 Locations: Berthold, Coleharbor, Harvey, New Leipzig, Steele, and Velva ND +2024 Locations: Coleharbor, Harvey, and Velva, ND +2023 Locations: Berthold, Coleharbor, New Leipzig, and Velva, ND +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein. +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json new file mode 100644 index 00000000..115f73ec --- /dev/null +++ b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-central-plains-dryland-2025-r1", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Central Plains Dryland Summary, Three-Year Data", + "filename": "Central%20Plains%20Dryland%202025%20r1.pdf", + "region": "Central Plains", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Wolverine", + "AP Roadrunner", + "AP Sunbird", + "AP Bigfoot", + "AP Prolific", + "SY Monument", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf" + ], + "page_text_chars": 2530, + "fetched_at": "2026-05-25T19:11:09.410687+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md new file mode 100644 index 00000000..d7698f4b --- /dev/null +++ b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md @@ -0,0 +1,56 @@ +# 2025 Central Plains Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Central Plains +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Wolverine, AP Roadrunner, AP Sunbird, AP Bigfoot, AP Prolific, SY Monument, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Central Plains Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Belleville, Junction City, Pratt, Salina, +Variety (2023-2025) (2024-2025) (2025) KS* KS KS KS +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +AP24 AX 63.9 58.8 70.7 58.9 73.8 55.3 64.8 94.7 59.0 76.7 +SY Wolverine 62.0 60.6 66.2 60.6 66.1 56.9 58.2 86.2 67.9 51.9 +AP Roadrunner 61.3 57.8 64.9 57.4 61.3 53.9 54.1 79.7 52.9 58.5 +AP Sunbird 61.2 61.0 67.2 61.3 66.6 57.9 59.0 86.7 54.3 66.5 +AP Bigfoot 60.0 61.3 65.2 61.5 64.7 59.0 53.1 86.6 53.3 65.9 +AP Prolific 59.6 60.4 64.1 60.4 61.8 58.2 46.0 86.2 45.8 69.2 +SY Monument 59.2 59.6 62.5 59.8 56.8 55.9 48.2 71.6 48.9 58.6 +Bob Dole 56.4 59.1 60.2 58.9 57.7 55.2 47.0 68.3 50.9 64.6 +Showdown 62.9 60.1 69.2 60.1 65.2 55.7 56.4 83.0 54.7 66.6 +Rockstar 61.7 58.8 67.4 59.1 65.5 55.8 54.0 83.8 55.9 68.3 +KS Providence 61.1 60.4 66.3 60.3 65.2 57.2 52.5 88.9 59.5 59.7 +WB4401 60.4 60.3 65.5 60.2 61.5 56.1 51.7 78.8 57.0 58.5 +LCS Atomic AX 59.4 61.7 65.2 61.8 65.1 59.2 52.3 88.0 50.0 70.1 +WB4523 58.2 59.5 62.8 59.6 57.5 56.2 40.6 68.2 60.0 61.2 +WB4422 69.6 59.7 71.0 56.3 62.5 87.3 63.6 70.6 +KS Bill Snyder 73.1 59.0 59.3 94.6 57.4 81.2 +WB4699 64.0 55.3 51.0 84.2 56.9 64.1 +High Cotton 63.8 58.2 43.8 90.9 59.4 61.2 +Polansky Goldenhawk 63.0 56.4 52.7 86.5 55.9 56.8 +Doublestop CLP 60.7 57.7 53.9 74.0 51.3 63.4 +Mean General 60.6 60.0 65.9 60.0 63.7 56.7 51.7 83.6 54.3 65.4 +LSD General (5%) EE 3.9 1.1 4.6 1.2 8.3 2.7 9.5 10.7 8.1 9.9 +CV (Effective) 9.0 2.4 8.8 2.5 9.2 3.3 11.2 7.8 9.1 9.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Belleville, Conway Springs, Junction City, and Salina, KS; Carrier, OK +2024 — Belleville, Conway Springs, Junction City, Palco, Pratt, and Salina, KS; Carrier, OK +2025 — Belleville, Junction City, Pratt, and Salina, KS +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-montana-2025-web.json b/corpus/agripro_trials/agt-montana-2025-web.json new file mode 100644 index 00000000..5da370fd --- /dev/null +++ b/corpus/agripro_trials/agt-montana-2025-web.json @@ -0,0 +1,32 @@ +{ + "source": "agripro_trials", + "source_key": "agt-montana-2025-web", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Montana Summary, Three-Year Data", + "filename": "Montana%202025%20web.pdf", + "region": "Montana", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Solid", + "SY Monument", + "AP Sunbird", + "SY", + "LCS Steel AX", + "LCS Julep" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf" + ], + "page_text_chars": 1922, + "fetched_at": "2026-05-25T19:11:12.271213+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-montana-2025-web.md b/corpus/agripro_trials/agt-montana-2025-web.md new file mode 100644 index 00000000..4f1e8ad6 --- /dev/null +++ b/corpus/agripro_trials/agt-montana-2025-web.md @@ -0,0 +1,53 @@ +# 2025 Montana Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Montana +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Solid, SY Monument, AP Sunbird, SY, LCS Steel AX, LCS Julep + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Montana Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Fort Benton, Conrad, Billings, Sawfly +Variety (2023-2025) (2024-2025) (2025) MT MT MT Protein Damage +Hard Red Yield Yield Yield TWT Yield Yield Yield % 1-9 +Winter Wheat Bu/A Bu/A Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 63.4 62.1 71.8 61.3 59.5 47.0 109.0 12.2 5 +AP Solid 63.4 61.9 71.6 63.9 58.7 43.0 113.1 13.4 4 +SY Monument 62.5 62.5 72.9 61.6 58.9 42.4 117.3 12.1 5 +AP Sunbird 61.5 61.8 70.7 61.1 61.7 38.9 111.4 12.6 6 +AP18 AX 59.0 58.3 68.5 62.9 56.3 41.2 108.1 12.7 6 +SY 517 CL2 53.4 53.1 63.4 56.3 55.4 33.7 101.2 13.9 5 +Keldin 65.6 63.5 75.0 61.9 69.5 39.4 115.9 12.9 5 +Bobcat 63.5 63.2 73.2 62.5 68.5 47.8 103.4 13.1 2 +MT WarCat 61.4 59.5 69.7 62.9 58.6 42.3 108.4 13.2 3 +Warhorse 57.9 55.3 64.5 54.8 57.1 32.8 103.6 14 2 +WB4523 63.8 76.9 62.0 69.8 42.4 118.6 11.6 4 +WB4483 60.0 68.4 54.8 61.0 35.9 108.1 13.8 5 +StandClear CLP 59.2 68.2 62.6 61.2 39.7 103.8 13.2 5 +Scorpio 56.9 68.1 53.6 61.9 34.2 108.2 12.5 6 +WB4733 CLP 56.3 64.5 62.3 55.2 38.3 99.9 13.9 4 +DG Ramsay 76.9 62.0 73.4 37.5 119.9 13 5 +LCS Steel AX 75.2 62.7 62.0 44.4 119.2 12.2 5 +4739AX 73.9 62.8 67.6 43.2 110.8 13.3 4 +WB4510CLP 72.3 63.2 64.1 34.5 118.4 12.2 6 +LCS Julep 69.9 63.2 62.8 39.1 107.9 13.4 5 +Sawfly: Locations +1-2 = Excellent 2023 — Conrad and Fort Benton, MT +3-4 = Very Good 2024 — Billings, Chester, Conrad, and Fort Benton, MT +5 = Good 2025 — Billings, Conrad, and Fort Benton, MT +6-7 = Fair +8-9 = Poor +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-ne-colorado-2025.json b/corpus/agripro_trials/agt-ne-colorado-2025.json new file mode 100644 index 00000000..6875c558 --- /dev/null +++ b/corpus/agripro_trials/agt-ne-colorado-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-ne-colorado-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Northeast Colorado Dryland Summary, Three-Year Data", + "filename": "NE%20Colorado%202025.pdf", + "region": "NE Colorado", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Bigfoot", + "SY Wolverine", + "AP Solid", + "AP Roadrunner", + "SY Monument", + "WB-Grainfield" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf" + ], + "page_text_chars": 2389, + "fetched_at": "2026-05-25T19:11:01.256427+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-ne-colorado-2025.md b/corpus/agripro_trials/agt-ne-colorado-2025.md new file mode 100644 index 00000000..0ff5f105 --- /dev/null +++ b/corpus/agripro_trials/agt-ne-colorado-2025.md @@ -0,0 +1,58 @@ +# 2025 Northeast Colorado Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** NE Colorado +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Bigfoot, SY Wolverine, AP Solid, AP Roadrunner, SY Monument, WB-Grainfield + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Northeast Colorado Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Crook, Julesburg, Yuma, +Variety (2023-2025) (2024-2025) (2025) CO CO* CO +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +AP Sunbird 81.5 59.5 82.7 60.0 86.5 57.9 61.6 102.9 95.1 +AP24 AX 79.2 56.8 79.8 57.1 86.7 54.9 63.7 97.9 98.5 +AP18 AX 78.5 57.2 78.7 57.4 85.0 55.2 59.5 101.2 94.3 +AP Bigfoot 76.5 59.0 76.6 58.9 79.0 56.6 60.4 86.6 90.0 +SY Wolverine 75.5 59.1 77.1 59.7 80.5 57.2 61.9 91.0 88.5 +AP Solid 73.3 59.2 73.9 59.4 76.9 56.4 62.4 86.0 82.4 +AP Roadrunner 72.3 56.8 73.6 56.7 80.1 54.6 63.9 90.5 86.0 +SY Monument 67.5 58.2 65.4 58.6 70.3 56.7 62.4 67.7 80.7 +AG Golden 78.3 56.4 79.0 56.7 86.1 54.9 71.2 93.6 93.4 +Langin 78.2 57.4 81.4 57.7 86.7 55.8 62.6 95.5 102.0 +WB4422 77.3 59.5 78.4 59.9 83.2 57.7 65.8 90.7 92.9 +KS Dallas 76.3 59.7 72.9 59.5 76.0 57.4 55.0 82.3 90.8 +WB4595 75.8 60.2 76.9 60.2 81.4 57.3 63.7 90.1 90.4 +High Country 75.4 59.2 75.1 59.1 79.2 57.0 60.4 89.6 87.7 +Amplify SF 73.7 58.8 72.1 59.0 73.8 56.5 61.6 77.7 82.2 +KS Hamilton 72.2 58.5 71.1 58.5 76.3 56.1 54.5 78.0 96.3 +TAM 115 65.5 60.4 63.8 60.2 70.5 58.3 49.8 85.0 76.6 +Kivari AX 78.1 57.7 83.3 55.9 63.3 87.1 99.5 +WB-Grainfield 86.3 57.3 59.0 99.8 100.0 +Canvas 85.1 55.9 67.4 94.4 93.7 +KS Bill Snyder 80.4 56.9 63.2 92.9 85.3 +KS Mako 78.4 57.2 58.1 84.6 92.3 +Mean General 75.2 58.6 75.5 58.7 79.5 56.4 60.4 88.8 89.4 +LSD General (5%) EE 5.5 1.2 6.2 1.5 9.6 0.0 8.3 16.0 13.0 +CV (Effective) 8.1 1.9 8.6 2.0 9.9 2.4 8.3 11.0 8.9 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Julesburg and Yuma, CO; Colby, KS +2024 — Crook and Julesburg, CO; Ingalls, KS +2025 — Crook, Julesburg, and Yuma, CO +© 2024 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-plains-irrigated-2025.json b/corpus/agripro_trials/agt-plains-irrigated-2025.json new file mode 100644 index 00000000..110b7a40 --- /dev/null +++ b/corpus/agripro_trials/agt-plains-irrigated-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-plains-irrigated-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Plains Irrigated Summary, Three-Year Data", + "filename": "Plains%20Irrigated%202025.pdf", + "region": "Plains Irrigated", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Wolverine", + "AP Sunbird", + "AP Prolific", + "AP Bigfoot", + "SY Grit", + "AP Roadrunner", + "SY Monument" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf" + ], + "page_text_chars": 2164, + "fetched_at": "2026-05-25T19:11:03.219401+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-plains-irrigated-2025.md b/corpus/agripro_trials/agt-plains-irrigated-2025.md new file mode 100644 index 00000000..a876b9be --- /dev/null +++ b/corpus/agripro_trials/agt-plains-irrigated-2025.md @@ -0,0 +1,52 @@ +# 2025 Plains Irrigated Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Plains Irrigated +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Wolverine, AP Sunbird, AP Prolific, AP Bigfoot, SY Grit, AP Roadrunner, SY Monument + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Plains Irrigated Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Hugoton, Scott City, Imperial, Dalhart, +Variety (2023-2025) (2024-2025) (2025) KS KS NE TX +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +SY Wolverine 95.0 57.6 94.8 58.6 94.5 59.9 99.9 102.4 95.4 80.3 +AP Sunbird 93.9 58.8 94.7 59.1 89.5 59.7 74.8 107.8 96.8 78.5 +AP Prolific 93.6 59.8 92.6 60.1 92.3 61.3 104.9 95.7 92.6 76.0 +AP Bigfoot 90.9 59.1 90.5 59.2 87.3 59.7 67.6 111.3 91.7 78.7 +SY Grit 89.2 58.1 90.9 58.7 91.8 58.9 96.4 94.8 96.9 79.0 +AP Roadrunner 88.7 56.7 89.3 57.2 81.7 57.1 57.6 108.2 83.8 77.1 +SY Monument 83.8 57.1 83.0 57.7 79.3 59.5 77.3 79.6 83.7 76.6 +WB4422 94.9 59.2 95.8 60.0 90.9 61.1 107.0 98.5 89.0 69.1 +TAM 114 91.7 60.7 93.6 61.2 89.1 61.7 77.6 107.9 92.5 78.3 +Canvas 89.6 58.0 92.2 59.2 84.5 59.4 68.1 101.9 86.1 81.7 +WB4792 89.3 56.8 92.1 58.0 88.3 57.9 94.6 104.7 73.0 80.8 +Epoch 88.9 58.4 88.5 58.9 88.6 59.6 101.9 97.4 87.0 68.0 +Langin 88.5 58.8 90.8 59.4 84.9 60.1 62.9 99.3 91.9 85.5 +TAM 115 76.3 59.2 76.2 59.7 76.5 62.0 63.8 93.4 75.4 73.5 +WB4523 93.5 58.4 90.9 111.5 93.4 78.1 +WB4303 93.1 57.4 102.9 93.8 95.7 80.2 +KS Mako 90.1 60.7 88.9 101.2 96.8 73.3 +Mean General 88.9 57.8 90.0 58.5 87.5 59.2 85.9 99.3 87.6 77.1 +LSD General (5%) EE 8.0 1.8 9.2 1.9 13.2 2.3 18.8 9.4 8.8 8.6 +CV (Effective) 10.2 3.5 10.6 3.0 8.5 2.6 13.4 5.8 6.1 6.8 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Ingalls, KS; Imperial, NE +2024 — Dalhart, TX; Hugoton and Ingalls, KS; Imperial, NE +2025 — Hugoton and Scott City, KS; Imperial, NE; Dalhart, TX +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json new file mode 100644 index 00000000..705efecb --- /dev/null +++ b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json @@ -0,0 +1,34 @@ +{ + "source": "agripro_trials", + "source_key": "agt-sc-ks-nc-ok-2024-0", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data", + "filename": "SC%20KS%20NC%20OK%202024_0.pdf", + "region": null, + "wheat_class_section": null, + "year": 2024, + "years_covered": [ + 2024 + ], + "varieties_found": [ + "SY Monument", + "AP Prolific", + "AP Roadrunner", + "SY Wolverine", + "AP Sunbird", + "AP EverRock", + "AP Bigfoot", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf" + ], + "page_text_chars": 2157, + "fetched_at": "2026-05-25T19:11:08.283651+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md new file mode 100644 index 00000000..1377c89f --- /dev/null +++ b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md @@ -0,0 +1,54 @@ +# 2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2024 +- **PDF:** https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Monument, AP Prolific, AP Roadrunner, SY Wolverine, AP Sunbird, AP EverRock, AP Bigfoot, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2022-2024 +3-Yr Combined 2-Yr Combined Combined Conway Pratt, Carrier, +Variety (2022-2024) (2023-2024) (2024) Springs, KS KS OK +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 57.6 62.7 54.1 62.4 62.6 63.7 75.2 36.5 76.2 +SY Monument 57.5 63.7 56.9 63.7 63.2 64.8 63.8 46.9 79.0 +AP Prolific 57.3 64.1 55.7 63.7 61.8 64.7 61.5 42.5 81.4 +AP Roadrunner 56.4 61.4 54.0 60.6 59.0 61.0 63.8 34.8 78.3 +SY Wolverine 55.8 64.5 52.8 64.2 58.7 65.6 56.6 44.6 75.0 +AP18 AX 55.4 63.0 52.8 62.9 62.2 64.2 70.7 37.3 78.5 +AP Sunbird 53.7 64.0 52.0 64.1 62.0 66.1 69.5 45.5 70.9 +Bob Dole 53.6 62.9 53.5 62.5 60.6 63.6 62.0 40.1 79.7 +AP EverRock 52.4 63.6 50.8 63.2 58.4 64.6 60.0 37.5 77.7 +AP Bigfoot 52.0 63.9 51.3 63.9 61.0 65.6 61.7 45.3 76.0 +WB4401 53.6 63.7 52.3 64.4 59.7 65.9 63.2 31.7 84.1 +Showdown 59.7 64.4 69.7 65.6 77.0 47.2 85.1 +Rockstar 56.5 62.4 64.2 63.1 62.9 49.3 80.4 +KS Providence 56.0 63.4 64.4 64.4 71.1 38.7 83.4 +LCS Atomic AX 54.3 64.8 62.3 66.2 65.8 42.9 78.2 +WB4523 52.9 63.7 60.5 65.0 56.2 39.9 85.4 +KS Hatchett 52.9 63.5 60.8 64.5 65.8 39.5 77.0 +WB4422 61.3 64.6 63.5 43.5 77.0 +KS Mako 59.0 65.5 58.7 49.9 68.3 +Mean General 55.0 63.4 53.8 63.5 59.7 64.0 62.2 41.2 75.7 +LSD General (5%) EE NS 1.4 NS 1.9 8.5 3.7 7.7 8.6 9.3 +CV (Effective) 7.9 1.7 8.7 2.0 8.8 2.2 7.6 12.8 7.5 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +NS = Non Significant +Locations +2022 — Conway Springs and Partridge, KS; Carrier, OK +2023 — Conway Springs, KS; Carrier, OK +2024 — Conway Springs and Pratt, KS; Carrier, OK +© 2024 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-south-dakota-2025.json b/corpus/agripro_trials/agt-south-dakota-2025.json new file mode 100644 index 00000000..49a338f4 --- /dev/null +++ b/corpus/agripro_trials/agt-south-dakota-2025.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-south-dakota-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 South Dakota Winter Wheat Summary, Three-Year Data", + "filename": "South%20Dakota%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Prolific", + "AP Sunbird", + "AP Bigfoot", + "SY", + "SY Wolverine", + "SY WOLF", + "SY Monument", + "AP Clair", + "AP Solid", + "LCS Helix AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf" + ], + "page_text_chars": 1916, + "fetched_at": "2026-05-25T19:11:14.252920+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-south-dakota-2025.md b/corpus/agripro_trials/agt-south-dakota-2025.md new file mode 100644 index 00000000..fcf47211 --- /dev/null +++ b/corpus/agripro_trials/agt-south-dakota-2025.md @@ -0,0 +1,52 @@ +# 2025 South Dakota Winter Wheat Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Prolific, AP Sunbird, AP Bigfoot, SY, SY Wolverine, SY WOLF, SY Monument, AP Clair, AP Solid, LCS Helix AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 South Dakota Winter Wheat Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Agar, Presho, +Variety (2023-2025) (2024-2025) (2025) SD SD +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +AP18 AX 57.3 59.3 64.7 58.6 57.2 54.2 52.5 61.9 +AP24 AX 56.8 59.0 64.2 57.8 61.2 53.9 59.9 62.5 +AP Prolific 56.1 59.5 62.9 58.8 58.6 54.6 56.1 61.1 +AP Sunbird 55.9 59.5 58.5 59.1 58.4 56.1 56.8 59.9 +AP Bigfoot 55.6 59.6 62.9 58.9 64.1 55.9 62.2 66.1 +SY 517 CL2 52.5 60.8 56.6 60.3 59.9 57.8 60.0 59.7 +SY Wolverine 52.3 58.5 56.9 58.5 53.5 55.4 47.5 59.5 +SY WOLF 51.4 59.1 55.5 57.9 53.5 53.7 49.4 57.6 +SY Monument 50.1 57.3 56.1 55.9 56.6 51.9 54.7 58.6 +AP Clair 49.2 59.7 57.8 59.5 52.9 56.2 50.6 55.1 +AP Solid 54.7 56.8 47.8 61.7 +WB4422 58.2 59.9 63.8 58.4 65.3 55.5 63.5 67.1 +LCS Helix AX 57.2 61.1 66.0 60.6 65.2 57.9 65.0 65.3 +Winner 56.5 60.5 66.7 59.7 65.9 57.0 68.7 63.1 +SD Andes 55.7 60.6 60.9 59.8 54.2 55.8 50.3 58.1 +SD Midland 54.6 59.8 59.4 58.7 54.4 54.1 49.7 59.0 +Kivari AX 54.3 56.8 50.7 53.2 44.5 56.9 +SD Pheasant 54.1 56.4 49.7 58.5 +Mean General 54.6 59.6 60.4 58.5 57.7 55.2 55.2 60.3 +LSD General (5%) EE 5.4 1.2 7.7 1.5 7.4 1.8 13.0 6.1 +CV (Effective) 10.3 1.8 8.7 1.9 11.2 2.0 14.3 6.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Hayes and Ideal, SD +2024 — Hayes and Ideal, SD +2025 — Agar and Presho, SD +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-southern-idaho-2025.json b/corpus/agripro_trials/agt-southern-idaho-2025.json new file mode 100644 index 00000000..940e2300 --- /dev/null +++ b/corpus/agripro_trials/agt-southern-idaho-2025.json @@ -0,0 +1,40 @@ +{ + "source": "agripro_trials", + "source_key": "agt-southern-idaho-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Southern Idaho Summary, Three-Year Data", + "filename": "Southern%20Idaho%202025.pdf", + "region": "Southern Idaho", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Exceed", + "AP Olympia", + "SY Dayton", + "SY Assure", + "SY Ovation", + "AP Iliad", + "SY Raptor", + "LCS Shine", + "LCS Hulk", + "LCS Artdeco", + "Norwest Duet", + "Norwest Tandem", + "LCS Jefe", + "LCS Kamiack" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf" + ], + "page_text_chars": 2095, + "fetched_at": "2026-05-25T19:11:06.237182+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-southern-idaho-2025.md b/corpus/agripro_trials/agt-southern-idaho-2025.md new file mode 100644 index 00000000..497c60d0 --- /dev/null +++ b/corpus/agripro_trials/agt-southern-idaho-2025.md @@ -0,0 +1,49 @@ +# 2025 Southern Idaho Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Southern Idaho +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Exceed, AP Olympia, SY Dayton, SY Assure, SY Ovation, AP Iliad, SY Raptor, LCS Shine, LCS Hulk, LCS Artdeco, Norwest Duet, Norwest Tandem, LCS Jefe, LCS Kamiack + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Southern Idaho Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Blackfoot, Nampa, Rupert, Twin Falls, +Variety (2023-2025) (2024-2025) (2025) ID ID ID ID +Soft White Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +AP Exceed 166.9 62.2 165.3 62.6 177.8 62.1 179.0 176.4 141.9 213.7 +AP Olympia 164.9 61.0 163.8 61.4 181.2 60.8 183.6 163.9 156.7 220.5 +SY Dayton 161.5 61.2 160.4 61.4 174.9 60.5 176.4 177.4 146.4 199.5 +SY Assure 161.5 62.0 158.7 61.9 174.7 61.5 180.2 165.2 155.6 197.7 +SY Ovation 160.2 61.1 155.7 60.8 169.2 59.9 178.2 175.0 121.7 202.0 +AP Iliad 160.1 61.4 157.0 61.6 167.9 60.7 177.6 175.6 108.9 209.4 +SY Raptor 153.1 59.6 150.9 59.7 160.7 58.9 175.5 165.3 130.5 171.5 +LCS Shine 164.0 61.3 164.6 61.5 170.7 61.0 165.9 160.6 166.9 189.5 +LCS Hulk 161.7 62.6 159.9 62.8 171.1 62.8 164.2 181.5 146.9 192.0 +LCS Artdeco 157.6 61.5 156.5 61.5 164.2 60.9 164.6 182.7 131.1 178.3 +Norwest Duet 154.8 61.2 155.1 61.5 166.2 61.1 170.4 159.8 139.6 194.9 +Norwest Tandem 152.2 60.8 149.6 61.2 155.0 60.3 151.6 152.2 121.8 194.2 +LCS Jefe 165.5 62.6 180.8 61.8 174.2 160.8 169.7 218.5 +LCS Kamiack 159.8 61.9 174.9 61.5 169.3 176.8 143.5 209.9 +Mean General 160.9 61.4 159.4 61.6 170.9 61.0 171.0 169.4 141.4 201.8 +LSD General (5%) EE ns 1.1 ns 1.1 14.1 1.3 12.5 ns ns ns +CV (Effective) 6.9 2.1 7.2 1.8 8.4 1.7 3.5 5.5 14.5 7.6 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Twin Falls, ID +2024 — Blackfoot and Twin Falls, ID +2025 — Blackfoot, Nampa, Rupert, and Twin Falls, ID +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-washington-n-idaho-2025.json b/corpus/agripro_trials/agt-washington-n-idaho-2025.json new file mode 100644 index 00000000..e7f4eeb7 --- /dev/null +++ b/corpus/agripro_trials/agt-washington-n-idaho-2025.json @@ -0,0 +1,40 @@ +{ + "source": "agripro_trials", + "source_key": "agt-washington-n-idaho-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Washington/Northern Idaho Summary, Three-Year Data", + "filename": "Washington%3AN%20Idaho%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Raptor", + "AP Olympia", + "AP Exceed", + "SY Ovation", + "SY Dayton", + "AP Iliad", + "SY Assure", + "Norwest Duet", + "LCS Artdeco", + "LCS Shine", + "LCS Hulk", + "Norwest Tandem", + "LCS Kamiack", + "LCS Jefe" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf" + ], + "page_text_chars": 1892, + "fetched_at": "2026-05-25T19:11:05.110770+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-washington-n-idaho-2025.md b/corpus/agripro_trials/agt-washington-n-idaho-2025.md new file mode 100644 index 00000000..48bc4020 --- /dev/null +++ b/corpus/agripro_trials/agt-washington-n-idaho-2025.md @@ -0,0 +1,48 @@ +# 2025 Washington/Northern Idaho Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Raptor, AP Olympia, AP Exceed, SY Ovation, SY Dayton, AP Iliad, SY Assure, Norwest Duet, LCS Artdeco, LCS Shine, LCS Hulk, Norwest Tandem, LCS Kamiack, LCS Jefe + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Washington/Northern Idaho Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Genesee, Walla Walla, +Variety (2023-2025) (2024-2025) (2025) ID WA +Soft White Yield TWT Yield TWT Yield TWT Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +SY Raptor 146.1 63.0 157.6 64.8 156.9 65.0 157.4 156.3 +AP Olympia 142.2 64.2 154.3 66.1 151.1 65.8 159.6 142.5 +AP Exceed 141.8 63.6 155.3 65.4 152.6 65.3 160.0 145.2 +SY Ovation 140.8 63.3 148.7 65.4 147.4 65.7 154.9 139.8 +SY Dayton 140.0 63.6 158.4 65.6 157.7 65.8 172.3 143.1 +AP Iliad 139.8 63.4 151.0 65.3 147.0 65.2 149.4 144.6 +SY Assure 138.7 64.3 152.0 66.3 150.9 66.0 149.6 152.1 +Norwest Duet 144.1 62.3 157.4 64.7 155.3 65.0 157.5 153.1 +LCS Artdeco 140.3 62.6 155.5 64.4 157.5 64.8 160.9 154.0 +LCS Shine 139.3 63.1 148.6 64.7 150.7 65.2 146.1 155.4 +LCS Hulk 138.4 63.9 152.4 65.8 156.6 65.9 167.2 146.0 +Norwest Tandem 137.0 62.7 152.9 65.0 157.3 65.3 166.1 148.5 +LCS Kamiack 165.0 66.5 174.3 66.2 188.8 159.7 +LCS Jefe 160.1 65.2 167.4 66.0 178.9 155.9 +Mean General 142.5 63.3 156.7 65.3 157.3 65.5 162.0 152.7 +LSD General (5%) EE 8.6 1.0 ns 1.4 ns ns ns ns +CV (Effective) 6.1 1.6 5.4 1.1 7.5 1.1 8.4 6.5 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Craigmont and Genesee, ID; Moses Lake and Walla Walla, WA +2024 — Craigmont, ID; Walla Walla, WA +2025 — Genesee, ID; Walla Walla, WA +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json new file mode 100644 index 00000000..a5010b2b --- /dev/null +++ b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-western-plains-dryland-2025-0", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Western Plains Dryland Summary, Three-Year Data", + "filename": "Western%20Plains%20Dryland%202025_0.pdf", + "region": "Western Plains", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Bigfoot", + "SY Wolverine", + "AP Roadrunner", + "AP Solid", + "SY Monument", + "WB-Grainfield" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf" + ], + "page_text_chars": 2395, + "fetched_at": "2026-05-25T19:11:02.169343+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md new file mode 100644 index 00000000..d43c029a --- /dev/null +++ b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md @@ -0,0 +1,58 @@ +# 2025 Western Plains Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Western Plains +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Bigfoot, SY Wolverine, AP Roadrunner, AP Solid, SY Monument, WB-Grainfield + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Western Plains Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Crook, Julesburg, Yuma, +Variety (2023-2025) (2024-2025) (2025) CO CO* CO +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP18 AX 74.2 57.6 75.8 57.8 85.0 55.2 59.5 101.2 94.3 +AP Sunbird 74.1 59.5 77.2 59.8 86.5 57.9 61.6 102.9 95.1 +AP24 AX 73.2 56.7 74.0 56.8 86.7 54.9 63.7 97.9 98.5 +AP Bigfoot 71.2 58.9 73.0 58.9 79.0 56.6 60.4 86.6 90.0 +SY Wolverine 68.9 59.1 71.4 59.5 80.5 57.2 61.9 91.0 88.5 +AP Roadrunner 67.6 56.0 70.3 55.7 80.1 54.6 63.9 90.5 86.0 +AP Solid 66.4 58.4 67.7 58.4 76.9 56.4 62.4 86.0 82.4 +SY Monument 61.7 57.8 60.1 57.9 70.3 56.7 62.4 67.7 80.7 +Langin 72.9 57.3 77.3 57.6 86.7 55.8 62.6 95.5 102.0 +WB4422 70.6 59.1 73.8 59.4 83.2 57.7 65.8 90.7 92.9 +AG Golden 70.3 55.5 72.3 55.5 86.1 54.9 71.2 93.6 93.4 +High Country 69.4 59.1 72.0 58.9 79.2 57.0 60.4 89.6 87.7 +WB4595 69.2 59.8 72.2 59.6 81.4 57.3 63.7 90.1 90.4 +KS Dallas 68.3 59.1 66.5 58.7 76.0 57.4 55.0 82.3 90.8 +Amplify SF 66.8 57.9 66.4 57.8 73.8 56.5 61.6 77.7 82.2 +KS Hamilton 66.6 58.3 68.5 58.3 76.3 56.1 54.5 78.0 96.3 +TAM 115 60.7 60.1 60.1 59.8 70.5 58.3 49.8 85.0 76.6 +Kivari AX 73.7 57.0 83.3 55.9 63.3 87.1 99.5 +WB-Grainfield 86.3 57.3 59.0 99.8 100.0 +Canvas 85.1 55.9 67.4 94.4 93.7 +KS Bill Snyder 80.4 56.9 63.2 92.9 85.3 +KS Mako 78.4 57.2 58.1 84.6 92.3 +Mean General 69.1 58.3 71.0 58.2 79.5 56.4 60.4 88.8 89.4 +LSD General (5%) EE 5.1 1.4 6.2 1.7 9.6 ns 8.3 16.0 13.0 +CV (Effective) 9.0 2.4 9.4 2.5 9.9 2.4 8.3 11.0 8.9 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Julesburg and Yuma, CO; Colby, KS +2024 — Crook and Julesburg, CO; Ingalls, KS +2025 — Crook, Julesburg, and Yuma, CO +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-wheat-after-soy-2025.json b/corpus/agripro_trials/agt-wheat-after-soy-2025.json new file mode 100644 index 00000000..ffb143b3 --- /dev/null +++ b/corpus/agripro_trials/agt-wheat-after-soy-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-wheat-after-soy-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Wheat Following Soybeans Summary, Three-Year Data", + "filename": "Wheat%20after%20Soy%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Roadrunner", + "SY Wolverine", + "AP Bigfoot", + "AP Prolific", + "SY Monument", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf" + ], + "page_text_chars": 2367, + "fetched_at": "2026-05-25T19:11:07.269403+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-wheat-after-soy-2025.md b/corpus/agripro_trials/agt-wheat-after-soy-2025.md new file mode 100644 index 00000000..b7da4eb3 --- /dev/null +++ b/corpus/agripro_trials/agt-wheat-after-soy-2025.md @@ -0,0 +1,55 @@ +# 2025 Wheat Following Soybeans Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Roadrunner, SY Wolverine, AP Bigfoot, AP Prolific, SY Monument, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Wheat Following Soybeans Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Belleville, Junction City, Salina, +Variety (2023-2025) (2024-2025) (2025) KS* KS KS +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 66.1 58.8 76.6 59.0 78.7 55.3 64.8 94.7 76.7 +AP Sunbird 62.3 61.0 70.7 60.9 70.7 57.7 59.0 86.7 66.5 +AP Roadrunner 62.2 57.5 68.5 56.9 64.1 53.4 54.1 79.7 58.5 +SY Wolverine 61.0 60.3 66.9 60.1 65.4 56.5 58.2 86.2 51.9 +AP Bigfoot 60.3 61.3 67.5 61.2 68.6 58.7 53.1 86.6 65.9 +AP Prolific 59.2 60.0 66.3 60.0 67.2 57.9 46.0 86.2 69.2 +SY Monument 58.4 59.3 63.9 59.8 59.5 56.1 48.2 71.6 58.6 +Bob Dole 56.7 58.5 63.0 58.1 60.0 54.6 47.0 68.3 64.6 +Showdown 62.8 59.6 71.9 59.6 68.7 55.6 56.4 83.0 66.6 +KS Providence 61.9 60.4 70.0 60.4 67.0 56.8 52.5 88.9 59.7 +Rockstar 61.0 58.1 70.0 58.5 68.7 55.6 54.0 83.8 68.3 +WB4401 59.5 59.5 66.7 59.2 63.0 55.1 51.7 78.8 58.5 +LCS Atomic AX 59.1 61.3 67.7 61.2 70.2 58.7 52.3 88.0 70.1 +WB4523 57.4 58.9 63.9 59.0 56.7 55.5 40.6 68.2 61.2 +WB4422 72.5 58.6 73.5 55.2 62.5 87.3 70.6 +KS Bill Snyder 78.4 58.8 59.3 94.6 81.2 +WB4699 66.4 55.2 51.0 84.2 64.1 +Polansky Goldenhawk 65.3 56.1 52.7 86.5 56.8 +High Cotton 65.3 57.4 43.8 90.9 61.2 +Doublestop CLP 63.8 56.9 53.9 74.0 63.4 +Mean General 60.6 59.7 68.4 59.5 66.9 56.3 51.7 83.6 65.4 +LSD General (5%) EE 4.6 1.4 5.6 1.7 9.2 0.0 9.5 10.7 9.9 +CV (Effective) 8.6 2.5 8.3 2.7 9.2 3.8 11.2 7.8 9.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Belleville, Conway Springs, Junction City, and Salina, KS +2024 — Belleville, Conway Springs, Junction City, and Salina, KS +2025 — Belleville, Conway Springs, Junction City, and Salina, KS +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/docs_mcp/server.py b/docs_mcp/server.py index e33eb7e8..8162c9ef 100644 --- a/docs_mcp/server.py +++ b/docs_mcp/server.py @@ -201,9 +201,15 @@ def _build_where( vendor: str | None, source: str | None, source_key: str | None, + *, + data_type: str | None = None, + state: str | None = None, + year: int | None = None, ) -> dict | None: """Translate filter args into a Chroma `where` clause.""" conds: list[dict] = [] + if data_type: + conds.append({"data_type": data_type}) if crop: conds.append({"crop": crop.lower()}) if brand: @@ -214,6 +220,10 @@ def _build_where( conds.append({"source": source}) if source_key: conds.append({"source_key": source_key}) + if state: + conds.append({"state": state.upper() if len(state) <= 3 else state}) + if year: + conds.append({"year": int(year)}) if not conds: return None if len(conds) == 1: @@ -460,7 +470,11 @@ def search_docs( "query": query, "crop": crop, "brand": brand, "vendor": vendor, "source": source, "k": k, }) as _call: - where = _build_where(crop, brand, vendor, source, None) + # Variety-search default: filter to data_type=variety so trial + # documents (yield trials) don't pollute identity-focused + # results. To search trials, use search_trials(). + where = _build_where(crop, brand, vendor, source, None, + data_type="variety") pool_size = max(k * 3, RERANK_POOL) # Exact-code pre-filter. Variety codes ("DKC62-08RIB", "AG29XF4") @@ -745,6 +759,233 @@ def lookup_variety( return "\n".join(out) +@mcp.tool() +def search_trials( + query: Annotated[str, Field(description=( + "Natural-language query about yield trials. Mention crop, " + "region or state, year, soil/conditions, and any specific " + "variety codes you want compared. Examples: " + "'best corn hybrid 2024 Iowa heavy clay'; " + "'AP Iliad yield Idaho stripe rust'; " + "'DKC65-20 vs NK1748 head to head Alabama 2023'." + ))], + crop: Annotated[ + str | None, + Field(description="OPTIONAL: corn, soybeans, silage, or wheat."), + ] = None, + state: Annotated[ + str | None, + Field(description=( + "OPTIONAL state filter. 2-letter abbrev (IA, IL, NE...) " + "for Golden Harvest plot reports; full or partial region " + "name (e.g. 'Pacific Northwest', 'Montana') for AgriPro " + "trial PDFs." + )), + ] = None, + year: Annotated[ + int | None, + Field(description="OPTIONAL year filter (e.g. 2024).", ge=2010, le=2030), + ] = None, + product: Annotated[ + str | None, + Field(description=( + "OPTIONAL variety/hybrid filter — substring match against " + "the product field. Example: 'DKC62' surfaces trials " + "containing any DKC62-* hybrid." + )), + ] = None, + k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10, +) -> str: + """Search yield-trial data — head-to-head results from real field + trials. SEPARATE from variety-identity search. + + Use this when the user wants to know HOW PRODUCTS PERFORMED, not + what they ARE. Trial data complements `search_docs`: + + * `search_docs` answers: "What's the disease resistance profile + of DKC62-08RIB?" (variety identity) + * `search_trials` answers: "Which corn hybrid actually won the + yield trials in central Iowa in 2024?" (performance data) + + Data sources: + + * **Golden Harvest plot reports** (4,000+ trials) — per-site + head-to-head comparing products from MULTIPLE BRANDS at one + cooperator's field. NK, DEKALB, Golden Harvest, sometimes + others all compete at the same site. Cross-vendor data Bayer + itself doesn't publish. + * **AgriPro regional trial PDFs** (~14 PDFs) — multi-year + multi-location wheat performance for Northern Plains / PNW / + Plains regions. + + A typical workflow: call this to identify top performers in a + region/year, then call `lookup_variety(source_key=...)` on the + leaders to verify identity details (RM, traits, disease ratings). + """ + with TimedCall("search_trials", { + "query": query, "crop": crop, "state": state, "year": year, + "product": product, "k": k, + }) as _call: + where = _build_where( + crop, None, None, None, None, + data_type="trial", + state=state, + year=year, + ) + pool_size = max(k * 3, RERANK_POOL) + + try: + col = _collection() + except Exception as exc: # noqa: BLE001 + _call.set(error_dense=str(exc), hits_returned=0) + return ( + "_(retrieval unavailable — Chroma collection not found. " + "Has the indexer run? `python -m rag.index --rebuild`.)_" + ) + + # If a product filter is set, augment the query with the + # product code so BM25 + dense both have signal. + full_query = query + if product: + full_query = f"{query} {product}" + + try: + dense = col.query( + query_texts=[full_query], + n_results=pool_size, + where=where, + ) + except Exception as exc: # noqa: BLE001 + _call.set(error_dense=str(exc), hits_returned=0) + return f"_(trial retrieval failed: {exc})_" + + dense_ids: list[str] = (dense.get("ids") or [[]])[0] + dense_docs: list[str] = (dense.get("documents") or [[]])[0] + dense_metas: list[dict] = (dense.get("metadatas") or [[]])[0] + dense_dists: list[float] = (dense.get("distances") or [[]])[0] + + id_to_doc = dict(zip(dense_ids, dense_docs)) + id_to_meta = dict(zip(dense_ids, dense_metas)) + id_to_dist = dict(zip(dense_ids, dense_dists)) + + used_hybrid = False + if HYBRID_SEARCH: + bm25 = _bm25_index() + if bm25 is not None: + bm25_hits = bm25.query(full_query, n=pool_size, where=where) + bm25_ids = [h[0] for h in bm25_hits] + if bm25_ids: + fused = _rrf_fuse([dense_ids, bm25_ids]) + fuzzy_ids = fused + used_hybrid = True + else: + fuzzy_ids = dense_ids + else: + fuzzy_ids = dense_ids + else: + fuzzy_ids = dense_ids + + # Optional product-substring post-filter: if user supplied + # ``product``, require the chunk to actually contain the + # token. This re-checks the bytes since BM25 only sees stems. + if product: + needle = product.lower() + def _has_product(cid: str) -> bool: + doc = id_to_doc.get(cid, "") + if needle in doc.lower(): + return True + # Not yet fetched — defer; the get-by-id below will fix. + return cid not in id_to_doc + + fuzzy_ids = [cid for cid in fuzzy_ids if _has_product(cid)] + + final_ids: list[str] = [] + seen: set[str] = set() + for cid in fuzzy_ids: + if cid in seen: + continue + seen.add(cid) + final_ids.append(cid) + if len(final_ids) >= k: + break + + missing = [i for i in final_ids if i not in id_to_doc] + if missing: + try: + extra = col.get(ids=missing, include=["documents", "metadatas"]) + for cid, doc, meta in zip( + extra.get("ids") or [], + extra.get("documents") or [], + extra.get("metadatas") or [], + ): + id_to_doc[cid] = doc + id_to_meta[cid] = meta + except Exception as exc: # noqa: BLE001 + log.warning("get-by-id for BM25-only hits failed: %s", exc) + + # Apply product filter once we have docs from the get-by-id pass. + if product: + needle = product.lower() + final_ids = [cid for cid in final_ids if needle in id_to_doc.get(cid, "").lower()] + + _call.set( + hits_returned=len(final_ids), + hybrid=used_hybrid, + pool_size=pool_size, + data_type="trial", + ) + + if not final_ids: + return ( + "_(no trials matched. Try widening — drop the state, " + "year, or product filter. `list_versions()` shows " + "which trial sources are indexed.)_" + ) + + blocks: list[str] = [] + for cid in final_ids: + doc = id_to_doc.get(cid, "") + meta = id_to_meta.get(cid, {}) + dist = id_to_dist.get(cid) if not used_hybrid else None + blocks.append(_format_trial_hit(doc, meta, dist)) + + header = ( + f"# Trial search results — {len(final_ids)} trial document" + f"{'s' if len(final_ids) != 1 else ''}" + f"{' (dense + BM25 hybrid)' if used_hybrid else ' (dense only)'}\n" + f"_Use `get_page(source=..., source_key=...)` to read the " + f"full trial body. Use `lookup_variety(source_key=...)` on " + f"any product code to verify its identity (RM, traits, " + f"disease ratings)._\n\n---\n\n" + ) + return header + "\n---\n\n".join(blocks) + + +def _format_trial_hit(doc: str, meta: dict, distance: float | None = None) -> str: + """Trial-specific result header. Highlights crop/state/year and + sources URL (vs variety hits which emphasize brand + product + identity).""" + src_url = meta.get("source_url") or "" + src_key = meta.get("source_key") or "" + src = meta.get("source") or "" + crop = meta.get("crop") or "" + state = meta.get("state") or "" + year = meta.get("year") or "" + region = meta.get("region") or "" + + title_bits = [b for b in [crop.title(), region or state, str(year) if year else ""] if b] + title = " · ".join(title_bits) if title_bits else src_key + + header = ( + f"### Trial: {title} \n" + f"`{src}::{src_key}` — {meta.get('vendor', '')} / {meta.get('brand', '')} \n" + f"<{src_url}>" + ) + if distance is not None: + header += f" \n_(distance={distance:.4f})_" + return f"{header}\n\n{doc.strip()}\n" + + @mcp.tool() def crop_seed_api_lessons( topic: Annotated[ diff --git a/rag/bm25.py b/rag/bm25.py index 79507371..e09bb968 100644 --- a/rag/bm25.py +++ b/rag/bm25.py @@ -42,7 +42,12 @@ DEFAULT_DB_NAME = "crop_seed_docs.db" # Columns we expose as filterable metadata. Mirrors what # ``docs_mcp.server._build_where`` accepts so the same filter dict # works for both Chroma and BM25 without per-retriever translation. -FILTER_COLUMNS = ("source", "vendor", "brand", "crop", "source_key", "ordinal") +# data_type / year / state / region are trial-specific facets; variety +# chunks leave them empty. +FILTER_COLUMNS = ( + "source", "vendor", "brand", "crop", "source_key", + "data_type", "year", "state", "ordinal", +) # Allowlist tokenizer for free-text queries. FTS5's parser chokes on @@ -131,8 +136,9 @@ class BM25Index: con.executescript(self._schema_sql()) con.executemany( "INSERT INTO chunks_meta " - "(id, source, vendor, brand, crop, source_key, ordinal) " - "VALUES (?, ?, ?, ?, ?, ?, ?)", + "(id, source, vendor, brand, crop, source_key, " + " data_type, year, state, ordinal) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", [ ( r["id"], @@ -141,6 +147,9 @@ class BM25Index: r["metadata"].get("brand") or "", r["metadata"].get("crop") or "", r["metadata"].get("source_key") or "", + r["metadata"].get("data_type") or "variety", + int(r["metadata"]["year"]) if isinstance(r["metadata"].get("year"), int) else None, + r["metadata"].get("state") or "", int(r["metadata"].get("ordinal") or 0), ) for r in records @@ -216,12 +225,18 @@ class BM25Index: brand TEXT, crop TEXT, source_key TEXT, + data_type TEXT, + year INTEGER, + state TEXT, ordinal INTEGER ); CREATE INDEX idx_meta_source ON chunks_meta(source); CREATE INDEX idx_meta_crop ON chunks_meta(crop); CREATE INDEX idx_meta_brand ON chunks_meta(brand); CREATE INDEX idx_meta_source_key ON chunks_meta(source_key); + CREATE INDEX idx_meta_data_type ON chunks_meta(data_type); + CREATE INDEX idx_meta_year ON chunks_meta(year); + CREATE INDEX idx_meta_state ON chunks_meta(state); CREATE VIRTUAL TABLE chunks_fts USING fts5( text, diff --git a/rag/chunk.py b/rag/chunk.py index ad04240a..9f01d793 100644 --- a/rag/chunk.py +++ b/rag/chunk.py @@ -253,6 +253,7 @@ def _flat_metadata(sidecar: dict) -> dict: md: dict = { "source": sidecar.get("source") or "", "source_key": sidecar.get("source_key") or "", + "data_type": "variety", "vendor": sidecar.get("vendor") or "", "brand": (sidecar.get("brand") or "").upper(), "crop": (sidecar.get("crop") or "").lower(), @@ -304,6 +305,258 @@ def chunks_from_variety( } +# =========================================================================== +# Trial chunker — for sidecars with data_type="trial" +# =========================================================================== +# +# Trial documents are a different shape from variety identity: +# - GH plot reports: per-site head-to-head yield comparison across brands +# - AgriPro trial PDFs: regional multi-year multi-location summary +# +# Both produce ONE chunk per document with a preamble that emphasizes +# the trial's location/year/top performers so the embedder gets clean +# signal for queries like "best corn for sandy soil Iowa 2024". + + +def _render_gh_plot_chunk(sidecar: dict) -> str: + """Render a Golden Harvest plot report (per-site cross-vendor).""" + lines: list[str] = [] + crop = (sidecar.get("crop") or "").lower() + crop_label = {"corn": "Corn", "soybeans": "Soybean", "silage": "Silage"}.get(crop, crop.title()) + state = sidecar.get("state") or sidecar.get("state_abbrev") or "" + year = sidecar.get("year") or "" + cooperator = sidecar.get("cooperator") or "" + + lines.append(f"# {crop_label} yield trial — {state}, {year}") + lines.append("") + facts = ["Golden Harvest plot report (cross-vendor)"] + if cooperator: + facts.append(f"cooperator {cooperator}") + if sidecar.get("planted_date"): + facts.append(f"planted {sidecar['planted_date']}") + if sidecar.get("harvested_date"): + facts.append(f"harvested {sidecar['harvested_date']}") + if sidecar.get("population_seeds_per_acre"): + facts.append(f"population {sidecar['population_seeds_per_acre']:,} seeds/acre") + if sidecar.get("row_width_in"): + facts.append(f"{sidecar['row_width_in']}\" rows") + lines.append(". ".join(facts) + ".") + lines.append("") + + results = sidecar.get("results") or [] + if results: + # Pick the primary metric for ranking: corn/soy use "Yield", + # silage uses "Ton/Acre". Find the first metric key with a + # numeric value in the top result. + def _primary(r: dict) -> tuple[str, float | None]: + metrics = r.get("metrics") or {} + # Back-compat: old sidecars had yield_bu_ac directly. + if not metrics and r.get("yield_bu_ac") is not None: + return ("Yield", r["yield_bu_ac"]) + for k in ("Yield", "Ton/Acre", "Tons/Acre"): + v = metrics.get(k) + if isinstance(v, (int, float)): + return (k, v) + for k, v in metrics.items(): + if isinstance(v, (int, float)): + return (k, v) + return ("", None) + + top = results[: min(5, len(results))] + primary_label, _ = _primary(top[0]) if top else ("", None) + rendered_top_parts: list[str] = [] + for i, r in enumerate(top): + label, val = _primary(r) + piece = f"#{r.get('rank') or i+1} {r.get('brand','?')} {r.get('product','?')}" + if r.get('traits'): + piece += f" {r['traits']}" + if val is not None: + piece += f" — {val} {label}" + rendered_top_parts.append(piece) + if rendered_top_parts: + lines.append( + f"Top {len(top)} ({crop_label}, {state} {year}): " + + ", ".join(rendered_top_parts) + "." + ) + lines.append("") + + # Discover the metric column order from the first result with metrics. + metric_keys: list[str] = [] + for r in results: + metrics = r.get("metrics") or {} + if metrics: + metric_keys = list(metrics.keys()) + break + # Back-compat: synthesize from legacy fields if no metrics dict. + if not metric_keys and any( + r.get("yield_bu_ac") is not None for r in results + ): + metric_keys = ["Yield", "%MST", "Test Weight", "Gross Revenue"] + + # Full ranking — preserves every datapoint verbatim. + col_headers = ["rank", "brand", "product", "traits"] + metric_keys + lines.append("Full ranking (" + " | ".join(col_headers) + "):") + for r in results: + row = [ + f"#{r.get('rank') or '-'}", + r.get("brand") or "-", + r.get("product") or "-", + r.get("traits") or "-", + ] + metrics = r.get("metrics") or {} + # Back-compat shim + if not metrics: + metrics = { + "Yield": r.get("yield_bu_ac"), + "%MST": r.get("mst_pct"), + "Test Weight": r.get("test_weight"), + "Gross Revenue": r.get("gross_revenue_dol_ac"), + } + for k in metric_keys: + v = metrics.get(k) + if v is None: + row.append("-") + elif isinstance(v, (int, float)): + if "Revenue" in k or "$" in k: + row.append(f"${v:.2f}") + else: + row.append(str(v)) + else: + row.append(str(v)) + lines.append(" " + " | ".join(row)) + lines.append("") + + urls = sidecar.get("source_urls") or [] + if urls: + lines.append(f"Source: {urls[0]}") + return "\n".join(lines).strip() + "\n" + + +def _render_agripro_trial_chunk(sidecar: dict) -> str: + """Render an AgriPro regional trial PDF — preamble + verbatim text.""" + lines: list[str] = [] + title = sidecar.get("title") or sidecar.get("filename") or sidecar.get("source_key", "") + lines.append(f"# {title}") + lines.append("") + + facts = ["AgriPro / Syngenta regional wheat trial"] + if sidecar.get("region"): + facts.append(f"region {sidecar['region']}") + if sidecar.get("wheat_class_section"): + facts.append(f"class {sidecar['wheat_class_section']}") + if sidecar.get("years_covered") and len(sidecar["years_covered"]) > 1: + yc = sidecar["years_covered"] + facts.append(f"years {yc[0]}–{yc[-1]}") + elif sidecar.get("year"): + facts.append(f"year {sidecar['year']}") + lines.append(". ".join(facts) + ".") + lines.append("") + + varieties = sidecar.get("varieties_found") or [] + if varieties: + lines.append("Varieties listed: " + ", ".join(varieties) + ".") + lines.append("") + + # Verbatim trial data — preserves variety + yield numbers adjacent + # so BM25/dense can match "AP Iliad Aberdeen Idaho" queries. + lines.append("Trial data (verbatim from PDF):") + lines.append("") + # The actual text was in the .md body but isn't in the sidecar + # JSON. We render a brief marker; full text goes in the .md file + # that get_page returns. For embedding signal, the title + + # varieties + region is usually enough. + # If we want the FULL text in the chunk we'd need to either store + # it in the sidecar OR read it from the .md path at chunk time. + # Read from the .md path: + return "\n".join(lines).strip() + "\n" + + +def _render_trial_chunk(sidecar: dict, md_text: str | None = None) -> str: + """Dispatch to the right trial renderer by source. Includes the + verbatim trial body for sources whose value lives in the body text + (currently agripro_trials).""" + source = sidecar.get("source") + if source == "gh_plot_reports": + return _render_gh_plot_chunk(sidecar) + if source == "agripro_trials": + header = _render_agripro_trial_chunk(sidecar) + if md_text: + # Strip the markdown frontmatter so the body text is the + # actual trial data, not the per-source preamble. + body = md_text + sep = "## Trial data (verbatim from PDF)" + if sep in body: + body = body.split(sep, 1)[1].strip() + # Strip fence markers + body = re.sub(r"```", "", body).strip() + return header + "\n" + body + "\n" + return header + # Fallback: generic trial render + return _render_gh_plot_chunk(sidecar) + + +def _flat_trial_metadata(sidecar: dict) -> dict: + """Chroma-safe metadata for trial chunks. Mirrors variety metadata + plus trial-specific facets (state, year, data_type).""" + md: dict = { + "source": sidecar.get("source") or "", + "source_key": sidecar.get("source_key") or "", + "data_type": sidecar.get("data_type") or "trial", + "vendor": sidecar.get("vendor") or "", + "brand": (sidecar.get("brand") or "").upper(), + "crop": (sidecar.get("crop") or "").lower(), + "source_url": (sidecar.get("source_urls") or [""])[0], + } + year = sidecar.get("year") + if isinstance(year, int): + md["year"] = year + state = sidecar.get("state_abbrev") or sidecar.get("state") + if state: + md["state"] = state.upper() if len(state) <= 3 else state + md["state_abbrev"] = (sidecar.get("state_abbrev") or "").upper() + if sidecar.get("region"): + md["region"] = sidecar["region"] + if sidecar.get("wheat_class_section"): + md["wheat_class"] = sidecar["wheat_class_section"] + if sidecar.get("plot_id"): + md["plot_id"] = sidecar["plot_id"] + if isinstance(sidecar.get("n_results"), int): + md["n_results"] = sidecar["n_results"] + return md + + +def chunks_from_trial( + sidecar_path: Path | str, + *, + md_path: Path | str | None = None, +) -> Iterator[dict]: + """Yield chunk dict(s) for one trial document. Emits exactly one + chunk per trial. + + Args: + sidecar_path: path to the trial's JSON sidecar. + md_path: path to the trial's markdown body (used for + AgriPro PDFs whose value lives in the verbatim + text). If omitted we infer it from sidecar_path. + """ + sc_path = Path(sidecar_path) + sidecar = json.loads(sc_path.read_text(encoding="utf-8")) + + md_text: str | None = None + md_p = Path(md_path) if md_path else sc_path.with_suffix(".md") + if md_p.exists(): + md_text = md_p.read_text(encoding="utf-8") + + text = _render_trial_chunk(sidecar, md_text=md_text) + meta = _flat_trial_metadata(sidecar) + chunk_id = f"{meta['source']}::{meta['source_key']}::0" + yield { + "id": chunk_id, + "text": text, + "metadata": {**meta, "ordinal": 0}, + } + + # ----- Backwards-compat shim for the template's index.py ------------------- # # The template's ``rag.index.page_records`` calls diff --git a/rag/index.py b/rag/index.py index 91bb9412..a453b5ba 100644 --- a/rag/index.py +++ b/rag/index.py @@ -12,6 +12,7 @@ Override via the PRODUCT_NAME env var. from __future__ import annotations import argparse +import json import logging import os import time @@ -21,7 +22,7 @@ from typing import Iterator import chromadb from chromadb.config import Settings -from .chunk import chunks_from_variety +from .chunk import chunks_from_variety, chunks_from_trial from .embeddings import embedding_function log = logging.getLogger(__name__) @@ -37,7 +38,17 @@ COLLECTION = f"{PRODUCT_NAME}_docs" def variety_records() -> Iterator[dict]: """Walk ``corpus//.json``, yield one chunk per - variety.""" + document. + + Dispatches by the sidecar's ``data_type`` field: + - ``"trial"`` → chunks_from_trial (gh_plot_reports, agripro_trials) + - anything else (or absent) → chunks_from_variety (default) + + The output shape (id/text/metadata) is identical for both — only + the chunk text composition and metadata keys differ. Chroma + BM25 + can index both into the same collection; downstream tools filter + by the ``data_type`` metadata field. + """ if not CORPUS.exists(): log.error("corpus/ doesn't exist; run a scraper first") return @@ -45,7 +56,15 @@ def variety_records() -> Iterator[dict]: if not source_dir.is_dir() or source_dir.name.startswith("."): continue for sidecar_path in sorted(source_dir.glob("*.json")): - yield from chunks_from_variety(sidecar_path) + try: + head = json.loads(sidecar_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + log.warning("skipping unreadable sidecar %s: %s", sidecar_path, exc) + continue + if head.get("data_type") == "trial": + yield from chunks_from_trial(sidecar_path) + else: + yield from chunks_from_variety(sidecar_path) def upsert_to_chroma(records: list[dict]) -> int: diff --git a/scrape/sources/agripro_trials.py b/scrape/sources/agripro_trials.py new file mode 100644 index 00000000..f29a4254 --- /dev/null +++ b/scrape/sources/agripro_trials.py @@ -0,0 +1,483 @@ +"""AgriPro trial-PDF scraper. + +Source: ``agriprowheat.com/trials-data`` — a single page listing +~38 PDF links to regional wheat trial summary documents. Each PDF +is a multi-year multi-location performance test comparing AgriPro +varieties against competitors (LCS, Norwest, PNW, UI, etc.). + +Discovery: walk ``/trials-data``, collect every ``href="*.pdf"``. + +Per-PDF content (parsed via pdfplumber): + - First line: usually the title (e.g. + "2024 Pacific Northwest Combined Summary, Three-Year Data") + - A multi-column table with one row per variety. Columns vary by + PDF but typically include: 3-yr combined yield, 2-yr combined, + most-recent-year yield, plus per-location yields with location + names in the header. + - Footer notes: locations covered, LSD/CV statistical caveats, + copyright. + +Trial PDFs are stable text-extractable (no charts). We capture the +full per-page text verbatim in the chunk body — preserving +variety-name + yield-number adjacency for the embedder — plus +metadata derived from the title (region, year, crop class). This is +a deliberate trade-off: perfect table parsing across the PDF +variants would be brittle; verbatim text preserves every data point +and the embedder + BM25 between them can match queries like +"AP Iliad yield Aberdeen Idaho" reliably. + +Output: + corpus/agripro_trials/.md + corpus/agripro_trials/.json + +source_key convention: ``agt-`` lowercased, +e.g. ``agt-2024-pnw-combined``. + +CLI: + python -m scrape.sources.agripro_trials --limit 5 + python -m scrape.sources.agripro_trials --force +""" + +from __future__ import annotations + +import argparse +import io +import json +import logging +import os +import random +import re +import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup +import pdfplumber + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://agriprowheat.com" +LIST_URL = f"{BASE}/trials-data" + +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "agripro_trials" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.agripro_trials") + +# Region name patterns we recognize in PDF filenames / titles. The +# value is a human-readable normalized region. +REGION_PATTERNS = ( + (re.compile(r"\bPNW\b|Pacific Northwest", re.I), "Pacific Northwest"), + (re.compile(r"\bNE Colorado\b|Northeast Colorado", re.I), "NE Colorado"), + (re.compile(r"\bSC KS\b|South Central Kansas", re.I), "SC Kansas / N Central OK"), + (re.compile(r"\bWestern Plains\b", re.I), "Western Plains"), + (re.compile(r"\bCentral Plains\b", re.I), "Central Plains"), + (re.compile(r"\bPlains Irrigated\b", re.I), "Plains Irrigated"), + (re.compile(r"\bWashington[/:]?N? *Idaho\b", re.I), "WA / N. Idaho"), + (re.compile(r"\bSouthern Idaho\b", re.I), "Southern Idaho"), + (re.compile(r"\bMontana\b", re.I), "Montana"), + (re.compile(r"\bNP Perf Data\b|Northern Plains", re.I), "Northern Plains"), + (re.compile(r"\bWheat after Soy\b", re.I), "Wheat-after-Soy rotation"), +) + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 60.0, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request(method, url, timeout=timeout, **kw) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class TrialPDF: + source_key: str + source_url: str + pdf_url: str + filename: str + title: str | None = None + year: int | None = None + years_covered: list[int] = field(default_factory=list) + region: str | None = None + wheat_class_section: str | None = None # e.g. "Soft White Winter Wheat" — derived from PDF text + page_text: str = "" + varieties_found: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------- discovery + + +def discover_pdfs(http: RateLimitedSession) -> list[tuple[str, str, str, str]]: + """Return ``[(pdf_url, filename, section_heading, section_anchor), ...]`` + for every PDF on /trials-data. + + De-duplicates by pdf_url — multiple section headings may link to + the same PDF (e.g. a multi-state summary). + """ + log.info("fetching trials index %s", LIST_URL) + r = http.get(LIST_URL) + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + seen: dict[str, tuple[str, str, str, str]] = {} + for a in soup.find_all("a", href=re.compile(r"\.pdf(?:$|\?)", re.I)): + href = a["href"] + from urllib.parse import urljoin + full = urljoin(LIST_URL, href) + fn = href.rsplit("/", 1)[-1] + # Section context — closest preceding h2/h3/h4 + section = "" + parent = a.parent + for _ in range(10): + if parent is None: + break + head = parent.find_previous(["h2", "h3", "h4"]) + if head: + section = head.get_text(strip=True) + break + parent = parent.parent + if full not in seen: + seen[full] = (full, fn, section, href) + out = list(seen.values()) + log.info("trial PDFs found: %d (deduped from %d total links)", + len(out), + sum(1 for a in soup.find_all("a", href=re.compile(r"\.pdf", re.I)))) + return out + + +# --------------------------------------------------------------------- helpers + + +def source_key_for(filename: str) -> str: + """``2024 PNW Combined.pdf`` → ``agt-2024-pnw-combined``.""" + from urllib.parse import unquote + stem = unquote(filename).rsplit(".", 1)[0] + slug = re.sub(r"[^a-zA-Z0-9]+", "-", stem).strip("-").lower() + return f"agt-{slug}" + + +def _detect_region(text: str) -> str | None: + for pat, label in REGION_PATTERNS: + if pat.search(text): + return label + return None + + +def _detect_years(text: str) -> list[int]: + """Return sorted years found in the PDF title / first lines. + Filters to 2010-2030 to ignore page numbers / table values.""" + years = sorted({ + int(y) for y in re.findall(r"\b(20[1-3]\d)\b", text[:600]) + }) + return years + + +def _detect_wheat_class_section(text: str) -> str | None: + """The trial PDFs typically have a class label line like + 'Soft White Winter Wheat' near the top of the table.""" + for label in ( + "Hard Red Winter Wheat", "Hard Red Spring Wheat", + "Hard White Spring Wheat", "Hard White Winter Wheat", + "Soft White Winter Wheat", "Soft White Spring Wheat", + "Soft Red Winter Wheat", "Durum", + ): + if re.search(r"\b" + re.escape(label) + r"\b", text[:1500], re.I): + return label + return None + + +# Variety name patterns we expect to see in AgriPro trial PDFs. +# AgriPro varieties = AP , SY ; competitors include +# LCS , UI , PNW , Norwest . +_VARIETY_LINE_RE = re.compile( + r"^(?:AP|SY|LCS|UI|PNW|Norwest|WB|Stine|Pioneer)\b[A-Za-z0-9 \-+]*", +) + + +def _detect_varieties(text: str) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for line in text.splitlines(): + line = line.strip() + if not line: + continue + m = _VARIETY_LINE_RE.match(line) + if m: + # Up to first run of digits / spaces — variety name only + name_match = re.match(r"^([A-Za-z][A-Za-z0-9 \-+]*?)\s+\d", line) + name = name_match.group(1).strip() if name_match else m.group(0).strip() + # Trim trailing single tokens that are clearly stats + if name and name not in seen and len(name) <= 40: + seen.add(name) + out.append(name) + return out + + +# --------------------------------------------------------------------- detail + + +def fetch_pdf_detail( + http: RateLimitedSession, + pdf_url: str, + filename: str, +) -> TrialPDF | None: + """Download + parse one trial PDF.""" + r = http.get(pdf_url) + if r.status_code == 404: + return None + r.raise_for_status() + try: + with pdfplumber.open(io.BytesIO(r.content)) as pdf: + pages_text = [] + for p in pdf.pages: + t = p.extract_text() or "" + pages_text.append(t) + text = "\n\n".join(pages_text).strip() + except Exception as exc: # noqa: BLE001 + log.warning("PDF parse failed for %s: %s", pdf_url, exc) + return None + + title = "" + if text: + # First non-empty line is usually the title. + for line in text.splitlines(): + line = line.strip() + if line: + title = line + break + + region = _detect_region(filename) or _detect_region(title or "") + years = _detect_years(title + "\n" + filename) + wheat_class_section = _detect_wheat_class_section(text) + varieties = _detect_varieties(text) + + return TrialPDF( + source_key=source_key_for(filename), + source_url=LIST_URL, + pdf_url=pdf_url, + filename=filename, + title=title or None, + year=years[-1] if years else None, + years_covered=years, + region=region, + wheat_class_section=wheat_class_section, + page_text=text, + varieties_found=varieties, + ) + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: TrialPDF) -> str: + head: list[str] = [ + f"# {p.title or p.filename}", + "", + "- **Source:** AgriPro (Syngenta) regional trial PDF", + "- **Vendor:** Syngenta", + "- **Brand:** AgriPro", + "- **Crop:** Wheat", + "- **Data type:** trial", + ] + if p.region: + head.append(f"- **Region:** {p.region}") + if p.wheat_class_section: + head.append(f"- **Wheat class:** {p.wheat_class_section}") + if p.year: + head.append(f"- **Year:** {p.year}") + if p.years_covered and len(p.years_covered) > 1: + head.append(f"- **Years covered:** {p.years_covered[0]}–{p.years_covered[-1]}") + head.append(f"- **PDF:** {p.pdf_url}") + head.append(f"- **Index page:** {p.source_url}") + if p.varieties_found: + head.append( + f"- **Varieties listed:** {', '.join(p.varieties_found[:30])}" + + ("…" if len(p.varieties_found) > 30 else "") + ) + head.append("") + head.append("---") + head.append("") + head.append("## Trial data (verbatim from PDF)") + head.append("") + head.append("```") + head.append(p.page_text) + head.append("```") + return "\n".join(head) + + +# --------------------------------------------------------------------- write + + +def write_pdf(prod: TrialPDF, body_md: str) -> None: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + sidecar = { + "source": "agripro_trials", + "source_key": prod.source_key, + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": prod.title, + "filename": prod.filename, + "region": prod.region, + "wheat_class_section": prod.wheat_class_section, + "year": prod.year, + "years_covered": prod.years_covered, + "varieties_found": prod.varieties_found, + "pdf_url": prod.pdf_url, + "source_urls": [prod.source_url, prod.pdf_url], + "page_text_chars": len(prod.page_text), + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_pdf( + http: RateLimitedSession, + *, + pdf_url: str, + filename: str, + force: bool, +) -> tuple[str, TrialPDF | None]: + sk = source_key_for(filename) + md_path = CORPUS_DIR / f"{sk}.md" + if md_path.exists() and not force: + return "skipped", None + try: + prod = fetch_pdf_detail(http, pdf_url, filename) + except Exception as exc: # noqa: BLE001 + log.error("PDF fetch/parse failed for %s: %s", pdf_url, exc) + return "failed", None + if prod is None: + return "missing", None + body = render_markdown(prod) + write_pdf(prod, body) + return "written", prod + + +def run(*, limit: int | None, force: bool) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + targets = discover_pdfs(http) + + counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} + processed = 0 + for pdf_url, filename, _section, _href in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_pdf( + http, pdf_url=pdf_url, filename=filename, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + log.info( + "[%d/%d] %s %s | region=%s year=%s varieties=%d chars=%d", + processed, len(targets), + source_key_for(filename), status, + (prod.region if prod else "-") or "-", + prod.year if prod else "-", + len(prod.varieties_found) if prod else 0, + len(prod.page_text) if prod else 0, + ) + + log.info( + "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d PDFs)", + processed, counts["written"], counts["skipped"], + counts["missing"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.agripro_trials", + description="Scrape AgriPro regional trial PDFs.", + ) + p.add_argument("--limit", type=int, default=None, + help="Stop after processing N PDFs (default: all).") + p.add_argument("--force", action="store_true", + help="Re-fetch even if the markdown file already exists.") + p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) + return p + + +def main(argv: list[str] | None = None) -> int: + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run(limit=args.limit, force=args.force) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrape/sources/gh_plot_reports.py b/scrape/sources/gh_plot_reports.py new file mode 100644 index 00000000..fa026598 --- /dev/null +++ b/scrape/sources/gh_plot_reports.py @@ -0,0 +1,781 @@ +"""Golden Harvest plot-report scraper — cross-vendor yield trials. + +This is the FIRST source in the seed-mcp corpus with ``data_type: +"trial"`` rather than the per-variety identity records all other +scrapers emit. Each document is one head-to-head yield trial at a +specific state/year/site, comparing products across brands (NK, +DEKALB, Golden Harvest, sometimes Pioneer/Channel etc. listed as +competitor entries) — i.e. **third-party-feeling cross-vendor data +that Bayer doesn't publish itself**. + +Source: ``goldenharvestseeds.com`` — same site as ``golden_harvest`` +variety scraper. ``/sitemap-ghs-hybrids.xml`` (already walked for +the variety scraper) lists 8,237 plot reports across: + + Year Corn Soy Silage Total + 2023 1,832 1,614 173 3,619 + 2024 1,432 1,277 137 2,846 + 2025 973 703 96 1,772 + +Initial scrape: 2024 + 2025 (4,618 reports). 2023 is older data +that's still informative but lower priority. Defer 2023 to a later +backfill pass via ``--include-2023``. + +URL shape: + //plot-report/// + e.g. /corn/plot-report/al/2023/2374765 + +Per-report data (server-rendered HTML): + - Cooperator name (h1 area) + - State (full name, e.g. "Alabama") + - Planted date / Harvested date + - Population (seeds/acre), Row Width + - One with columns: + Rank | Brand | Product | Traits | Yield (BU/Acre) | %MST | + Test Weight | Gross Revenue | Entry # + +Each row in the results table can be from any seed brand — the +trial is the test, not the catalog. Brand and product are the join +keys back to the per-variety corpus (lookup_variety can pull the +identity record if we have the same brand/product). + +Output: + corpus/gh_plot_reports/.md LLM-visible body + corpus/gh_plot_reports/.json sidecar metadata + +source_key convention: ``ghpr----`` +e.g. ``ghpr-corn-al-2023-2374765``. + +CLI: + python -m scrape.sources.gh_plot_reports --limit 5 + python -m scrape.sources.gh_plot_reports --crop corn --state ia --year 2024 + python -m scrape.sources.gh_plot_reports --include-2023 --force +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import random +import re +import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://www.goldenharvestseeds.com" +SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml" + +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "gh_plot_reports" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.gh_plot_reports") + +# State name normalization: URL gives a 2-letter abbrev; sidecar keeps +# both forms so search filters can use either. +STATE_NAMES = { + "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas", + "ca": "California", "co": "Colorado", "ct": "Connecticut", + "de": "Delaware", "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", + "id": "Idaho", "il": "Illinois", "in": "Indiana", "ia": "Iowa", + "ks": "Kansas", "ky": "Kentucky", "la": "Louisiana", "me": "Maine", + "md": "Maryland", "ma": "Massachusetts", "mi": "Michigan", + "mn": "Minnesota", "ms": "Mississippi", "mo": "Missouri", + "mt": "Montana", "ne": "Nebraska", "nv": "Nevada", "nh": "New Hampshire", + "nj": "New Jersey", "nm": "New Mexico", "ny": "New York", + "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", + "ok": "Oklahoma", "or": "Oregon", "pa": "Pennsylvania", + "ri": "Rhode Island", "sc": "South Carolina", "sd": "South Dakota", + "tn": "Tennessee", "tx": "Texas", "ut": "Utah", "vt": "Vermont", + "va": "Virginia", "wa": "Washington", "wv": "West Virginia", + "wi": "Wisconsin", "wy": "Wyoming", +} + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 30.0, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request(method, url, timeout=timeout, **kw) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class TrialResult: + rank: int | None = None + brand: str = "" + product: str = "" + traits: str = "" + # Generic per-column metrics — keyed by the header from the table + # (e.g. "Yield" / "%MST" / "Ton/Acre" / "Milk Per Acre" / + # "Beef Per Ton"). Corn + soy use Yield/MST/Test Weight/Gross + # Revenue; silage uses Ton/Acre + Milk + Beef columns. Storing as + # an open dict keeps the scraper robust across crop types. + metrics: dict[str, float | str | None] = field(default_factory=dict) + entry_num: int | None = None + + # Convenience accessors — back-compat for the chunker that looks + # up these specific keys. + @property + def yield_bu_ac(self) -> float | None: + v = self.metrics.get("Yield") + return v if isinstance(v, (int, float)) else None + + @property + def mst_pct(self) -> float | None: + v = self.metrics.get("%MST") + return v if isinstance(v, (int, float)) else None + + @property + def test_weight(self) -> float | None: + v = self.metrics.get("Test Weight") + return v if isinstance(v, (int, float)) else None + + @property + def gross_revenue_dol_ac(self) -> float | None: + v = self.metrics.get("Gross Revenue") + return v if isinstance(v, (int, float)) else None + + @property + def primary_metric(self) -> tuple[str, float | None]: + """The first numeric metric — used as the canonical 'yield' + for ranking in the chunk preamble. Corn/soy: Yield (BU/Ac). + Silage: Ton/Acre.""" + for k in ("Yield", "Ton/Acre", "Tons/Acre"): + v = self.metrics.get(k) + if isinstance(v, (int, float)): + return (k, v) + # Fallback to first numeric metric + for k, v in self.metrics.items(): + if isinstance(v, (int, float)): + return (k, v) + return ("", None) + + +@dataclass +class PlotReport: + source_key: str + source_url: str + crop: str # "corn" / "soybeans" / "silage" + state_abbrev: str # "al" + state_name: str # "Alabama" + year: int + plot_id: str + + cooperator: str | None = None + planted_date: str | None = None # ISO date + harvested_date: str | None = None # ISO date + population: int | None = None + row_width: int | None = None + + results: list[TrialResult] = field(default_factory=list) + + +# --------------------------------------------------------------------- discovery + + +_PLOT_URL_RE = re.compile( + r".*?/(?Pcorn|soybean|silage)/plot-report/" + r"(?P[a-z]{2})/(?P\d{4})/(?P\d+)" +) + + +def discover_plots( + http: RateLimitedSession, + *, + crops: set[str], + states: set[str] | None, + years: set[int], +) -> list[tuple[str, str, str, int, str]]: + """Walk the hybrids sitemap and return matching plot URLs as + ``[(url, crop, state, year, plot_id), ...]`` tuples. ``crop`` is + normalized to the schema's terms (soybean → soybeans).""" + log.info("fetching sitemap %s", SITEMAP_HYBRIDS) + r = http.get(SITEMAP_HYBRIDS) + r.raise_for_status() + entries = re.findall(r"([^<]+)", r.text) + log.info("sitemap parsed: %d total locs", len(entries)) + + out: list[tuple[str, str, str, int, str]] = [] + for url in entries: + m = _PLOT_URL_RE.match(url) + if not m: + continue + crop_url = m.group("crop") + # Normalize "soybean" → "soybeans" to match the rest of the corpus. + crop = "soybeans" if crop_url == "soybean" else crop_url + state = m.group("state").lower() + year = int(m.group("year")) + plot = m.group("plot") + if crops and crop not in crops: + continue + if states and state not in states: + continue + if years and year not in years: + continue + out.append((url, crop, state, year, plot)) + + log.info("after filters: %d plot URLs", len(out)) + return out + + +# --------------------------------------------------------------------- helpers + + +def source_key_for(crop: str, state: str, year: int, plot_id: str) -> str: + return f"ghpr-{crop}-{state}-{year}-{plot_id}" + + +def _parse_date_mdy(s: str) -> str | None: + """``04/06/23`` → ``2023-04-06``. Two-digit years are assumed to + be 20xx (sane for current-century trial data).""" + s = (s or "").strip() + m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s) + if not m: + return None + mo, dy, yr = m.group(1), m.group(2), m.group(3) + if len(yr) == 2: + yr = "20" + yr + try: + return f"{int(yr):04d}-{int(mo):02d}-{int(dy):02d}" + except ValueError: + return None + + +def _parse_int(s: str | None) -> int | None: + if not s: + return None + s = re.sub(r"[,$]", "", str(s).strip()) + try: + return int(s) + except ValueError: + return None + + +def _parse_float(s: str | None) -> float | None: + if not s: + return None + s = re.sub(r"[,$]", "", str(s).strip()) + try: + return float(s) + except ValueError: + return None + + +# --------------------------------------------------------------------- detail + + +def fetch_plot_detail( + http: RateLimitedSession, + url: str, + crop: str, + state: str, + year: int, + plot_id: str, +) -> PlotReport | None: + """Fetch one plot-report page and parse it.""" + r = http.get(url) + if r.status_code == 404: + return None + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + + prod = PlotReport( + source_key=source_key_for(crop, state, year, plot_id), + source_url=url, + crop=crop, + state_abbrev=state, + state_name=STATE_NAMES.get(state, state.upper()), + year=year, + plot_id=plot_id, + ) + + # Pull metadata from the header area. The page renders cooperator + # name + state + key fields as text following the h1. + h1 = soup.find("h1") + if h1: + # Walk up to a parent that includes the metadata strip + container = h1.parent + while container is not None and not container.find("table"): + parent = container.parent + if parent is None: + break + container = parent + if container: + text = container.get_text(" | ", strip=True) + # Cooperator is usually the segment right after the H1. + # Pattern: "Corn Plot Results | | | Planted: | ..." + parts = [p.strip() for p in text.split("|") if p.strip()] + # Drop the title segment + if parts and parts[0].lower().startswith(("corn plot", "soybean plot", "silage plot")): + parts = parts[1:] + if parts: + # First segment that doesn't match a state name is the cooperator + cand = parts[0] + if cand and cand != prod.state_name and not cand.endswith(":"): + prod.cooperator = cand + + # Walk the page text for known labeled fields. + page_text = soup.get_text(" ", strip=True) + m = re.search(r"Planted:\s*(\d{1,2}/\d{1,2}/\d{2,4})", page_text) + if m: + prod.planted_date = _parse_date_mdy(m.group(1)) + m = re.search(r"Harvested:\s*(\d{1,2}/\d{1,2}/\d{2,4})", page_text) + if m: + prod.harvested_date = _parse_date_mdy(m.group(1)) + m = re.search(r"Population:\s*([\d,]+)", page_text) + if m: + prod.population = _parse_int(m.group(1)) + m = re.search(r"Row Width:\s*(\d+)", page_text) + if m: + prod.row_width = _parse_int(m.group(1)) + + # Parse the results table. The HTML uses ONE merged cell for + # "Brand Product Traits" (despite the header containing all + # three labels); subsequent cells are Yield, %MST, Test Weight, + # Gross Revenue, Entry #. We split the merged cell using a + # known-brand prefix match. + table = soup.find("table") + if not table: + return prod + rows = table.find_all("tr") + if not rows: + return prod + + header_cells = [c.get_text(" ", strip=True) for c in rows[0].find_all(["th", "td"])] + + def col_idx(*names: str) -> int | None: + for n in names: + for i, h in enumerate(header_cells): + if n.lower() in h.lower(): + return i + return None + + # Position of the merged identity cell, by header containing "Brand". + i_identity = col_idx("Brand") + i_rank = col_idx("Rank") + i_entry = col_idx("Entry") + + # Build a list of (header, index) for the OTHER columns (the + # metric columns). Skips Rank, Brand-merge-cell, and Entry #. + metric_columns: list[tuple[str, int]] = [] + skip_idx = {i_identity, i_rank, i_entry} + for i, h in enumerate(header_cells): + if i in skip_idx: + continue + h_clean = h.strip() + if h_clean: + metric_columns.append((h_clean, i)) + + for row in rows[1:]: + cells = [c.get_text(" ", strip=True) for c in row.find_all(["td", "th"])] + if len(cells) < 2: + continue + def cell(i: int | None) -> str: + return cells[i] if i is not None and 0 <= i < len(cells) else "" + + identity = cell(i_identity).strip() + if any(k in identity.lower() for k in ("plot average", "trial average", "average")): + continue + + brand, product, traits = _split_identity(identity) + + # Collect every metric column verbatim. Numeric where parseable, + # else preserve the raw string (e.g. "ns" for not-significant). + metrics: dict[str, float | str | None] = {} + for h, idx in metric_columns: + raw = cell(idx).strip() + if not raw or raw == "-": + metrics[h] = None + else: + f = _parse_float(raw) + metrics[h] = f if f is not None else raw + + result = TrialResult( + rank=_parse_int(cell(i_rank)), + brand=brand, + product=product, + traits=traits, + metrics=metrics, + entry_num=_parse_int(cell(i_entry)), + ) + has_data = result.brand or result.product or any( + v is not None for v in metrics.values() + ) + if has_data: + prod.results.append(result) + + return prod + + +# Known seed brands that can appear in plot-report identity cells. +# Sorted longest-first so multi-word brands match before sub-strings. +_BRAND_NAMES = ( + "Golden Harvest", "WestBred", "AgriPro", "DEKALB", "Pioneer", + "Channel", "Asgrow", "NK", "Becks", "Beck's", "Brevant", + "Stine", "Renk", "Wyffels", "LG Seeds", "Croplan", "FS", + "Local Choice", "Mycogen", "AgriGold", "Hoegemeyer", +) +_BRAND_RE = re.compile( + r"^(?:" + "|".join(re.escape(b) for b in _BRAND_NAMES) + r")\b", + re.I, +) + + +def _split_identity(identity: str) -> tuple[str, str, str]: + """Split a plot-report identity cell into ``(brand, product, traits)``. + + The HTML emits one merged cell like "NK NK1748-3110 Agrisure ®" + or "Golden Harvest G16Q82-DV DuracadeViptera™" or just + "DEKALB DKC65-20". We: + + 1. Match the brand against a known-brand list at the start. + 2. The token immediately after the brand is the product. + 3. Anything remaining is the trait stack (free text). + """ + if not identity: + return "", "", "" + s = identity.strip() + m = _BRAND_RE.match(s) + if not m: + # Unknown brand prefix — best-effort: first token is brand, + # second is product, rest is traits. + parts = s.split(maxsplit=2) + if len(parts) == 1: + return parts[0], "", "" + if len(parts) == 2: + return parts[0], parts[1], "" + return parts[0], parts[1], parts[2] + brand = m.group(0) + rest = s[len(brand):].strip() + parts = rest.split(maxsplit=1) + product = parts[0] if parts else "" + traits = parts[1].strip() if len(parts) > 1 else "" + return brand, product, traits + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: PlotReport) -> str: + crop_label = { + "corn": "Corn", "soybeans": "Soybean", "silage": "Silage", + }.get(p.crop, p.crop.title()) + + head: list[str] = [ + f"# {crop_label} yield trial — {p.state_name}, {p.year}", + "", + f"- **Source:** Golden Harvest plot report (cross-vendor head-to-head)", + f"- **Crop:** {crop_label}", + f"- **State:** {p.state_name} ({p.state_abbrev.upper()})", + f"- **Year:** {p.year}", + f"- **Plot ID:** {p.plot_id}", + ] + if p.cooperator: + head.append(f"- **Cooperator:** {p.cooperator}") + if p.planted_date: + head.append(f"- **Planted:** {p.planted_date}") + if p.harvested_date: + head.append(f"- **Harvested:** {p.harvested_date}") + if p.population: + head.append(f"- **Population:** {p.population:,} seeds/acre") + if p.row_width: + head.append(f"- **Row width:** {p.row_width}\"") + head.append(f"- **URL:** {p.source_url}") + head.append("") + head.append("---") + head.append("") + + sections: list[str] = [] + if p.results: + # Discover all metric columns present across results, in + # first-seen order. This keeps corn (Yield/MST/...) and silage + # (Ton/Acre/Milk/Beef) using their own header sets. + metric_keys: list[str] = [] + seen_keys: set[str] = set() + for r in p.results: + for k in r.metrics.keys(): + if k not in seen_keys: + seen_keys.add(k) + metric_keys.append(k) + + sections.append("## Results (top-down by rank)") + sections.append("") + header_cells = ["Rank", "Brand", "Product", "Traits"] + metric_keys + sections.append("| " + " | ".join(header_cells) + " |") + sections.append("|" + "|".join(["---"] * len(header_cells)) + "|") + for r in p.results: + row = [ + str(r.rank) if r.rank is not None else "-", + r.brand or "-", + r.product or "-", + r.traits or "-", + ] + for k in metric_keys: + v = r.metrics.get(k) + if v is None: + row.append("-") + elif isinstance(v, (int, float)): + # Dollar columns rendered with $ prefix + if "Revenue" in k or "$" in k: + row.append(f"${v:.2f}") + else: + row.append(str(v)) + else: + row.append(str(v)) + sections.append("| " + " | ".join(row) + " |") + sections.append("") + + # Compact text summary for embedder signal — uses the primary + # metric (Yield for corn/soy, Ton/Acre for silage). + top = p.results[: min(5, len(p.results))] + if top: + primary_label, _ = top[0].primary_metric + if primary_label: + summary = ", ".join( + f"{r.product or '?'} ({r.brand or '?'}) {r.primary_metric[1]}" + for r in top + if r.primary_metric[1] is not None + ) + if summary: + sections.append(f"Top {len(top)} by {primary_label}: {summary}.") + sections.append("") + + return "\n".join(head) + "\n".join(sections) + + +# --------------------------------------------------------------------- write + + +def write_plot(prod: PlotReport, body_md: str) -> None: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + sidecar = { + "source": "gh_plot_reports", + "source_key": prod.source_key, + "data_type": "trial", + "vendor": "Syngenta", # Golden Harvest publishes the trial + "brand": "Golden Harvest", + "crop": prod.crop, + "state": prod.state_name, + "state_abbrev": prod.state_abbrev, + "year": prod.year, + "plot_id": prod.plot_id, + "cooperator": prod.cooperator, + "planted_date": prod.planted_date, + "harvested_date": prod.harvested_date, + "population_seeds_per_acre": prod.population, + "row_width_in": prod.row_width, + "results": [ + { + "rank": r.rank, + "brand": r.brand, + "product": r.product, + "traits": r.traits, + # All per-column metrics verbatim. Corn/soy: Yield, + # %MST, Test Weight, Gross Revenue. Silage: Ton/Acre, + # Milk Per Acre, Milk Per Ton, Beef Per Acre, Beef Per + # Ton. (Plus any other column the source publishes.) + "metrics": r.metrics, + "entry_num": r.entry_num, + } + for r in prod.results + ], + "n_results": len(prod.results), + "source_urls": [prod.source_url], + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_plot( + http: RateLimitedSession, + *, + url: str, + crop: str, + state: str, + year: int, + plot_id: str, + force: bool, +) -> tuple[str, PlotReport | None]: + sk = source_key_for(crop, state, year, plot_id) + md_path = CORPUS_DIR / f"{sk}.md" + if md_path.exists() and not force: + return "skipped", None + try: + prod = fetch_plot_detail(http, url, crop, state, year, plot_id) + except Exception as exc: # noqa: BLE001 + log.error("detail fetch failed for %s: %s", url, exc) + return "failed", None + if prod is None: + return "missing", None + body = render_markdown(prod) + write_plot(prod, body) + return "written", prod + + +def run( + *, + limit: int | None, + force: bool, + only_crop: str | None, + only_state: str | None, + only_year: int | None, + include_2023: bool, +) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + + crops = {only_crop} if only_crop else {"corn", "soybeans", "silage"} + states = {only_state} if only_state else None + if only_year: + years = {only_year} + elif include_2023: + years = {2023, 2024, 2025} + else: + years = {2024, 2025} + + targets = discover_plots(http, crops=crops, states=states, years=years) + + counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} + processed = 0 + for url, crop, state, year, plot_id in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_plot( + http, url=url, crop=crop, state=state, year=year, + plot_id=plot_id, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + if prod is not None and processed <= 5 or processed % 100 == 0: + log.info( + "[%d/%s] %s %s | results=%d coop=%s", + processed, str(limit) if limit else len(targets), + source_key_for(crop, state, year, plot_id), status, + len(prod.results) if prod else 0, + (prod.cooperator if prod else "-") or "-", + ) + + log.info( + "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)", + processed, counts["written"], counts["skipped"], + counts["missing"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.gh_plot_reports", + description="Scrape Golden Harvest cross-vendor plot reports (yield trials).", + ) + p.add_argument("--limit", type=int, default=None, + help="Stop after processing N plots (default: all).") + p.add_argument("--force", action="store_true", + help="Re-fetch even if the markdown file already exists.") + p.add_argument("--crop", default=None, + choices=("corn", "soybeans", "silage"), + help="Limit to one crop.") + p.add_argument("--state", default=None, + help="Limit to one state (2-letter abbrev: ia, il, ne, ...).") + p.add_argument("--year", type=int, default=None, choices=(2023, 2024, 2025), + help="Limit to one year.") + p.add_argument("--include-2023", action="store_true", + help="Include 2023 plot reports (default: 2024-2025 only).") + p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) + return p + + +def main(argv: list[str] | None = None) -> int: + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run( + limit=args.limit, + force=args.force, + only_crop=args.crop, + only_state=args.state.lower() if args.state else None, + only_year=args.year, + include_2023=args.include_2023, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sources.json b/sources.json index 8ad18a31..3ca18f8d 100644 --- a/sources.json +++ b/sources.json @@ -5,8 +5,16 @@ { "name": "bayer_seeds", "vendor": "Bayer", - "brands": ["DEKALB", "Asgrow", "WestBred"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "DEKALB", + "Asgrow", + "WestBred" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "green", "expected_count": 475, "base_url": "https://cropscience.bayer.us", @@ -17,65 +25,124 @@ { "name": "golden_harvest", "vendor": "Syngenta", - "brands": ["Golden Harvest"], - "crops": ["corn", "soybeans"], + "brands": [ + "Golden Harvest" + ], + "crops": [ + "corn", + "soybeans" + ], "verdict": "green", "expected_count": 175, "base_url": "https://www.goldenharvestseeds.com", "scope_filter": "All sitemap-listed corn + soybean varieties.", "tos_check_date": "2026-05-25", - "schema_notes": "Disease ratings published on 9-to-1 scale (9 = best). Normalize to 1-9 (9 = best) at chunk time to match Bayer/NK/AgriPro convention. Note original direction in chunk_0 preamble. Tech-sheet PDF URLs in the sitemap are stale (250331) — resolve live URL from product HTML, not sitemap entry." + "schema_notes": "Disease ratings published on 9-to-1 scale (9 = best). Normalize to 1-9 (9 = best) at chunk time to match Bayer/NK/AgriPro convention. Note original direction in chunk_0 preamble. Tech-sheet PDF URLs in the sitemap are stale (250331) \u2014 resolve live URL from product HTML, not sitemap entry." }, { "name": "nk", "vendor": "Syngenta", - "brands": ["NK"], - "crops": ["corn", "soybeans"], + "brands": [ + "NK" + ], + "crops": [ + "corn", + "soybeans" + ], "verdict": "green", "expected_count": 29, "base_url": "https://www.syngenta-us.com", "pdf_cdn": "https://assets.syngentaebiz.com/pdf/techsheets/", "scope_filter": "All NK corn + soy varieties. No wheat (NK doesn't sell wheat in US).", "tos_check_date": "2026-05-24", - "schema_notes": "Disease + agronomic ratings live in tech-sheet PDFs only — need pdfplumber. PDF URLs share format `_YYMMDD.pdf` with Golden Harvest, so the same fetcher works for both." + "schema_notes": "Disease + agronomic ratings live in tech-sheet PDFs only \u2014 need pdfplumber. PDF URLs share format `_YYMMDD.pdf` with Golden Harvest, so the same fetcher works for both." }, { "name": "agripro", "vendor": "Syngenta", - "brands": ["AgriPro"], - "crops": ["wheat", "barley"], + "brands": [ + "AgriPro" + ], + "crops": [ + "wheat", + "barley" + ], "verdict": "green", "expected_count": 24, "base_url": "https://www.agriprowheat.com", - "scope_filter": "All wheat classes (HRW/HRS/HWS/SWW/SWS) + barley. NO SRW — Syngenta's SRW lives at GrowProGenetics.com under a separate brand.", + "scope_filter": "All wheat classes (HRW/HRS/HWS/SWW/SWS) + barley. NO SRW \u2014 Syngenta's SRW lives at GrowProGenetics.com under a separate brand.", "tos_check_date": "2026-05-24", "schema_notes": "Drupal Views form; server-rendered HTML. CoAXium trait flag is implicit in product family; Clearfield/CL2 trait IS in this catalog." }, { "name": "becks_pfr", "vendor": "Beck's Hybrids", - "brands": ["Beck's PFR"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "Beck's PFR" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "yellow", "expected_count": 2089, "base_url": "https://www.beckshybrids.com", "api_base": "https://mc8v24rf.api.sanity.io", - "scope_filter": "All Practical Farm Research publications since 2015. PFR is head-to-head agronomy trials — fungicide timing, planting-date studies, hybrid-by-population, etc.", + "scope_filter": "All Practical Farm Research publications since 2015. PFR is head-to-head agronomy trials \u2014 fungicide timing, planting-date studies, hybrid-by-population, etc.", "tos_check_date": "2026-05-24", - "schema_notes": "Public Sanity GROQ API, no auth required. Records have title/year/crop/key-findings/full-text. Treat PFR docs as a research corpus, not variety records — the chunk_0 includes the study's tl;dr finding." + "schema_notes": "Public Sanity GROQ API, no auth required. Records have title/year/crop/key-findings/full-text. Treat PFR docs as a research corpus, not variety records \u2014 the chunk_0 includes the study's tl;dr finding." }, { "name": "becks_products", "vendor": "Beck's Hybrids", - "brands": ["Beck's"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "Beck's" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "yellow", "expected_count": 860, "base_url": "https://www.beckshybrids.com", "api_base": "https://mc8v24rf.api.sanity.io", - "scope_filter": "All Beck's product records — corn + soy + wheat. Identity + RM/MG only.", + "scope_filter": "All Beck's product records \u2014 corn + soy + wheat. Identity + RM/MG only.", "tos_check_date": "2026-05-24", "schema_notes": "Sanity GROQ exposes identity (name, RM/MG, basic traits) but agronomic + disease ratings are SeedIQ-gated (requires browser cookie). Deferred until the SeedIQ XHR endpoint is captured from a logged-in browser session. Without ratings, products are reference-only; the MCP can confirm 'Beck's has hybrid X at RM 112 with Enlist trait' but not 'rate it against drought'." + }, + { + "name": "gh_plot_reports", + "vendor": "Syngenta", + "brand_aggregator": "Golden Harvest publishes", + "crops": [ + "corn", + "soybeans", + "silage" + ], + "verdict": "green", + "expected_count": 4618, + "base_url": "https://www.goldenharvestseeds.com", + "scope_filter": "sitemap-listed plot reports 2024 and 2025 (4,618 reports). 2023 (3,619 reports) deferred to a future pass \u2014 most recent data is most relevant for current decisions.", + "tos_check_date": "2026-05-25", + "schema_notes": "Cross-vendor head-to-head yield trials at specific state/year/site. Each report lists products from multiple brands (NK, DEKALB, GH, etc.) with rank, yield, %MST, test weight, gross revenue. URL: //plot-report///. Same site/auth as golden_harvest variety scraper.", + "data_type": "trial" + }, + { + "name": "agripro_trials", + "vendor": "Syngenta", + "brand_aggregator": "AgriPro publishes", + "crops": [ + "wheat" + ], + "verdict": "green", + "expected_count": 38, + "base_url": "https://agriprowheat.com", + "scope_filter": "PDF trial summaries linked from /trials-data. Regional wheat performance (PNW, Western Plains, NE Colorado, etc.).", + "tos_check_date": "2026-05-25", + "schema_notes": "PDF tables of varieties tested per region per year. pdfplumber for table extraction.", + "data_type": "trial" } ], "_excluded_sources": [