diff --git a/corpus/agripro_trials/agt-2024-pnw-combined.json b/corpus/agripro_trials/agt-2024-pnw-combined.json new file mode 100644 index 00000000..eb319481 --- /dev/null +++ b/corpus/agripro_trials/agt-2024-pnw-combined.json @@ -0,0 +1,44 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2024-pnw-combined", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2024 Pacific Northwest Combined Summary, Three-Year Data", + "filename": "2024%20PNW%20Combined.pdf", + "region": "Pacific Northwest", + "wheat_class_section": null, + "year": 2024, + "years_covered": [ + 2024 + ], + "varieties_found": [ + "AP Olympia", + "AP Exceed", + "SY Ovation", + "SY Dayton", + "SY Assure", + "AP Iliad", + "LCS Shine", + "LCS Artdeco", + "Norwest Duet", + "LCS Hulk", + "PNW Hailey", + "LCS Sonic", + "Norwest Tandem", + "UI Magic CL+", + "LCS Blackjack", + "LCS Drive", + "LCS Jefe", + "LCS Kamiack" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf" + ], + "page_text_chars": 2613, + "fetched_at": "2026-05-25T19:11:04.196638+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2024-pnw-combined.md b/corpus/agripro_trials/agt-2024-pnw-combined.md new file mode 100644 index 00000000..7f38fb06 --- /dev/null +++ b/corpus/agripro_trials/agt-2024-pnw-combined.md @@ -0,0 +1,54 @@ +# 2024 Pacific Northwest Combined Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Pacific Northwest +- **Year:** 2024 +- **PDF:** https://agriprowheat.com/sites/default/files/2024-09/2024%20PNW%20Combined.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Olympia, AP Exceed, SY Ovation, SY Dayton, SY Assure, AP Iliad, LCS Shine, LCS Artdeco, Norwest Duet, LCS Hulk, PNW Hailey, LCS Sonic, Norwest Tandem, UI Magic CL+, LCS Blackjack, LCS Drive, LCS Jefe, LCS Kamiack + +--- + +## Trial data (verbatim from PDF) + +``` +2024 Pacific Northwest Combined Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2022-2024 +3-Yr Combined 2-Yr Combined Combined Moses Lake, Walla Walla, Aberdeen, Craigmont, Twin Falls, +Variety (2022-2024) (2023-2024) (2024) WA WA ID ID ID +Soft White Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A Bu/A +AP Olympia 153.6 63.7 146.5 63.5 154.7 65.4 201.6 189.5 113.8 124.5 144.2 +AP Exceed 150.5 63.2 147.2 63.2 156.6 65.2 188.0 191.2 124.2 124.7 154.8 +SY Ovation 148.5 63.2 146.3 63.1 148.9 64.8 186.4 179.4 117.1 120.1 141.3 +SY Dayton 146.4 63.2 143.1 63.2 155.4 65.1 192.6 189.4 115.4 128.9 150.6 +SY Assure 141.6 64.0 138.7 63.9 141.3 65.4 144.6 172.6 119.7 133.4 136.1 +AP Iliad 144.7 63.1 150.7 65.1 173.2 175.3 116.3 135.4 153.2 +LCS Shine 150.5 62.9 146.0 62.8 156.1 64.4 182.0 169.0 137.3 125.8 166.5 +LCS Artdeco 149.4 62.5 145.9 62.6 158.9 64.2 197.9 183.8 139.9 122.3 150.3 +Norwest Duet 148.5 62.3 148.7 62.0 162.1 64.3 220.0 196.5 129.3 123.9 140.8 +LCS Hulk 147.9 64.0 145.1 63.6 156.3 65.3 204.1 174.1 133.5 123.5 146.4 +PNW Hailey 147.3 63.9 142.0 63.9 152.1 65.4 188.6 192.7 112.1 126.1 141.0 +LCS Sonic 146.8 62.8 143.7 62.7 150.3 64.7 190.3 186.6 124.2 125.6 125.0 +Norwest Tandem 144.2 62.4 143.2 62.3 156.1 64.6 196.1 169.8 137.5 130.1 146.8 +UI Magic CL+ 143.4 63.7 139.4 63.4 150.5 65.4 179.5 183.6 124.2 123.2 142.1 +LCS Blackjack 147.7 60.5 162.0 63.1 203.8 198.0 122.3 131.9 154.0 +LCS Drive 141.7 61.3 153.0 63.8 181.4 181.0 125.8 122.9 153.8 +LCS Jefe 158.0 65.1 208.3 191.0 113.4 123.0 154.3 +LCS Kamiack 151.1 65.1 181.1 178.9 122.9 133.2 139.6 +Mean General 148.7 63.2 146.4 62.9 156.0 64.8 192.3 186.6 126.6 127.6 147.1 +LSD General (5%) EE 7.8 0.7 9.2 1.0 13.0 1.3 18.5 ns 4.5 8.9 ns +CV (Effective) 6.5 1.8 6.5 1.9 5.4 1.5 4.7 6.7 1.8 3.4 6.7 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +NS = Non Significant +Locations +2022—Colfax, WA; Aberdeen, Genesee, and Twin Falls, ID +2023—Moses Lake and Walla Walla, WA; Craigmont, Genesee, and Twin Falls, ID +2024—Moses Lake and Walla Walla, WA; Aberdeen, Craigmont, and Twin Falls, ID +© 2023 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. 8-30-24 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json new file mode 100644 index 00000000..b4c31a4d --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.json @@ -0,0 +1,35 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-sd-web", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20SD%20web.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Valda", + "AP Dagr", + "AP Iconic", + "AP Elevate", + "AP Murdock", + "AP Gunsmoke CL2", + "SY Ingmar", + "AP Revolution", + "LCS Trigger" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf" + ], + "page_text_chars": 5882, + "fetched_at": "2026-05-25T19:11:13.388036+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md new file mode 100644 index 00000000..8709f83b --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-sd-web.md @@ -0,0 +1,104 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20SD%20web.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Valda, AP Dagr, AP Iconic, AP Elevate, AP Murdock, AP Gunsmoke CL2, SY Ingmar, AP Revolution, LCS Trigger + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +South Dakota +2025 Yield bu/ac +South Dakota Protein Test Wt. Heading Height +Variety Avg. Agar Miller Northville Selby % lbs/bu 1-9 1-9 +AgriPro HY141 59.4 35.8 57.6 72.0 72.1 15.9 57.1 5 6 +SY Valda 56.8 35.3 63.3 61.2 67.3 16.0 58.2 5 5 +AP Dagr 56.4 29.7 54.5 69.8 71.5 16.0 55.6 6 4 +AgriPro HY155 56.2 31.1 57.3 65.6 70.9 16.9 57.6 5 6 +AgriPro HY162 54.8 30.4 52.2 68.2 68.5 16.2 56.7 5 6 +AP Iconic 54.1 29.0 52.0 68.9 66.4 17.0 56.3 5 6 +AP Elevate 53.3 27.1 51.1 71.5 63.8 16.8 56.2 6 4 +AP Murdock 48.5 29.8 46.6 51.0 66.7 17.5 57.1 4 4 +AP Gunsmoke CL2 47.6 30.1 44.8 64.5 51.2 17.8 55.7 5 5 +SY Ingmar 44.3 23.1 40.5 59.3 54.3 17.8 57.2 5 5 +AP Revolution 43.0 31.5 35.7 38.9 66.1 17.3 57.9 4 4 +LCS Trigger 63.5 34.0 67.6 77.8 74.6 15.2 58.1 6 6 +ND Stampede 60.2 47.3 54.3 66.5 72.6 17.1 58.2 5 5 +Brawn-SD 58.8 42.1 55.4 69.0 68.9 15.4 59.5 5 NA +WB9641 57.0 32.2 58.1 71.6 66.0 16.0 56.8 6 5 +MN-Torgy 56.2 34.0 54.0 67.4 69.4 17.9 58.9 6 6 +WB9645 55.9 37.5 51.3 67.2 67.6 16.1 56.8 7 6 +Ascend-SD 54.7 33.2 51.1 66.3 68.3 16.9 57.7 6 8 +Driver 52.0 30.9 44.4 70.2 62.5 16.7 57.4 5 7 +WB9642 49.4 34.7 30.0 65.8 67.0 16.4 59.1 6 4 +WB9590 42.6 41.3 25.8 50.4 52.9 17.9 57.1 4 4 +Mean 53.7 33.7 50.3 64.4 66.3 16.7 57.4 +LSD (5%) 8.4 7.1 9.9 5.8 1.0 2.4 +CV (%) 9.7 9.9 9.0 13.0 4.3 2.6 1.2 +No. of Locs. 4 2 2 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Lodging: 1 = no lodging; Disease 1 = tolerant +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly +100% hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +South Dakota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Gross Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY141 65.0 65.7 59.4 15.4 60.2 370.0 3 5 6 6 5 4 +AP Dagr 63.1 62.5 56.4 15.6 59.8 364.2 10 6 4 5 4 5 +AgriPro HY162 62.7 61.4 54.8 15.7 59.9 364.5 9 5 6 5 5 4 +AP Iconic 62.2 62.1 54.1 16.2 60.1 368.4 5 5 6 3 4 4 +SY Valda 62.2 62.5 56.8 16.1 60.5 368.3 6 5 5 5 4 4 +AgriPro HY155 61.8 61.5 56.2 16.5 60.3 365.8 7 5 6 5 5 4 +AP Elevate 61.6 60.9 53.3 16.3 60.2 364.9 8 6 4 3 4 4 +AP Gunsmoke CL2 56.9 55.6 47.6 16.8 60.0 336.9 12 5 5 3 5 4 +AP Murdock 53.1 54.7 48.5 16.9 60.2 314.3 13 4 4 4 4 4 +AP Revolution 52.8 53.2 43.0 16.9 60.7 312.9 14 4 4 4 3 3 +SY Ingmar 52.4 51.7 44.3 17.2 60.7 309.9 15 5 5 3 3 3 +LCS Trigger 71.9 70.9 63.5 14.3 60.9 376.2 2 6 6 NA 4 NA +Brawn-SD 64.5 62.1 58.8 15.5 61.7 369.3 4 5 NA NA 4 5 +Ascend-SD 64.4 62.5 54.7 16.7 60.8 381.2 1 6 8 7 3 4 +Driver 60.5 58.5 52.0 16.3 60.5 358.1 11 5 7 NA NA NA +ND Stampede 60.2 5 5 5 4 5 +WB9641 57.0 6 5 4 6 5 +MN-Torgy 56.2 6 6 5 3 3 +WB9645 55.9 7 6 5 6 5 +WB9642 49.4 6 4 6 6 4 +WB9590 42.6 4 4 2 6 6 +Mean 61.0 60.4 53.7 16.2 60.4 +LSD (5%) 4.3 4.5 8.4 0.7 0.4 +CV (%) 7.7 7.9 9.74 0.8 2.2 +No. of Locs. 10 8 4 6 6 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Lodging: 1 = no lodging; Disease 1 = tolerant +2025 Locations: Agar, Miller, Northville, and Selby SD +2024 Locations: Agar, Roscoe, Northville, and Selby, SD +2023 Locations: Selby, and Webster, SD +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json new file mode 100644 index 00000000..ecc20dab --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-web-east", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20web%20East.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Iconic", + "AP Elevate", + "SY Valda", + "AP Murdock", + "AP Smith", + "AP Dagr", + "AP Gunsmoke CL2", + "SY", + "SY Ingmar", + "LCS Boom" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf" + ], + "page_text_chars": 7194, + "fetched_at": "2026-05-25T19:11:10.521992+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md new file mode 100644 index 00000000..b9d1ea9a --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-east.md @@ -0,0 +1,114 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20East.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Iconic, AP Elevate, SY Valda, AP Murdock, AP Smith, AP Dagr, AP Gunsmoke CL2, SY, SY Ingmar, LCS Boom + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +Eastern North Dakota and Minnesota +2025 Yield bu/ac +Test +Minnesota North Dakota +Prot. Wt. +Variety Avg. Crookston Glyndon Warren Wolverton Cando Casselton Langdon McVille Park River Thompson % lbs/bu +AgriPro HY141 90.8 87.2 75.8 85.7 90.7 88.4 103.5 108.6 92.7 82.0 93.9 13.5 59.5 +AgriPro HY155 90.8 93.7 74.7 93.3 88.3 84.6 91.8 101.3 99.1 85.3 96.1 14.0 60.2 +AgriPro HY162 86.7 85.8 73.1 88.9 92.5 75.8 86.3 103.8 90.1 82.3 88.6 13.5 60.2 +AP Iconic 83.5 92.2 69.0 75.1 76.9 86.8 85.5 104.1 86.6 73.6 85.5 13.5 58.7 +AP Elevate 81.9 84.3 66.1 74.2 74.9 78.1 81.2 104.7 93.9 73.2 87.9 13.5 59.7 +SY Valda 81.2 74.6 71.6 94.7 73.2 81.0 82.8 91.0 82.5 72.9 87.3 13.9 58.4 +AP Murdock 80.2 90.0 64.7 85.2 78.6 75.7 82.3 96.2 87.4 55.3 87.1 13.5 59.5 +AP Smith 80.2 80.8 64.9 83.0 70.3 71.3 85.0 91.5 95.1 71.6 88.6 14.4 59.1 +AP Dagr 79.0 79.2 68.1 90.5 71.2 75.5 82.1 88.1 79.5 59.0 97.0 13.2 57.6 +AP Gunsmoke CL2 77.4 81.7 63.1 79.7 67.8 82.3 83.7 93.5 88.1 49.9 83.9 13.3 58.1 +SY 611 CL2 76.7 78.0 63.1 73.2 71.1 74.4 77.3 100.8 83.3 62.7 82.8 14.5 58.9 +SY Ingmar 74.8 70.0 62.1 80.8 66.6 69.4 85.0 89.0 73.1 65.0 86.7 14.5 59.1 +ND Stampede 87.1 86.8 69.0 90.3 88.7 77.2 91.0 100.5 100.9 76.2 90.4 15.1 58.8 +WB9645 85.7 96.2 72.7 92.2 73.9 74.2 95.1 103.0 99.2 69.1 81.4 13.3 58.9 +Ascend-SD 85.5 95.6 69.0 88.8 90.5 79.7 88.3 93.1 90.6 78.0 81.7 14.1 60.5 +WB9641 85.5 74.7 64.2 93.8 75.0 88.1 92.1 104.8 98.3 65.4 98.9 12.7 58.6 +WB9590 84.8 81.3 63.8 88.1 76.9 77.0 86.2 103.1 93.0 77.0 101.4 14.8 59.5 +WB9606 83.8 87.4 70.8 90.6 69.7 78.8 96.4 95.5 94.2 75.3 79.0 12.9 60.5 +Faller 82.5 88.9 62.2 92.1 69.4 83.8 83.0 98.6 87.6 66.5 92.6 14.0 58.9 +WB9642 81.9 83.7 65.7 92.0 77.8 81.1 80.8 91.9 85.2 73.2 88.0 13.6 60.9 +MN-Torgy 80.9 89.4 65.1 84.6 80.9 82.0 76.0 92.6 94.5 70.3 73.7 14.6 60.5 +MN-Rothsay 80.8 82.2 65.8 91.1 69.7 82.2 80.5 96.1 91.1 60.7 88.3 13.9 58.8 +WB9719 75.9 84.6 60.3 88.8 63.8 72.7 79.1 97.8 78.4 42.9 90.7 13.3 57.5 +LCS Boom 74.3 84.0 52.5 77.5 67.6 79.3 76.5 91.7 91.0 46.4 76.9 14.2 60.6 +ND Thresher 73.3 81.9 56.3 86.9 71.9 73.7 70.5 86.7 70.2 56.2 78.9 14.4 56.6 +Mean 82.2 84.8 66.4 86.9 76.6 79.2 85.2 97.5 89.2 68.3 87.6 13.8 59.2 +LSD (5%) 4.8 11.5 4.7 13.2 13.1 11.0 — 10.6 10.3 9.2 11.4 0.5 0.7 +CV (%) 6.5 6.6 4.3 7.3 8.2 8.4 12.1 6.6 7.0 7.8 6.4 6.9 2.1 +No. of Locs. 10 8 10 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= Early, Height: 1 = Short +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +Eastern North Dakota and Minnesota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu Gross $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY155 92.4 91.5 90.8 14.4 60.2 488.8 1 5 6 5 5 4 +AgriPro HY141 91.7 90.6 90.8 13.8 59.8 461.9 5 5 6 6 5 4 +AgriPro HY162 91.0 89.3 86.7 13.6 60.3 450.0 8 5 6 5 5 4 +AP Iconic 87.5 87.1 83.5 14.0 59.4 446.3 9 5 6 3 4 4 +AP Elevate 85.9 86.0 81.9 14.5 60.1 458.5 6 6 4 3 4 4 +AP Dagr 85.7 82.6 79.0 13.7 58.7 426.2 17 6 4 5 4 5 +SY Valda 85.5 84.5 81.2 14.2 58.8 446.2 10 5 5 5 4 4 +AP Smith 82.6 82.0 80.2 14.8 59.6 450.3 7 6 4 2 3 4 +AP Murdock 82.4 82.4 80.2 14.3 60.0 431.5 16 4 4 4 4 4 +AP Gunsmoke CL2 81.2 80.9 77.4 14.6 58.9 434.8 15 5 5 3 5 4 +SY 611 CL2 80.7 79.5 76.7 14.9 59.3 443.4 11 5 4 4 4 3 +SY Ingmar 79.8 79.7 74.8 15.0 59.7 440.1 13 5 5 3 3 3 +WB9606 87.7 85.8 83.8 13.8 60.9 438.9 14 5 6 5 6 5 +MN-Rothsay 86.3 84.3 80.8 14.8 59.9 468.5 3 7 3 3 4 5 +WB9590 85.5 84.4 84.8 15.1 59.3 476.8 2 4 4 2 6 6 +Faller 85.0 82.9 82.5 14.2 59.3 443.3 12 6 7 7 3 3 +MN-Torgy 84.0 82.4 80.9 15.1 60.9 466.3 4 6 6 5 3 3 +WB9719 81.6 77.9 75.9 14.0 58.6 417.1 18 6 5 2 5 6 +Ascend-SD 84.3 85.5 6 8 7 3 4 +ND Thresher 74.4 73.3 6 5 7 3 5 +ND Stampede 87.1 5 5 5 4 5 +WB9645 85.7 7 6 5 6 5 +WB9641 85.5 6 5 4 6 5 +WB9642 81.9 6 4 6 6 4 +LCS Boom 74.3 3 5 4 6 4 +Mean 85.4 83.6 82.5 14.6 59.7 +LSD (5%) 3.3 4.0 4.8 0.5 0.8 +CV (%) 6.7 6.9 6.5 2.8 2.1 +No. of Locs. 23 16 10 22 21 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Disease: 1 = no disease +2025 Locations: Crookston, Glyndon, Warren, and Wolverton MN; Cando, Casselton, Langdon, McVille, Park River, and Thompson ND +2024 Locations: Casselton, Drayton, Langdon, and Park River, ND; Crookston and Warren, MN +2023 Locations: Casselton, McVille, Park River, and Valley City, ND; Crookston, Glyndon, and Warren, MN +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein. +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json new file mode 100644 index 00000000..20609df4 --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-2025-np-perf-data-web-west", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Performance Summary, Syngenta Data", + "filename": "2025%20NP%20Perf%20Data%20web%20west.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Iconic", + "AP Elevate", + "SY", + "SY Valda", + "AP Smith", + "AP Dagr", + "AP Gunsmoke CL2", + "AP Murdock", + "SY Ingmar", + "LCS Boom" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf" + ], + "page_text_chars": 6380, + "fetched_at": "2026-05-25T19:11:11.464402+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md new file mode 100644 index 00000000..77804829 --- /dev/null +++ b/corpus/agripro_trials/agt-2025-np-perf-data-web-west.md @@ -0,0 +1,113 @@ +# 2025 Performance Summary, Syngenta Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-11/2025%20NP%20Perf%20Data%20web%20west.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Iconic, AP Elevate, SY, SY Valda, AP Smith, AP Dagr, AP Gunsmoke CL2, AP Murdock, SY Ingmar, LCS Boom + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Performance Summary, Syngenta Data +Western North Dakota +2025 Yield bu/ac +North Dakota +Prot. Test Wt. +Variety Avg. Berthold Coleharbor Harvey New Leipzig Steele Velva % lbs/bu +AgriPro HY162 78.9 45.9 110.4 89.1 67.3 61.1 99.3 14.6 60.2 +AgriPro HY155 77.5 45.4 110.9 85.1 65.2 60.3 98.0 15.3 59.7 +AP Iconic 75.8 50.2 100.4 88.9 67.4 55.2 92.7 14.5 59.9 +AgriPro HY141 75.1 46.8 101.8 75.8 68.4 59.1 98.6 14.3 60.0 +AP Elevate 74.6 52.8 104.9 81.6 57.1 61.4 89.6 14.9 60.7 +SY 611 CL2 73.3 54.0 98.7 74.3 66.3 55.9 90.5 15.5 60.3 +SY Valda 72.5 56.7 94.3 78.3 62.1 52.0 91.7 14.5 58.9 +AP Smith 72.1 56.9 98.2 81.9 46.9 58.8 90.1 15.2 59.9 +AP Dagr 70.7 59.2 94.3 63.9 60.9 53.3 92.4 14.4 59.6 +AP Gunsmoke CL2 70.5 51.2 92.5 77.9 61.0 53.5 87.0 15.5 60.5 +AP Murdock 69.2 50.7 97.5 69.7 64.7 48.1 84.2 15.0 60.8 +SY Ingmar 65.8 49.0 95.8 68.5 57.3 42.7 81.4 15.5 58.7 +ND Stampede 78.1 51.0 107.3 86.8 62.0 68.5 93.2 16.1 60.3 +WB9641 76.8 58.8 104.9 87.2 51.6 62.8 95.4 14.1 59.8 +Faller 76.5 47.1 98.6 85.1 63.6 68.1 96.6 15.2 60.4 +WB9645 75.6 51.3 100.9 76.0 63.4 58.9 102.8 14.1 60.6 +MN-Torgy 75.1 58.5 93.0 83.0 68.5 63.2 84.6 15.8 61.8 +LCS Boom 75.0 49.2 100.7 87.7 70.8 54.1 87.7 15.0 62.5 +WB9606 74.0 49.1 109.1 84.1 52.7 55.5 93.3 14.4 60.6 +Ascend-SD 73.0 50.7 98.9 84.4 50.3 68.4 85.0 15.1 62.1 +WB9590 72.2 52.6 104.4 74.8 59.4 49.8 92.1 15.6 60.4 +WB9642 71.8 48.0 96.3 66.9 72.3 57.1 90.4 14.9 60.4 +WB9719 71.7 50.4 98.6 71.1 69.1 53.2 87.9 15.0 59.7 +MN-Rothsay 71.0 48.0 100.7 89.1 36.5 61.4 90.5 13.5 60.4 +ND Thresher 68.4 49.1 93.5 68.8 58.3 55.6 85.0 15.8 59.2 +Mean 73.7 51.3 100.8 79.1 61.3 57.8 91.7 14.9 60.3 +LSD (5%) 6.6 10.4 12.5 8.9 — 6.2 1.1 1.6 +CV (%) 7.7 11.8 6.3 9.6 8.7 — 3.2 5.1 2.3 +No. of Locs. 6 5 6 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= Early, Height: 1 = Short +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause variations +within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific evaluations. +Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific evaluation +inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 + +Three-Year Performance Summary, Syngenta Data (2023-2025) +Western North Dakota +Yield Average bu/ac Economic Return1 Agronomics and Disease +Protein Test Wt. Heading Height Lodging BLS FHB +Variety 3-yr 2-yr 2025 % lb/bu Gross $/A Rank 1-9 1-9 1-9 1-9 1-9 +AgriPro HY162 84.9 81.3 78.9 14.2 60.0 440.5 4 5 6 5 5 4 +AP Iconic 83.4 78.5 75.8 14.3 59.9 438.3 7 5 6 3 4 4 +AgriPro HY141 83.3 79.6 75.1 14.1 59.9 430.5 11 5 6 6 5 4 +AgriPro HY155 82.5 79.4 77.5 14.8 59.4 449.9 2 5 6 5 5 4 +AP Elevate 80.4 76.9 74.6 14.8 60.3 438.6 6 6 4 3 4 4 +AP Dagr 79.8 73.4 70.7 14.1 59.5 412.3 16 6 4 5 4 5 +SY Valda 79.5 73.4 72.5 14.3 59.1 416.3 15 5 5 5 4 4 +SY 611 CL2 78.2 73.5 73.3 15.1 59.7 435.7 8 5 4 4 4 3 +AP Smith 77.9 74.0 72.1 15.0 59.8 431.3 10 6 4 2 3 4 +AP Gunsmoke CL2 76.9 72.8 70.5 15.4 60.1 435.7 9 5 5 3 5 4 +AP Murdock 76.6 74.1 69.2 14.9 60.5 418.6 14 4 4 4 4 4 +SY Ingmar 74.7 71.4 65.8 15.4 59.5 424.5 13 5 5 3 3 3 +WB9606 82.9 78.6 74.0 14.1 60.2 429.1 12 5 6 5 6 5 +Faller 81.9 76.9 76.5 14.7 59.8 443.0 3 6 7 7 3 3 +WB9719 80.6 75.7 71.7 14.9 59.4 440.1 5 6 5 2 5 6 +WB9590 78.4 75.7 72.2 15.7 59.9 453.6 1 4 4 2 6 6 +MN-Torgy 77.2 75.1 6 6 5 3 3 +Ascend-SD 75.0 73.0 6 8 7 3 4 +MN-Rothsay 74.0 71.0 7 3 3 4 5 +ND Thresher 69.2 68.4 6 5 7 3 5 +ND Stampede 78.1 5 5 5 4 5 +WB9641 76.8 6 5 4 6 5 +WB9645 75.6 7 6 5 6 5 +LCS Boom 75.0 3 5 4 6 4 +WB9642 71.8 6 4 6 6 4 +Mean 80.1 75.5 73.7 14.7 59.8 +LSD (5%) 4.3 5.7 6.6 0.5 1.2 +CV (%) 6.9 8.1 7.7 5.9 2.2 +No. of Locs. 13 9 6 8 9 +Numbers in bold type are in the top yielding group and considered statistically similar. +Numerical ratings: Heading: 1= early; Height: 1 = short; Disease: 1 = no disease +2025 Locations: Berthold, Coleharbor, Harvey, New Leipzig, Steele, and Velva ND +2024 Locations: Coleharbor, Harvey, and Velva, ND +2023 Locations: Berthold, Coleharbor, New Leipzig, and Velva, ND +1 Economic return calculated by using the three-year yield average multiplied by the average grain price ($5.12/bu). (+) 8 cents per 1/5th premium over 14% protein, (-) 10 cents +per 1/5 discount under 14% protein. +These agronomic assessments are made by Syngenta scientists and reflect each variety’s relative performance within these characteristics through the 2025 crop year. Specific conditions may cause +variations within those characteristics. These relative protection values are based on current pest and disease populations. These have been known to shift periodically and may cause changes in specific +evaluations. Resistance to many other diseases and pests is sensitive to environmental conditions, plant development stages and the presence and intensity of other diseases which may result in specific +evaluation inconsistencies. This chart is updated annually to reflect the most current trends. +AgriPro hybrid wheat seed sold commercially contains 75-95% hybrid seed, as required by the Federal Seed Act. Plot trial data for AgriPro hybrids represents performance using seed lots with nearly 100% +hybrid seed. +© 2025 Syngenta. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. Some or all of the varieties may be protected under one or more of the following: Plant Variety +Protection, United States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. NP - 10/2025 +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json new file mode 100644 index 00000000..115f73ec --- /dev/null +++ b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-central-plains-dryland-2025-r1", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Central Plains Dryland Summary, Three-Year Data", + "filename": "Central%20Plains%20Dryland%202025%20r1.pdf", + "region": "Central Plains", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Wolverine", + "AP Roadrunner", + "AP Sunbird", + "AP Bigfoot", + "AP Prolific", + "SY Monument", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf" + ], + "page_text_chars": 2530, + "fetched_at": "2026-05-25T19:11:09.410687+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md new file mode 100644 index 00000000..d7698f4b --- /dev/null +++ b/corpus/agripro_trials/agt-central-plains-dryland-2025-r1.md @@ -0,0 +1,56 @@ +# 2025 Central Plains Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Central Plains +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Central%20Plains%20Dryland%202025%20r1.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Wolverine, AP Roadrunner, AP Sunbird, AP Bigfoot, AP Prolific, SY Monument, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Central Plains Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Belleville, Junction City, Pratt, Salina, +Variety (2023-2025) (2024-2025) (2025) KS* KS KS KS +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +AP24 AX 63.9 58.8 70.7 58.9 73.8 55.3 64.8 94.7 59.0 76.7 +SY Wolverine 62.0 60.6 66.2 60.6 66.1 56.9 58.2 86.2 67.9 51.9 +AP Roadrunner 61.3 57.8 64.9 57.4 61.3 53.9 54.1 79.7 52.9 58.5 +AP Sunbird 61.2 61.0 67.2 61.3 66.6 57.9 59.0 86.7 54.3 66.5 +AP Bigfoot 60.0 61.3 65.2 61.5 64.7 59.0 53.1 86.6 53.3 65.9 +AP Prolific 59.6 60.4 64.1 60.4 61.8 58.2 46.0 86.2 45.8 69.2 +SY Monument 59.2 59.6 62.5 59.8 56.8 55.9 48.2 71.6 48.9 58.6 +Bob Dole 56.4 59.1 60.2 58.9 57.7 55.2 47.0 68.3 50.9 64.6 +Showdown 62.9 60.1 69.2 60.1 65.2 55.7 56.4 83.0 54.7 66.6 +Rockstar 61.7 58.8 67.4 59.1 65.5 55.8 54.0 83.8 55.9 68.3 +KS Providence 61.1 60.4 66.3 60.3 65.2 57.2 52.5 88.9 59.5 59.7 +WB4401 60.4 60.3 65.5 60.2 61.5 56.1 51.7 78.8 57.0 58.5 +LCS Atomic AX 59.4 61.7 65.2 61.8 65.1 59.2 52.3 88.0 50.0 70.1 +WB4523 58.2 59.5 62.8 59.6 57.5 56.2 40.6 68.2 60.0 61.2 +WB4422 69.6 59.7 71.0 56.3 62.5 87.3 63.6 70.6 +KS Bill Snyder 73.1 59.0 59.3 94.6 57.4 81.2 +WB4699 64.0 55.3 51.0 84.2 56.9 64.1 +High Cotton 63.8 58.2 43.8 90.9 59.4 61.2 +Polansky Goldenhawk 63.0 56.4 52.7 86.5 55.9 56.8 +Doublestop CLP 60.7 57.7 53.9 74.0 51.3 63.4 +Mean General 60.6 60.0 65.9 60.0 63.7 56.7 51.7 83.6 54.3 65.4 +LSD General (5%) EE 3.9 1.1 4.6 1.2 8.3 2.7 9.5 10.7 8.1 9.9 +CV (Effective) 9.0 2.4 8.8 2.5 9.2 3.3 11.2 7.8 9.1 9.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Belleville, Conway Springs, Junction City, and Salina, KS; Carrier, OK +2024 — Belleville, Conway Springs, Junction City, Palco, Pratt, and Salina, KS; Carrier, OK +2025 — Belleville, Junction City, Pratt, and Salina, KS +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-montana-2025-web.json b/corpus/agripro_trials/agt-montana-2025-web.json new file mode 100644 index 00000000..5da370fd --- /dev/null +++ b/corpus/agripro_trials/agt-montana-2025-web.json @@ -0,0 +1,32 @@ +{ + "source": "agripro_trials", + "source_key": "agt-montana-2025-web", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Montana Summary, Three-Year Data", + "filename": "Montana%202025%20web.pdf", + "region": "Montana", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Solid", + "SY Monument", + "AP Sunbird", + "SY", + "LCS Steel AX", + "LCS Julep" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf" + ], + "page_text_chars": 1922, + "fetched_at": "2026-05-25T19:11:12.271213+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-montana-2025-web.md b/corpus/agripro_trials/agt-montana-2025-web.md new file mode 100644 index 00000000..4f1e8ad6 --- /dev/null +++ b/corpus/agripro_trials/agt-montana-2025-web.md @@ -0,0 +1,53 @@ +# 2025 Montana Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Montana +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/Montana%202025%20web.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Solid, SY Monument, AP Sunbird, SY, LCS Steel AX, LCS Julep + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Montana Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Fort Benton, Conrad, Billings, Sawfly +Variety (2023-2025) (2024-2025) (2025) MT MT MT Protein Damage +Hard Red Yield Yield Yield TWT Yield Yield Yield % 1-9 +Winter Wheat Bu/A Bu/A Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 63.4 62.1 71.8 61.3 59.5 47.0 109.0 12.2 5 +AP Solid 63.4 61.9 71.6 63.9 58.7 43.0 113.1 13.4 4 +SY Monument 62.5 62.5 72.9 61.6 58.9 42.4 117.3 12.1 5 +AP Sunbird 61.5 61.8 70.7 61.1 61.7 38.9 111.4 12.6 6 +AP18 AX 59.0 58.3 68.5 62.9 56.3 41.2 108.1 12.7 6 +SY 517 CL2 53.4 53.1 63.4 56.3 55.4 33.7 101.2 13.9 5 +Keldin 65.6 63.5 75.0 61.9 69.5 39.4 115.9 12.9 5 +Bobcat 63.5 63.2 73.2 62.5 68.5 47.8 103.4 13.1 2 +MT WarCat 61.4 59.5 69.7 62.9 58.6 42.3 108.4 13.2 3 +Warhorse 57.9 55.3 64.5 54.8 57.1 32.8 103.6 14 2 +WB4523 63.8 76.9 62.0 69.8 42.4 118.6 11.6 4 +WB4483 60.0 68.4 54.8 61.0 35.9 108.1 13.8 5 +StandClear CLP 59.2 68.2 62.6 61.2 39.7 103.8 13.2 5 +Scorpio 56.9 68.1 53.6 61.9 34.2 108.2 12.5 6 +WB4733 CLP 56.3 64.5 62.3 55.2 38.3 99.9 13.9 4 +DG Ramsay 76.9 62.0 73.4 37.5 119.9 13 5 +LCS Steel AX 75.2 62.7 62.0 44.4 119.2 12.2 5 +4739AX 73.9 62.8 67.6 43.2 110.8 13.3 4 +WB4510CLP 72.3 63.2 64.1 34.5 118.4 12.2 6 +LCS Julep 69.9 63.2 62.8 39.1 107.9 13.4 5 +Sawfly: Locations +1-2 = Excellent 2023 — Conrad and Fort Benton, MT +3-4 = Very Good 2024 — Billings, Chester, Conrad, and Fort Benton, MT +5 = Good 2025 — Billings, Conrad, and Fort Benton, MT +6-7 = Fair +8-9 = Poor +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-ne-colorado-2025.json b/corpus/agripro_trials/agt-ne-colorado-2025.json new file mode 100644 index 00000000..6875c558 --- /dev/null +++ b/corpus/agripro_trials/agt-ne-colorado-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-ne-colorado-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Northeast Colorado Dryland Summary, Three-Year Data", + "filename": "NE%20Colorado%202025.pdf", + "region": "NE Colorado", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Bigfoot", + "SY Wolverine", + "AP Solid", + "AP Roadrunner", + "SY Monument", + "WB-Grainfield" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf" + ], + "page_text_chars": 2389, + "fetched_at": "2026-05-25T19:11:01.256427+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-ne-colorado-2025.md b/corpus/agripro_trials/agt-ne-colorado-2025.md new file mode 100644 index 00000000..0ff5f105 --- /dev/null +++ b/corpus/agripro_trials/agt-ne-colorado-2025.md @@ -0,0 +1,58 @@ +# 2025 Northeast Colorado Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** NE Colorado +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/NE%20Colorado%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Bigfoot, SY Wolverine, AP Solid, AP Roadrunner, SY Monument, WB-Grainfield + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Northeast Colorado Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Crook, Julesburg, Yuma, +Variety (2023-2025) (2024-2025) (2025) CO CO* CO +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +AP Sunbird 81.5 59.5 82.7 60.0 86.5 57.9 61.6 102.9 95.1 +AP24 AX 79.2 56.8 79.8 57.1 86.7 54.9 63.7 97.9 98.5 +AP18 AX 78.5 57.2 78.7 57.4 85.0 55.2 59.5 101.2 94.3 +AP Bigfoot 76.5 59.0 76.6 58.9 79.0 56.6 60.4 86.6 90.0 +SY Wolverine 75.5 59.1 77.1 59.7 80.5 57.2 61.9 91.0 88.5 +AP Solid 73.3 59.2 73.9 59.4 76.9 56.4 62.4 86.0 82.4 +AP Roadrunner 72.3 56.8 73.6 56.7 80.1 54.6 63.9 90.5 86.0 +SY Monument 67.5 58.2 65.4 58.6 70.3 56.7 62.4 67.7 80.7 +AG Golden 78.3 56.4 79.0 56.7 86.1 54.9 71.2 93.6 93.4 +Langin 78.2 57.4 81.4 57.7 86.7 55.8 62.6 95.5 102.0 +WB4422 77.3 59.5 78.4 59.9 83.2 57.7 65.8 90.7 92.9 +KS Dallas 76.3 59.7 72.9 59.5 76.0 57.4 55.0 82.3 90.8 +WB4595 75.8 60.2 76.9 60.2 81.4 57.3 63.7 90.1 90.4 +High Country 75.4 59.2 75.1 59.1 79.2 57.0 60.4 89.6 87.7 +Amplify SF 73.7 58.8 72.1 59.0 73.8 56.5 61.6 77.7 82.2 +KS Hamilton 72.2 58.5 71.1 58.5 76.3 56.1 54.5 78.0 96.3 +TAM 115 65.5 60.4 63.8 60.2 70.5 58.3 49.8 85.0 76.6 +Kivari AX 78.1 57.7 83.3 55.9 63.3 87.1 99.5 +WB-Grainfield 86.3 57.3 59.0 99.8 100.0 +Canvas 85.1 55.9 67.4 94.4 93.7 +KS Bill Snyder 80.4 56.9 63.2 92.9 85.3 +KS Mako 78.4 57.2 58.1 84.6 92.3 +Mean General 75.2 58.6 75.5 58.7 79.5 56.4 60.4 88.8 89.4 +LSD General (5%) EE 5.5 1.2 6.2 1.5 9.6 0.0 8.3 16.0 13.0 +CV (Effective) 8.1 1.9 8.6 2.0 9.9 2.4 8.3 11.0 8.9 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Julesburg and Yuma, CO; Colby, KS +2024 — Crook and Julesburg, CO; Ingalls, KS +2025 — Crook, Julesburg, and Yuma, CO +© 2024 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-plains-irrigated-2025.json b/corpus/agripro_trials/agt-plains-irrigated-2025.json new file mode 100644 index 00000000..110b7a40 --- /dev/null +++ b/corpus/agripro_trials/agt-plains-irrigated-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-plains-irrigated-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Plains Irrigated Summary, Three-Year Data", + "filename": "Plains%20Irrigated%202025.pdf", + "region": "Plains Irrigated", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Wolverine", + "AP Sunbird", + "AP Prolific", + "AP Bigfoot", + "SY Grit", + "AP Roadrunner", + "SY Monument" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf" + ], + "page_text_chars": 2164, + "fetched_at": "2026-05-25T19:11:03.219401+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-plains-irrigated-2025.md b/corpus/agripro_trials/agt-plains-irrigated-2025.md new file mode 100644 index 00000000..a876b9be --- /dev/null +++ b/corpus/agripro_trials/agt-plains-irrigated-2025.md @@ -0,0 +1,52 @@ +# 2025 Plains Irrigated Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Plains Irrigated +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Plains%20Irrigated%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Wolverine, AP Sunbird, AP Prolific, AP Bigfoot, SY Grit, AP Roadrunner, SY Monument + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Plains Irrigated Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Hugoton, Scott City, Imperial, Dalhart, +Variety (2023-2025) (2024-2025) (2025) KS KS NE TX +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +SY Wolverine 95.0 57.6 94.8 58.6 94.5 59.9 99.9 102.4 95.4 80.3 +AP Sunbird 93.9 58.8 94.7 59.1 89.5 59.7 74.8 107.8 96.8 78.5 +AP Prolific 93.6 59.8 92.6 60.1 92.3 61.3 104.9 95.7 92.6 76.0 +AP Bigfoot 90.9 59.1 90.5 59.2 87.3 59.7 67.6 111.3 91.7 78.7 +SY Grit 89.2 58.1 90.9 58.7 91.8 58.9 96.4 94.8 96.9 79.0 +AP Roadrunner 88.7 56.7 89.3 57.2 81.7 57.1 57.6 108.2 83.8 77.1 +SY Monument 83.8 57.1 83.0 57.7 79.3 59.5 77.3 79.6 83.7 76.6 +WB4422 94.9 59.2 95.8 60.0 90.9 61.1 107.0 98.5 89.0 69.1 +TAM 114 91.7 60.7 93.6 61.2 89.1 61.7 77.6 107.9 92.5 78.3 +Canvas 89.6 58.0 92.2 59.2 84.5 59.4 68.1 101.9 86.1 81.7 +WB4792 89.3 56.8 92.1 58.0 88.3 57.9 94.6 104.7 73.0 80.8 +Epoch 88.9 58.4 88.5 58.9 88.6 59.6 101.9 97.4 87.0 68.0 +Langin 88.5 58.8 90.8 59.4 84.9 60.1 62.9 99.3 91.9 85.5 +TAM 115 76.3 59.2 76.2 59.7 76.5 62.0 63.8 93.4 75.4 73.5 +WB4523 93.5 58.4 90.9 111.5 93.4 78.1 +WB4303 93.1 57.4 102.9 93.8 95.7 80.2 +KS Mako 90.1 60.7 88.9 101.2 96.8 73.3 +Mean General 88.9 57.8 90.0 58.5 87.5 59.2 85.9 99.3 87.6 77.1 +LSD General (5%) EE 8.0 1.8 9.2 1.9 13.2 2.3 18.8 9.4 8.8 8.6 +CV (Effective) 10.2 3.5 10.6 3.0 8.5 2.6 13.4 5.8 6.1 6.8 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Ingalls, KS; Imperial, NE +2024 — Dalhart, TX; Hugoton and Ingalls, KS; Imperial, NE +2025 — Hugoton and Scott City, KS; Imperial, NE; Dalhart, TX +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json new file mode 100644 index 00000000..705efecb --- /dev/null +++ b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.json @@ -0,0 +1,34 @@ +{ + "source": "agripro_trials", + "source_key": "agt-sc-ks-nc-ok-2024-0", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data", + "filename": "SC%20KS%20NC%20OK%202024_0.pdf", + "region": null, + "wheat_class_section": null, + "year": 2024, + "years_covered": [ + 2024 + ], + "varieties_found": [ + "SY Monument", + "AP Prolific", + "AP Roadrunner", + "SY Wolverine", + "AP Sunbird", + "AP EverRock", + "AP Bigfoot", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf" + ], + "page_text_chars": 2157, + "fetched_at": "2026-05-25T19:11:08.283651+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md new file mode 100644 index 00000000..1377c89f --- /dev/null +++ b/corpus/agripro_trials/agt-sc-ks-nc-ok-2024-0.md @@ -0,0 +1,54 @@ +# 2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2024 +- **PDF:** https://agriprowheat.com/sites/default/files/2024-07/SC%20KS%20NC%20OK%202024_0.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Monument, AP Prolific, AP Roadrunner, SY Wolverine, AP Sunbird, AP EverRock, AP Bigfoot, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2024 South-Central Kansas, North-Central Oklahoma Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2022-2024 +3-Yr Combined 2-Yr Combined Combined Conway Pratt, Carrier, +Variety (2022-2024) (2023-2024) (2024) Springs, KS KS OK +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 57.6 62.7 54.1 62.4 62.6 63.7 75.2 36.5 76.2 +SY Monument 57.5 63.7 56.9 63.7 63.2 64.8 63.8 46.9 79.0 +AP Prolific 57.3 64.1 55.7 63.7 61.8 64.7 61.5 42.5 81.4 +AP Roadrunner 56.4 61.4 54.0 60.6 59.0 61.0 63.8 34.8 78.3 +SY Wolverine 55.8 64.5 52.8 64.2 58.7 65.6 56.6 44.6 75.0 +AP18 AX 55.4 63.0 52.8 62.9 62.2 64.2 70.7 37.3 78.5 +AP Sunbird 53.7 64.0 52.0 64.1 62.0 66.1 69.5 45.5 70.9 +Bob Dole 53.6 62.9 53.5 62.5 60.6 63.6 62.0 40.1 79.7 +AP EverRock 52.4 63.6 50.8 63.2 58.4 64.6 60.0 37.5 77.7 +AP Bigfoot 52.0 63.9 51.3 63.9 61.0 65.6 61.7 45.3 76.0 +WB4401 53.6 63.7 52.3 64.4 59.7 65.9 63.2 31.7 84.1 +Showdown 59.7 64.4 69.7 65.6 77.0 47.2 85.1 +Rockstar 56.5 62.4 64.2 63.1 62.9 49.3 80.4 +KS Providence 56.0 63.4 64.4 64.4 71.1 38.7 83.4 +LCS Atomic AX 54.3 64.8 62.3 66.2 65.8 42.9 78.2 +WB4523 52.9 63.7 60.5 65.0 56.2 39.9 85.4 +KS Hatchett 52.9 63.5 60.8 64.5 65.8 39.5 77.0 +WB4422 61.3 64.6 63.5 43.5 77.0 +KS Mako 59.0 65.5 58.7 49.9 68.3 +Mean General 55.0 63.4 53.8 63.5 59.7 64.0 62.2 41.2 75.7 +LSD General (5%) EE NS 1.4 NS 1.9 8.5 3.7 7.7 8.6 9.3 +CV (Effective) 7.9 1.7 8.7 2.0 8.8 2.2 7.6 12.8 7.5 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +NS = Non Significant +Locations +2022 — Conway Springs and Partridge, KS; Carrier, OK +2023 — Conway Springs, KS; Carrier, OK +2024 — Conway Springs and Pratt, KS; Carrier, OK +© 2024 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-south-dakota-2025.json b/corpus/agripro_trials/agt-south-dakota-2025.json new file mode 100644 index 00000000..49a338f4 --- /dev/null +++ b/corpus/agripro_trials/agt-south-dakota-2025.json @@ -0,0 +1,36 @@ +{ + "source": "agripro_trials", + "source_key": "agt-south-dakota-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 South Dakota Winter Wheat Summary, Three-Year Data", + "filename": "South%20Dakota%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Prolific", + "AP Sunbird", + "AP Bigfoot", + "SY", + "SY Wolverine", + "SY WOLF", + "SY Monument", + "AP Clair", + "AP Solid", + "LCS Helix AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf" + ], + "page_text_chars": 1916, + "fetched_at": "2026-05-25T19:11:14.252920+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-south-dakota-2025.md b/corpus/agripro_trials/agt-south-dakota-2025.md new file mode 100644 index 00000000..fcf47211 --- /dev/null +++ b/corpus/agripro_trials/agt-south-dakota-2025.md @@ -0,0 +1,52 @@ +# 2025 South Dakota Winter Wheat Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/South%20Dakota%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Prolific, AP Sunbird, AP Bigfoot, SY, SY Wolverine, SY WOLF, SY Monument, AP Clair, AP Solid, LCS Helix AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 South Dakota Winter Wheat Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Agar, Presho, +Variety (2023-2025) (2024-2025) (2025) SD SD +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +AP18 AX 57.3 59.3 64.7 58.6 57.2 54.2 52.5 61.9 +AP24 AX 56.8 59.0 64.2 57.8 61.2 53.9 59.9 62.5 +AP Prolific 56.1 59.5 62.9 58.8 58.6 54.6 56.1 61.1 +AP Sunbird 55.9 59.5 58.5 59.1 58.4 56.1 56.8 59.9 +AP Bigfoot 55.6 59.6 62.9 58.9 64.1 55.9 62.2 66.1 +SY 517 CL2 52.5 60.8 56.6 60.3 59.9 57.8 60.0 59.7 +SY Wolverine 52.3 58.5 56.9 58.5 53.5 55.4 47.5 59.5 +SY WOLF 51.4 59.1 55.5 57.9 53.5 53.7 49.4 57.6 +SY Monument 50.1 57.3 56.1 55.9 56.6 51.9 54.7 58.6 +AP Clair 49.2 59.7 57.8 59.5 52.9 56.2 50.6 55.1 +AP Solid 54.7 56.8 47.8 61.7 +WB4422 58.2 59.9 63.8 58.4 65.3 55.5 63.5 67.1 +LCS Helix AX 57.2 61.1 66.0 60.6 65.2 57.9 65.0 65.3 +Winner 56.5 60.5 66.7 59.7 65.9 57.0 68.7 63.1 +SD Andes 55.7 60.6 60.9 59.8 54.2 55.8 50.3 58.1 +SD Midland 54.6 59.8 59.4 58.7 54.4 54.1 49.7 59.0 +Kivari AX 54.3 56.8 50.7 53.2 44.5 56.9 +SD Pheasant 54.1 56.4 49.7 58.5 +Mean General 54.6 59.6 60.4 58.5 57.7 55.2 55.2 60.3 +LSD General (5%) EE 5.4 1.2 7.7 1.5 7.4 1.8 13.0 6.1 +CV (Effective) 10.3 1.8 8.7 1.9 11.2 2.0 14.3 6.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Hayes and Ideal, SD +2024 — Hayes and Ideal, SD +2025 — Agar and Presho, SD +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-southern-idaho-2025.json b/corpus/agripro_trials/agt-southern-idaho-2025.json new file mode 100644 index 00000000..940e2300 --- /dev/null +++ b/corpus/agripro_trials/agt-southern-idaho-2025.json @@ -0,0 +1,40 @@ +{ + "source": "agripro_trials", + "source_key": "agt-southern-idaho-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Southern Idaho Summary, Three-Year Data", + "filename": "Southern%20Idaho%202025.pdf", + "region": "Southern Idaho", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Exceed", + "AP Olympia", + "SY Dayton", + "SY Assure", + "SY Ovation", + "AP Iliad", + "SY Raptor", + "LCS Shine", + "LCS Hulk", + "LCS Artdeco", + "Norwest Duet", + "Norwest Tandem", + "LCS Jefe", + "LCS Kamiack" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf" + ], + "page_text_chars": 2095, + "fetched_at": "2026-05-25T19:11:06.237182+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-southern-idaho-2025.md b/corpus/agripro_trials/agt-southern-idaho-2025.md new file mode 100644 index 00000000..497c60d0 --- /dev/null +++ b/corpus/agripro_trials/agt-southern-idaho-2025.md @@ -0,0 +1,49 @@ +# 2025 Southern Idaho Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Southern Idaho +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-09/Southern%20Idaho%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Exceed, AP Olympia, SY Dayton, SY Assure, SY Ovation, AP Iliad, SY Raptor, LCS Shine, LCS Hulk, LCS Artdeco, Norwest Duet, Norwest Tandem, LCS Jefe, LCS Kamiack + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Southern Idaho Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Blackfoot, Nampa, Rupert, Twin Falls, +Variety (2023-2025) (2024-2025) (2025) ID ID ID ID +Soft White Yield TWT Yield TWT Yield TWT Yield Yield Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A Bu/A +AP Exceed 166.9 62.2 165.3 62.6 177.8 62.1 179.0 176.4 141.9 213.7 +AP Olympia 164.9 61.0 163.8 61.4 181.2 60.8 183.6 163.9 156.7 220.5 +SY Dayton 161.5 61.2 160.4 61.4 174.9 60.5 176.4 177.4 146.4 199.5 +SY Assure 161.5 62.0 158.7 61.9 174.7 61.5 180.2 165.2 155.6 197.7 +SY Ovation 160.2 61.1 155.7 60.8 169.2 59.9 178.2 175.0 121.7 202.0 +AP Iliad 160.1 61.4 157.0 61.6 167.9 60.7 177.6 175.6 108.9 209.4 +SY Raptor 153.1 59.6 150.9 59.7 160.7 58.9 175.5 165.3 130.5 171.5 +LCS Shine 164.0 61.3 164.6 61.5 170.7 61.0 165.9 160.6 166.9 189.5 +LCS Hulk 161.7 62.6 159.9 62.8 171.1 62.8 164.2 181.5 146.9 192.0 +LCS Artdeco 157.6 61.5 156.5 61.5 164.2 60.9 164.6 182.7 131.1 178.3 +Norwest Duet 154.8 61.2 155.1 61.5 166.2 61.1 170.4 159.8 139.6 194.9 +Norwest Tandem 152.2 60.8 149.6 61.2 155.0 60.3 151.6 152.2 121.8 194.2 +LCS Jefe 165.5 62.6 180.8 61.8 174.2 160.8 169.7 218.5 +LCS Kamiack 159.8 61.9 174.9 61.5 169.3 176.8 143.5 209.9 +Mean General 160.9 61.4 159.4 61.6 170.9 61.0 171.0 169.4 141.4 201.8 +LSD General (5%) EE ns 1.1 ns 1.1 14.1 1.3 12.5 ns ns ns +CV (Effective) 6.9 2.1 7.2 1.8 8.4 1.7 3.5 5.5 14.5 7.6 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Twin Falls, ID +2024 — Blackfoot and Twin Falls, ID +2025 — Blackfoot, Nampa, Rupert, and Twin Falls, ID +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-washington-n-idaho-2025.json b/corpus/agripro_trials/agt-washington-n-idaho-2025.json new file mode 100644 index 00000000..e7f4eeb7 --- /dev/null +++ b/corpus/agripro_trials/agt-washington-n-idaho-2025.json @@ -0,0 +1,40 @@ +{ + "source": "agripro_trials", + "source_key": "agt-washington-n-idaho-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Washington/Northern Idaho Summary, Three-Year Data", + "filename": "Washington%3AN%20Idaho%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "SY Raptor", + "AP Olympia", + "AP Exceed", + "SY Ovation", + "SY Dayton", + "AP Iliad", + "SY Assure", + "Norwest Duet", + "LCS Artdeco", + "LCS Shine", + "LCS Hulk", + "Norwest Tandem", + "LCS Kamiack", + "LCS Jefe" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf" + ], + "page_text_chars": 1892, + "fetched_at": "2026-05-25T19:11:05.110770+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-washington-n-idaho-2025.md b/corpus/agripro_trials/agt-washington-n-idaho-2025.md new file mode 100644 index 00000000..48bc4020 --- /dev/null +++ b/corpus/agripro_trials/agt-washington-n-idaho-2025.md @@ -0,0 +1,48 @@ +# 2025 Washington/Northern Idaho Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-08/Washington%3AN%20Idaho%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** SY Raptor, AP Olympia, AP Exceed, SY Ovation, SY Dayton, AP Iliad, SY Assure, Norwest Duet, LCS Artdeco, LCS Shine, LCS Hulk, Norwest Tandem, LCS Kamiack, LCS Jefe + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Washington/Northern Idaho Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Genesee, Walla Walla, +Variety (2023-2025) (2024-2025) (2025) ID WA +Soft White Yield TWT Yield TWT Yield TWT Yield Yield +Winter Wheat Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A +SY Raptor 146.1 63.0 157.6 64.8 156.9 65.0 157.4 156.3 +AP Olympia 142.2 64.2 154.3 66.1 151.1 65.8 159.6 142.5 +AP Exceed 141.8 63.6 155.3 65.4 152.6 65.3 160.0 145.2 +SY Ovation 140.8 63.3 148.7 65.4 147.4 65.7 154.9 139.8 +SY Dayton 140.0 63.6 158.4 65.6 157.7 65.8 172.3 143.1 +AP Iliad 139.8 63.4 151.0 65.3 147.0 65.2 149.4 144.6 +SY Assure 138.7 64.3 152.0 66.3 150.9 66.0 149.6 152.1 +Norwest Duet 144.1 62.3 157.4 64.7 155.3 65.0 157.5 153.1 +LCS Artdeco 140.3 62.6 155.5 64.4 157.5 64.8 160.9 154.0 +LCS Shine 139.3 63.1 148.6 64.7 150.7 65.2 146.1 155.4 +LCS Hulk 138.4 63.9 152.4 65.8 156.6 65.9 167.2 146.0 +Norwest Tandem 137.0 62.7 152.9 65.0 157.3 65.3 166.1 148.5 +LCS Kamiack 165.0 66.5 174.3 66.2 188.8 159.7 +LCS Jefe 160.1 65.2 167.4 66.0 178.9 155.9 +Mean General 142.5 63.3 156.7 65.3 157.3 65.5 162.0 152.7 +LSD General (5%) EE 8.6 1.0 ns 1.4 ns ns ns ns +CV (Effective) 6.1 1.6 5.4 1.1 7.5 1.1 8.4 6.5 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +Locations +2023 — Craigmont and Genesee, ID; Moses Lake and Walla Walla, WA +2024 — Craigmont, ID; Walla Walla, WA +2025 — Genesee, ID; Walla Walla, WA +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json new file mode 100644 index 00000000..a5010b2b --- /dev/null +++ b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-western-plains-dryland-2025-0", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Western Plains Dryland Summary, Three-Year Data", + "filename": "Western%20Plains%20Dryland%202025_0.pdf", + "region": "Western Plains", + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Bigfoot", + "SY Wolverine", + "AP Roadrunner", + "AP Solid", + "SY Monument", + "WB-Grainfield" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf" + ], + "page_text_chars": 2395, + "fetched_at": "2026-05-25T19:11:02.169343+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md new file mode 100644 index 00000000..d43c029a --- /dev/null +++ b/corpus/agripro_trials/agt-western-plains-dryland-2025-0.md @@ -0,0 +1,58 @@ +# 2025 Western Plains Dryland Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Region:** Western Plains +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Western%20Plains%20Dryland%202025_0.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Bigfoot, SY Wolverine, AP Roadrunner, AP Solid, SY Monument, WB-Grainfield + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Western Plains Dryland Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Crook, Julesburg, Yuma, +Variety (2023-2025) (2024-2025) (2025) CO CO* CO +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP18 AX 74.2 57.6 75.8 57.8 85.0 55.2 59.5 101.2 94.3 +AP Sunbird 74.1 59.5 77.2 59.8 86.5 57.9 61.6 102.9 95.1 +AP24 AX 73.2 56.7 74.0 56.8 86.7 54.9 63.7 97.9 98.5 +AP Bigfoot 71.2 58.9 73.0 58.9 79.0 56.6 60.4 86.6 90.0 +SY Wolverine 68.9 59.1 71.4 59.5 80.5 57.2 61.9 91.0 88.5 +AP Roadrunner 67.6 56.0 70.3 55.7 80.1 54.6 63.9 90.5 86.0 +AP Solid 66.4 58.4 67.7 58.4 76.9 56.4 62.4 86.0 82.4 +SY Monument 61.7 57.8 60.1 57.9 70.3 56.7 62.4 67.7 80.7 +Langin 72.9 57.3 77.3 57.6 86.7 55.8 62.6 95.5 102.0 +WB4422 70.6 59.1 73.8 59.4 83.2 57.7 65.8 90.7 92.9 +AG Golden 70.3 55.5 72.3 55.5 86.1 54.9 71.2 93.6 93.4 +High Country 69.4 59.1 72.0 58.9 79.2 57.0 60.4 89.6 87.7 +WB4595 69.2 59.8 72.2 59.6 81.4 57.3 63.7 90.1 90.4 +KS Dallas 68.3 59.1 66.5 58.7 76.0 57.4 55.0 82.3 90.8 +Amplify SF 66.8 57.9 66.4 57.8 73.8 56.5 61.6 77.7 82.2 +KS Hamilton 66.6 58.3 68.5 58.3 76.3 56.1 54.5 78.0 96.3 +TAM 115 60.7 60.1 60.1 59.8 70.5 58.3 49.8 85.0 76.6 +Kivari AX 73.7 57.0 83.3 55.9 63.3 87.1 99.5 +WB-Grainfield 86.3 57.3 59.0 99.8 100.0 +Canvas 85.1 55.9 67.4 94.4 93.7 +KS Bill Snyder 80.4 56.9 63.2 92.9 85.3 +KS Mako 78.4 57.2 58.1 84.6 92.3 +Mean General 69.1 58.3 71.0 58.2 79.5 56.4 60.4 88.8 89.4 +LSD General (5%) EE 5.1 1.4 6.2 1.7 9.6 ns 8.3 16.0 13.0 +CV (Effective) 9.0 2.4 9.4 2.5 9.9 2.4 8.3 11.0 8.9 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Julesburg and Yuma, CO; Colby, KS +2024 — Crook and Julesburg, CO; Ingalls, KS +2025 — Crook, Julesburg, and Yuma, CO +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/corpus/agripro_trials/agt-wheat-after-soy-2025.json b/corpus/agripro_trials/agt-wheat-after-soy-2025.json new file mode 100644 index 00000000..ffb143b3 --- /dev/null +++ b/corpus/agripro_trials/agt-wheat-after-soy-2025.json @@ -0,0 +1,33 @@ +{ + "source": "agripro_trials", + "source_key": "agt-wheat-after-soy-2025", + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": "2025 Wheat Following Soybeans Summary, Three-Year Data", + "filename": "Wheat%20after%20Soy%202025.pdf", + "region": null, + "wheat_class_section": null, + "year": 2025, + "years_covered": [ + 2025 + ], + "varieties_found": [ + "AP Sunbird", + "AP Roadrunner", + "SY Wolverine", + "AP Bigfoot", + "AP Prolific", + "SY Monument", + "LCS Atomic AX" + ], + "pdf_url": "https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf", + "source_urls": [ + "https://agriprowheat.com/trials-data", + "https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf" + ], + "page_text_chars": 2367, + "fetched_at": "2026-05-25T19:11:07.269403+00:00", + "scraper_version": "0.1.0" +} diff --git a/corpus/agripro_trials/agt-wheat-after-soy-2025.md b/corpus/agripro_trials/agt-wheat-after-soy-2025.md new file mode 100644 index 00000000..b7da4eb3 --- /dev/null +++ b/corpus/agripro_trials/agt-wheat-after-soy-2025.md @@ -0,0 +1,55 @@ +# 2025 Wheat Following Soybeans Summary, Three-Year Data + +- **Source:** AgriPro (Syngenta) regional trial PDF +- **Vendor:** Syngenta +- **Brand:** AgriPro +- **Crop:** Wheat +- **Data type:** trial +- **Year:** 2025 +- **PDF:** https://agriprowheat.com/sites/default/files/2025-07/Wheat%20after%20Soy%202025.pdf +- **Index page:** https://agriprowheat.com/trials-data +- **Varieties listed:** AP Sunbird, AP Roadrunner, SY Wolverine, AP Bigfoot, AP Prolific, SY Monument, LCS Atomic AX + +--- + +## Trial data (verbatim from PDF) + +``` +2025 Wheat Following Soybeans Summary, Three-Year Data +Syngenta Commercial Variety Wheat Performance Test, 2023-2025 +3-Yr Combined 2-Yr Combined Combined Belleville, Junction City, Salina, +Variety (2023-2025) (2024-2025) (2025) KS* KS KS +Hard Winter Wheat Yield TWT Yield TWT Yield TWT Yield Yield Yield +Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Lb/Bu Bu/A Bu/A Bu/A +AP24 AX 66.1 58.8 76.6 59.0 78.7 55.3 64.8 94.7 76.7 +AP Sunbird 62.3 61.0 70.7 60.9 70.7 57.7 59.0 86.7 66.5 +AP Roadrunner 62.2 57.5 68.5 56.9 64.1 53.4 54.1 79.7 58.5 +SY Wolverine 61.0 60.3 66.9 60.1 65.4 56.5 58.2 86.2 51.9 +AP Bigfoot 60.3 61.3 67.5 61.2 68.6 58.7 53.1 86.6 65.9 +AP Prolific 59.2 60.0 66.3 60.0 67.2 57.9 46.0 86.2 69.2 +SY Monument 58.4 59.3 63.9 59.8 59.5 56.1 48.2 71.6 58.6 +Bob Dole 56.7 58.5 63.0 58.1 60.0 54.6 47.0 68.3 64.6 +Showdown 62.8 59.6 71.9 59.6 68.7 55.6 56.4 83.0 66.6 +KS Providence 61.9 60.4 70.0 60.4 67.0 56.8 52.5 88.9 59.7 +Rockstar 61.0 58.1 70.0 58.5 68.7 55.6 54.0 83.8 68.3 +WB4401 59.5 59.5 66.7 59.2 63.0 55.1 51.7 78.8 58.5 +LCS Atomic AX 59.1 61.3 67.7 61.2 70.2 58.7 52.3 88.0 70.1 +WB4523 57.4 58.9 63.9 59.0 56.7 55.5 40.6 68.2 61.2 +WB4422 72.5 58.6 73.5 55.2 62.5 87.3 70.6 +KS Bill Snyder 78.4 58.8 59.3 94.6 81.2 +WB4699 66.4 55.2 51.0 84.2 64.1 +Polansky Goldenhawk 65.3 56.1 52.7 86.5 56.8 +High Cotton 65.3 57.4 43.8 90.9 61.2 +Doublestop CLP 63.8 56.9 53.9 74.0 63.4 +Mean General 60.6 59.7 68.4 59.5 66.9 56.3 51.7 83.6 65.4 +LSD General (5%) EE 4.6 1.4 5.6 1.7 9.2 0.0 9.5 10.7 9.9 +CV (Effective) 8.6 2.5 8.3 2.7 9.2 3.8 11.2 7.8 9.2 +Boldfaced numbers are within confidence interval at specific locations and combined years of yield data. +* Location was affected by a Wheat Streak Mosaic Virus infestation, which resulted in reduced yield of susceptible varieties. +Locations +2023 — Belleville, Conway Springs, Junction City, and Salina, KS +2024 — Belleville, Conway Springs, Junction City, and Salina, KS +2025 — Belleville, Conway Springs, Junction City, and Salina, KS +© 2025 Syngenta. All rights reserved. Reproduction expressly prohibited without written permission. Some or all of the varieties may be protected under one or more of the following: Plant Variety Protection, United +States Plant Patents and/or Utility Patents and may not be propagated or reproduced without authorization. AgriPro® and the Syngenta logo are trademarks of a Syngenta Group Company. +``` \ No newline at end of file diff --git a/docs_mcp/server.py b/docs_mcp/server.py index e33eb7e8..8162c9ef 100644 --- a/docs_mcp/server.py +++ b/docs_mcp/server.py @@ -201,9 +201,15 @@ def _build_where( vendor: str | None, source: str | None, source_key: str | None, + *, + data_type: str | None = None, + state: str | None = None, + year: int | None = None, ) -> dict | None: """Translate filter args into a Chroma `where` clause.""" conds: list[dict] = [] + if data_type: + conds.append({"data_type": data_type}) if crop: conds.append({"crop": crop.lower()}) if brand: @@ -214,6 +220,10 @@ def _build_where( conds.append({"source": source}) if source_key: conds.append({"source_key": source_key}) + if state: + conds.append({"state": state.upper() if len(state) <= 3 else state}) + if year: + conds.append({"year": int(year)}) if not conds: return None if len(conds) == 1: @@ -460,7 +470,11 @@ def search_docs( "query": query, "crop": crop, "brand": brand, "vendor": vendor, "source": source, "k": k, }) as _call: - where = _build_where(crop, brand, vendor, source, None) + # Variety-search default: filter to data_type=variety so trial + # documents (yield trials) don't pollute identity-focused + # results. To search trials, use search_trials(). + where = _build_where(crop, brand, vendor, source, None, + data_type="variety") pool_size = max(k * 3, RERANK_POOL) # Exact-code pre-filter. Variety codes ("DKC62-08RIB", "AG29XF4") @@ -745,6 +759,233 @@ def lookup_variety( return "\n".join(out) +@mcp.tool() +def search_trials( + query: Annotated[str, Field(description=( + "Natural-language query about yield trials. Mention crop, " + "region or state, year, soil/conditions, and any specific " + "variety codes you want compared. Examples: " + "'best corn hybrid 2024 Iowa heavy clay'; " + "'AP Iliad yield Idaho stripe rust'; " + "'DKC65-20 vs NK1748 head to head Alabama 2023'." + ))], + crop: Annotated[ + str | None, + Field(description="OPTIONAL: corn, soybeans, silage, or wheat."), + ] = None, + state: Annotated[ + str | None, + Field(description=( + "OPTIONAL state filter. 2-letter abbrev (IA, IL, NE...) " + "for Golden Harvest plot reports; full or partial region " + "name (e.g. 'Pacific Northwest', 'Montana') for AgriPro " + "trial PDFs." + )), + ] = None, + year: Annotated[ + int | None, + Field(description="OPTIONAL year filter (e.g. 2024).", ge=2010, le=2030), + ] = None, + product: Annotated[ + str | None, + Field(description=( + "OPTIONAL variety/hybrid filter — substring match against " + "the product field. Example: 'DKC62' surfaces trials " + "containing any DKC62-* hybrid." + )), + ] = None, + k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10, +) -> str: + """Search yield-trial data — head-to-head results from real field + trials. SEPARATE from variety-identity search. + + Use this when the user wants to know HOW PRODUCTS PERFORMED, not + what they ARE. Trial data complements `search_docs`: + + * `search_docs` answers: "What's the disease resistance profile + of DKC62-08RIB?" (variety identity) + * `search_trials` answers: "Which corn hybrid actually won the + yield trials in central Iowa in 2024?" (performance data) + + Data sources: + + * **Golden Harvest plot reports** (4,000+ trials) — per-site + head-to-head comparing products from MULTIPLE BRANDS at one + cooperator's field. NK, DEKALB, Golden Harvest, sometimes + others all compete at the same site. Cross-vendor data Bayer + itself doesn't publish. + * **AgriPro regional trial PDFs** (~14 PDFs) — multi-year + multi-location wheat performance for Northern Plains / PNW / + Plains regions. + + A typical workflow: call this to identify top performers in a + region/year, then call `lookup_variety(source_key=...)` on the + leaders to verify identity details (RM, traits, disease ratings). + """ + with TimedCall("search_trials", { + "query": query, "crop": crop, "state": state, "year": year, + "product": product, "k": k, + }) as _call: + where = _build_where( + crop, None, None, None, None, + data_type="trial", + state=state, + year=year, + ) + pool_size = max(k * 3, RERANK_POOL) + + try: + col = _collection() + except Exception as exc: # noqa: BLE001 + _call.set(error_dense=str(exc), hits_returned=0) + return ( + "_(retrieval unavailable — Chroma collection not found. " + "Has the indexer run? `python -m rag.index --rebuild`.)_" + ) + + # If a product filter is set, augment the query with the + # product code so BM25 + dense both have signal. + full_query = query + if product: + full_query = f"{query} {product}" + + try: + dense = col.query( + query_texts=[full_query], + n_results=pool_size, + where=where, + ) + except Exception as exc: # noqa: BLE001 + _call.set(error_dense=str(exc), hits_returned=0) + return f"_(trial retrieval failed: {exc})_" + + dense_ids: list[str] = (dense.get("ids") or [[]])[0] + dense_docs: list[str] = (dense.get("documents") or [[]])[0] + dense_metas: list[dict] = (dense.get("metadatas") or [[]])[0] + dense_dists: list[float] = (dense.get("distances") or [[]])[0] + + id_to_doc = dict(zip(dense_ids, dense_docs)) + id_to_meta = dict(zip(dense_ids, dense_metas)) + id_to_dist = dict(zip(dense_ids, dense_dists)) + + used_hybrid = False + if HYBRID_SEARCH: + bm25 = _bm25_index() + if bm25 is not None: + bm25_hits = bm25.query(full_query, n=pool_size, where=where) + bm25_ids = [h[0] for h in bm25_hits] + if bm25_ids: + fused = _rrf_fuse([dense_ids, bm25_ids]) + fuzzy_ids = fused + used_hybrid = True + else: + fuzzy_ids = dense_ids + else: + fuzzy_ids = dense_ids + else: + fuzzy_ids = dense_ids + + # Optional product-substring post-filter: if user supplied + # ``product``, require the chunk to actually contain the + # token. This re-checks the bytes since BM25 only sees stems. + if product: + needle = product.lower() + def _has_product(cid: str) -> bool: + doc = id_to_doc.get(cid, "") + if needle in doc.lower(): + return True + # Not yet fetched — defer; the get-by-id below will fix. + return cid not in id_to_doc + + fuzzy_ids = [cid for cid in fuzzy_ids if _has_product(cid)] + + final_ids: list[str] = [] + seen: set[str] = set() + for cid in fuzzy_ids: + if cid in seen: + continue + seen.add(cid) + final_ids.append(cid) + if len(final_ids) >= k: + break + + missing = [i for i in final_ids if i not in id_to_doc] + if missing: + try: + extra = col.get(ids=missing, include=["documents", "metadatas"]) + for cid, doc, meta in zip( + extra.get("ids") or [], + extra.get("documents") or [], + extra.get("metadatas") or [], + ): + id_to_doc[cid] = doc + id_to_meta[cid] = meta + except Exception as exc: # noqa: BLE001 + log.warning("get-by-id for BM25-only hits failed: %s", exc) + + # Apply product filter once we have docs from the get-by-id pass. + if product: + needle = product.lower() + final_ids = [cid for cid in final_ids if needle in id_to_doc.get(cid, "").lower()] + + _call.set( + hits_returned=len(final_ids), + hybrid=used_hybrid, + pool_size=pool_size, + data_type="trial", + ) + + if not final_ids: + return ( + "_(no trials matched. Try widening — drop the state, " + "year, or product filter. `list_versions()` shows " + "which trial sources are indexed.)_" + ) + + blocks: list[str] = [] + for cid in final_ids: + doc = id_to_doc.get(cid, "") + meta = id_to_meta.get(cid, {}) + dist = id_to_dist.get(cid) if not used_hybrid else None + blocks.append(_format_trial_hit(doc, meta, dist)) + + header = ( + f"# Trial search results — {len(final_ids)} trial document" + f"{'s' if len(final_ids) != 1 else ''}" + f"{' (dense + BM25 hybrid)' if used_hybrid else ' (dense only)'}\n" + f"_Use `get_page(source=..., source_key=...)` to read the " + f"full trial body. Use `lookup_variety(source_key=...)` on " + f"any product code to verify its identity (RM, traits, " + f"disease ratings)._\n\n---\n\n" + ) + return header + "\n---\n\n".join(blocks) + + +def _format_trial_hit(doc: str, meta: dict, distance: float | None = None) -> str: + """Trial-specific result header. Highlights crop/state/year and + sources URL (vs variety hits which emphasize brand + product + identity).""" + src_url = meta.get("source_url") or "" + src_key = meta.get("source_key") or "" + src = meta.get("source") or "" + crop = meta.get("crop") or "" + state = meta.get("state") or "" + year = meta.get("year") or "" + region = meta.get("region") or "" + + title_bits = [b for b in [crop.title(), region or state, str(year) if year else ""] if b] + title = " · ".join(title_bits) if title_bits else src_key + + header = ( + f"### Trial: {title} \n" + f"`{src}::{src_key}` — {meta.get('vendor', '')} / {meta.get('brand', '')} \n" + f"<{src_url}>" + ) + if distance is not None: + header += f" \n_(distance={distance:.4f})_" + return f"{header}\n\n{doc.strip()}\n" + + @mcp.tool() def crop_seed_api_lessons( topic: Annotated[ diff --git a/rag/bm25.py b/rag/bm25.py index 79507371..e09bb968 100644 --- a/rag/bm25.py +++ b/rag/bm25.py @@ -42,7 +42,12 @@ DEFAULT_DB_NAME = "crop_seed_docs.db" # Columns we expose as filterable metadata. Mirrors what # ``docs_mcp.server._build_where`` accepts so the same filter dict # works for both Chroma and BM25 without per-retriever translation. -FILTER_COLUMNS = ("source", "vendor", "brand", "crop", "source_key", "ordinal") +# data_type / year / state / region are trial-specific facets; variety +# chunks leave them empty. +FILTER_COLUMNS = ( + "source", "vendor", "brand", "crop", "source_key", + "data_type", "year", "state", "ordinal", +) # Allowlist tokenizer for free-text queries. FTS5's parser chokes on @@ -131,8 +136,9 @@ class BM25Index: con.executescript(self._schema_sql()) con.executemany( "INSERT INTO chunks_meta " - "(id, source, vendor, brand, crop, source_key, ordinal) " - "VALUES (?, ?, ?, ?, ?, ?, ?)", + "(id, source, vendor, brand, crop, source_key, " + " data_type, year, state, ordinal) " + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", [ ( r["id"], @@ -141,6 +147,9 @@ class BM25Index: r["metadata"].get("brand") or "", r["metadata"].get("crop") or "", r["metadata"].get("source_key") or "", + r["metadata"].get("data_type") or "variety", + int(r["metadata"]["year"]) if isinstance(r["metadata"].get("year"), int) else None, + r["metadata"].get("state") or "", int(r["metadata"].get("ordinal") or 0), ) for r in records @@ -216,12 +225,18 @@ class BM25Index: brand TEXT, crop TEXT, source_key TEXT, + data_type TEXT, + year INTEGER, + state TEXT, ordinal INTEGER ); CREATE INDEX idx_meta_source ON chunks_meta(source); CREATE INDEX idx_meta_crop ON chunks_meta(crop); CREATE INDEX idx_meta_brand ON chunks_meta(brand); CREATE INDEX idx_meta_source_key ON chunks_meta(source_key); + CREATE INDEX idx_meta_data_type ON chunks_meta(data_type); + CREATE INDEX idx_meta_year ON chunks_meta(year); + CREATE INDEX idx_meta_state ON chunks_meta(state); CREATE VIRTUAL TABLE chunks_fts USING fts5( text, diff --git a/rag/chunk.py b/rag/chunk.py index ad04240a..9f01d793 100644 --- a/rag/chunk.py +++ b/rag/chunk.py @@ -253,6 +253,7 @@ def _flat_metadata(sidecar: dict) -> dict: md: dict = { "source": sidecar.get("source") or "", "source_key": sidecar.get("source_key") or "", + "data_type": "variety", "vendor": sidecar.get("vendor") or "", "brand": (sidecar.get("brand") or "").upper(), "crop": (sidecar.get("crop") or "").lower(), @@ -304,6 +305,258 @@ def chunks_from_variety( } +# =========================================================================== +# Trial chunker — for sidecars with data_type="trial" +# =========================================================================== +# +# Trial documents are a different shape from variety identity: +# - GH plot reports: per-site head-to-head yield comparison across brands +# - AgriPro trial PDFs: regional multi-year multi-location summary +# +# Both produce ONE chunk per document with a preamble that emphasizes +# the trial's location/year/top performers so the embedder gets clean +# signal for queries like "best corn for sandy soil Iowa 2024". + + +def _render_gh_plot_chunk(sidecar: dict) -> str: + """Render a Golden Harvest plot report (per-site cross-vendor).""" + lines: list[str] = [] + crop = (sidecar.get("crop") or "").lower() + crop_label = {"corn": "Corn", "soybeans": "Soybean", "silage": "Silage"}.get(crop, crop.title()) + state = sidecar.get("state") or sidecar.get("state_abbrev") or "" + year = sidecar.get("year") or "" + cooperator = sidecar.get("cooperator") or "" + + lines.append(f"# {crop_label} yield trial — {state}, {year}") + lines.append("") + facts = ["Golden Harvest plot report (cross-vendor)"] + if cooperator: + facts.append(f"cooperator {cooperator}") + if sidecar.get("planted_date"): + facts.append(f"planted {sidecar['planted_date']}") + if sidecar.get("harvested_date"): + facts.append(f"harvested {sidecar['harvested_date']}") + if sidecar.get("population_seeds_per_acre"): + facts.append(f"population {sidecar['population_seeds_per_acre']:,} seeds/acre") + if sidecar.get("row_width_in"): + facts.append(f"{sidecar['row_width_in']}\" rows") + lines.append(". ".join(facts) + ".") + lines.append("") + + results = sidecar.get("results") or [] + if results: + # Pick the primary metric for ranking: corn/soy use "Yield", + # silage uses "Ton/Acre". Find the first metric key with a + # numeric value in the top result. + def _primary(r: dict) -> tuple[str, float | None]: + metrics = r.get("metrics") or {} + # Back-compat: old sidecars had yield_bu_ac directly. + if not metrics and r.get("yield_bu_ac") is not None: + return ("Yield", r["yield_bu_ac"]) + for k in ("Yield", "Ton/Acre", "Tons/Acre"): + v = metrics.get(k) + if isinstance(v, (int, float)): + return (k, v) + for k, v in metrics.items(): + if isinstance(v, (int, float)): + return (k, v) + return ("", None) + + top = results[: min(5, len(results))] + primary_label, _ = _primary(top[0]) if top else ("", None) + rendered_top_parts: list[str] = [] + for i, r in enumerate(top): + label, val = _primary(r) + piece = f"#{r.get('rank') or i+1} {r.get('brand','?')} {r.get('product','?')}" + if r.get('traits'): + piece += f" {r['traits']}" + if val is not None: + piece += f" — {val} {label}" + rendered_top_parts.append(piece) + if rendered_top_parts: + lines.append( + f"Top {len(top)} ({crop_label}, {state} {year}): " + + ", ".join(rendered_top_parts) + "." + ) + lines.append("") + + # Discover the metric column order from the first result with metrics. + metric_keys: list[str] = [] + for r in results: + metrics = r.get("metrics") or {} + if metrics: + metric_keys = list(metrics.keys()) + break + # Back-compat: synthesize from legacy fields if no metrics dict. + if not metric_keys and any( + r.get("yield_bu_ac") is not None for r in results + ): + metric_keys = ["Yield", "%MST", "Test Weight", "Gross Revenue"] + + # Full ranking — preserves every datapoint verbatim. + col_headers = ["rank", "brand", "product", "traits"] + metric_keys + lines.append("Full ranking (" + " | ".join(col_headers) + "):") + for r in results: + row = [ + f"#{r.get('rank') or '-'}", + r.get("brand") or "-", + r.get("product") or "-", + r.get("traits") or "-", + ] + metrics = r.get("metrics") or {} + # Back-compat shim + if not metrics: + metrics = { + "Yield": r.get("yield_bu_ac"), + "%MST": r.get("mst_pct"), + "Test Weight": r.get("test_weight"), + "Gross Revenue": r.get("gross_revenue_dol_ac"), + } + for k in metric_keys: + v = metrics.get(k) + if v is None: + row.append("-") + elif isinstance(v, (int, float)): + if "Revenue" in k or "$" in k: + row.append(f"${v:.2f}") + else: + row.append(str(v)) + else: + row.append(str(v)) + lines.append(" " + " | ".join(row)) + lines.append("") + + urls = sidecar.get("source_urls") or [] + if urls: + lines.append(f"Source: {urls[0]}") + return "\n".join(lines).strip() + "\n" + + +def _render_agripro_trial_chunk(sidecar: dict) -> str: + """Render an AgriPro regional trial PDF — preamble + verbatim text.""" + lines: list[str] = [] + title = sidecar.get("title") or sidecar.get("filename") or sidecar.get("source_key", "") + lines.append(f"# {title}") + lines.append("") + + facts = ["AgriPro / Syngenta regional wheat trial"] + if sidecar.get("region"): + facts.append(f"region {sidecar['region']}") + if sidecar.get("wheat_class_section"): + facts.append(f"class {sidecar['wheat_class_section']}") + if sidecar.get("years_covered") and len(sidecar["years_covered"]) > 1: + yc = sidecar["years_covered"] + facts.append(f"years {yc[0]}–{yc[-1]}") + elif sidecar.get("year"): + facts.append(f"year {sidecar['year']}") + lines.append(". ".join(facts) + ".") + lines.append("") + + varieties = sidecar.get("varieties_found") or [] + if varieties: + lines.append("Varieties listed: " + ", ".join(varieties) + ".") + lines.append("") + + # Verbatim trial data — preserves variety + yield numbers adjacent + # so BM25/dense can match "AP Iliad Aberdeen Idaho" queries. + lines.append("Trial data (verbatim from PDF):") + lines.append("") + # The actual text was in the .md body but isn't in the sidecar + # JSON. We render a brief marker; full text goes in the .md file + # that get_page returns. For embedding signal, the title + + # varieties + region is usually enough. + # If we want the FULL text in the chunk we'd need to either store + # it in the sidecar OR read it from the .md path at chunk time. + # Read from the .md path: + return "\n".join(lines).strip() + "\n" + + +def _render_trial_chunk(sidecar: dict, md_text: str | None = None) -> str: + """Dispatch to the right trial renderer by source. Includes the + verbatim trial body for sources whose value lives in the body text + (currently agripro_trials).""" + source = sidecar.get("source") + if source == "gh_plot_reports": + return _render_gh_plot_chunk(sidecar) + if source == "agripro_trials": + header = _render_agripro_trial_chunk(sidecar) + if md_text: + # Strip the markdown frontmatter so the body text is the + # actual trial data, not the per-source preamble. + body = md_text + sep = "## Trial data (verbatim from PDF)" + if sep in body: + body = body.split(sep, 1)[1].strip() + # Strip fence markers + body = re.sub(r"```", "", body).strip() + return header + "\n" + body + "\n" + return header + # Fallback: generic trial render + return _render_gh_plot_chunk(sidecar) + + +def _flat_trial_metadata(sidecar: dict) -> dict: + """Chroma-safe metadata for trial chunks. Mirrors variety metadata + plus trial-specific facets (state, year, data_type).""" + md: dict = { + "source": sidecar.get("source") or "", + "source_key": sidecar.get("source_key") or "", + "data_type": sidecar.get("data_type") or "trial", + "vendor": sidecar.get("vendor") or "", + "brand": (sidecar.get("brand") or "").upper(), + "crop": (sidecar.get("crop") or "").lower(), + "source_url": (sidecar.get("source_urls") or [""])[0], + } + year = sidecar.get("year") + if isinstance(year, int): + md["year"] = year + state = sidecar.get("state_abbrev") or sidecar.get("state") + if state: + md["state"] = state.upper() if len(state) <= 3 else state + md["state_abbrev"] = (sidecar.get("state_abbrev") or "").upper() + if sidecar.get("region"): + md["region"] = sidecar["region"] + if sidecar.get("wheat_class_section"): + md["wheat_class"] = sidecar["wheat_class_section"] + if sidecar.get("plot_id"): + md["plot_id"] = sidecar["plot_id"] + if isinstance(sidecar.get("n_results"), int): + md["n_results"] = sidecar["n_results"] + return md + + +def chunks_from_trial( + sidecar_path: Path | str, + *, + md_path: Path | str | None = None, +) -> Iterator[dict]: + """Yield chunk dict(s) for one trial document. Emits exactly one + chunk per trial. + + Args: + sidecar_path: path to the trial's JSON sidecar. + md_path: path to the trial's markdown body (used for + AgriPro PDFs whose value lives in the verbatim + text). If omitted we infer it from sidecar_path. + """ + sc_path = Path(sidecar_path) + sidecar = json.loads(sc_path.read_text(encoding="utf-8")) + + md_text: str | None = None + md_p = Path(md_path) if md_path else sc_path.with_suffix(".md") + if md_p.exists(): + md_text = md_p.read_text(encoding="utf-8") + + text = _render_trial_chunk(sidecar, md_text=md_text) + meta = _flat_trial_metadata(sidecar) + chunk_id = f"{meta['source']}::{meta['source_key']}::0" + yield { + "id": chunk_id, + "text": text, + "metadata": {**meta, "ordinal": 0}, + } + + # ----- Backwards-compat shim for the template's index.py ------------------- # # The template's ``rag.index.page_records`` calls diff --git a/rag/index.py b/rag/index.py index 91bb9412..a453b5ba 100644 --- a/rag/index.py +++ b/rag/index.py @@ -12,6 +12,7 @@ Override via the PRODUCT_NAME env var. from __future__ import annotations import argparse +import json import logging import os import time @@ -21,7 +22,7 @@ from typing import Iterator import chromadb from chromadb.config import Settings -from .chunk import chunks_from_variety +from .chunk import chunks_from_variety, chunks_from_trial from .embeddings import embedding_function log = logging.getLogger(__name__) @@ -37,7 +38,17 @@ COLLECTION = f"{PRODUCT_NAME}_docs" def variety_records() -> Iterator[dict]: """Walk ``corpus//.json``, yield one chunk per - variety.""" + document. + + Dispatches by the sidecar's ``data_type`` field: + - ``"trial"`` → chunks_from_trial (gh_plot_reports, agripro_trials) + - anything else (or absent) → chunks_from_variety (default) + + The output shape (id/text/metadata) is identical for both — only + the chunk text composition and metadata keys differ. Chroma + BM25 + can index both into the same collection; downstream tools filter + by the ``data_type`` metadata field. + """ if not CORPUS.exists(): log.error("corpus/ doesn't exist; run a scraper first") return @@ -45,7 +56,15 @@ def variety_records() -> Iterator[dict]: if not source_dir.is_dir() or source_dir.name.startswith("."): continue for sidecar_path in sorted(source_dir.glob("*.json")): - yield from chunks_from_variety(sidecar_path) + try: + head = json.loads(sidecar_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + log.warning("skipping unreadable sidecar %s: %s", sidecar_path, exc) + continue + if head.get("data_type") == "trial": + yield from chunks_from_trial(sidecar_path) + else: + yield from chunks_from_variety(sidecar_path) def upsert_to_chroma(records: list[dict]) -> int: diff --git a/scrape/sources/agripro_trials.py b/scrape/sources/agripro_trials.py new file mode 100644 index 00000000..f29a4254 --- /dev/null +++ b/scrape/sources/agripro_trials.py @@ -0,0 +1,483 @@ +"""AgriPro trial-PDF scraper. + +Source: ``agriprowheat.com/trials-data`` — a single page listing +~38 PDF links to regional wheat trial summary documents. Each PDF +is a multi-year multi-location performance test comparing AgriPro +varieties against competitors (LCS, Norwest, PNW, UI, etc.). + +Discovery: walk ``/trials-data``, collect every ``href="*.pdf"``. + +Per-PDF content (parsed via pdfplumber): + - First line: usually the title (e.g. + "2024 Pacific Northwest Combined Summary, Three-Year Data") + - A multi-column table with one row per variety. Columns vary by + PDF but typically include: 3-yr combined yield, 2-yr combined, + most-recent-year yield, plus per-location yields with location + names in the header. + - Footer notes: locations covered, LSD/CV statistical caveats, + copyright. + +Trial PDFs are stable text-extractable (no charts). We capture the +full per-page text verbatim in the chunk body — preserving +variety-name + yield-number adjacency for the embedder — plus +metadata derived from the title (region, year, crop class). This is +a deliberate trade-off: perfect table parsing across the PDF +variants would be brittle; verbatim text preserves every data point +and the embedder + BM25 between them can match queries like +"AP Iliad yield Aberdeen Idaho" reliably. + +Output: + corpus/agripro_trials/.md + corpus/agripro_trials/.json + +source_key convention: ``agt-`` lowercased, +e.g. ``agt-2024-pnw-combined``. + +CLI: + python -m scrape.sources.agripro_trials --limit 5 + python -m scrape.sources.agripro_trials --force +""" + +from __future__ import annotations + +import argparse +import io +import json +import logging +import os +import random +import re +import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup +import pdfplumber + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://agriprowheat.com" +LIST_URL = f"{BASE}/trials-data" + +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "agripro_trials" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.agripro_trials") + +# Region name patterns we recognize in PDF filenames / titles. The +# value is a human-readable normalized region. +REGION_PATTERNS = ( + (re.compile(r"\bPNW\b|Pacific Northwest", re.I), "Pacific Northwest"), + (re.compile(r"\bNE Colorado\b|Northeast Colorado", re.I), "NE Colorado"), + (re.compile(r"\bSC KS\b|South Central Kansas", re.I), "SC Kansas / N Central OK"), + (re.compile(r"\bWestern Plains\b", re.I), "Western Plains"), + (re.compile(r"\bCentral Plains\b", re.I), "Central Plains"), + (re.compile(r"\bPlains Irrigated\b", re.I), "Plains Irrigated"), + (re.compile(r"\bWashington[/:]?N? *Idaho\b", re.I), "WA / N. Idaho"), + (re.compile(r"\bSouthern Idaho\b", re.I), "Southern Idaho"), + (re.compile(r"\bMontana\b", re.I), "Montana"), + (re.compile(r"\bNP Perf Data\b|Northern Plains", re.I), "Northern Plains"), + (re.compile(r"\bWheat after Soy\b", re.I), "Wheat-after-Soy rotation"), +) + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 60.0, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request(method, url, timeout=timeout, **kw) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class TrialPDF: + source_key: str + source_url: str + pdf_url: str + filename: str + title: str | None = None + year: int | None = None + years_covered: list[int] = field(default_factory=list) + region: str | None = None + wheat_class_section: str | None = None # e.g. "Soft White Winter Wheat" — derived from PDF text + page_text: str = "" + varieties_found: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------- discovery + + +def discover_pdfs(http: RateLimitedSession) -> list[tuple[str, str, str, str]]: + """Return ``[(pdf_url, filename, section_heading, section_anchor), ...]`` + for every PDF on /trials-data. + + De-duplicates by pdf_url — multiple section headings may link to + the same PDF (e.g. a multi-state summary). + """ + log.info("fetching trials index %s", LIST_URL) + r = http.get(LIST_URL) + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + seen: dict[str, tuple[str, str, str, str]] = {} + for a in soup.find_all("a", href=re.compile(r"\.pdf(?:$|\?)", re.I)): + href = a["href"] + from urllib.parse import urljoin + full = urljoin(LIST_URL, href) + fn = href.rsplit("/", 1)[-1] + # Section context — closest preceding h2/h3/h4 + section = "" + parent = a.parent + for _ in range(10): + if parent is None: + break + head = parent.find_previous(["h2", "h3", "h4"]) + if head: + section = head.get_text(strip=True) + break + parent = parent.parent + if full not in seen: + seen[full] = (full, fn, section, href) + out = list(seen.values()) + log.info("trial PDFs found: %d (deduped from %d total links)", + len(out), + sum(1 for a in soup.find_all("a", href=re.compile(r"\.pdf", re.I)))) + return out + + +# --------------------------------------------------------------------- helpers + + +def source_key_for(filename: str) -> str: + """``2024 PNW Combined.pdf`` → ``agt-2024-pnw-combined``.""" + from urllib.parse import unquote + stem = unquote(filename).rsplit(".", 1)[0] + slug = re.sub(r"[^a-zA-Z0-9]+", "-", stem).strip("-").lower() + return f"agt-{slug}" + + +def _detect_region(text: str) -> str | None: + for pat, label in REGION_PATTERNS: + if pat.search(text): + return label + return None + + +def _detect_years(text: str) -> list[int]: + """Return sorted years found in the PDF title / first lines. + Filters to 2010-2030 to ignore page numbers / table values.""" + years = sorted({ + int(y) for y in re.findall(r"\b(20[1-3]\d)\b", text[:600]) + }) + return years + + +def _detect_wheat_class_section(text: str) -> str | None: + """The trial PDFs typically have a class label line like + 'Soft White Winter Wheat' near the top of the table.""" + for label in ( + "Hard Red Winter Wheat", "Hard Red Spring Wheat", + "Hard White Spring Wheat", "Hard White Winter Wheat", + "Soft White Winter Wheat", "Soft White Spring Wheat", + "Soft Red Winter Wheat", "Durum", + ): + if re.search(r"\b" + re.escape(label) + r"\b", text[:1500], re.I): + return label + return None + + +# Variety name patterns we expect to see in AgriPro trial PDFs. +# AgriPro varieties = AP , SY ; competitors include +# LCS , UI , PNW , Norwest . +_VARIETY_LINE_RE = re.compile( + r"^(?:AP|SY|LCS|UI|PNW|Norwest|WB|Stine|Pioneer)\b[A-Za-z0-9 \-+]*", +) + + +def _detect_varieties(text: str) -> list[str]: + out: list[str] = [] + seen: set[str] = set() + for line in text.splitlines(): + line = line.strip() + if not line: + continue + m = _VARIETY_LINE_RE.match(line) + if m: + # Up to first run of digits / spaces — variety name only + name_match = re.match(r"^([A-Za-z][A-Za-z0-9 \-+]*?)\s+\d", line) + name = name_match.group(1).strip() if name_match else m.group(0).strip() + # Trim trailing single tokens that are clearly stats + if name and name not in seen and len(name) <= 40: + seen.add(name) + out.append(name) + return out + + +# --------------------------------------------------------------------- detail + + +def fetch_pdf_detail( + http: RateLimitedSession, + pdf_url: str, + filename: str, +) -> TrialPDF | None: + """Download + parse one trial PDF.""" + r = http.get(pdf_url) + if r.status_code == 404: + return None + r.raise_for_status() + try: + with pdfplumber.open(io.BytesIO(r.content)) as pdf: + pages_text = [] + for p in pdf.pages: + t = p.extract_text() or "" + pages_text.append(t) + text = "\n\n".join(pages_text).strip() + except Exception as exc: # noqa: BLE001 + log.warning("PDF parse failed for %s: %s", pdf_url, exc) + return None + + title = "" + if text: + # First non-empty line is usually the title. + for line in text.splitlines(): + line = line.strip() + if line: + title = line + break + + region = _detect_region(filename) or _detect_region(title or "") + years = _detect_years(title + "\n" + filename) + wheat_class_section = _detect_wheat_class_section(text) + varieties = _detect_varieties(text) + + return TrialPDF( + source_key=source_key_for(filename), + source_url=LIST_URL, + pdf_url=pdf_url, + filename=filename, + title=title or None, + year=years[-1] if years else None, + years_covered=years, + region=region, + wheat_class_section=wheat_class_section, + page_text=text, + varieties_found=varieties, + ) + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: TrialPDF) -> str: + head: list[str] = [ + f"# {p.title or p.filename}", + "", + "- **Source:** AgriPro (Syngenta) regional trial PDF", + "- **Vendor:** Syngenta", + "- **Brand:** AgriPro", + "- **Crop:** Wheat", + "- **Data type:** trial", + ] + if p.region: + head.append(f"- **Region:** {p.region}") + if p.wheat_class_section: + head.append(f"- **Wheat class:** {p.wheat_class_section}") + if p.year: + head.append(f"- **Year:** {p.year}") + if p.years_covered and len(p.years_covered) > 1: + head.append(f"- **Years covered:** {p.years_covered[0]}–{p.years_covered[-1]}") + head.append(f"- **PDF:** {p.pdf_url}") + head.append(f"- **Index page:** {p.source_url}") + if p.varieties_found: + head.append( + f"- **Varieties listed:** {', '.join(p.varieties_found[:30])}" + + ("…" if len(p.varieties_found) > 30 else "") + ) + head.append("") + head.append("---") + head.append("") + head.append("## Trial data (verbatim from PDF)") + head.append("") + head.append("```") + head.append(p.page_text) + head.append("```") + return "\n".join(head) + + +# --------------------------------------------------------------------- write + + +def write_pdf(prod: TrialPDF, body_md: str) -> None: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + sidecar = { + "source": "agripro_trials", + "source_key": prod.source_key, + "data_type": "trial", + "vendor": "Syngenta", + "brand": "AgriPro", + "crop": "wheat", + "title": prod.title, + "filename": prod.filename, + "region": prod.region, + "wheat_class_section": prod.wheat_class_section, + "year": prod.year, + "years_covered": prod.years_covered, + "varieties_found": prod.varieties_found, + "pdf_url": prod.pdf_url, + "source_urls": [prod.source_url, prod.pdf_url], + "page_text_chars": len(prod.page_text), + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_pdf( + http: RateLimitedSession, + *, + pdf_url: str, + filename: str, + force: bool, +) -> tuple[str, TrialPDF | None]: + sk = source_key_for(filename) + md_path = CORPUS_DIR / f"{sk}.md" + if md_path.exists() and not force: + return "skipped", None + try: + prod = fetch_pdf_detail(http, pdf_url, filename) + except Exception as exc: # noqa: BLE001 + log.error("PDF fetch/parse failed for %s: %s", pdf_url, exc) + return "failed", None + if prod is None: + return "missing", None + body = render_markdown(prod) + write_pdf(prod, body) + return "written", prod + + +def run(*, limit: int | None, force: bool) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + targets = discover_pdfs(http) + + counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} + processed = 0 + for pdf_url, filename, _section, _href in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_pdf( + http, pdf_url=pdf_url, filename=filename, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + log.info( + "[%d/%d] %s %s | region=%s year=%s varieties=%d chars=%d", + processed, len(targets), + source_key_for(filename), status, + (prod.region if prod else "-") or "-", + prod.year if prod else "-", + len(prod.varieties_found) if prod else 0, + len(prod.page_text) if prod else 0, + ) + + log.info( + "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d PDFs)", + processed, counts["written"], counts["skipped"], + counts["missing"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.agripro_trials", + description="Scrape AgriPro regional trial PDFs.", + ) + p.add_argument("--limit", type=int, default=None, + help="Stop after processing N PDFs (default: all).") + p.add_argument("--force", action="store_true", + help="Re-fetch even if the markdown file already exists.") + p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) + return p + + +def main(argv: list[str] | None = None) -> int: + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run(limit=args.limit, force=args.force) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrape/sources/gh_plot_reports.py b/scrape/sources/gh_plot_reports.py new file mode 100644 index 00000000..fa026598 --- /dev/null +++ b/scrape/sources/gh_plot_reports.py @@ -0,0 +1,781 @@ +"""Golden Harvest plot-report scraper — cross-vendor yield trials. + +This is the FIRST source in the seed-mcp corpus with ``data_type: +"trial"`` rather than the per-variety identity records all other +scrapers emit. Each document is one head-to-head yield trial at a +specific state/year/site, comparing products across brands (NK, +DEKALB, Golden Harvest, sometimes Pioneer/Channel etc. listed as +competitor entries) — i.e. **third-party-feeling cross-vendor data +that Bayer doesn't publish itself**. + +Source: ``goldenharvestseeds.com`` — same site as ``golden_harvest`` +variety scraper. ``/sitemap-ghs-hybrids.xml`` (already walked for +the variety scraper) lists 8,237 plot reports across: + + Year Corn Soy Silage Total + 2023 1,832 1,614 173 3,619 + 2024 1,432 1,277 137 2,846 + 2025 973 703 96 1,772 + +Initial scrape: 2024 + 2025 (4,618 reports). 2023 is older data +that's still informative but lower priority. Defer 2023 to a later +backfill pass via ``--include-2023``. + +URL shape: + //plot-report/// + e.g. /corn/plot-report/al/2023/2374765 + +Per-report data (server-rendered HTML): + - Cooperator name (h1 area) + - State (full name, e.g. "Alabama") + - Planted date / Harvested date + - Population (seeds/acre), Row Width + - One with columns: + Rank | Brand | Product | Traits | Yield (BU/Acre) | %MST | + Test Weight | Gross Revenue | Entry # + +Each row in the results table can be from any seed brand — the +trial is the test, not the catalog. Brand and product are the join +keys back to the per-variety corpus (lookup_variety can pull the +identity record if we have the same brand/product). + +Output: + corpus/gh_plot_reports/.md LLM-visible body + corpus/gh_plot_reports/.json sidecar metadata + +source_key convention: ``ghpr----`` +e.g. ``ghpr-corn-al-2023-2374765``. + +CLI: + python -m scrape.sources.gh_plot_reports --limit 5 + python -m scrape.sources.gh_plot_reports --crop corn --state ia --year 2024 + python -m scrape.sources.gh_plot_reports --include-2023 --force +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import random +import re +import sys +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup + +SCRAPER_VERSION = "0.1.0" +USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)" +BASE = "https://www.goldenharvestseeds.com" +SITEMAP_HYBRIDS = f"{BASE}/sitemap-ghs-hybrids.xml" + +REPO_ROOT = Path(__file__).resolve().parents[2] +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_DIR = CORPUS_ROOT / "gh_plot_reports" + +REQ_INTERVAL_SEC = 1.0 + +log = logging.getLogger("scrape.gh_plot_reports") + +# State name normalization: URL gives a 2-letter abbrev; sidecar keeps +# both forms so search filters can use either. +STATE_NAMES = { + "al": "Alabama", "ak": "Alaska", "az": "Arizona", "ar": "Arkansas", + "ca": "California", "co": "Colorado", "ct": "Connecticut", + "de": "Delaware", "fl": "Florida", "ga": "Georgia", "hi": "Hawaii", + "id": "Idaho", "il": "Illinois", "in": "Indiana", "ia": "Iowa", + "ks": "Kansas", "ky": "Kentucky", "la": "Louisiana", "me": "Maine", + "md": "Maryland", "ma": "Massachusetts", "mi": "Michigan", + "mn": "Minnesota", "ms": "Mississippi", "mo": "Missouri", + "mt": "Montana", "ne": "Nebraska", "nv": "Nevada", "nh": "New Hampshire", + "nj": "New Jersey", "nm": "New Mexico", "ny": "New York", + "nc": "North Carolina", "nd": "North Dakota", "oh": "Ohio", + "ok": "Oklahoma", "or": "Oregon", "pa": "Pennsylvania", + "ri": "Rhode Island", "sc": "South Carolina", "sd": "South Dakota", + "tn": "Tennessee", "tx": "Texas", "ut": "Utah", "vt": "Vermont", + "va": "Virginia", "wa": "Washington", "wv": "West Virginia", + "wi": "Wisconsin", "wy": "Wyoming", +} + + +# --------------------------------------------------------------------- HTTP + + +class RateLimitedSession: + def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None: + self.s = requests.Session() + self.s.headers["User-Agent"] = USER_AGENT + self.interval = interval + self._last = 0.0 + + def _wait(self) -> None: + delta = time.monotonic() - self._last + if delta < self.interval: + time.sleep(self.interval - delta) + self._last = time.monotonic() + + def request( + self, + method: str, + url: str, + *, + max_retries: int = 4, + timeout: float = 30.0, + **kw: Any, + ) -> requests.Response: + last_exc: Exception | None = None + for attempt in range(max_retries): + self._wait() + try: + resp = self.s.request(method, url, timeout=timeout, **kw) + except requests.RequestException as exc: + last_exc = exc + backoff = min(30.0, (2 ** attempt) + random.random()) + log.warning("network error on %s %s: %s — retry in %.1fs", + method, url, exc, backoff) + time.sleep(backoff) + continue + if resp.status_code == 429 or 500 <= resp.status_code < 600: + ra = resp.headers.get("Retry-After") + backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random()) + log.warning("HTTP %d on %s %s — retry in %.1fs", + resp.status_code, method, url, backoff) + time.sleep(backoff) + continue + return resp + if last_exc: + raise last_exc + return resp # type: ignore[return-value] + + def get(self, url: str, **kw: Any) -> requests.Response: + return self.request("GET", url, **kw) + + +# --------------------------------------------------------------------- model + + +@dataclass +class TrialResult: + rank: int | None = None + brand: str = "" + product: str = "" + traits: str = "" + # Generic per-column metrics — keyed by the header from the table + # (e.g. "Yield" / "%MST" / "Ton/Acre" / "Milk Per Acre" / + # "Beef Per Ton"). Corn + soy use Yield/MST/Test Weight/Gross + # Revenue; silage uses Ton/Acre + Milk + Beef columns. Storing as + # an open dict keeps the scraper robust across crop types. + metrics: dict[str, float | str | None] = field(default_factory=dict) + entry_num: int | None = None + + # Convenience accessors — back-compat for the chunker that looks + # up these specific keys. + @property + def yield_bu_ac(self) -> float | None: + v = self.metrics.get("Yield") + return v if isinstance(v, (int, float)) else None + + @property + def mst_pct(self) -> float | None: + v = self.metrics.get("%MST") + return v if isinstance(v, (int, float)) else None + + @property + def test_weight(self) -> float | None: + v = self.metrics.get("Test Weight") + return v if isinstance(v, (int, float)) else None + + @property + def gross_revenue_dol_ac(self) -> float | None: + v = self.metrics.get("Gross Revenue") + return v if isinstance(v, (int, float)) else None + + @property + def primary_metric(self) -> tuple[str, float | None]: + """The first numeric metric — used as the canonical 'yield' + for ranking in the chunk preamble. Corn/soy: Yield (BU/Ac). + Silage: Ton/Acre.""" + for k in ("Yield", "Ton/Acre", "Tons/Acre"): + v = self.metrics.get(k) + if isinstance(v, (int, float)): + return (k, v) + # Fallback to first numeric metric + for k, v in self.metrics.items(): + if isinstance(v, (int, float)): + return (k, v) + return ("", None) + + +@dataclass +class PlotReport: + source_key: str + source_url: str + crop: str # "corn" / "soybeans" / "silage" + state_abbrev: str # "al" + state_name: str # "Alabama" + year: int + plot_id: str + + cooperator: str | None = None + planted_date: str | None = None # ISO date + harvested_date: str | None = None # ISO date + population: int | None = None + row_width: int | None = None + + results: list[TrialResult] = field(default_factory=list) + + +# --------------------------------------------------------------------- discovery + + +_PLOT_URL_RE = re.compile( + r".*?/(?Pcorn|soybean|silage)/plot-report/" + r"(?P[a-z]{2})/(?P\d{4})/(?P\d+)" +) + + +def discover_plots( + http: RateLimitedSession, + *, + crops: set[str], + states: set[str] | None, + years: set[int], +) -> list[tuple[str, str, str, int, str]]: + """Walk the hybrids sitemap and return matching plot URLs as + ``[(url, crop, state, year, plot_id), ...]`` tuples. ``crop`` is + normalized to the schema's terms (soybean → soybeans).""" + log.info("fetching sitemap %s", SITEMAP_HYBRIDS) + r = http.get(SITEMAP_HYBRIDS) + r.raise_for_status() + entries = re.findall(r"([^<]+)", r.text) + log.info("sitemap parsed: %d total locs", len(entries)) + + out: list[tuple[str, str, str, int, str]] = [] + for url in entries: + m = _PLOT_URL_RE.match(url) + if not m: + continue + crop_url = m.group("crop") + # Normalize "soybean" → "soybeans" to match the rest of the corpus. + crop = "soybeans" if crop_url == "soybean" else crop_url + state = m.group("state").lower() + year = int(m.group("year")) + plot = m.group("plot") + if crops and crop not in crops: + continue + if states and state not in states: + continue + if years and year not in years: + continue + out.append((url, crop, state, year, plot)) + + log.info("after filters: %d plot URLs", len(out)) + return out + + +# --------------------------------------------------------------------- helpers + + +def source_key_for(crop: str, state: str, year: int, plot_id: str) -> str: + return f"ghpr-{crop}-{state}-{year}-{plot_id}" + + +def _parse_date_mdy(s: str) -> str | None: + """``04/06/23`` → ``2023-04-06``. Two-digit years are assumed to + be 20xx (sane for current-century trial data).""" + s = (s or "").strip() + m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s) + if not m: + return None + mo, dy, yr = m.group(1), m.group(2), m.group(3) + if len(yr) == 2: + yr = "20" + yr + try: + return f"{int(yr):04d}-{int(mo):02d}-{int(dy):02d}" + except ValueError: + return None + + +def _parse_int(s: str | None) -> int | None: + if not s: + return None + s = re.sub(r"[,$]", "", str(s).strip()) + try: + return int(s) + except ValueError: + return None + + +def _parse_float(s: str | None) -> float | None: + if not s: + return None + s = re.sub(r"[,$]", "", str(s).strip()) + try: + return float(s) + except ValueError: + return None + + +# --------------------------------------------------------------------- detail + + +def fetch_plot_detail( + http: RateLimitedSession, + url: str, + crop: str, + state: str, + year: int, + plot_id: str, +) -> PlotReport | None: + """Fetch one plot-report page and parse it.""" + r = http.get(url) + if r.status_code == 404: + return None + r.raise_for_status() + soup = BeautifulSoup(r.text, "html.parser") + + prod = PlotReport( + source_key=source_key_for(crop, state, year, plot_id), + source_url=url, + crop=crop, + state_abbrev=state, + state_name=STATE_NAMES.get(state, state.upper()), + year=year, + plot_id=plot_id, + ) + + # Pull metadata from the header area. The page renders cooperator + # name + state + key fields as text following the h1. + h1 = soup.find("h1") + if h1: + # Walk up to a parent that includes the metadata strip + container = h1.parent + while container is not None and not container.find("table"): + parent = container.parent + if parent is None: + break + container = parent + if container: + text = container.get_text(" | ", strip=True) + # Cooperator is usually the segment right after the H1. + # Pattern: "Corn Plot Results | | | Planted: | ..." + parts = [p.strip() for p in text.split("|") if p.strip()] + # Drop the title segment + if parts and parts[0].lower().startswith(("corn plot", "soybean plot", "silage plot")): + parts = parts[1:] + if parts: + # First segment that doesn't match a state name is the cooperator + cand = parts[0] + if cand and cand != prod.state_name and not cand.endswith(":"): + prod.cooperator = cand + + # Walk the page text for known labeled fields. + page_text = soup.get_text(" ", strip=True) + m = re.search(r"Planted:\s*(\d{1,2}/\d{1,2}/\d{2,4})", page_text) + if m: + prod.planted_date = _parse_date_mdy(m.group(1)) + m = re.search(r"Harvested:\s*(\d{1,2}/\d{1,2}/\d{2,4})", page_text) + if m: + prod.harvested_date = _parse_date_mdy(m.group(1)) + m = re.search(r"Population:\s*([\d,]+)", page_text) + if m: + prod.population = _parse_int(m.group(1)) + m = re.search(r"Row Width:\s*(\d+)", page_text) + if m: + prod.row_width = _parse_int(m.group(1)) + + # Parse the results table. The HTML uses ONE merged cell for + # "Brand Product Traits" (despite the header containing all + # three labels); subsequent cells are Yield, %MST, Test Weight, + # Gross Revenue, Entry #. We split the merged cell using a + # known-brand prefix match. + table = soup.find("table") + if not table: + return prod + rows = table.find_all("tr") + if not rows: + return prod + + header_cells = [c.get_text(" ", strip=True) for c in rows[0].find_all(["th", "td"])] + + def col_idx(*names: str) -> int | None: + for n in names: + for i, h in enumerate(header_cells): + if n.lower() in h.lower(): + return i + return None + + # Position of the merged identity cell, by header containing "Brand". + i_identity = col_idx("Brand") + i_rank = col_idx("Rank") + i_entry = col_idx("Entry") + + # Build a list of (header, index) for the OTHER columns (the + # metric columns). Skips Rank, Brand-merge-cell, and Entry #. + metric_columns: list[tuple[str, int]] = [] + skip_idx = {i_identity, i_rank, i_entry} + for i, h in enumerate(header_cells): + if i in skip_idx: + continue + h_clean = h.strip() + if h_clean: + metric_columns.append((h_clean, i)) + + for row in rows[1:]: + cells = [c.get_text(" ", strip=True) for c in row.find_all(["td", "th"])] + if len(cells) < 2: + continue + def cell(i: int | None) -> str: + return cells[i] if i is not None and 0 <= i < len(cells) else "" + + identity = cell(i_identity).strip() + if any(k in identity.lower() for k in ("plot average", "trial average", "average")): + continue + + brand, product, traits = _split_identity(identity) + + # Collect every metric column verbatim. Numeric where parseable, + # else preserve the raw string (e.g. "ns" for not-significant). + metrics: dict[str, float | str | None] = {} + for h, idx in metric_columns: + raw = cell(idx).strip() + if not raw or raw == "-": + metrics[h] = None + else: + f = _parse_float(raw) + metrics[h] = f if f is not None else raw + + result = TrialResult( + rank=_parse_int(cell(i_rank)), + brand=brand, + product=product, + traits=traits, + metrics=metrics, + entry_num=_parse_int(cell(i_entry)), + ) + has_data = result.brand or result.product or any( + v is not None for v in metrics.values() + ) + if has_data: + prod.results.append(result) + + return prod + + +# Known seed brands that can appear in plot-report identity cells. +# Sorted longest-first so multi-word brands match before sub-strings. +_BRAND_NAMES = ( + "Golden Harvest", "WestBred", "AgriPro", "DEKALB", "Pioneer", + "Channel", "Asgrow", "NK", "Becks", "Beck's", "Brevant", + "Stine", "Renk", "Wyffels", "LG Seeds", "Croplan", "FS", + "Local Choice", "Mycogen", "AgriGold", "Hoegemeyer", +) +_BRAND_RE = re.compile( + r"^(?:" + "|".join(re.escape(b) for b in _BRAND_NAMES) + r")\b", + re.I, +) + + +def _split_identity(identity: str) -> tuple[str, str, str]: + """Split a plot-report identity cell into ``(brand, product, traits)``. + + The HTML emits one merged cell like "NK NK1748-3110 Agrisure ®" + or "Golden Harvest G16Q82-DV DuracadeViptera™" or just + "DEKALB DKC65-20". We: + + 1. Match the brand against a known-brand list at the start. + 2. The token immediately after the brand is the product. + 3. Anything remaining is the trait stack (free text). + """ + if not identity: + return "", "", "" + s = identity.strip() + m = _BRAND_RE.match(s) + if not m: + # Unknown brand prefix — best-effort: first token is brand, + # second is product, rest is traits. + parts = s.split(maxsplit=2) + if len(parts) == 1: + return parts[0], "", "" + if len(parts) == 2: + return parts[0], parts[1], "" + return parts[0], parts[1], parts[2] + brand = m.group(0) + rest = s[len(brand):].strip() + parts = rest.split(maxsplit=1) + product = parts[0] if parts else "" + traits = parts[1].strip() if len(parts) > 1 else "" + return brand, product, traits + + +# --------------------------------------------------------------------- render + + +def render_markdown(p: PlotReport) -> str: + crop_label = { + "corn": "Corn", "soybeans": "Soybean", "silage": "Silage", + }.get(p.crop, p.crop.title()) + + head: list[str] = [ + f"# {crop_label} yield trial — {p.state_name}, {p.year}", + "", + f"- **Source:** Golden Harvest plot report (cross-vendor head-to-head)", + f"- **Crop:** {crop_label}", + f"- **State:** {p.state_name} ({p.state_abbrev.upper()})", + f"- **Year:** {p.year}", + f"- **Plot ID:** {p.plot_id}", + ] + if p.cooperator: + head.append(f"- **Cooperator:** {p.cooperator}") + if p.planted_date: + head.append(f"- **Planted:** {p.planted_date}") + if p.harvested_date: + head.append(f"- **Harvested:** {p.harvested_date}") + if p.population: + head.append(f"- **Population:** {p.population:,} seeds/acre") + if p.row_width: + head.append(f"- **Row width:** {p.row_width}\"") + head.append(f"- **URL:** {p.source_url}") + head.append("") + head.append("---") + head.append("") + + sections: list[str] = [] + if p.results: + # Discover all metric columns present across results, in + # first-seen order. This keeps corn (Yield/MST/...) and silage + # (Ton/Acre/Milk/Beef) using their own header sets. + metric_keys: list[str] = [] + seen_keys: set[str] = set() + for r in p.results: + for k in r.metrics.keys(): + if k not in seen_keys: + seen_keys.add(k) + metric_keys.append(k) + + sections.append("## Results (top-down by rank)") + sections.append("") + header_cells = ["Rank", "Brand", "Product", "Traits"] + metric_keys + sections.append("| " + " | ".join(header_cells) + " |") + sections.append("|" + "|".join(["---"] * len(header_cells)) + "|") + for r in p.results: + row = [ + str(r.rank) if r.rank is not None else "-", + r.brand or "-", + r.product or "-", + r.traits or "-", + ] + for k in metric_keys: + v = r.metrics.get(k) + if v is None: + row.append("-") + elif isinstance(v, (int, float)): + # Dollar columns rendered with $ prefix + if "Revenue" in k or "$" in k: + row.append(f"${v:.2f}") + else: + row.append(str(v)) + else: + row.append(str(v)) + sections.append("| " + " | ".join(row) + " |") + sections.append("") + + # Compact text summary for embedder signal — uses the primary + # metric (Yield for corn/soy, Ton/Acre for silage). + top = p.results[: min(5, len(p.results))] + if top: + primary_label, _ = top[0].primary_metric + if primary_label: + summary = ", ".join( + f"{r.product or '?'} ({r.brand or '?'}) {r.primary_metric[1]}" + for r in top + if r.primary_metric[1] is not None + ) + if summary: + sections.append(f"Top {len(top)} by {primary_label}: {summary}.") + sections.append("") + + return "\n".join(head) + "\n".join(sections) + + +# --------------------------------------------------------------------- write + + +def write_plot(prod: PlotReport, body_md: str) -> None: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + md_path = CORPUS_DIR / f"{prod.source_key}.md" + json_path = CORPUS_DIR / f"{prod.source_key}.json" + + md_path.write_text(body_md, encoding="utf-8") + sidecar = { + "source": "gh_plot_reports", + "source_key": prod.source_key, + "data_type": "trial", + "vendor": "Syngenta", # Golden Harvest publishes the trial + "brand": "Golden Harvest", + "crop": prod.crop, + "state": prod.state_name, + "state_abbrev": prod.state_abbrev, + "year": prod.year, + "plot_id": prod.plot_id, + "cooperator": prod.cooperator, + "planted_date": prod.planted_date, + "harvested_date": prod.harvested_date, + "population_seeds_per_acre": prod.population, + "row_width_in": prod.row_width, + "results": [ + { + "rank": r.rank, + "brand": r.brand, + "product": r.product, + "traits": r.traits, + # All per-column metrics verbatim. Corn/soy: Yield, + # %MST, Test Weight, Gross Revenue. Silage: Ton/Acre, + # Milk Per Acre, Milk Per Ton, Beef Per Acre, Beef Per + # Ton. (Plus any other column the source publishes.) + "metrics": r.metrics, + "entry_num": r.entry_num, + } + for r in prod.results + ], + "n_results": len(prod.results), + "source_urls": [prod.source_url], + "fetched_at": datetime.now(timezone.utc).isoformat(), + "scraper_version": SCRAPER_VERSION, + } + json_path.write_text( + json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + +# --------------------------------------------------------------------- pipeline + + +def process_plot( + http: RateLimitedSession, + *, + url: str, + crop: str, + state: str, + year: int, + plot_id: str, + force: bool, +) -> tuple[str, PlotReport | None]: + sk = source_key_for(crop, state, year, plot_id) + md_path = CORPUS_DIR / f"{sk}.md" + if md_path.exists() and not force: + return "skipped", None + try: + prod = fetch_plot_detail(http, url, crop, state, year, plot_id) + except Exception as exc: # noqa: BLE001 + log.error("detail fetch failed for %s: %s", url, exc) + return "failed", None + if prod is None: + return "missing", None + body = render_markdown(prod) + write_plot(prod, body) + return "written", prod + + +def run( + *, + limit: int | None, + force: bool, + only_crop: str | None, + only_state: str | None, + only_year: int | None, + include_2023: bool, +) -> int: + CORPUS_DIR.mkdir(parents=True, exist_ok=True) + http = RateLimitedSession() + + crops = {only_crop} if only_crop else {"corn", "soybeans", "silage"} + states = {only_state} if only_state else None + if only_year: + years = {only_year} + elif include_2023: + years = {2023, 2024, 2025} + else: + years = {2024, 2025} + + targets = discover_plots(http, crops=crops, states=states, years=years) + + counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0} + processed = 0 + for url, crop, state, year, plot_id in targets: + if limit is not None and processed >= limit: + break + processed += 1 + status, prod = process_plot( + http, url=url, crop=crop, state=state, year=year, + plot_id=plot_id, force=force, + ) + counts[status] = counts.get(status, 0) + 1 + if prod is not None and processed <= 5 or processed % 100 == 0: + log.info( + "[%d/%s] %s %s | results=%d coop=%s", + processed, str(limit) if limit else len(targets), + source_key_for(crop, state, year, plot_id), status, + len(prod.results) if prod else 0, + (prod.cooperator if prod else "-") or "-", + ) + + log.info( + "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)", + processed, counts["written"], counts["skipped"], + counts["missing"], counts["failed"], len(targets), + ) + return 0 if counts["failed"] == 0 else 1 + + +# --------------------------------------------------------------------- CLI + + +def _build_argparser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser( + prog="scrape.sources.gh_plot_reports", + description="Scrape Golden Harvest cross-vendor plot reports (yield trials).", + ) + p.add_argument("--limit", type=int, default=None, + help="Stop after processing N plots (default: all).") + p.add_argument("--force", action="store_true", + help="Re-fetch even if the markdown file already exists.") + p.add_argument("--crop", default=None, + choices=("corn", "soybeans", "silage"), + help="Limit to one crop.") + p.add_argument("--state", default=None, + help="Limit to one state (2-letter abbrev: ia, il, ne, ...).") + p.add_argument("--year", type=int, default=None, choices=(2023, 2024, 2025), + help="Limit to one year.") + p.add_argument("--include-2023", action="store_true", + help="Include 2023 plot reports (default: 2024-2025 only).") + p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO")) + return p + + +def main(argv: list[str] | None = None) -> int: + args = _build_argparser().parse_args(argv) + logging.basicConfig( + level=args.log_level.upper(), + format="%(asctime)s %(levelname)s %(name)s %(message)s", + stream=sys.stderr, + ) + return run( + limit=args.limit, + force=args.force, + only_crop=args.crop, + only_state=args.state.lower() if args.state else None, + only_year=args.year, + include_2023=args.include_2023, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/sources.json b/sources.json index 8ad18a31..3ca18f8d 100644 --- a/sources.json +++ b/sources.json @@ -5,8 +5,16 @@ { "name": "bayer_seeds", "vendor": "Bayer", - "brands": ["DEKALB", "Asgrow", "WestBred"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "DEKALB", + "Asgrow", + "WestBred" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "green", "expected_count": 475, "base_url": "https://cropscience.bayer.us", @@ -17,65 +25,124 @@ { "name": "golden_harvest", "vendor": "Syngenta", - "brands": ["Golden Harvest"], - "crops": ["corn", "soybeans"], + "brands": [ + "Golden Harvest" + ], + "crops": [ + "corn", + "soybeans" + ], "verdict": "green", "expected_count": 175, "base_url": "https://www.goldenharvestseeds.com", "scope_filter": "All sitemap-listed corn + soybean varieties.", "tos_check_date": "2026-05-25", - "schema_notes": "Disease ratings published on 9-to-1 scale (9 = best). Normalize to 1-9 (9 = best) at chunk time to match Bayer/NK/AgriPro convention. Note original direction in chunk_0 preamble. Tech-sheet PDF URLs in the sitemap are stale (250331) — resolve live URL from product HTML, not sitemap entry." + "schema_notes": "Disease ratings published on 9-to-1 scale (9 = best). Normalize to 1-9 (9 = best) at chunk time to match Bayer/NK/AgriPro convention. Note original direction in chunk_0 preamble. Tech-sheet PDF URLs in the sitemap are stale (250331) \u2014 resolve live URL from product HTML, not sitemap entry." }, { "name": "nk", "vendor": "Syngenta", - "brands": ["NK"], - "crops": ["corn", "soybeans"], + "brands": [ + "NK" + ], + "crops": [ + "corn", + "soybeans" + ], "verdict": "green", "expected_count": 29, "base_url": "https://www.syngenta-us.com", "pdf_cdn": "https://assets.syngentaebiz.com/pdf/techsheets/", "scope_filter": "All NK corn + soy varieties. No wheat (NK doesn't sell wheat in US).", "tos_check_date": "2026-05-24", - "schema_notes": "Disease + agronomic ratings live in tech-sheet PDFs only — need pdfplumber. PDF URLs share format `_YYMMDD.pdf` with Golden Harvest, so the same fetcher works for both." + "schema_notes": "Disease + agronomic ratings live in tech-sheet PDFs only \u2014 need pdfplumber. PDF URLs share format `_YYMMDD.pdf` with Golden Harvest, so the same fetcher works for both." }, { "name": "agripro", "vendor": "Syngenta", - "brands": ["AgriPro"], - "crops": ["wheat", "barley"], + "brands": [ + "AgriPro" + ], + "crops": [ + "wheat", + "barley" + ], "verdict": "green", "expected_count": 24, "base_url": "https://www.agriprowheat.com", - "scope_filter": "All wheat classes (HRW/HRS/HWS/SWW/SWS) + barley. NO SRW — Syngenta's SRW lives at GrowProGenetics.com under a separate brand.", + "scope_filter": "All wheat classes (HRW/HRS/HWS/SWW/SWS) + barley. NO SRW \u2014 Syngenta's SRW lives at GrowProGenetics.com under a separate brand.", "tos_check_date": "2026-05-24", "schema_notes": "Drupal Views form; server-rendered HTML. CoAXium trait flag is implicit in product family; Clearfield/CL2 trait IS in this catalog." }, { "name": "becks_pfr", "vendor": "Beck's Hybrids", - "brands": ["Beck's PFR"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "Beck's PFR" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "yellow", "expected_count": 2089, "base_url": "https://www.beckshybrids.com", "api_base": "https://mc8v24rf.api.sanity.io", - "scope_filter": "All Practical Farm Research publications since 2015. PFR is head-to-head agronomy trials — fungicide timing, planting-date studies, hybrid-by-population, etc.", + "scope_filter": "All Practical Farm Research publications since 2015. PFR is head-to-head agronomy trials \u2014 fungicide timing, planting-date studies, hybrid-by-population, etc.", "tos_check_date": "2026-05-24", - "schema_notes": "Public Sanity GROQ API, no auth required. Records have title/year/crop/key-findings/full-text. Treat PFR docs as a research corpus, not variety records — the chunk_0 includes the study's tl;dr finding." + "schema_notes": "Public Sanity GROQ API, no auth required. Records have title/year/crop/key-findings/full-text. Treat PFR docs as a research corpus, not variety records \u2014 the chunk_0 includes the study's tl;dr finding." }, { "name": "becks_products", "vendor": "Beck's Hybrids", - "brands": ["Beck's"], - "crops": ["corn", "soybeans", "wheat"], + "brands": [ + "Beck's" + ], + "crops": [ + "corn", + "soybeans", + "wheat" + ], "verdict": "yellow", "expected_count": 860, "base_url": "https://www.beckshybrids.com", "api_base": "https://mc8v24rf.api.sanity.io", - "scope_filter": "All Beck's product records — corn + soy + wheat. Identity + RM/MG only.", + "scope_filter": "All Beck's product records \u2014 corn + soy + wheat. Identity + RM/MG only.", "tos_check_date": "2026-05-24", "schema_notes": "Sanity GROQ exposes identity (name, RM/MG, basic traits) but agronomic + disease ratings are SeedIQ-gated (requires browser cookie). Deferred until the SeedIQ XHR endpoint is captured from a logged-in browser session. Without ratings, products are reference-only; the MCP can confirm 'Beck's has hybrid X at RM 112 with Enlist trait' but not 'rate it against drought'." + }, + { + "name": "gh_plot_reports", + "vendor": "Syngenta", + "brand_aggregator": "Golden Harvest publishes", + "crops": [ + "corn", + "soybeans", + "silage" + ], + "verdict": "green", + "expected_count": 4618, + "base_url": "https://www.goldenharvestseeds.com", + "scope_filter": "sitemap-listed plot reports 2024 and 2025 (4,618 reports). 2023 (3,619 reports) deferred to a future pass \u2014 most recent data is most relevant for current decisions.", + "tos_check_date": "2026-05-25", + "schema_notes": "Cross-vendor head-to-head yield trials at specific state/year/site. Each report lists products from multiple brands (NK, DEKALB, GH, etc.) with rank, yield, %MST, test weight, gross revenue. URL: //plot-report///. Same site/auth as golden_harvest variety scraper.", + "data_type": "trial" + }, + { + "name": "agripro_trials", + "vendor": "Syngenta", + "brand_aggregator": "AgriPro publishes", + "crops": [ + "wheat" + ], + "verdict": "green", + "expected_count": 38, + "base_url": "https://agriprowheat.com", + "scope_filter": "PDF trial summaries linked from /trials-data. Regional wheat performance (PNW, Western Plains, NE Colorado, etc.).", + "tos_check_date": "2026-05-25", + "schema_notes": "PDF tables of varieties tested per region per year. pdfplumber for table extraction.", + "data_type": "trial" } ], "_excluded_sources": [