Trial-data scrapers: gh_plot_reports + agripro_trials + search_trials tool

This PR introduces TRIAL data — yield-performance results from real
field trials — as a SEPARATE data type alongside variety identity.
The two are complementary:

  search_docs  → "What's the disease resistance of DKC62-08RIB?"
                  (variety identity — what it IS)
  search_trials → "Which corn hybrid won the IA 2024 trials?"
                  (performance data — how it PERFORMED)

scrape/sources/gh_plot_reports.py — Golden Harvest plot reports
- 4,618 expected (2024+2025; 2023 deferred to a backfill pass).
- URL: /<crop>/plot-report/<state>/<year>/<plot_id>
- Cross-vendor: each plot lists products from multiple brands
  (NK / DEKALB / Golden Harvest / Enogen / Pioneer / Channel) side
  by side at one cooperator's field — the kind of independent
  comparison data Bayer doesn't publish itself.
- Generic per-column metrics dict (Yield/MST/Test Weight/$/Ac for
  corn+soy, Ton/Acre + Milk + Beef columns for silage).
- Politeness: 1 req/sec, retries on 429/5xx, no redirect-follow.

scrape/sources/agripro_trials.py — AgriPro regional trial PDFs
- 14 unique PDFs (38 sitemap links deduped) at /trials-data
- pdfplumber text extraction, region/year detection from filename
- Verbatim PDF text preserved in chunk body so variety + yield
  number adjacency drives retrieval (AP Iliad's Aberdeen ID yield
  matches a query about "AP Iliad Idaho yield")

rag/chunk.py — chunks_from_trial() dispatching by source
- Plot reports: identity preamble + Top-5 by primary metric + full
  ranking table. Metric labels chosen from the data (corn/soy use
  "Yield", silage uses "Ton/Acre").
- AgriPro PDFs: identity preamble + verbatim trial body inline so
  per-location yields surface for region+variety queries.
- Variety chunks get data_type="variety" metadata; trial chunks get
  data_type="trial". Single Chroma collection; the tool router
  filters by data_type rather than maintaining two collections.

rag/index.py — dispatch by sidecar's data_type field
rag/bm25.py — new filter columns (data_type, year, state)

docs_mcp/server.py — sixth MCP tool: search_trials(crop?, state?,
year?, product?, k=10)
- Filters trial chunks via where={"data_type": "trial", ...}
- Optional product substring post-filter for "DKC62-08RIB Iowa 2024"
  style searches
- search_docs now defaults to data_type="variety" so trial chunks
  don't bleed into variety identity queries
- Tool docstring routes the agent: "use lookup_variety to verify
  identity details on any trial winner you surface"

NK trial endpoint (/NKSeeds/wsProxy.asmx/GetPlotResult) is documented
as deferred — the ASMX-SOAP shape returned empty XML on initial
probe. Bayer per-variety yield data is not publicly indexed at all
— documented in the trial-scope note (DEKALB/Asgrow trial data flows
through Channel reps, not the web). AgRevival research books exist
as 10 large annual PDFs but are deferred (low ROI per parse).

Initial corpus shipped in this PR: 14 AgriPro trial PDFs. The 4,618
Golden Harvest plot reports are scraping in background and will be
added in a follow-up corpus-snapshot PR (~70 min ETA).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-25 15:19:03 -04:00
parent 7b3da908e0
commit c737871c4c
35 changed files with 3302 additions and 25 deletions
+242 -1
View File
@@ -201,9 +201,15 @@ def _build_where(
vendor: str | None,
source: str | None,
source_key: str | None,
*,
data_type: str | None = None,
state: str | None = None,
year: int | None = None,
) -> dict | None:
"""Translate filter args into a Chroma `where` clause."""
conds: list[dict] = []
if data_type:
conds.append({"data_type": data_type})
if crop:
conds.append({"crop": crop.lower()})
if brand:
@@ -214,6 +220,10 @@ def _build_where(
conds.append({"source": source})
if source_key:
conds.append({"source_key": source_key})
if state:
conds.append({"state": state.upper() if len(state) <= 3 else state})
if year:
conds.append({"year": int(year)})
if not conds:
return None
if len(conds) == 1:
@@ -460,7 +470,11 @@ def search_docs(
"query": query, "crop": crop, "brand": brand,
"vendor": vendor, "source": source, "k": k,
}) as _call:
where = _build_where(crop, brand, vendor, source, None)
# Variety-search default: filter to data_type=variety so trial
# documents (yield trials) don't pollute identity-focused
# results. To search trials, use search_trials().
where = _build_where(crop, brand, vendor, source, None,
data_type="variety")
pool_size = max(k * 3, RERANK_POOL)
# Exact-code pre-filter. Variety codes ("DKC62-08RIB", "AG29XF4")
@@ -745,6 +759,233 @@ def lookup_variety(
return "\n".join(out)
@mcp.tool()
def search_trials(
query: Annotated[str, Field(description=(
"Natural-language query about yield trials. Mention crop, "
"region or state, year, soil/conditions, and any specific "
"variety codes you want compared. Examples: "
"'best corn hybrid 2024 Iowa heavy clay'; "
"'AP Iliad yield Idaho stripe rust'; "
"'DKC65-20 vs NK1748 head to head Alabama 2023'."
))],
crop: Annotated[
str | None,
Field(description="OPTIONAL: corn, soybeans, silage, or wheat."),
] = None,
state: Annotated[
str | None,
Field(description=(
"OPTIONAL state filter. 2-letter abbrev (IA, IL, NE...) "
"for Golden Harvest plot reports; full or partial region "
"name (e.g. 'Pacific Northwest', 'Montana') for AgriPro "
"trial PDFs."
)),
] = None,
year: Annotated[
int | None,
Field(description="OPTIONAL year filter (e.g. 2024).", ge=2010, le=2030),
] = None,
product: Annotated[
str | None,
Field(description=(
"OPTIONAL variety/hybrid filter — substring match against "
"the product field. Example: 'DKC62' surfaces trials "
"containing any DKC62-* hybrid."
)),
] = None,
k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10,
) -> str:
"""Search yield-trial data — head-to-head results from real field
trials. SEPARATE from variety-identity search.
Use this when the user wants to know HOW PRODUCTS PERFORMED, not
what they ARE. Trial data complements `search_docs`:
* `search_docs` answers: "What's the disease resistance profile
of DKC62-08RIB?" (variety identity)
* `search_trials` answers: "Which corn hybrid actually won the
yield trials in central Iowa in 2024?" (performance data)
Data sources:
* **Golden Harvest plot reports** (4,000+ trials) — per-site
head-to-head comparing products from MULTIPLE BRANDS at one
cooperator's field. NK, DEKALB, Golden Harvest, sometimes
others all compete at the same site. Cross-vendor data Bayer
itself doesn't publish.
* **AgriPro regional trial PDFs** (~14 PDFs) — multi-year
multi-location wheat performance for Northern Plains / PNW /
Plains regions.
A typical workflow: call this to identify top performers in a
region/year, then call `lookup_variety(source_key=...)` on the
leaders to verify identity details (RM, traits, disease ratings).
"""
with TimedCall("search_trials", {
"query": query, "crop": crop, "state": state, "year": year,
"product": product, "k": k,
}) as _call:
where = _build_where(
crop, None, None, None, None,
data_type="trial",
state=state,
year=year,
)
pool_size = max(k * 3, RERANK_POOL)
try:
col = _collection()
except Exception as exc: # noqa: BLE001
_call.set(error_dense=str(exc), hits_returned=0)
return (
"_(retrieval unavailable — Chroma collection not found. "
"Has the indexer run? `python -m rag.index --rebuild`.)_"
)
# If a product filter is set, augment the query with the
# product code so BM25 + dense both have signal.
full_query = query
if product:
full_query = f"{query} {product}"
try:
dense = col.query(
query_texts=[full_query],
n_results=pool_size,
where=where,
)
except Exception as exc: # noqa: BLE001
_call.set(error_dense=str(exc), hits_returned=0)
return f"_(trial retrieval failed: {exc})_"
dense_ids: list[str] = (dense.get("ids") or [[]])[0]
dense_docs: list[str] = (dense.get("documents") or [[]])[0]
dense_metas: list[dict] = (dense.get("metadatas") or [[]])[0]
dense_dists: list[float] = (dense.get("distances") or [[]])[0]
id_to_doc = dict(zip(dense_ids, dense_docs))
id_to_meta = dict(zip(dense_ids, dense_metas))
id_to_dist = dict(zip(dense_ids, dense_dists))
used_hybrid = False
if HYBRID_SEARCH:
bm25 = _bm25_index()
if bm25 is not None:
bm25_hits = bm25.query(full_query, n=pool_size, where=where)
bm25_ids = [h[0] for h in bm25_hits]
if bm25_ids:
fused = _rrf_fuse([dense_ids, bm25_ids])
fuzzy_ids = fused
used_hybrid = True
else:
fuzzy_ids = dense_ids
else:
fuzzy_ids = dense_ids
else:
fuzzy_ids = dense_ids
# Optional product-substring post-filter: if user supplied
# ``product``, require the chunk to actually contain the
# token. This re-checks the bytes since BM25 only sees stems.
if product:
needle = product.lower()
def _has_product(cid: str) -> bool:
doc = id_to_doc.get(cid, "")
if needle in doc.lower():
return True
# Not yet fetched — defer; the get-by-id below will fix.
return cid not in id_to_doc
fuzzy_ids = [cid for cid in fuzzy_ids if _has_product(cid)]
final_ids: list[str] = []
seen: set[str] = set()
for cid in fuzzy_ids:
if cid in seen:
continue
seen.add(cid)
final_ids.append(cid)
if len(final_ids) >= k:
break
missing = [i for i in final_ids if i not in id_to_doc]
if missing:
try:
extra = col.get(ids=missing, include=["documents", "metadatas"])
for cid, doc, meta in zip(
extra.get("ids") or [],
extra.get("documents") or [],
extra.get("metadatas") or [],
):
id_to_doc[cid] = doc
id_to_meta[cid] = meta
except Exception as exc: # noqa: BLE001
log.warning("get-by-id for BM25-only hits failed: %s", exc)
# Apply product filter once we have docs from the get-by-id pass.
if product:
needle = product.lower()
final_ids = [cid for cid in final_ids if needle in id_to_doc.get(cid, "").lower()]
_call.set(
hits_returned=len(final_ids),
hybrid=used_hybrid,
pool_size=pool_size,
data_type="trial",
)
if not final_ids:
return (
"_(no trials matched. Try widening — drop the state, "
"year, or product filter. `list_versions()` shows "
"which trial sources are indexed.)_"
)
blocks: list[str] = []
for cid in final_ids:
doc = id_to_doc.get(cid, "")
meta = id_to_meta.get(cid, {})
dist = id_to_dist.get(cid) if not used_hybrid else None
blocks.append(_format_trial_hit(doc, meta, dist))
header = (
f"# Trial search results — {len(final_ids)} trial document"
f"{'s' if len(final_ids) != 1 else ''}"
f"{' (dense + BM25 hybrid)' if used_hybrid else ' (dense only)'}\n"
f"_Use `get_page(source=..., source_key=...)` to read the "
f"full trial body. Use `lookup_variety(source_key=...)` on "
f"any product code to verify its identity (RM, traits, "
f"disease ratings)._\n\n---\n\n"
)
return header + "\n---\n\n".join(blocks)
def _format_trial_hit(doc: str, meta: dict, distance: float | None = None) -> str:
"""Trial-specific result header. Highlights crop/state/year and
sources URL (vs variety hits which emphasize brand + product
identity)."""
src_url = meta.get("source_url") or ""
src_key = meta.get("source_key") or ""
src = meta.get("source") or ""
crop = meta.get("crop") or ""
state = meta.get("state") or ""
year = meta.get("year") or ""
region = meta.get("region") or ""
title_bits = [b for b in [crop.title(), region or state, str(year) if year else ""] if b]
title = " · ".join(title_bits) if title_bits else src_key
header = (
f"### Trial: {title} \n"
f"`{src}::{src_key}` — {meta.get('vendor', '')} / {meta.get('brand', '')} \n"
f"<{src_url}>"
)
if distance is not None:
header += f" \n_(distance={distance:.4f})_"
return f"{header}\n\n{doc.strip()}\n"
@mcp.tool()
def crop_seed_api_lessons(
topic: Annotated[