a54fac240f
Image rebuild (skip scrape) / build (push) Successful in 5m54s
Co-authored-by: claude <claude@jpaul.io> Co-committed-by: claude <claude@jpaul.io>
684 lines
27 KiB
Python
684 lines
27 KiB
Python
"""Chunker for seed-variety corpus.
|
||
|
||
Each variety becomes ONE chunk by default. Variety pages are small
|
||
(typically 2-3 KB of useful signal) and nomic-embed-text handles up
|
||
to ~8 K tokens cleanly. Splitting a variety across chunks dilutes
|
||
the named-rating embeddings (e.g. "SCN resistance 7") that farmers
|
||
search by — keep them together.
|
||
|
||
The chunk text is a synthetic preamble assembled deterministically
|
||
from the sidecar JSON. Every value in the chunk text comes verbatim
|
||
from the source. The framing words ("Disease ratings (1-9, 9=best):",
|
||
"Maturity group:", etc.) are template glue — *we add structure, we
|
||
do NOT add facts*. Given the same sidecar, this chunker always
|
||
produces the same chunk text. That's the anti-hallucination
|
||
contract: the retriever can never surface a rating value that
|
||
wasn't in the source.
|
||
|
||
Metadata is flattened to Chroma-safe primitives (str/int/float/bool):
|
||
|
||
source "bayer_seeds"
|
||
source_key "dekalb-dkc075-70rib"
|
||
vendor "Bayer"
|
||
brand "DEKALB"
|
||
crop "corn" | "soybeans" | "wheat"
|
||
product_name "DKC075-70RIB BRAND BLEND"
|
||
product_id canonical full id
|
||
source_url the variety's page URL
|
||
rm corn RM as int when parseable (else absent)
|
||
mg soy MG as float when parseable (else absent)
|
||
release_year int when known
|
||
trait_codes_csv comma-separated trait codes for substring search
|
||
rating_scale "1-9 (9 = best)" — chunker should ALWAYS attach
|
||
this so downstream code can detect a flip
|
||
ordinal chunk index within variety (0-based)
|
||
|
||
Lists like ``regional_recommendations`` and the full per-rating dicts
|
||
do NOT fit Chroma's metadata constraints — they stay in the sidecar
|
||
JSON, surfaced by ``get_page`` / ``lookup_variety``.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Iterator
|
||
|
||
|
||
# Rating-group classification. The source publishes characteristics
|
||
# grouped by label; we map those labels to one of three buckets in
|
||
# the chunk preamble so retrieval gets coherent text. Group labels not
|
||
# listed here fall into "other" and are still emitted, just in their
|
||
# own section.
|
||
DISEASE_GROUP_LABELS = {
|
||
"DISEASE RATINGS", # Bayer DEKALB / Asgrow / WestBred
|
||
"PEST AND DISEASE RESISTANCE", # WestBred wheat
|
||
"DISEASE", # Channel + Deltapine (Bayer)
|
||
}
|
||
AGRONOMIC_GROUP_LABELS = {
|
||
"GROWTH",
|
||
"HARVEST",
|
||
"PRODUCTION",
|
||
"KEY CHARACTERISTICS",
|
||
"QUALITY",
|
||
"AGRONOMIC CHARACTERISTICS", # Channel + Deltapine
|
||
"MATURITY", # Channel — RM / GDU
|
||
}
|
||
MANAGEMENT_GROUP_LABELS = {
|
||
"MANAGEMENT",
|
||
"HERBICIDE",
|
||
"SENSITIVITY",
|
||
"PLANT DESCRIPTION",
|
||
"HERBICIDES", # Channel (plural)
|
||
"ADAPTATION", # Channel — regional placement
|
||
"OTHER", # Channel — misc trait/management
|
||
}
|
||
|
||
|
||
def _parse_rm(value: object) -> int | None:
|
||
"""Best-effort RM-days int. Returns None if not a clean integer
|
||
(e.g. wheat's qualitative 'Early'/'Medium-Early' values)."""
|
||
if value is None:
|
||
return None
|
||
s = str(value).strip()
|
||
if not s:
|
||
return None
|
||
try:
|
||
# Handle floats stored as strings ("105.0") and the trailing
|
||
# tenths sometimes seen on early corn ("75").
|
||
return int(float(s))
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def _parse_mg(value: object) -> float | None:
|
||
"""Best-effort MG float. Soy MGs go from 00 to 9.0 with one decimal."""
|
||
if value is None:
|
||
return None
|
||
s = str(value).strip()
|
||
if not s:
|
||
return None
|
||
try:
|
||
return float(s)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def _format_items(items: list[dict]) -> str:
|
||
"""Render `[{characteristic, value}, ...]` to a compact inline list."""
|
||
out: list[str] = []
|
||
for it in items:
|
||
ch = (it.get("characteristic") or "").strip()
|
||
v = (it.get("value") or "").strip()
|
||
if ch and v:
|
||
out.append(f"{ch} {v}")
|
||
elif ch:
|
||
out.append(f"{ch} —")
|
||
return ", ".join(out)
|
||
|
||
|
||
def _render_variety_chunk(sidecar: dict) -> str:
|
||
"""Build the dense preamble for one variety from its sidecar JSON.
|
||
|
||
Faithful to source: every numeric/categorical *value* is verbatim
|
||
from ``sidecar``. The only generated text is the framing language.
|
||
"""
|
||
lines: list[str] = []
|
||
|
||
# ---- Identity line --------------------------------------------------
|
||
name = sidecar.get("product_name") or sidecar.get("source_key") or ""
|
||
brand = (sidecar.get("brand") or "").strip()
|
||
vendor = sidecar.get("vendor") or ""
|
||
crop = (sidecar.get("crop") or "").strip()
|
||
crop_label = crop.capitalize() if crop else ""
|
||
ident = f"# {name}"
|
||
sub = " ".join(filter(None, [
|
||
f"({brand.title()} {crop_label} variety, {vendor})" if brand and crop_label and vendor else "",
|
||
]))
|
||
lines.append(ident)
|
||
if sub:
|
||
lines.append("")
|
||
lines.append(sub)
|
||
|
||
# ---- Identity body --------------------------------------------------
|
||
facts: list[str] = []
|
||
|
||
rm = sidecar.get("relative_maturity")
|
||
mg = sidecar.get("maturity_group")
|
||
wc = sidecar.get("wheat_class")
|
||
if crop == "corn" and rm:
|
||
facts.append(f"Relative maturity {rm}")
|
||
elif crop == "soybeans" and mg:
|
||
facts.append(f"Maturity group {mg}")
|
||
elif crop == "wheat":
|
||
if rm:
|
||
facts.append(f"Maturity {rm}")
|
||
if wc:
|
||
facts.append(f"Wheat class {wc}")
|
||
|
||
traits = sidecar.get("trait_stack") or []
|
||
trait_descs = sidecar.get("trait_descriptions") or []
|
||
if traits:
|
||
if trait_descs:
|
||
facts.append(
|
||
"Trait stack: "
|
||
+ ", ".join(traits)
|
||
+ " ("
|
||
+ "; ".join(trait_descs)
|
||
+ ")"
|
||
)
|
||
else:
|
||
facts.append("Trait stack: " + ", ".join(traits))
|
||
|
||
if sidecar.get("release_year"):
|
||
facts.append(f"Released {sidecar['release_year']}")
|
||
|
||
if facts:
|
||
lines.append("")
|
||
lines.append(". ".join(facts) + ".")
|
||
|
||
# ---- Positioning ----------------------------------------------------
|
||
pos = (sidecar.get("positioning_statement") or "").strip()
|
||
if pos:
|
||
lines.append("")
|
||
lines.append(f"Positioning: {pos}")
|
||
|
||
# ---- Ratings, bucketed for retrieval --------------------------------
|
||
scale = sidecar.get("_scale_direction") or "(scale direction not declared)"
|
||
groups = sidecar.get("characteristics_groups") or []
|
||
disease: list[dict] = []
|
||
agronomic: list[dict] = []
|
||
management: list[dict] = []
|
||
other: list[tuple[str, list[dict]]] = []
|
||
for g in groups:
|
||
label = (g.get("label") or "").upper().strip()
|
||
items = g.get("items") or []
|
||
if not items:
|
||
continue
|
||
if label in DISEASE_GROUP_LABELS:
|
||
disease.extend(items)
|
||
elif label in AGRONOMIC_GROUP_LABELS:
|
||
agronomic.extend(items)
|
||
elif label in MANAGEMENT_GROUP_LABELS:
|
||
management.extend(items)
|
||
else:
|
||
other.append((g.get("label") or "Other", items))
|
||
|
||
if disease:
|
||
lines.append("")
|
||
lines.append(f"Disease ratings ({scale}): {_format_items(disease)}.")
|
||
if agronomic:
|
||
lines.append("")
|
||
lines.append(f"Agronomic ratings ({scale}): {_format_items(agronomic)}.")
|
||
if management:
|
||
lines.append("")
|
||
lines.append(f"Management notes: {_format_items(management)}.")
|
||
for label, items in other:
|
||
lines.append("")
|
||
lines.append(f"{label.title()}: {_format_items(items)}.")
|
||
|
||
# ---- Strengths narrative --------------------------------------------
|
||
strengths = sidecar.get("strengths") or []
|
||
if strengths:
|
||
lines.append("")
|
||
lines.append("Strengths and management notes:")
|
||
for s in strengths:
|
||
s = (s or "").strip()
|
||
if s:
|
||
lines.append(f"- {s}")
|
||
|
||
# ---- Regional listings (compact, not the agronomist emails) ---------
|
||
rec = sidecar.get("regional_recommendations") or []
|
||
if rec:
|
||
names = sorted({
|
||
(r.get("product_list_name") or "").strip()
|
||
for r in rec
|
||
if (r.get("product_list_name") or "").strip()
|
||
})
|
||
if names:
|
||
lines.append("")
|
||
lines.append("Listed in regional seed guides: " + "; ".join(names) + ".")
|
||
|
||
# ---- Provenance footer (must always be in the chunk text so it
|
||
# can never be lost between retrieval and LLM rendering) --------
|
||
urls = sidecar.get("source_urls") or []
|
||
if urls:
|
||
lines.append("")
|
||
lines.append(f"Source: {urls[0]}")
|
||
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def _flat_metadata(sidecar: dict) -> dict:
|
||
"""Distil sidecar into Chroma-safe metadata (primitives only)."""
|
||
# Normalize brand to uppercase so cross-vendor filter matching is
|
||
# case-stable. Vendors are inconsistent (Bayer uses "DEKALB",
|
||
# Syngenta uses "Golden Harvest") and the filter shouldn't have to
|
||
# know which is which. The sidecar JSON keeps the original casing
|
||
# for display; only Chroma metadata is normalized.
|
||
md: dict = {
|
||
"source": sidecar.get("source") or "",
|
||
"source_key": sidecar.get("source_key") or "",
|
||
"data_type": "variety",
|
||
"vendor": sidecar.get("vendor") or "",
|
||
"brand": (sidecar.get("brand") or "").upper(),
|
||
"crop": (sidecar.get("crop") or "").lower(),
|
||
"product_name": sidecar.get("product_name") or "",
|
||
"product_id": sidecar.get("product_id") or "",
|
||
"source_url": (sidecar.get("source_urls") or [""])[0],
|
||
"rating_scale": sidecar.get("_scale_direction") or "",
|
||
}
|
||
rm = _parse_rm(sidecar.get("relative_maturity"))
|
||
mg = _parse_mg(sidecar.get("maturity_group"))
|
||
if rm is not None:
|
||
md["rm"] = rm
|
||
if mg is not None:
|
||
md["mg"] = mg
|
||
ry = sidecar.get("release_year")
|
||
if isinstance(ry, int):
|
||
md["release_year"] = ry
|
||
traits = sidecar.get("trait_stack") or []
|
||
if traits:
|
||
# Comma-delimited for partial-match / human eyeballing.
|
||
# Bracket-padded so `LIKE '%,XF,%'` finds whole tokens.
|
||
md["trait_codes_csv"] = "," + ",".join(traits) + ","
|
||
if sidecar.get("wheat_class"):
|
||
md["wheat_class"] = sidecar["wheat_class"]
|
||
return md
|
||
|
||
|
||
def chunks_from_variety(
|
||
sidecar_path: Path | str,
|
||
*,
|
||
md_path: Path | str | None = None,
|
||
) -> Iterator[dict]:
|
||
"""Yield chunk dict(s) for one variety. Currently emits exactly one.
|
||
|
||
Args:
|
||
sidecar_path: path to the variety's JSON sidecar.
|
||
md_path: ignored (the chunker rebuilds from sidecar); kept
|
||
in the signature in case a future split-chunker
|
||
wants the rendered body.
|
||
"""
|
||
sidecar = json.loads(Path(sidecar_path).read_text(encoding="utf-8"))
|
||
text = _render_variety_chunk(sidecar)
|
||
# Same 2,048-token cap as trial chunks. Varieties are usually
|
||
# under 3 KB so this rarely fires, but Bayer hybrids with long
|
||
# characteristics_groups can run wide — defensive cap.
|
||
text, truncated = _truncate_for_embed(text)
|
||
meta = _flat_metadata(sidecar)
|
||
if truncated:
|
||
meta["embed_truncated"] = True
|
||
chunk_id = f"{meta['source']}::{meta['source_key']}::0"
|
||
yield {
|
||
"id": chunk_id,
|
||
"text": text,
|
||
"metadata": {**meta, "ordinal": 0},
|
||
}
|
||
|
||
|
||
# ===========================================================================
|
||
# Trial chunker — for sidecars with data_type="trial"
|
||
# ===========================================================================
|
||
#
|
||
# Trial documents are a different shape from variety identity:
|
||
# - GH plot reports: per-site head-to-head yield comparison across brands
|
||
# - AgriPro trial PDFs: regional multi-year multi-location summary
|
||
#
|
||
# Both produce ONE chunk per document with a preamble that emphasizes
|
||
# the trial's location/year/top performers so the embedder gets clean
|
||
# signal for queries like "best corn for sandy soil Iowa 2024".
|
||
|
||
|
||
def _render_gh_plot_chunk(sidecar: dict, *, include_region: bool = False) -> str:
|
||
"""Render a cross-vendor plot report (per-site head-to-head).
|
||
|
||
Originally GH-specific; now also handles ``lg_plot_reports`` and
|
||
``agrigold_plot_reports`` since they emit the same sidecar shape.
|
||
The preamble's "Source:" line uses the actual brand from the
|
||
sidecar so the LLM sees who PUBLISHED the trial (Bayer-side
|
||
queries should still find DEKALB results inside a GH or AgriGold
|
||
plot — search filters target ``brand_in_results``, not the
|
||
publisher's brand).
|
||
|
||
``include_region`` (university-trial sources) folds the
|
||
region/district into the title + facts so it's in the embedded
|
||
text — these sources publish many same-state/year tables that are
|
||
only distinguished by region (e.g. Iowa "District South"), and
|
||
without this the region lived only in metadata/the .md body.
|
||
"""
|
||
lines: list[str] = []
|
||
crop = (sidecar.get("crop") or "").lower()
|
||
crop_label = {
|
||
"corn": "Corn", "soybeans": "Soybean", "silage": "Silage",
|
||
"sorghum": "Sorghum",
|
||
}.get(crop, crop.title())
|
||
state = sidecar.get("state") or sidecar.get("state_abbrev") or ""
|
||
year = sidecar.get("year") or ""
|
||
cooperator = sidecar.get("cooperator") or ""
|
||
region = (sidecar.get("region") or "").strip() if include_region else ""
|
||
|
||
title = f"# {crop_label} yield trial — {state}, {year}"
|
||
if region:
|
||
title += f" ({region})"
|
||
lines.append(title)
|
||
lines.append("")
|
||
# Publisher label — emphasizes the source brand for retrieval.
|
||
publisher_brand = sidecar.get("brand") or "Golden Harvest"
|
||
facts = [f"{publisher_brand} {'variety trial (cross-vendor, independent third-party)' if include_region else 'plot report (cross-vendor)'}"]
|
||
if region:
|
||
facts.append(f"region {region}")
|
||
if cooperator:
|
||
facts.append(f"cooperator {cooperator}")
|
||
if sidecar.get("planted_date"):
|
||
facts.append(f"planted {sidecar['planted_date']}")
|
||
if sidecar.get("harvested_date"):
|
||
facts.append(f"harvested {sidecar['harvested_date']}")
|
||
if sidecar.get("population_seeds_per_acre"):
|
||
facts.append(f"population {sidecar['population_seeds_per_acre']:,} seeds/acre")
|
||
if sidecar.get("row_width_in"):
|
||
facts.append(f"{sidecar['row_width_in']}\" rows")
|
||
lines.append(". ".join(facts) + ".")
|
||
lines.append("")
|
||
|
||
results = sidecar.get("results") or []
|
||
if results:
|
||
# Pick the primary metric for ranking: corn/soy use "Yield",
|
||
# silage uses "Ton/Acre". Find the first metric key with a
|
||
# numeric value in the top result.
|
||
def _primary(r: dict) -> tuple[str, float | None]:
|
||
metrics = r.get("metrics") or {}
|
||
# Back-compat: old sidecars had yield_bu_ac directly.
|
||
if not metrics and r.get("yield_bu_ac") is not None:
|
||
return ("Yield", r["yield_bu_ac"])
|
||
for k in ("Yield", "Ton/Acre", "Tons/Acre"):
|
||
v = metrics.get(k)
|
||
if isinstance(v, (int, float)):
|
||
return (k, v)
|
||
for k, v in metrics.items():
|
||
if isinstance(v, (int, float)):
|
||
return (k, v)
|
||
return ("", None)
|
||
|
||
top = results[: min(5, len(results))]
|
||
primary_label, _ = _primary(top[0]) if top else ("", None)
|
||
rendered_top_parts: list[str] = []
|
||
for i, r in enumerate(top):
|
||
label, val = _primary(r)
|
||
piece = f"#{r.get('rank') or i+1} {r.get('brand','?')} {r.get('product','?')}"
|
||
if r.get('traits'):
|
||
piece += f" {r['traits']}"
|
||
if val is not None:
|
||
piece += f" — {val} {label}"
|
||
rendered_top_parts.append(piece)
|
||
if rendered_top_parts:
|
||
lines.append(
|
||
f"Top {len(top)} ({crop_label}, {state} {year}): "
|
||
+ ", ".join(rendered_top_parts) + "."
|
||
)
|
||
lines.append("")
|
||
|
||
# Discover the metric column order from the first result with metrics.
|
||
metric_keys: list[str] = []
|
||
for r in results:
|
||
metrics = r.get("metrics") or {}
|
||
if metrics:
|
||
metric_keys = list(metrics.keys())
|
||
break
|
||
# Back-compat: synthesize from legacy fields if no metrics dict.
|
||
if not metric_keys and any(
|
||
r.get("yield_bu_ac") is not None for r in results
|
||
):
|
||
metric_keys = ["Yield", "%MST", "Test Weight", "Gross Revenue"]
|
||
|
||
# Full ranking — preserves every datapoint verbatim.
|
||
col_headers = ["rank", "brand", "product", "traits"] + metric_keys
|
||
lines.append("Full ranking (" + " | ".join(col_headers) + "):")
|
||
for r in results:
|
||
row = [
|
||
f"#{r.get('rank') or '-'}",
|
||
r.get("brand") or "-",
|
||
r.get("product") or "-",
|
||
r.get("traits") or "-",
|
||
]
|
||
metrics = r.get("metrics") or {}
|
||
# Back-compat shim
|
||
if not metrics:
|
||
metrics = {
|
||
"Yield": r.get("yield_bu_ac"),
|
||
"%MST": r.get("mst_pct"),
|
||
"Test Weight": r.get("test_weight"),
|
||
"Gross Revenue": r.get("gross_revenue_dol_ac"),
|
||
}
|
||
for k in metric_keys:
|
||
v = metrics.get(k)
|
||
if v is None:
|
||
row.append("-")
|
||
elif isinstance(v, (int, float)):
|
||
if "Revenue" in k or "$" in k:
|
||
row.append(f"${v:.2f}")
|
||
else:
|
||
row.append(str(v))
|
||
else:
|
||
row.append(str(v))
|
||
lines.append(" " + " | ".join(row))
|
||
lines.append("")
|
||
|
||
urls = sidecar.get("source_urls") or []
|
||
if urls:
|
||
lines.append(f"Source: {urls[0]}")
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def _render_agripro_trial_chunk(sidecar: dict) -> str:
|
||
"""Render an AgriPro regional trial PDF — preamble + verbatim text."""
|
||
lines: list[str] = []
|
||
title = sidecar.get("title") or sidecar.get("filename") or sidecar.get("source_key", "")
|
||
lines.append(f"# {title}")
|
||
lines.append("")
|
||
|
||
facts = ["AgriPro / Syngenta regional wheat trial"]
|
||
if sidecar.get("region"):
|
||
facts.append(f"region {sidecar['region']}")
|
||
if sidecar.get("wheat_class_section"):
|
||
facts.append(f"class {sidecar['wheat_class_section']}")
|
||
if sidecar.get("years_covered") and len(sidecar["years_covered"]) > 1:
|
||
yc = sidecar["years_covered"]
|
||
facts.append(f"years {yc[0]}–{yc[-1]}")
|
||
elif sidecar.get("year"):
|
||
facts.append(f"year {sidecar['year']}")
|
||
lines.append(". ".join(facts) + ".")
|
||
lines.append("")
|
||
|
||
varieties = sidecar.get("varieties_found") or []
|
||
if varieties:
|
||
lines.append("Varieties listed: " + ", ".join(varieties) + ".")
|
||
lines.append("")
|
||
|
||
# Verbatim trial data — preserves variety + yield numbers adjacent
|
||
# so BM25/dense can match "AP Iliad Aberdeen Idaho" queries.
|
||
lines.append("Trial data (verbatim from PDF):")
|
||
lines.append("")
|
||
# The actual text was in the .md body but isn't in the sidecar
|
||
# JSON. We render a brief marker; full text goes in the .md file
|
||
# that get_page returns. For embedding signal, the title +
|
||
# varieties + region is usually enough.
|
||
# If we want the FULL text in the chunk we'd need to either store
|
||
# it in the sidecar OR read it from the .md path at chunk time.
|
||
# Read from the .md path:
|
||
return "\n".join(lines).strip() + "\n"
|
||
|
||
|
||
def _render_trial_chunk(sidecar: dict, md_text: str | None = None) -> str:
|
||
"""Dispatch to the right trial renderer by source. Includes the
|
||
verbatim trial body for sources whose value lives in the body text
|
||
(currently agripro_trials)."""
|
||
source = sidecar.get("source")
|
||
# Cross-vendor plot-report sources all share the gh_plot_reports
|
||
# sidecar shape (results: [{rank,brand,product,traits,metrics}]),
|
||
# so they route through the same renderer. The renderer reads
|
||
# ``brand`` from the sidecar so the publisher label is correct
|
||
# for each (Golden Harvest / LG Seeds / AgriGold).
|
||
if source in ("gh_plot_reports", "lg_plot_reports", "agrigold_plot_reports"):
|
||
return _render_gh_plot_chunk(sidecar)
|
||
if source in ("illinois_vt_trials", "iowa_icpt_trials", "ohio_ocpt_trials"):
|
||
# University-extension variety trials — same results[] shape, but
|
||
# fold region/district into the embedded text (many same-state/year
|
||
# tables) + label as an independent third-party variety trial.
|
||
return _render_gh_plot_chunk(sidecar, include_region=True)
|
||
if source == "proharvest_plots":
|
||
# Structured rows → shared cross-vendor renderer (publisher brand
|
||
# read from the sidecar). Foreign-format third-party PDFs that
|
||
# couldn't be parsed into rows carry raw_text=True and the verbatim
|
||
# table text in the .md body — embed that so they're still found.
|
||
if sidecar.get("results"):
|
||
return _render_gh_plot_chunk(sidecar)
|
||
crop = (sidecar.get("crop") or "").lower()
|
||
crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(crop, crop.title())
|
||
coop = sidecar.get("cooperator") or ""
|
||
state = sidecar.get("state") or ""
|
||
year = sidecar.get("year") or ""
|
||
head = [
|
||
f"# {crop_label} yield trial — {coop} ({state}, {year})", "",
|
||
"ProHarvest Seeds plot report (cross-vendor, verbatim from PDF).", "",
|
||
]
|
||
body = md_text or ""
|
||
sep = "## Trial data (verbatim from PDF)"
|
||
if sep in body:
|
||
body = body.split(sep, 1)[1].strip()
|
||
body = re.sub(r"```", "", body).strip()
|
||
return "\n".join(head) + "\n" + body + "\n"
|
||
if source == "agripro_trials":
|
||
header = _render_agripro_trial_chunk(sidecar)
|
||
if md_text:
|
||
# Strip the markdown frontmatter so the body text is the
|
||
# actual trial data, not the per-source preamble.
|
||
body = md_text
|
||
sep = "## Trial data (verbatim from PDF)"
|
||
if sep in body:
|
||
body = body.split(sep, 1)[1].strip()
|
||
# Strip fence markers
|
||
body = re.sub(r"```", "", body).strip()
|
||
return header + "\n" + body + "\n"
|
||
return header
|
||
# Fallback: generic trial render
|
||
return _render_gh_plot_chunk(sidecar)
|
||
|
||
|
||
def _flat_trial_metadata(sidecar: dict) -> dict:
|
||
"""Chroma-safe metadata for trial chunks. Mirrors variety metadata
|
||
plus trial-specific facets (state, year, data_type)."""
|
||
md: dict = {
|
||
"source": sidecar.get("source") or "",
|
||
"source_key": sidecar.get("source_key") or "",
|
||
"data_type": sidecar.get("data_type") or "trial",
|
||
"vendor": sidecar.get("vendor") or "",
|
||
"brand": (sidecar.get("brand") or "").upper(),
|
||
"crop": (sidecar.get("crop") or "").lower(),
|
||
"source_url": (sidecar.get("source_urls") or [""])[0],
|
||
}
|
||
year = sidecar.get("year")
|
||
if isinstance(year, int):
|
||
md["year"] = year
|
||
state = sidecar.get("state_abbrev") or sidecar.get("state")
|
||
if state:
|
||
md["state"] = state.upper() if len(state) <= 3 else state
|
||
md["state_abbrev"] = (sidecar.get("state_abbrev") or "").upper()
|
||
if sidecar.get("region"):
|
||
md["region"] = sidecar["region"]
|
||
if sidecar.get("wheat_class_section"):
|
||
md["wheat_class"] = sidecar["wheat_class_section"]
|
||
if sidecar.get("plot_id"):
|
||
md["plot_id"] = sidecar["plot_id"]
|
||
if isinstance(sidecar.get("n_results"), int):
|
||
md["n_results"] = sidecar["n_results"]
|
||
return md
|
||
|
||
|
||
# nomic-embed-text caps at 2,048 tokens (Ollama returns HTTP 400 on
|
||
# inputs that exceed this). chars/token ratio varies wildly:
|
||
# prose: ~3.5 chars/token
|
||
# numeric trial tables: ~2.4 chars/token (GH plot reports with
|
||
# full ranking tables)
|
||
# Empirically: GH plot reports failed at 5,261+ chars; agripro
|
||
# trials at 5,552 chars sometimes failed. Cap at 4,500 chars =
|
||
# ~2.2 chars/token worst-case for 2,048 tokens, leaving safe
|
||
# headroom across all source types. The FULL text stays in the
|
||
# on-disk .md so get_page returns it verbatim regardless.
|
||
MAX_EMBED_CHARS = 4500
|
||
|
||
|
||
def _truncate_for_embed(text: str) -> tuple[str, bool]:
|
||
"""Cap chunk text to fit nomic-embed-text's 2,048-token context.
|
||
|
||
Returns ``(maybe_truncated_text, was_truncated)``. The head is
|
||
preserved because high-signal content (variety identity, top
|
||
performers, ratings preamble) sits at the start of every chunk
|
||
type we produce.
|
||
"""
|
||
if len(text) <= MAX_EMBED_CHARS:
|
||
return text, False
|
||
suffix = "\n…(truncated for embedding; full text via get_page)\n"
|
||
body = text[: MAX_EMBED_CHARS - len(suffix)].rstrip()
|
||
return body + suffix, True
|
||
|
||
|
||
def chunks_from_trial(
|
||
sidecar_path: Path | str,
|
||
*,
|
||
md_path: Path | str | None = None,
|
||
) -> Iterator[dict]:
|
||
"""Yield chunk dict(s) for one trial document. Emits exactly one
|
||
chunk per trial.
|
||
|
||
Args:
|
||
sidecar_path: path to the trial's JSON sidecar.
|
||
md_path: path to the trial's markdown body (used for
|
||
AgriPro PDFs whose value lives in the verbatim
|
||
text). If omitted we infer it from sidecar_path.
|
||
"""
|
||
sc_path = Path(sidecar_path)
|
||
sidecar = json.loads(sc_path.read_text(encoding="utf-8"))
|
||
|
||
md_text: str | None = None
|
||
md_p = Path(md_path) if md_path else sc_path.with_suffix(".md")
|
||
if md_p.exists():
|
||
md_text = md_p.read_text(encoding="utf-8")
|
||
|
||
text = _render_trial_chunk(sidecar, md_text=md_text)
|
||
text, truncated = _truncate_for_embed(text)
|
||
meta = _flat_trial_metadata(sidecar)
|
||
if truncated:
|
||
meta["embed_truncated"] = True
|
||
chunk_id = f"{meta['source']}::{meta['source_key']}::0"
|
||
yield {
|
||
"id": chunk_id,
|
||
"text": text,
|
||
"metadata": {**meta, "ordinal": 0},
|
||
}
|
||
|
||
|
||
# ----- Backwards-compat shim for the template's index.py -------------------
|
||
#
|
||
# The template's ``rag.index.page_records`` calls
|
||
# ``chunks_from_page(md, page_id, base_meta)`` which doesn't know about
|
||
# sidecar JSON. We accept that signature but ignore it — index.py has
|
||
# been updated to use ``chunks_from_variety`` directly, and this shim
|
||
# is here only so a stray import of the old name doesn't break.
|
||
#
|
||
def chunks_from_page(text: str, page_id: str, metadata: dict) -> Iterator[dict]:
|
||
"""Deprecated for seed-mcp; prefer ``chunks_from_variety``."""
|
||
# Best-effort: if metadata carries a sidecar_path, dispatch.
|
||
sidecar_path = metadata.get("_sidecar_path")
|
||
if sidecar_path:
|
||
yield from chunks_from_variety(sidecar_path)
|
||
return
|
||
# Fallback — emit a single chunk of the raw markdown with whatever
|
||
# metadata we have. Better than crashing if someone calls this.
|
||
chunk_id = f"{metadata.get('source','unknown')}::{page_id}::0"
|
||
yield {
|
||
"id": chunk_id,
|
||
"text": text,
|
||
"metadata": {**metadata, "ordinal": 0},
|
||
}
|