"""Chunker for seed-variety corpus. Each variety becomes ONE chunk by default. Variety pages are small (typically 2-3 KB of useful signal) and nomic-embed-text handles up to ~8 K tokens cleanly. Splitting a variety across chunks dilutes the named-rating embeddings (e.g. "SCN resistance 7") that farmers search by — keep them together. The chunk text is a synthetic preamble assembled deterministically from the sidecar JSON. Every value in the chunk text comes verbatim from the source. The framing words ("Disease ratings (1-9, 9=best):", "Maturity group:", etc.) are template glue — *we add structure, we do NOT add facts*. Given the same sidecar, this chunker always produces the same chunk text. That's the anti-hallucination contract: the retriever can never surface a rating value that wasn't in the source. Metadata is flattened to Chroma-safe primitives (str/int/float/bool): source "bayer_seeds" source_key "dekalb-dkc075-70rib" vendor "Bayer" brand "DEKALB" crop "corn" | "soybeans" | "wheat" product_name "DKC075-70RIB BRAND BLEND" product_id canonical full id source_url the variety's page URL rm corn RM as int when parseable (else absent) mg soy MG as float when parseable (else absent) release_year int when known trait_codes_csv comma-separated trait codes for substring search rating_scale "1-9 (9 = best)" — chunker should ALWAYS attach this so downstream code can detect a flip ordinal chunk index within variety (0-based) Lists like ``regional_recommendations`` and the full per-rating dicts do NOT fit Chroma's metadata constraints — they stay in the sidecar JSON, surfaced by ``get_page`` / ``lookup_variety``. """ from __future__ import annotations import json import re from pathlib import Path from typing import Iterator # Rating-group classification. The source publishes characteristics # grouped by label; we map those labels to one of three buckets in # the chunk preamble so retrieval gets coherent text. Group labels not # listed here fall into "other" and are still emitted, just in their # own section. DISEASE_GROUP_LABELS = { "DISEASE RATINGS", "PEST AND DISEASE RESISTANCE", } AGRONOMIC_GROUP_LABELS = { "GROWTH", "HARVEST", "PRODUCTION", "KEY CHARACTERISTICS", "QUALITY", } MANAGEMENT_GROUP_LABELS = { "MANAGEMENT", "HERBICIDE", "SENSITIVITY", "PLANT DESCRIPTION", } def _parse_rm(value: object) -> int | None: """Best-effort RM-days int. Returns None if not a clean integer (e.g. wheat's qualitative 'Early'/'Medium-Early' values).""" if value is None: return None s = str(value).strip() if not s: return None try: # Handle floats stored as strings ("105.0") and the trailing # tenths sometimes seen on early corn ("75"). return int(float(s)) except ValueError: return None def _parse_mg(value: object) -> float | None: """Best-effort MG float. Soy MGs go from 00 to 9.0 with one decimal.""" if value is None: return None s = str(value).strip() if not s: return None try: return float(s) except ValueError: return None def _format_items(items: list[dict]) -> str: """Render `[{characteristic, value}, ...]` to a compact inline list.""" out: list[str] = [] for it in items: ch = (it.get("characteristic") or "").strip() v = (it.get("value") or "").strip() if ch and v: out.append(f"{ch} {v}") elif ch: out.append(f"{ch} —") return ", ".join(out) def _render_variety_chunk(sidecar: dict) -> str: """Build the dense preamble for one variety from its sidecar JSON. Faithful to source: every numeric/categorical *value* is verbatim from ``sidecar``. The only generated text is the framing language. """ lines: list[str] = [] # ---- Identity line -------------------------------------------------- name = sidecar.get("product_name") or sidecar.get("source_key") or "" brand = (sidecar.get("brand") or "").strip() vendor = sidecar.get("vendor") or "" crop = (sidecar.get("crop") or "").strip() crop_label = crop.capitalize() if crop else "" ident = f"# {name}" sub = " ".join(filter(None, [ f"({brand.title()} {crop_label} variety, {vendor})" if brand and crop_label and vendor else "", ])) lines.append(ident) if sub: lines.append("") lines.append(sub) # ---- Identity body -------------------------------------------------- facts: list[str] = [] rm = sidecar.get("relative_maturity") mg = sidecar.get("maturity_group") wc = sidecar.get("wheat_class") if crop == "corn" and rm: facts.append(f"Relative maturity {rm}") elif crop == "soybeans" and mg: facts.append(f"Maturity group {mg}") elif crop == "wheat": if rm: facts.append(f"Maturity {rm}") if wc: facts.append(f"Wheat class {wc}") traits = sidecar.get("trait_stack") or [] trait_descs = sidecar.get("trait_descriptions") or [] if traits: if trait_descs: facts.append( "Trait stack: " + ", ".join(traits) + " (" + "; ".join(trait_descs) + ")" ) else: facts.append("Trait stack: " + ", ".join(traits)) if sidecar.get("release_year"): facts.append(f"Released {sidecar['release_year']}") if facts: lines.append("") lines.append(". ".join(facts) + ".") # ---- Positioning ---------------------------------------------------- pos = (sidecar.get("positioning_statement") or "").strip() if pos: lines.append("") lines.append(f"Positioning: {pos}") # ---- Ratings, bucketed for retrieval -------------------------------- scale = sidecar.get("_scale_direction") or "(scale direction not declared)" groups = sidecar.get("characteristics_groups") or [] disease: list[dict] = [] agronomic: list[dict] = [] management: list[dict] = [] other: list[tuple[str, list[dict]]] = [] for g in groups: label = (g.get("label") or "").upper().strip() items = g.get("items") or [] if not items: continue if label in DISEASE_GROUP_LABELS: disease.extend(items) elif label in AGRONOMIC_GROUP_LABELS: agronomic.extend(items) elif label in MANAGEMENT_GROUP_LABELS: management.extend(items) else: other.append((g.get("label") or "Other", items)) if disease: lines.append("") lines.append(f"Disease ratings ({scale}): {_format_items(disease)}.") if agronomic: lines.append("") lines.append(f"Agronomic ratings ({scale}): {_format_items(agronomic)}.") if management: lines.append("") lines.append(f"Management notes: {_format_items(management)}.") for label, items in other: lines.append("") lines.append(f"{label.title()}: {_format_items(items)}.") # ---- Strengths narrative -------------------------------------------- strengths = sidecar.get("strengths") or [] if strengths: lines.append("") lines.append("Strengths and management notes:") for s in strengths: s = (s or "").strip() if s: lines.append(f"- {s}") # ---- Regional listings (compact, not the agronomist emails) --------- rec = sidecar.get("regional_recommendations") or [] if rec: names = sorted({ (r.get("product_list_name") or "").strip() for r in rec if (r.get("product_list_name") or "").strip() }) if names: lines.append("") lines.append("Listed in regional seed guides: " + "; ".join(names) + ".") # ---- Provenance footer (must always be in the chunk text so it # can never be lost between retrieval and LLM rendering) -------- urls = sidecar.get("source_urls") or [] if urls: lines.append("") lines.append(f"Source: {urls[0]}") return "\n".join(lines).strip() + "\n" def _flat_metadata(sidecar: dict) -> dict: """Distil sidecar into Chroma-safe metadata (primitives only).""" # Normalize brand to uppercase so cross-vendor filter matching is # case-stable. Vendors are inconsistent (Bayer uses "DEKALB", # Syngenta uses "Golden Harvest") and the filter shouldn't have to # know which is which. The sidecar JSON keeps the original casing # for display; only Chroma metadata is normalized. md: dict = { "source": sidecar.get("source") or "", "source_key": sidecar.get("source_key") or "", "data_type": "variety", "vendor": sidecar.get("vendor") or "", "brand": (sidecar.get("brand") or "").upper(), "crop": (sidecar.get("crop") or "").lower(), "product_name": sidecar.get("product_name") or "", "product_id": sidecar.get("product_id") or "", "source_url": (sidecar.get("source_urls") or [""])[0], "rating_scale": sidecar.get("_scale_direction") or "", } rm = _parse_rm(sidecar.get("relative_maturity")) mg = _parse_mg(sidecar.get("maturity_group")) if rm is not None: md["rm"] = rm if mg is not None: md["mg"] = mg ry = sidecar.get("release_year") if isinstance(ry, int): md["release_year"] = ry traits = sidecar.get("trait_stack") or [] if traits: # Comma-delimited for partial-match / human eyeballing. # Bracket-padded so `LIKE '%,XF,%'` finds whole tokens. md["trait_codes_csv"] = "," + ",".join(traits) + "," if sidecar.get("wheat_class"): md["wheat_class"] = sidecar["wheat_class"] return md def chunks_from_variety( sidecar_path: Path | str, *, md_path: Path | str | None = None, ) -> Iterator[dict]: """Yield chunk dict(s) for one variety. Currently emits exactly one. Args: sidecar_path: path to the variety's JSON sidecar. md_path: ignored (the chunker rebuilds from sidecar); kept in the signature in case a future split-chunker wants the rendered body. """ sidecar = json.loads(Path(sidecar_path).read_text(encoding="utf-8")) text = _render_variety_chunk(sidecar) # Same 2,048-token cap as trial chunks. Varieties are usually # under 3 KB so this rarely fires, but Bayer hybrids with long # characteristics_groups can run wide — defensive cap. text, truncated = _truncate_for_embed(text) meta = _flat_metadata(sidecar) if truncated: meta["embed_truncated"] = True chunk_id = f"{meta['source']}::{meta['source_key']}::0" yield { "id": chunk_id, "text": text, "metadata": {**meta, "ordinal": 0}, } # =========================================================================== # Trial chunker — for sidecars with data_type="trial" # =========================================================================== # # Trial documents are a different shape from variety identity: # - GH plot reports: per-site head-to-head yield comparison across brands # - AgriPro trial PDFs: regional multi-year multi-location summary # # Both produce ONE chunk per document with a preamble that emphasizes # the trial's location/year/top performers so the embedder gets clean # signal for queries like "best corn for sandy soil Iowa 2024". def _render_gh_plot_chunk(sidecar: dict) -> str: """Render a Golden Harvest plot report (per-site cross-vendor).""" lines: list[str] = [] crop = (sidecar.get("crop") or "").lower() crop_label = {"corn": "Corn", "soybeans": "Soybean", "silage": "Silage"}.get(crop, crop.title()) state = sidecar.get("state") or sidecar.get("state_abbrev") or "" year = sidecar.get("year") or "" cooperator = sidecar.get("cooperator") or "" lines.append(f"# {crop_label} yield trial — {state}, {year}") lines.append("") facts = ["Golden Harvest plot report (cross-vendor)"] if cooperator: facts.append(f"cooperator {cooperator}") if sidecar.get("planted_date"): facts.append(f"planted {sidecar['planted_date']}") if sidecar.get("harvested_date"): facts.append(f"harvested {sidecar['harvested_date']}") if sidecar.get("population_seeds_per_acre"): facts.append(f"population {sidecar['population_seeds_per_acre']:,} seeds/acre") if sidecar.get("row_width_in"): facts.append(f"{sidecar['row_width_in']}\" rows") lines.append(". ".join(facts) + ".") lines.append("") results = sidecar.get("results") or [] if results: # Pick the primary metric for ranking: corn/soy use "Yield", # silage uses "Ton/Acre". Find the first metric key with a # numeric value in the top result. def _primary(r: dict) -> tuple[str, float | None]: metrics = r.get("metrics") or {} # Back-compat: old sidecars had yield_bu_ac directly. if not metrics and r.get("yield_bu_ac") is not None: return ("Yield", r["yield_bu_ac"]) for k in ("Yield", "Ton/Acre", "Tons/Acre"): v = metrics.get(k) if isinstance(v, (int, float)): return (k, v) for k, v in metrics.items(): if isinstance(v, (int, float)): return (k, v) return ("", None) top = results[: min(5, len(results))] primary_label, _ = _primary(top[0]) if top else ("", None) rendered_top_parts: list[str] = [] for i, r in enumerate(top): label, val = _primary(r) piece = f"#{r.get('rank') or i+1} {r.get('brand','?')} {r.get('product','?')}" if r.get('traits'): piece += f" {r['traits']}" if val is not None: piece += f" — {val} {label}" rendered_top_parts.append(piece) if rendered_top_parts: lines.append( f"Top {len(top)} ({crop_label}, {state} {year}): " + ", ".join(rendered_top_parts) + "." ) lines.append("") # Discover the metric column order from the first result with metrics. metric_keys: list[str] = [] for r in results: metrics = r.get("metrics") or {} if metrics: metric_keys = list(metrics.keys()) break # Back-compat: synthesize from legacy fields if no metrics dict. if not metric_keys and any( r.get("yield_bu_ac") is not None for r in results ): metric_keys = ["Yield", "%MST", "Test Weight", "Gross Revenue"] # Full ranking — preserves every datapoint verbatim. col_headers = ["rank", "brand", "product", "traits"] + metric_keys lines.append("Full ranking (" + " | ".join(col_headers) + "):") for r in results: row = [ f"#{r.get('rank') or '-'}", r.get("brand") or "-", r.get("product") or "-", r.get("traits") or "-", ] metrics = r.get("metrics") or {} # Back-compat shim if not metrics: metrics = { "Yield": r.get("yield_bu_ac"), "%MST": r.get("mst_pct"), "Test Weight": r.get("test_weight"), "Gross Revenue": r.get("gross_revenue_dol_ac"), } for k in metric_keys: v = metrics.get(k) if v is None: row.append("-") elif isinstance(v, (int, float)): if "Revenue" in k or "$" in k: row.append(f"${v:.2f}") else: row.append(str(v)) else: row.append(str(v)) lines.append(" " + " | ".join(row)) lines.append("") urls = sidecar.get("source_urls") or [] if urls: lines.append(f"Source: {urls[0]}") return "\n".join(lines).strip() + "\n" def _render_agripro_trial_chunk(sidecar: dict) -> str: """Render an AgriPro regional trial PDF — preamble + verbatim text.""" lines: list[str] = [] title = sidecar.get("title") or sidecar.get("filename") or sidecar.get("source_key", "") lines.append(f"# {title}") lines.append("") facts = ["AgriPro / Syngenta regional wheat trial"] if sidecar.get("region"): facts.append(f"region {sidecar['region']}") if sidecar.get("wheat_class_section"): facts.append(f"class {sidecar['wheat_class_section']}") if sidecar.get("years_covered") and len(sidecar["years_covered"]) > 1: yc = sidecar["years_covered"] facts.append(f"years {yc[0]}–{yc[-1]}") elif sidecar.get("year"): facts.append(f"year {sidecar['year']}") lines.append(". ".join(facts) + ".") lines.append("") varieties = sidecar.get("varieties_found") or [] if varieties: lines.append("Varieties listed: " + ", ".join(varieties) + ".") lines.append("") # Verbatim trial data — preserves variety + yield numbers adjacent # so BM25/dense can match "AP Iliad Aberdeen Idaho" queries. lines.append("Trial data (verbatim from PDF):") lines.append("") # The actual text was in the .md body but isn't in the sidecar # JSON. We render a brief marker; full text goes in the .md file # that get_page returns. For embedding signal, the title + # varieties + region is usually enough. # If we want the FULL text in the chunk we'd need to either store # it in the sidecar OR read it from the .md path at chunk time. # Read from the .md path: return "\n".join(lines).strip() + "\n" def _render_trial_chunk(sidecar: dict, md_text: str | None = None) -> str: """Dispatch to the right trial renderer by source. Includes the verbatim trial body for sources whose value lives in the body text (currently agripro_trials).""" source = sidecar.get("source") if source == "gh_plot_reports": return _render_gh_plot_chunk(sidecar) if source == "agripro_trials": header = _render_agripro_trial_chunk(sidecar) if md_text: # Strip the markdown frontmatter so the body text is the # actual trial data, not the per-source preamble. body = md_text sep = "## Trial data (verbatim from PDF)" if sep in body: body = body.split(sep, 1)[1].strip() # Strip fence markers body = re.sub(r"```", "", body).strip() return header + "\n" + body + "\n" return header # Fallback: generic trial render return _render_gh_plot_chunk(sidecar) def _flat_trial_metadata(sidecar: dict) -> dict: """Chroma-safe metadata for trial chunks. Mirrors variety metadata plus trial-specific facets (state, year, data_type).""" md: dict = { "source": sidecar.get("source") or "", "source_key": sidecar.get("source_key") or "", "data_type": sidecar.get("data_type") or "trial", "vendor": sidecar.get("vendor") or "", "brand": (sidecar.get("brand") or "").upper(), "crop": (sidecar.get("crop") or "").lower(), "source_url": (sidecar.get("source_urls") or [""])[0], } year = sidecar.get("year") if isinstance(year, int): md["year"] = year state = sidecar.get("state_abbrev") or sidecar.get("state") if state: md["state"] = state.upper() if len(state) <= 3 else state md["state_abbrev"] = (sidecar.get("state_abbrev") or "").upper() if sidecar.get("region"): md["region"] = sidecar["region"] if sidecar.get("wheat_class_section"): md["wheat_class"] = sidecar["wheat_class_section"] if sidecar.get("plot_id"): md["plot_id"] = sidecar["plot_id"] if isinstance(sidecar.get("n_results"), int): md["n_results"] = sidecar["n_results"] return md # nomic-embed-text caps at 2,048 tokens (Ollama returns HTTP 400 on # inputs that exceed this). chars/token ratio varies wildly: # prose: ~3.5 chars/token # numeric trial tables: ~2.4 chars/token (GH plot reports with # full ranking tables) # Empirically: GH plot reports failed at 5,261+ chars; agripro # trials at 5,552 chars sometimes failed. Cap at 4,500 chars = # ~2.2 chars/token worst-case for 2,048 tokens, leaving safe # headroom across all source types. The FULL text stays in the # on-disk .md so get_page returns it verbatim regardless. MAX_EMBED_CHARS = 4500 def _truncate_for_embed(text: str) -> tuple[str, bool]: """Cap chunk text to fit nomic-embed-text's 2,048-token context. Returns ``(maybe_truncated_text, was_truncated)``. The head is preserved because high-signal content (variety identity, top performers, ratings preamble) sits at the start of every chunk type we produce. """ if len(text) <= MAX_EMBED_CHARS: return text, False suffix = "\n…(truncated for embedding; full text via get_page)\n" body = text[: MAX_EMBED_CHARS - len(suffix)].rstrip() return body + suffix, True def chunks_from_trial( sidecar_path: Path | str, *, md_path: Path | str | None = None, ) -> Iterator[dict]: """Yield chunk dict(s) for one trial document. Emits exactly one chunk per trial. Args: sidecar_path: path to the trial's JSON sidecar. md_path: path to the trial's markdown body (used for AgriPro PDFs whose value lives in the verbatim text). If omitted we infer it from sidecar_path. """ sc_path = Path(sidecar_path) sidecar = json.loads(sc_path.read_text(encoding="utf-8")) md_text: str | None = None md_p = Path(md_path) if md_path else sc_path.with_suffix(".md") if md_p.exists(): md_text = md_p.read_text(encoding="utf-8") text = _render_trial_chunk(sidecar, md_text=md_text) text, truncated = _truncate_for_embed(text) meta = _flat_trial_metadata(sidecar) if truncated: meta["embed_truncated"] = True chunk_id = f"{meta['source']}::{meta['source_key']}::0" yield { "id": chunk_id, "text": text, "metadata": {**meta, "ordinal": 0}, } # ----- Backwards-compat shim for the template's index.py ------------------- # # The template's ``rag.index.page_records`` calls # ``chunks_from_page(md, page_id, base_meta)`` which doesn't know about # sidecar JSON. We accept that signature but ignore it — index.py has # been updated to use ``chunks_from_variety`` directly, and this shim # is here only so a stray import of the old name doesn't break. # def chunks_from_page(text: str, page_id: str, metadata: dict) -> Iterator[dict]: """Deprecated for seed-mcp; prefer ``chunks_from_variety``.""" # Best-effort: if metadata carries a sidecar_path, dispatch. sidecar_path = metadata.get("_sidecar_path") if sidecar_path: yield from chunks_from_variety(sidecar_path) return # Fallback — emit a single chunk of the raw markdown with whatever # metadata we have. Better than crashing if someone calls this. chunk_id = f"{metadata.get('source','unknown')}::{page_id}::0" yield { "id": chunk_id, "text": text, "metadata": {**metadata, "ordinal": 0}, }