Files
claude a54fac240f
Image rebuild (skip scrape) / build (push) Successful in 5m54s
Add university-extension trials: Illinois VT + Iowa ICPT + Ohio OCPT (+123 cross-vendor trial docs) (#19)
Co-authored-by: claude <claude@jpaul.io>
Co-committed-by: claude <claude@jpaul.io>
2026-06-10 08:36:19 -04:00

684 lines
27 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Chunker for seed-variety corpus.
Each variety becomes ONE chunk by default. Variety pages are small
(typically 2-3 KB of useful signal) and nomic-embed-text handles up
to ~8 K tokens cleanly. Splitting a variety across chunks dilutes
the named-rating embeddings (e.g. "SCN resistance 7") that farmers
search by — keep them together.
The chunk text is a synthetic preamble assembled deterministically
from the sidecar JSON. Every value in the chunk text comes verbatim
from the source. The framing words ("Disease ratings (1-9, 9=best):",
"Maturity group:", etc.) are template glue — *we add structure, we
do NOT add facts*. Given the same sidecar, this chunker always
produces the same chunk text. That's the anti-hallucination
contract: the retriever can never surface a rating value that
wasn't in the source.
Metadata is flattened to Chroma-safe primitives (str/int/float/bool):
source "bayer_seeds"
source_key "dekalb-dkc075-70rib"
vendor "Bayer"
brand "DEKALB"
crop "corn" | "soybeans" | "wheat"
product_name "DKC075-70RIB BRAND BLEND"
product_id canonical full id
source_url the variety's page URL
rm corn RM as int when parseable (else absent)
mg soy MG as float when parseable (else absent)
release_year int when known
trait_codes_csv comma-separated trait codes for substring search
rating_scale "1-9 (9 = best)" — chunker should ALWAYS attach
this so downstream code can detect a flip
ordinal chunk index within variety (0-based)
Lists like ``regional_recommendations`` and the full per-rating dicts
do NOT fit Chroma's metadata constraints — they stay in the sidecar
JSON, surfaced by ``get_page`` / ``lookup_variety``.
"""
from __future__ import annotations
import json
import re
from pathlib import Path
from typing import Iterator
# Rating-group classification. The source publishes characteristics
# grouped by label; we map those labels to one of three buckets in
# the chunk preamble so retrieval gets coherent text. Group labels not
# listed here fall into "other" and are still emitted, just in their
# own section.
DISEASE_GROUP_LABELS = {
"DISEASE RATINGS", # Bayer DEKALB / Asgrow / WestBred
"PEST AND DISEASE RESISTANCE", # WestBred wheat
"DISEASE", # Channel + Deltapine (Bayer)
}
AGRONOMIC_GROUP_LABELS = {
"GROWTH",
"HARVEST",
"PRODUCTION",
"KEY CHARACTERISTICS",
"QUALITY",
"AGRONOMIC CHARACTERISTICS", # Channel + Deltapine
"MATURITY", # Channel — RM / GDU
}
MANAGEMENT_GROUP_LABELS = {
"MANAGEMENT",
"HERBICIDE",
"SENSITIVITY",
"PLANT DESCRIPTION",
"HERBICIDES", # Channel (plural)
"ADAPTATION", # Channel — regional placement
"OTHER", # Channel — misc trait/management
}
def _parse_rm(value: object) -> int | None:
"""Best-effort RM-days int. Returns None if not a clean integer
(e.g. wheat's qualitative 'Early'/'Medium-Early' values)."""
if value is None:
return None
s = str(value).strip()
if not s:
return None
try:
# Handle floats stored as strings ("105.0") and the trailing
# tenths sometimes seen on early corn ("75").
return int(float(s))
except ValueError:
return None
def _parse_mg(value: object) -> float | None:
"""Best-effort MG float. Soy MGs go from 00 to 9.0 with one decimal."""
if value is None:
return None
s = str(value).strip()
if not s:
return None
try:
return float(s)
except ValueError:
return None
def _format_items(items: list[dict]) -> str:
"""Render `[{characteristic, value}, ...]` to a compact inline list."""
out: list[str] = []
for it in items:
ch = (it.get("characteristic") or "").strip()
v = (it.get("value") or "").strip()
if ch and v:
out.append(f"{ch} {v}")
elif ch:
out.append(f"{ch} —")
return ", ".join(out)
def _render_variety_chunk(sidecar: dict) -> str:
"""Build the dense preamble for one variety from its sidecar JSON.
Faithful to source: every numeric/categorical *value* is verbatim
from ``sidecar``. The only generated text is the framing language.
"""
lines: list[str] = []
# ---- Identity line --------------------------------------------------
name = sidecar.get("product_name") or sidecar.get("source_key") or ""
brand = (sidecar.get("brand") or "").strip()
vendor = sidecar.get("vendor") or ""
crop = (sidecar.get("crop") or "").strip()
crop_label = crop.capitalize() if crop else ""
ident = f"# {name}"
sub = " ".join(filter(None, [
f"({brand.title()} {crop_label} variety, {vendor})" if brand and crop_label and vendor else "",
]))
lines.append(ident)
if sub:
lines.append("")
lines.append(sub)
# ---- Identity body --------------------------------------------------
facts: list[str] = []
rm = sidecar.get("relative_maturity")
mg = sidecar.get("maturity_group")
wc = sidecar.get("wheat_class")
if crop == "corn" and rm:
facts.append(f"Relative maturity {rm}")
elif crop == "soybeans" and mg:
facts.append(f"Maturity group {mg}")
elif crop == "wheat":
if rm:
facts.append(f"Maturity {rm}")
if wc:
facts.append(f"Wheat class {wc}")
traits = sidecar.get("trait_stack") or []
trait_descs = sidecar.get("trait_descriptions") or []
if traits:
if trait_descs:
facts.append(
"Trait stack: "
+ ", ".join(traits)
+ " ("
+ "; ".join(trait_descs)
+ ")"
)
else:
facts.append("Trait stack: " + ", ".join(traits))
if sidecar.get("release_year"):
facts.append(f"Released {sidecar['release_year']}")
if facts:
lines.append("")
lines.append(". ".join(facts) + ".")
# ---- Positioning ----------------------------------------------------
pos = (sidecar.get("positioning_statement") or "").strip()
if pos:
lines.append("")
lines.append(f"Positioning: {pos}")
# ---- Ratings, bucketed for retrieval --------------------------------
scale = sidecar.get("_scale_direction") or "(scale direction not declared)"
groups = sidecar.get("characteristics_groups") or []
disease: list[dict] = []
agronomic: list[dict] = []
management: list[dict] = []
other: list[tuple[str, list[dict]]] = []
for g in groups:
label = (g.get("label") or "").upper().strip()
items = g.get("items") or []
if not items:
continue
if label in DISEASE_GROUP_LABELS:
disease.extend(items)
elif label in AGRONOMIC_GROUP_LABELS:
agronomic.extend(items)
elif label in MANAGEMENT_GROUP_LABELS:
management.extend(items)
else:
other.append((g.get("label") or "Other", items))
if disease:
lines.append("")
lines.append(f"Disease ratings ({scale}): {_format_items(disease)}.")
if agronomic:
lines.append("")
lines.append(f"Agronomic ratings ({scale}): {_format_items(agronomic)}.")
if management:
lines.append("")
lines.append(f"Management notes: {_format_items(management)}.")
for label, items in other:
lines.append("")
lines.append(f"{label.title()}: {_format_items(items)}.")
# ---- Strengths narrative --------------------------------------------
strengths = sidecar.get("strengths") or []
if strengths:
lines.append("")
lines.append("Strengths and management notes:")
for s in strengths:
s = (s or "").strip()
if s:
lines.append(f"- {s}")
# ---- Regional listings (compact, not the agronomist emails) ---------
rec = sidecar.get("regional_recommendations") or []
if rec:
names = sorted({
(r.get("product_list_name") or "").strip()
for r in rec
if (r.get("product_list_name") or "").strip()
})
if names:
lines.append("")
lines.append("Listed in regional seed guides: " + "; ".join(names) + ".")
# ---- Provenance footer (must always be in the chunk text so it
# can never be lost between retrieval and LLM rendering) --------
urls = sidecar.get("source_urls") or []
if urls:
lines.append("")
lines.append(f"Source: {urls[0]}")
return "\n".join(lines).strip() + "\n"
def _flat_metadata(sidecar: dict) -> dict:
"""Distil sidecar into Chroma-safe metadata (primitives only)."""
# Normalize brand to uppercase so cross-vendor filter matching is
# case-stable. Vendors are inconsistent (Bayer uses "DEKALB",
# Syngenta uses "Golden Harvest") and the filter shouldn't have to
# know which is which. The sidecar JSON keeps the original casing
# for display; only Chroma metadata is normalized.
md: dict = {
"source": sidecar.get("source") or "",
"source_key": sidecar.get("source_key") or "",
"data_type": "variety",
"vendor": sidecar.get("vendor") or "",
"brand": (sidecar.get("brand") or "").upper(),
"crop": (sidecar.get("crop") or "").lower(),
"product_name": sidecar.get("product_name") or "",
"product_id": sidecar.get("product_id") or "",
"source_url": (sidecar.get("source_urls") or [""])[0],
"rating_scale": sidecar.get("_scale_direction") or "",
}
rm = _parse_rm(sidecar.get("relative_maturity"))
mg = _parse_mg(sidecar.get("maturity_group"))
if rm is not None:
md["rm"] = rm
if mg is not None:
md["mg"] = mg
ry = sidecar.get("release_year")
if isinstance(ry, int):
md["release_year"] = ry
traits = sidecar.get("trait_stack") or []
if traits:
# Comma-delimited for partial-match / human eyeballing.
# Bracket-padded so `LIKE '%,XF,%'` finds whole tokens.
md["trait_codes_csv"] = "," + ",".join(traits) + ","
if sidecar.get("wheat_class"):
md["wheat_class"] = sidecar["wheat_class"]
return md
def chunks_from_variety(
sidecar_path: Path | str,
*,
md_path: Path | str | None = None,
) -> Iterator[dict]:
"""Yield chunk dict(s) for one variety. Currently emits exactly one.
Args:
sidecar_path: path to the variety's JSON sidecar.
md_path: ignored (the chunker rebuilds from sidecar); kept
in the signature in case a future split-chunker
wants the rendered body.
"""
sidecar = json.loads(Path(sidecar_path).read_text(encoding="utf-8"))
text = _render_variety_chunk(sidecar)
# Same 2,048-token cap as trial chunks. Varieties are usually
# under 3 KB so this rarely fires, but Bayer hybrids with long
# characteristics_groups can run wide — defensive cap.
text, truncated = _truncate_for_embed(text)
meta = _flat_metadata(sidecar)
if truncated:
meta["embed_truncated"] = True
chunk_id = f"{meta['source']}::{meta['source_key']}::0"
yield {
"id": chunk_id,
"text": text,
"metadata": {**meta, "ordinal": 0},
}
# ===========================================================================
# Trial chunker — for sidecars with data_type="trial"
# ===========================================================================
#
# Trial documents are a different shape from variety identity:
# - GH plot reports: per-site head-to-head yield comparison across brands
# - AgriPro trial PDFs: regional multi-year multi-location summary
#
# Both produce ONE chunk per document with a preamble that emphasizes
# the trial's location/year/top performers so the embedder gets clean
# signal for queries like "best corn for sandy soil Iowa 2024".
def _render_gh_plot_chunk(sidecar: dict, *, include_region: bool = False) -> str:
"""Render a cross-vendor plot report (per-site head-to-head).
Originally GH-specific; now also handles ``lg_plot_reports`` and
``agrigold_plot_reports`` since they emit the same sidecar shape.
The preamble's "Source:" line uses the actual brand from the
sidecar so the LLM sees who PUBLISHED the trial (Bayer-side
queries should still find DEKALB results inside a GH or AgriGold
plot — search filters target ``brand_in_results``, not the
publisher's brand).
``include_region`` (university-trial sources) folds the
region/district into the title + facts so it's in the embedded
text — these sources publish many same-state/year tables that are
only distinguished by region (e.g. Iowa "District South"), and
without this the region lived only in metadata/the .md body.
"""
lines: list[str] = []
crop = (sidecar.get("crop") or "").lower()
crop_label = {
"corn": "Corn", "soybeans": "Soybean", "silage": "Silage",
"sorghum": "Sorghum",
}.get(crop, crop.title())
state = sidecar.get("state") or sidecar.get("state_abbrev") or ""
year = sidecar.get("year") or ""
cooperator = sidecar.get("cooperator") or ""
region = (sidecar.get("region") or "").strip() if include_region else ""
title = f"# {crop_label} yield trial — {state}, {year}"
if region:
title += f" ({region})"
lines.append(title)
lines.append("")
# Publisher label — emphasizes the source brand for retrieval.
publisher_brand = sidecar.get("brand") or "Golden Harvest"
facts = [f"{publisher_brand} {'variety trial (cross-vendor, independent third-party)' if include_region else 'plot report (cross-vendor)'}"]
if region:
facts.append(f"region {region}")
if cooperator:
facts.append(f"cooperator {cooperator}")
if sidecar.get("planted_date"):
facts.append(f"planted {sidecar['planted_date']}")
if sidecar.get("harvested_date"):
facts.append(f"harvested {sidecar['harvested_date']}")
if sidecar.get("population_seeds_per_acre"):
facts.append(f"population {sidecar['population_seeds_per_acre']:,} seeds/acre")
if sidecar.get("row_width_in"):
facts.append(f"{sidecar['row_width_in']}\" rows")
lines.append(". ".join(facts) + ".")
lines.append("")
results = sidecar.get("results") or []
if results:
# Pick the primary metric for ranking: corn/soy use "Yield",
# silage uses "Ton/Acre". Find the first metric key with a
# numeric value in the top result.
def _primary(r: dict) -> tuple[str, float | None]:
metrics = r.get("metrics") or {}
# Back-compat: old sidecars had yield_bu_ac directly.
if not metrics and r.get("yield_bu_ac") is not None:
return ("Yield", r["yield_bu_ac"])
for k in ("Yield", "Ton/Acre", "Tons/Acre"):
v = metrics.get(k)
if isinstance(v, (int, float)):
return (k, v)
for k, v in metrics.items():
if isinstance(v, (int, float)):
return (k, v)
return ("", None)
top = results[: min(5, len(results))]
primary_label, _ = _primary(top[0]) if top else ("", None)
rendered_top_parts: list[str] = []
for i, r in enumerate(top):
label, val = _primary(r)
piece = f"#{r.get('rank') or i+1} {r.get('brand','?')} {r.get('product','?')}"
if r.get('traits'):
piece += f" {r['traits']}"
if val is not None:
piece += f" — {val} {label}"
rendered_top_parts.append(piece)
if rendered_top_parts:
lines.append(
f"Top {len(top)} ({crop_label}, {state} {year}): "
+ ", ".join(rendered_top_parts) + "."
)
lines.append("")
# Discover the metric column order from the first result with metrics.
metric_keys: list[str] = []
for r in results:
metrics = r.get("metrics") or {}
if metrics:
metric_keys = list(metrics.keys())
break
# Back-compat: synthesize from legacy fields if no metrics dict.
if not metric_keys and any(
r.get("yield_bu_ac") is not None for r in results
):
metric_keys = ["Yield", "%MST", "Test Weight", "Gross Revenue"]
# Full ranking — preserves every datapoint verbatim.
col_headers = ["rank", "brand", "product", "traits"] + metric_keys
lines.append("Full ranking (" + " | ".join(col_headers) + "):")
for r in results:
row = [
f"#{r.get('rank') or '-'}",
r.get("brand") or "-",
r.get("product") or "-",
r.get("traits") or "-",
]
metrics = r.get("metrics") or {}
# Back-compat shim
if not metrics:
metrics = {
"Yield": r.get("yield_bu_ac"),
"%MST": r.get("mst_pct"),
"Test Weight": r.get("test_weight"),
"Gross Revenue": r.get("gross_revenue_dol_ac"),
}
for k in metric_keys:
v = metrics.get(k)
if v is None:
row.append("-")
elif isinstance(v, (int, float)):
if "Revenue" in k or "$" in k:
row.append(f"${v:.2f}")
else:
row.append(str(v))
else:
row.append(str(v))
lines.append(" " + " | ".join(row))
lines.append("")
urls = sidecar.get("source_urls") or []
if urls:
lines.append(f"Source: {urls[0]}")
return "\n".join(lines).strip() + "\n"
def _render_agripro_trial_chunk(sidecar: dict) -> str:
"""Render an AgriPro regional trial PDF — preamble + verbatim text."""
lines: list[str] = []
title = sidecar.get("title") or sidecar.get("filename") or sidecar.get("source_key", "")
lines.append(f"# {title}")
lines.append("")
facts = ["AgriPro / Syngenta regional wheat trial"]
if sidecar.get("region"):
facts.append(f"region {sidecar['region']}")
if sidecar.get("wheat_class_section"):
facts.append(f"class {sidecar['wheat_class_section']}")
if sidecar.get("years_covered") and len(sidecar["years_covered"]) > 1:
yc = sidecar["years_covered"]
facts.append(f"years {yc[0]}{yc[-1]}")
elif sidecar.get("year"):
facts.append(f"year {sidecar['year']}")
lines.append(". ".join(facts) + ".")
lines.append("")
varieties = sidecar.get("varieties_found") or []
if varieties:
lines.append("Varieties listed: " + ", ".join(varieties) + ".")
lines.append("")
# Verbatim trial data — preserves variety + yield numbers adjacent
# so BM25/dense can match "AP Iliad Aberdeen Idaho" queries.
lines.append("Trial data (verbatim from PDF):")
lines.append("")
# The actual text was in the .md body but isn't in the sidecar
# JSON. We render a brief marker; full text goes in the .md file
# that get_page returns. For embedding signal, the title +
# varieties + region is usually enough.
# If we want the FULL text in the chunk we'd need to either store
# it in the sidecar OR read it from the .md path at chunk time.
# Read from the .md path:
return "\n".join(lines).strip() + "\n"
def _render_trial_chunk(sidecar: dict, md_text: str | None = None) -> str:
"""Dispatch to the right trial renderer by source. Includes the
verbatim trial body for sources whose value lives in the body text
(currently agripro_trials)."""
source = sidecar.get("source")
# Cross-vendor plot-report sources all share the gh_plot_reports
# sidecar shape (results: [{rank,brand,product,traits,metrics}]),
# so they route through the same renderer. The renderer reads
# ``brand`` from the sidecar so the publisher label is correct
# for each (Golden Harvest / LG Seeds / AgriGold).
if source in ("gh_plot_reports", "lg_plot_reports", "agrigold_plot_reports"):
return _render_gh_plot_chunk(sidecar)
if source in ("illinois_vt_trials", "iowa_icpt_trials", "ohio_ocpt_trials"):
# University-extension variety trials — same results[] shape, but
# fold region/district into the embedded text (many same-state/year
# tables) + label as an independent third-party variety trial.
return _render_gh_plot_chunk(sidecar, include_region=True)
if source == "proharvest_plots":
# Structured rows → shared cross-vendor renderer (publisher brand
# read from the sidecar). Foreign-format third-party PDFs that
# couldn't be parsed into rows carry raw_text=True and the verbatim
# table text in the .md body — embed that so they're still found.
if sidecar.get("results"):
return _render_gh_plot_chunk(sidecar)
crop = (sidecar.get("crop") or "").lower()
crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(crop, crop.title())
coop = sidecar.get("cooperator") or ""
state = sidecar.get("state") or ""
year = sidecar.get("year") or ""
head = [
f"# {crop_label} yield trial — {coop} ({state}, {year})", "",
"ProHarvest Seeds plot report (cross-vendor, verbatim from PDF).", "",
]
body = md_text or ""
sep = "## Trial data (verbatim from PDF)"
if sep in body:
body = body.split(sep, 1)[1].strip()
body = re.sub(r"```", "", body).strip()
return "\n".join(head) + "\n" + body + "\n"
if source == "agripro_trials":
header = _render_agripro_trial_chunk(sidecar)
if md_text:
# Strip the markdown frontmatter so the body text is the
# actual trial data, not the per-source preamble.
body = md_text
sep = "## Trial data (verbatim from PDF)"
if sep in body:
body = body.split(sep, 1)[1].strip()
# Strip fence markers
body = re.sub(r"```", "", body).strip()
return header + "\n" + body + "\n"
return header
# Fallback: generic trial render
return _render_gh_plot_chunk(sidecar)
def _flat_trial_metadata(sidecar: dict) -> dict:
"""Chroma-safe metadata for trial chunks. Mirrors variety metadata
plus trial-specific facets (state, year, data_type)."""
md: dict = {
"source": sidecar.get("source") or "",
"source_key": sidecar.get("source_key") or "",
"data_type": sidecar.get("data_type") or "trial",
"vendor": sidecar.get("vendor") or "",
"brand": (sidecar.get("brand") or "").upper(),
"crop": (sidecar.get("crop") or "").lower(),
"source_url": (sidecar.get("source_urls") or [""])[0],
}
year = sidecar.get("year")
if isinstance(year, int):
md["year"] = year
state = sidecar.get("state_abbrev") or sidecar.get("state")
if state:
md["state"] = state.upper() if len(state) <= 3 else state
md["state_abbrev"] = (sidecar.get("state_abbrev") or "").upper()
if sidecar.get("region"):
md["region"] = sidecar["region"]
if sidecar.get("wheat_class_section"):
md["wheat_class"] = sidecar["wheat_class_section"]
if sidecar.get("plot_id"):
md["plot_id"] = sidecar["plot_id"]
if isinstance(sidecar.get("n_results"), int):
md["n_results"] = sidecar["n_results"]
return md
# nomic-embed-text caps at 2,048 tokens (Ollama returns HTTP 400 on
# inputs that exceed this). chars/token ratio varies wildly:
# prose: ~3.5 chars/token
# numeric trial tables: ~2.4 chars/token (GH plot reports with
# full ranking tables)
# Empirically: GH plot reports failed at 5,261+ chars; agripro
# trials at 5,552 chars sometimes failed. Cap at 4,500 chars =
# ~2.2 chars/token worst-case for 2,048 tokens, leaving safe
# headroom across all source types. The FULL text stays in the
# on-disk .md so get_page returns it verbatim regardless.
MAX_EMBED_CHARS = 4500
def _truncate_for_embed(text: str) -> tuple[str, bool]:
"""Cap chunk text to fit nomic-embed-text's 2,048-token context.
Returns ``(maybe_truncated_text, was_truncated)``. The head is
preserved because high-signal content (variety identity, top
performers, ratings preamble) sits at the start of every chunk
type we produce.
"""
if len(text) <= MAX_EMBED_CHARS:
return text, False
suffix = "\n…(truncated for embedding; full text via get_page)\n"
body = text[: MAX_EMBED_CHARS - len(suffix)].rstrip()
return body + suffix, True
def chunks_from_trial(
sidecar_path: Path | str,
*,
md_path: Path | str | None = None,
) -> Iterator[dict]:
"""Yield chunk dict(s) for one trial document. Emits exactly one
chunk per trial.
Args:
sidecar_path: path to the trial's JSON sidecar.
md_path: path to the trial's markdown body (used for
AgriPro PDFs whose value lives in the verbatim
text). If omitted we infer it from sidecar_path.
"""
sc_path = Path(sidecar_path)
sidecar = json.loads(sc_path.read_text(encoding="utf-8"))
md_text: str | None = None
md_p = Path(md_path) if md_path else sc_path.with_suffix(".md")
if md_p.exists():
md_text = md_p.read_text(encoding="utf-8")
text = _render_trial_chunk(sidecar, md_text=md_text)
text, truncated = _truncate_for_embed(text)
meta = _flat_trial_metadata(sidecar)
if truncated:
meta["embed_truncated"] = True
chunk_id = f"{meta['source']}::{meta['source_key']}::0"
yield {
"id": chunk_id,
"text": text,
"metadata": {**meta, "ordinal": 0},
}
# ----- Backwards-compat shim for the template's index.py -------------------
#
# The template's ``rag.index.page_records`` calls
# ``chunks_from_page(md, page_id, base_meta)`` which doesn't know about
# sidecar JSON. We accept that signature but ignore it — index.py has
# been updated to use ``chunks_from_variety`` directly, and this shim
# is here only so a stray import of the old name doesn't break.
#
def chunks_from_page(text: str, page_id: str, metadata: dict) -> Iterator[dict]:
"""Deprecated for seed-mcp; prefer ``chunks_from_variety``."""
# Best-effort: if metadata carries a sidecar_path, dispatch.
sidecar_path = metadata.get("_sidecar_path")
if sidecar_path:
yield from chunks_from_variety(sidecar_path)
return
# Fallback — emit a single chunk of the raw markdown with whatever
# metadata we have. Better than crashing if someone calls this.
chunk_id = f"{metadata.get('source','unknown')}::{page_id}::0"
yield {
"id": chunk_id,
"text": text,
"metadata": {**metadata, "ordinal": 0},
}