"""Markdown chunker — paragraph-aware, ~400-600 token target. Adjust the chunking strategy per product if your page format differs significantly from prose. The output shape (id, text, metadata) is fixed by the downstream Chroma + BM25 indexing in rag/index.py — don't change that. The key knob you'll tune per product is chunk-0. Dense retrieval lands on chunk 0 first for most queries. Make it a synthetic chunk built from: - the page title (as natural-language H1) - a 1-sentence task description (you'll have to generate this — for pages that already have a "## Overview" or "## Introduction" the first sentence usually works) - a keyword bag of important terms (filenames, API names, error codes — the rare technical tokens that BM25 lights up on) Without a rich chunk 0, dense retrieval gets dominated by the much larger prose body, and short pages (script examples, reference cards) get buried. """ from __future__ import annotations import re from typing import Iterator # Approximate token estimate from char count. Tunable — set per # embedder if the default 4 chars/token is wrong. CHARS_PER_TOKEN = 4 TARGET_TOKENS = 500 TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN # Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger # 400s the entire embed batch. 6000 chars ≈ 1500 tokens leaves headroom. MAX_CHARS = 6000 def _hard_split(text: str) -> list[str]: """Split an oversized block on line boundaries into MAX_CHARS pieces.""" if len(text) <= MAX_CHARS: return [text] out: list[str] = [] buf: list[str] = [] buf_chars = 0 for line in text.splitlines(keepends=True): if buf_chars + len(line) > MAX_CHARS and buf: out.append("".join(buf).rstrip()) buf, buf_chars = [], 0 buf.append(line) buf_chars += len(line) if buf: out.append("".join(buf).rstrip()) return out def estimate_tokens(text: str) -> int: return max(1, len(text) // CHARS_PER_TOKEN) def split_paragraphs(md: str) -> list[str]: """Split markdown into paragraph-ish blocks. Keeps fenced code blocks together (don't slice through ```). Headings start new paragraphs. """ blocks: list[str] = [] current: list[str] = [] in_fence = False for line in md.splitlines(keepends=True): stripped = line.strip() if stripped.startswith("```"): in_fence = not in_fence current.append(line) continue if in_fence: current.append(line) continue if stripped.startswith("#"): if current: blocks.append("".join(current).strip()) current = [] current.append(line) continue if not stripped and current and not "".join(current).strip().endswith("\n\n"): current.append(line) blocks.append("".join(current).strip()) current = [] continue current.append(line) if current: blocks.append("".join(current).strip()) return [b for b in blocks if b] def chunks_from_page( text: str, page_id: str, metadata: dict, ) -> Iterator[dict]: """Yield chunk dicts ready for index.py to upsert. The synthetic chunk 0 is the per-product customization point. The default below is a simple title + body-first-paragraph; rewrite for richer retrieval signal (see module docstring). """ paragraphs = split_paragraphs(text) if not paragraphs: return # ----- Chunk 0: synthetic anchor for dense retrieval --------- title = metadata.get("title") or page_id first_para = next((p for p in paragraphs if not p.startswith("#")), "") chunk0_body = ( f"# {title}\n\n" f"{first_para[:300]}" # TODO per product: append a keyword bag here (filenames, # API names, error codes) for BM25 + dense joint coverage. ) yield { "id": f"{metadata['bundle_id']}::{page_id}::0", "text": chunk0_body, "metadata": {**metadata, "ordinal": 0}, } # ----- Body chunks: pack paragraphs up to TARGET_CHARS ------- ordinal = 1 def emit(buf: list[str]) -> Iterator[dict]: nonlocal ordinal merged = "\n\n".join(buf) for piece in _hard_split(merged): yield { "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", "text": piece, "metadata": {**metadata, "ordinal": ordinal}, } ordinal += 1 buf: list[str] = [] buf_chars = 0 for p in paragraphs: if buf_chars + len(p) > TARGET_CHARS and buf: yield from emit(buf) buf = [] buf_chars = 0 buf.append(p) buf_chars += len(p) if buf: yield from emit(buf)