hvm-docs/rag/chunk.py

"""Markdown chunker — paragraph-aware, ~400-600 token target.

Adjust the chunking strategy per product if your page format differs
significantly from prose. The output shape (id, text, metadata) is
fixed by the downstream Chroma + BM25 indexing in rag/index.py — don't
change that.

The key knob you'll tune per product is chunk-0. Dense retrieval lands
on chunk 0 first for most queries. Make it a synthetic chunk built
from:

  - the page title (as natural-language H1)
  - a 1-sentence task description (you'll have to generate this — for
    pages that already have a "## Overview" or "## Introduction" the
    first sentence usually works)
  - a keyword bag of important terms (filenames, API names, error
    codes — the rare technical tokens that BM25 lights up on)

Without a rich chunk 0, dense retrieval gets dominated by the much
larger prose body, and short pages (script examples, reference cards)
get buried.
"""
from __future__ import annotations

import re
from typing import Iterator


# Approximate token estimate from char count. Tunable — set per
# embedder if the default 4 chars/token is wrong.
CHARS_PER_TOKEN = 4
TARGET_TOKENS = 500
TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN
# Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger
# 400s the entire embed batch. 6000 chars works for prose but markdown
# tables with lots of `|` separators tokenize ~1.4× denser; a 5839-char
# table chunk from the HVM qualification matrix tokenized past 2048 and
# crashed the rebuild. 4000 chars stays under 2048 tokens even for
# dense table content while leaving headroom for the query side.
MAX_CHARS = 4000


def _hard_split(text: str) -> list[str]:
    """Split an oversized block on line boundaries into MAX_CHARS pieces."""
    if len(text) <= MAX_CHARS:
        return [text]
    out: list[str] = []
    buf: list[str] = []
    buf_chars = 0
    for line in text.splitlines(keepends=True):
        if buf_chars + len(line) > MAX_CHARS and buf:
            out.append("".join(buf).rstrip())
            buf, buf_chars = [], 0
        buf.append(line)
        buf_chars += len(line)
    if buf:
        out.append("".join(buf).rstrip())
    return out


def estimate_tokens(text: str) -> int:
    return max(1, len(text) // CHARS_PER_TOKEN)


def split_paragraphs(md: str) -> list[str]:
    """Split markdown into paragraph-ish blocks.

    Keeps fenced code blocks together (don't slice through ```).
    Headings start new paragraphs.
    """
    blocks: list[str] = []
    current: list[str] = []
    in_fence = False
    for line in md.splitlines(keepends=True):
        stripped = line.strip()
        if stripped.startswith("```"):
            in_fence = not in_fence
            current.append(line)
            continue
        if in_fence:
            current.append(line)
            continue
        if stripped.startswith("#"):
            if current:
                blocks.append("".join(current).strip())
                current = []
            current.append(line)
            continue
        if not stripped and current and not "".join(current).strip().endswith("\n\n"):
            current.append(line)
            blocks.append("".join(current).strip())
            current = []
            continue
        current.append(line)
    if current:
        blocks.append("".join(current).strip())
    return [b for b in blocks if b]


def chunks_from_page(
    text: str,
    page_id: str,
    metadata: dict,
) -> Iterator[dict]:
    """Yield chunk dicts ready for index.py to upsert.

    The synthetic chunk 0 is the per-product customization point. The
    default below is a simple title + body-first-paragraph; rewrite
    for richer retrieval signal (see module docstring).
    """
    paragraphs = split_paragraphs(text)
    if not paragraphs:
        return

    # ----- Chunk 0: synthetic anchor for dense retrieval ---------
    title = metadata.get("title") or page_id
    first_para = next((p for p in paragraphs if not p.startswith("#")), "")
    chunk0_body = (
        f"# {title}\n\n"
        f"{first_para[:300]}"
        # TODO per product: append a keyword bag here (filenames,
        # API names, error codes) for BM25 + dense joint coverage.
    )
    yield {
        "id":       f"{metadata['bundle_id']}::{page_id}::0",
        "text":     chunk0_body,
        "metadata": {**metadata, "ordinal": 0},
    }

    # ----- Body chunks: pack paragraphs up to TARGET_CHARS -------
    ordinal = 1

    def emit(buf: list[str]) -> Iterator[dict]:
        nonlocal ordinal
        merged = "\n\n".join(buf)
        for piece in _hard_split(merged):
            yield {
                "id":       f"{metadata['bundle_id']}::{page_id}::{ordinal}",
                "text":     piece,
                "metadata": {**metadata, "ordinal": ordinal},
            }
            ordinal += 1

    buf: list[str] = []
    buf_chars = 0
    for p in paragraphs:
        if buf_chars + len(p) > TARGET_CHARS and buf:
            yield from emit(buf)
            buf = []
            buf_chars = 0
        buf.append(p)
        buf_chars += len(p)
    if buf:
        yield from emit(buf)