crop-chem-docs/rag/bm25.py

"""SQLite FTS5-backed BM25 retrieval over the same chunks Chroma indexes.

Hybrid retrieval (BM25 + dense + Reciprocal Rank Fusion) addresses a
limit of single-tower dense embeddings: when a query has specific
technical terms (filenames, language names, error codes, API paths),
the dense embedding doesn't bridge from the query into a short
code-focused chunk. The chunk loses to the much larger crowd of
prose chunks that semantically match the query topic.

BM25 handles this directly. Lexical overlap on rare terms ("python",
"create_vpg.py", "PROTECTED_SITE_ID", "applyUpgrade") scores those
chunks high. Fused with the dense ranking via RRF, the hybrid result
is strictly better than either alone for the queries we've seen
fail.

Why SQLite FTS5:
  - In the stdlib. Zero new deps.
  - On-disk. Same persistence model as Chroma — Docker COPY the dir,
    `rag.index --rebuild` regenerates from corpus.
  - Built-in `bm25()` ranking function. No knobs to tune that matter
    for our use case (k1=1.2, b=0.75 defaults are fine).
  - Builds 70k+ chunks in seconds. Faster than the Chroma rebuild's
    embedding step by 100×, so it adds basically nothing to the
    full-rebuild cycle.

Schema is two tables to keep filtering clean. FTS5 doesn't filter
nicely on its own columns; the content_rowid pattern keeps an
external metadata table joinable by rowid:

    CREATE TABLE chunks_meta (
        rowid INTEGER PRIMARY KEY AUTOINCREMENT,
        id TEXT UNIQUE,
        bundle_id TEXT, page_id TEXT, version TEXT,
        platform TEXT, product TEXT, ordinal INTEGER
    );
    CREATE VIRTUAL TABLE chunks_fts USING fts5(
        text,
        tokenize = 'porter unicode61 remove_diacritics 2',
        content = 'chunks_meta',
        content_rowid = 'rowid'
    );

Queries:

    SELECT m.id, bm25(chunks_fts) AS score
    FROM chunks_meta m
    JOIN chunks_fts  f ON m.rowid = f.rowid
    WHERE f MATCH ?
      AND m.version = ?            -- optional metadata filter
    ORDER BY bm25(chunks_fts)      -- lower = better in FTS5
    LIMIT ?;
"""
from __future__ import annotations

import logging
import re
import sqlite3
from pathlib import Path
from typing import Any

log = logging.getLogger(__name__)

# Default location: bm25/<product>_docs.db at the repo root, next to chroma/.
ROOT = Path(__file__).resolve().parent.parent
DEFAULT_DB_DIR = ROOT / "bm25"
DEFAULT_DB_NAME = "<product>_docs.db"

# Columns we expose as filterable metadata. Mirrors what _build_where in
# docs_mcp/server.py accepts so the same filter dicts work for both
# Chroma and BM25 without per-retriever translation in the caller.
FILTER_COLUMNS = ("bundle_id", "page_id", "version", "platform", "product", "ordinal")


# Allowlist tokenizer for free-text queries. FTS5's parser chokes on lots
# of punctuation we routinely see in user queries (".10.9", "?", "VPG's",
# em-dash, etc.). Rather than blocklist every operator, just keep
# alphanumerics + a few separators and replace everything else with a
# space. This loses the ability to phrase-search ("exact match") but we
# don't expose that to users anyway — they ask natural-language questions
# and want the answer, not a Boolean DSL.
_KEEP_RE = re.compile(r"[^A-Za-z0-9_\s]")
# FTS5 reserves these Boolean operator KEYWORDS at the token level —
# stripping them avoids accidental phrase-query behavior when a user
# query happens to contain bare "AND", "OR", "NOT", "NEAR".
_BOOLEAN_KW_RE = re.compile(r"(?<!\w)(AND|OR|NOT|NEAR)(?!\w)")


def _sanitize_query(text: str) -> str:
    """Reduce a natural-language query to an FTS5 OR-of-tokens query.

    Two transformations:

    1. Non-alphanumeric → space (drops punctuation; "10.9?" becomes
       "10 9"). Lets us handle versions, parens, question marks, etc.
       without inviting FTS5 parse errors.
    2. Boolean keywords stripped (FTS5 reserves AND/OR/NOT/NEAR).
    3. Tokens explicitly OR'd. FTS5's default is AND-of-tokens — for
       any non-trivial natural-language query that means zero hits
       (no chunk contains every word). OR semantics is what we want:
       BM25 already weights documents containing more query terms
       higher, so we don't lose precision, but we DO gain recall.
    """
    cleaned = _KEEP_RE.sub(" ", text)
    cleaned = _BOOLEAN_KW_RE.sub(" ", cleaned)
    tokens = cleaned.split()
    if not tokens:
        return ""
    return " OR ".join(tokens)


def _where_to_sql(where: dict | None) -> tuple[str, list[Any]]:
    """Translate a Chroma-shaped filter dict into a SQL fragment + params.

    Accepts the same shapes ``docs_mcp.server._build_where`` produces:

        None                          → ("", [])
        {"version": "10.9"}           → ("AND m.version = ?", ["10.9"])
        {"$and": [{...}, {...}]}      → ("AND m.X = ? AND m.Y = ?", [...])

    Unknown keys are silently dropped (defensive — better to over-match
    than to crash on a filter we don't know).
    """
    if not where:
        return "", []
    parts: list[str] = []
    params: list[Any] = []

    def _emit_eq(cond: dict[str, Any]) -> None:
        for k, v in cond.items():
            if k in FILTER_COLUMNS:
                parts.append(f"m.{k} = ?")
                params.append(v)

    if "$and" in where:
        for sub in where["$and"]:
            _emit_eq(sub)
    else:
        _emit_eq(where)
    if not parts:
        return "", []
    return "AND " + " AND ".join(parts), params


class BM25Index:
    """Thin wrapper around an FTS5-backed sqlite db.

    Single-writer model. Reads are connection-per-call (sqlite handles
    concurrency through file locks; for our read-heavy workload that's
    fine and avoids cross-thread connection sharing issues with the MCP
    server's request handlers).
    """

    def __init__(self, db_path: Path | None = None):
        self.db_path = Path(db_path) if db_path else (DEFAULT_DB_DIR / DEFAULT_DB_NAME)

    # -- build ----------------------------------------------------------

    def build(self, records: list[dict]) -> int:
        """Rebuild the index from scratch from `records`.

        `records` is the same list ``rag.index.page_records`` produces:
        ``[{"id": ..., "text": ..., "metadata": {...}}, ...]``. Bulk
        insert wrapped in a transaction — single-digit seconds for the
        full 73k-chunk corpus.
        """
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        # Drop and recreate. Idempotent rebuild.
        if self.db_path.exists():
            self.db_path.unlink()
        with sqlite3.connect(self.db_path) as con:
            con.executescript(self._schema_sql())
            con.executemany(
                "INSERT INTO chunks_meta (id, bundle_id, page_id, version, "
                "platform, product, ordinal) VALUES (?, ?, ?, ?, ?, ?, ?)",
                [
                    (
                        r["id"],
                        r["metadata"].get("bundle_id") or "",
                        r["metadata"].get("page_id") or "",
                        r["metadata"].get("version") or "",
                        r["metadata"].get("platform") or "",
                        r["metadata"].get("product") or "",
                        int(r["metadata"].get("ordinal") or 0),
                    )
                    for r in records
                ],
            )
            # Populate the FTS5 contentless-ish table by rowid. We populated
            # chunks_meta first; rowids align with insertion order.
            con.executemany(
                "INSERT INTO chunks_fts (rowid, text) VALUES (?, ?)",
                [
                    (i + 1, r["text"])
                    for i, r in enumerate(records)
                ],
            )
            con.commit()
        log.info("bm25: indexed %d chunks → %s", len(records), self.db_path)
        return len(records)

    # -- query ----------------------------------------------------------

    def query(
        self,
        text: str,
        n: int = 200,
        where: dict | None = None,
    ) -> list[tuple[str, float]]:
        """Return up to `n` (chunk_id, bm25_score) pairs, lowest score first.

        FTS5's bm25() returns NEGATIVE numbers — more relevant docs have
        smaller (more negative) scores. We order ASC so the first row is
        the most relevant. Callers that need a "rank" should enumerate
        the returned list.
        """
        sanitized = _sanitize_query(text)
        if not sanitized:
            return []
        where_sql, params = _where_to_sql(where)
        # FTS5 MATCH wants the unaliased table name on its left, so we use
        # chunks_fts (no alias) and JOIN by rowid against chunks_meta.
        sql = (
            "SELECT m.id, bm25(chunks_fts) AS score "
            "FROM chunks_fts "
            "JOIN chunks_meta m ON m.rowid = chunks_fts.rowid "
            f"WHERE chunks_fts MATCH ? {where_sql} "
            "ORDER BY bm25(chunks_fts) "
            "LIMIT ?"
        )
        try:
            with sqlite3.connect(self.db_path) as con:
                cur = con.execute(sql, [sanitized, *params, n])
                return [(row[0], float(row[1])) for row in cur.fetchall()]
        except sqlite3.OperationalError as e:
            # FTS5 syntax error (rare after sanitization) or db missing.
            # Caller decides whether to fall back to dense-only.
            log.warning("bm25 query failed (%s); query=%r", e, sanitized[:80])
            return []

    def exists(self) -> bool:
        """Cheap probe — does the index file exist on disk?"""
        return self.db_path.exists()

    def count(self) -> int:
        """Number of chunks indexed. 0 if the db is missing or empty."""
        if not self.exists():
            return 0
        try:
            with sqlite3.connect(self.db_path) as con:
                return con.execute("SELECT COUNT(*) FROM chunks_meta").fetchone()[0]
        except sqlite3.OperationalError:
            return 0

    # -- schema ---------------------------------------------------------

    @staticmethod
    def _schema_sql() -> str:
        return """
        CREATE TABLE chunks_meta (
            rowid     INTEGER PRIMARY KEY AUTOINCREMENT,
            id        TEXT UNIQUE NOT NULL,
            bundle_id TEXT,
            page_id   TEXT,
            version   TEXT,
            platform  TEXT,
            product   TEXT,
            ordinal   INTEGER
        );
        CREATE INDEX idx_meta_version  ON chunks_meta(version);
        CREATE INDEX idx_meta_platform ON chunks_meta(platform);
        CREATE INDEX idx_meta_bundle   ON chunks_meta(bundle_id);

        CREATE VIRTUAL TABLE chunks_fts USING fts5(
            text,
            tokenize = 'porter unicode61 remove_diacritics 2'
        );
        """