"""SQLite FTS5-backed BM25 retrieval over the same chunks Chroma indexes. Hybrid retrieval (BM25 + dense + Reciprocal Rank Fusion) addresses a limit of single-tower dense embeddings: when a query has specific technical terms (filenames, language names, error codes, API paths), the dense embedding doesn't bridge from the query into a short code-focused chunk. The chunk loses to the much larger crowd of prose chunks that semantically match the query topic. BM25 handles this directly. Lexical overlap on rare terms ("python", "create_vpg.py", "PROTECTED_SITE_ID", "applyUpgrade") scores those chunks high. Fused with the dense ranking via RRF, the hybrid result is strictly better than either alone for the queries we've seen fail. Why SQLite FTS5: - In the stdlib. Zero new deps. - On-disk. Same persistence model as Chroma — Docker COPY the dir, `rag.index --rebuild` regenerates from corpus. - Built-in `bm25()` ranking function. No knobs to tune that matter for our use case (k1=1.2, b=0.75 defaults are fine). - Builds 70k+ chunks in seconds. Faster than the Chroma rebuild's embedding step by 100×, so it adds basically nothing to the full-rebuild cycle. Schema is two tables to keep filtering clean. FTS5 doesn't filter nicely on its own columns; the content_rowid pattern keeps an external metadata table joinable by rowid: CREATE TABLE chunks_meta ( rowid INTEGER PRIMARY KEY AUTOINCREMENT, id TEXT UNIQUE, bundle_id TEXT, page_id TEXT, version TEXT, platform TEXT, product TEXT, ordinal INTEGER ); CREATE VIRTUAL TABLE chunks_fts USING fts5( text, tokenize = 'porter unicode61 remove_diacritics 2', content = 'chunks_meta', content_rowid = 'rowid' ); Queries: SELECT m.id, bm25(chunks_fts) AS score FROM chunks_meta m JOIN chunks_fts f ON m.rowid = f.rowid WHERE f MATCH ? AND m.version = ? -- optional metadata filter ORDER BY bm25(chunks_fts) -- lower = better in FTS5 LIMIT ?; """ from __future__ import annotations import logging import re import sqlite3 from pathlib import Path from typing import Any log = logging.getLogger(__name__) # Default location: bm25/_docs.db at the repo root, next to chroma/. ROOT = Path(__file__).resolve().parent.parent DEFAULT_DB_DIR = ROOT / "bm25" DEFAULT_DB_NAME = "_docs.db" # Columns we expose as filterable metadata. Mirrors what _build_where in # docs_mcp/server.py accepts so the same filter dicts work for both # Chroma and BM25 without per-retriever translation in the caller. FILTER_COLUMNS = ("bundle_id", "page_id", "version", "platform", "product", "ordinal") # Allowlist tokenizer for free-text queries. FTS5's parser chokes on lots # of punctuation we routinely see in user queries (".10.9", "?", "VPG's", # em-dash, etc.). Rather than blocklist every operator, just keep # alphanumerics + a few separators and replace everything else with a # space. This loses the ability to phrase-search ("exact match") but we # don't expose that to users anyway — they ask natural-language questions # and want the answer, not a Boolean DSL. _KEEP_RE = re.compile(r"[^A-Za-z0-9_\s]") # FTS5 reserves these Boolean operator KEYWORDS at the token level — # stripping them avoids accidental phrase-query behavior when a user # query happens to contain bare "AND", "OR", "NOT", "NEAR". _BOOLEAN_KW_RE = re.compile(r"(? str: """Reduce a natural-language query to an FTS5 OR-of-tokens query. Two transformations: 1. Non-alphanumeric → space (drops punctuation; "10.9?" becomes "10 9"). Lets us handle versions, parens, question marks, etc. without inviting FTS5 parse errors. 2. Boolean keywords stripped (FTS5 reserves AND/OR/NOT/NEAR). 3. Tokens explicitly OR'd. FTS5's default is AND-of-tokens — for any non-trivial natural-language query that means zero hits (no chunk contains every word). OR semantics is what we want: BM25 already weights documents containing more query terms higher, so we don't lose precision, but we DO gain recall. """ cleaned = _KEEP_RE.sub(" ", text) cleaned = _BOOLEAN_KW_RE.sub(" ", cleaned) tokens = cleaned.split() if not tokens: return "" return " OR ".join(tokens) def _where_to_sql(where: dict | None) -> tuple[str, list[Any]]: """Translate a Chroma-shaped filter dict into a SQL fragment + params. Accepts the same shapes ``docs_mcp.server._build_where`` produces: None → ("", []) {"version": "10.9"} → ("AND m.version = ?", ["10.9"]) {"$and": [{...}, {...}]} → ("AND m.X = ? AND m.Y = ?", [...]) Unknown keys are silently dropped (defensive — better to over-match than to crash on a filter we don't know). """ if not where: return "", [] parts: list[str] = [] params: list[Any] = [] def _emit_eq(cond: dict[str, Any]) -> None: for k, v in cond.items(): if k in FILTER_COLUMNS: parts.append(f"m.{k} = ?") params.append(v) if "$and" in where: for sub in where["$and"]: _emit_eq(sub) else: _emit_eq(where) if not parts: return "", [] return "AND " + " AND ".join(parts), params class BM25Index: """Thin wrapper around an FTS5-backed sqlite db. Single-writer model. Reads are connection-per-call (sqlite handles concurrency through file locks; for our read-heavy workload that's fine and avoids cross-thread connection sharing issues with the MCP server's request handlers). """ def __init__(self, db_path: Path | None = None): self.db_path = Path(db_path) if db_path else (DEFAULT_DB_DIR / DEFAULT_DB_NAME) # -- build ---------------------------------------------------------- def build(self, records: list[dict]) -> int: """Rebuild the index from scratch from `records`. `records` is the same list ``rag.index.page_records`` produces: ``[{"id": ..., "text": ..., "metadata": {...}}, ...]``. Bulk insert wrapped in a transaction — single-digit seconds for the full 73k-chunk corpus. """ self.db_path.parent.mkdir(parents=True, exist_ok=True) # Drop and recreate. Idempotent rebuild. if self.db_path.exists(): self.db_path.unlink() with sqlite3.connect(self.db_path) as con: con.executescript(self._schema_sql()) con.executemany( "INSERT INTO chunks_meta (id, bundle_id, page_id, version, " "platform, product, ordinal) VALUES (?, ?, ?, ?, ?, ?, ?)", [ ( r["id"], r["metadata"].get("bundle_id") or "", r["metadata"].get("page_id") or "", r["metadata"].get("version") or "", r["metadata"].get("platform") or "", r["metadata"].get("product") or "", int(r["metadata"].get("ordinal") or 0), ) for r in records ], ) # Populate the FTS5 contentless-ish table by rowid. We populated # chunks_meta first; rowids align with insertion order. con.executemany( "INSERT INTO chunks_fts (rowid, text) VALUES (?, ?)", [ (i + 1, r["text"]) for i, r in enumerate(records) ], ) con.commit() log.info("bm25: indexed %d chunks → %s", len(records), self.db_path) return len(records) # -- query ---------------------------------------------------------- def query( self, text: str, n: int = 200, where: dict | None = None, ) -> list[tuple[str, float]]: """Return up to `n` (chunk_id, bm25_score) pairs, lowest score first. FTS5's bm25() returns NEGATIVE numbers — more relevant docs have smaller (more negative) scores. We order ASC so the first row is the most relevant. Callers that need a "rank" should enumerate the returned list. """ sanitized = _sanitize_query(text) if not sanitized: return [] where_sql, params = _where_to_sql(where) # FTS5 MATCH wants the unaliased table name on its left, so we use # chunks_fts (no alias) and JOIN by rowid against chunks_meta. sql = ( "SELECT m.id, bm25(chunks_fts) AS score " "FROM chunks_fts " "JOIN chunks_meta m ON m.rowid = chunks_fts.rowid " f"WHERE chunks_fts MATCH ? {where_sql} " "ORDER BY bm25(chunks_fts) " "LIMIT ?" ) try: with sqlite3.connect(self.db_path) as con: cur = con.execute(sql, [sanitized, *params, n]) return [(row[0], float(row[1])) for row in cur.fetchall()] except sqlite3.OperationalError as e: # FTS5 syntax error (rare after sanitization) or db missing. # Caller decides whether to fall back to dense-only. log.warning("bm25 query failed (%s); query=%r", e, sanitized[:80]) return [] def exists(self) -> bool: """Cheap probe — does the index file exist on disk?""" return self.db_path.exists() def count(self) -> int: """Number of chunks indexed. 0 if the db is missing or empty.""" if not self.exists(): return 0 try: with sqlite3.connect(self.db_path) as con: return con.execute("SELECT COUNT(*) FROM chunks_meta").fetchone()[0] except sqlite3.OperationalError: return 0 # -- schema --------------------------------------------------------- @staticmethod def _schema_sql() -> str: return """ CREATE TABLE chunks_meta ( rowid INTEGER PRIMARY KEY AUTOINCREMENT, id TEXT UNIQUE NOT NULL, bundle_id TEXT, page_id TEXT, version TEXT, platform TEXT, product TEXT, ordinal INTEGER ); CREATE INDEX idx_meta_version ON chunks_meta(version); CREATE INDEX idx_meta_platform ON chunks_meta(platform); CREATE INDEX idx_meta_bundle ON chunks_meta(bundle_id); CREATE VIRTUAL TABLE chunks_fts USING fts5( text, tokenize = 'porter unicode61 remove_diacritics 2' ); """