initial: docs-mcp-template — build guide + scaffolded server

Template for building hosted MCP servers over a product's public documentation. Distilled from one production build; everything product-specific has been factored out. Contents: - PLAN.md — comprehensive build guide. 13 phases from project skeleton through weekly_digest. Includes the gotchas ("fetch-depth: 0 always", reranker per-pair token limit, Cloudflare body cap, dash-not-bash on Gitea runners), the decisions worth carrying forward, and a per-product customization checklist. - CLAUDE.md — guidance for Claude Code working in a clone of this template. Phase identification table, conventions (env-gating + operator confirmation for side-effecting tools, defensive fallback for retrieval components), common commands. - README.md — quick-start summary. Scaffolded code (all signature-stable, with NotImplementedError stubs where phase-specific work is required): docs_mcp/server.py FastMCP server, stateless_http=True, with search_docs / get_page / list_versions baseline tools and commented stubs for the rest of the phase set. docs_mcp/usage.py TimedCall telemetry, JSONL, daily rotation, 90-day retention. Reusable as-is. rag/embeddings.py Ollama embedder (nomic-embed-text default), load-balanced across N URLs. Reusable. rag/chunk.py Paragraph-aware chunker with synthetic chunk 0. Per-product tunable. rag/index.py Chroma + BM25 builder. --rebuild and --bm25-only flags. rag/bm25.py SQLite FTS5 lexical index. Reusable. scrape/changelog.py --cached / --ref / --json / --history-out. Reusable. scrape/README.md What you write per-product. eval/queries.jsonl.example Curate ~25 hand-labeled queries here. eval/retrievers.py Retriever protocol + stub classes. eval/run_eval.py MRR / Recall@K / nDCG@K harness skeleton. scripts/usage_report.py Standalone log analyzer; the FOLLOW-UP CHECKS pattern noted in the module docstring. scripts/registry_gc.py Gitea container registry cleanup. Reusable. Deployment + CI: Dockerfile Python 3.12-slim; COPY corpus + chroma + bm25 last for cache efficiency. deploy/docker-compose.yml MCP + reranker sidecar + Watchtower. Templated with <placeholders>. .gitea/workflows/refresh.yml Weekly cron + manual dispatch. fetch-depth: 0, retry-on-race, three-tag image scheme. .gitea/workflows/image-only.yml Code-only ship cycle, ~18min. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 09:18:17 -04:00
commit 9ba615c8ee
26 changed files with 3280 additions and 0 deletions
@@ -0,0 +1,134 @@
+"""Build Chroma (and optionally BM25) indexes from corpus on disk.
+
+Reads `corpus/<bundle>/<page>.{md,json}`, chunks each page, upserts
+into Chroma. With --rebuild, drops + recreates the collection (clean
+state). With --bm25-only, skips Chroma and rebuilds only the FTS5
+index — useful for fast iteration when chunking didn't change.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterator
+
+import chromadb
+from chromadb.config import Settings
+
+from .chunk import chunks_from_page
+from .embeddings import embedding_function
+
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(message)s")
+
+ROOT = Path(__file__).resolve().parent.parent
+CORPUS = ROOT / "corpus"
+CHROMA_DIR = ROOT / "chroma"
+
+# Collection name — convention: <product>_docs. Override via env if needed.
+import os
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
+COLLECTION = f"{PRODUCT_NAME}_docs"
+
+
+def page_records() -> Iterator[dict]:
+    """Walk corpus/, yield chunks for every page."""
+    if not CORPUS.exists():
+        log.error("corpus/ doesn't exist; run the scraper first")
+        return
+    for bundle_dir in sorted(CORPUS.iterdir()):
+        if not bundle_dir.is_dir() or bundle_dir.name.startswith("."):
+            continue
+        for md_path in sorted(bundle_dir.glob("*.md")):
+            page_id = md_path.stem
+            sidecar = md_path.with_suffix(".json")
+            if not sidecar.exists():
+                log.warning("skipping %s — no JSON sidecar", md_path)
+                continue
+            md = md_path.read_text()
+            meta = json.loads(sidecar.read_text())
+            # Surface common filter fields at the chunk-metadata level
+            # so Chroma's `where` filter can use them.
+            base_meta = {
+                "bundle_id": bundle_dir.name,
+                "page_id":   page_id,
+                "title":     meta.get("title") or "",
+                "version":   meta.get("version") or "",
+                "platform":  meta.get("platform") or "",
+                "product":   meta.get("product") or "",
+            }
+            yield from chunks_from_page(md, page_id, base_meta)
+
+
+def upsert_to_chroma(records: list[dict]) -> int:
+    client = chromadb.PersistentClient(
+        path=str(CHROMA_DIR),
+        settings=Settings(anonymized_telemetry=False),
+    )
+    # Drop + recreate for --rebuild semantics
+    try:
+        client.delete_collection(COLLECTION)
+    except Exception:
+        pass
+    col = client.create_collection(COLLECTION, embedding_function=embedding_function())
+
+    BATCH = 64
+    total = 0
+    for i in range(0, len(records), BATCH):
+        chunk = records[i:i + BATCH]
+        col.upsert(
+            ids=[r["id"] for r in chunk],
+            documents=[r["text"] for r in chunk],
+            metadatas=[r["metadata"] for r in chunk],
+        )
+        total += len(chunk)
+        log.info("upserted %d / %d chunks", total, len(records))
+    return total
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--rebuild", action="store_true",
+                   help="Drop and recreate the Chroma collection.")
+    p.add_argument("--bm25-only", action="store_true",
+                   help="Rebuild only the BM25 index, skip Chroma.")
+    p.add_argument("--bm25-db", type=Path,
+                   default=ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db",
+                   help="Path to the BM25 sqlite db.")
+    args = p.parse_args()
+
+    log.info("reading corpus from %s", CORPUS)
+    t0 = time.time()
+    records = list(page_records())
+    log.info("loaded %d chunks in %.1fs", len(records), time.time() - t0)
+
+    if args.bm25_only:
+        from .bm25 import BM25Index
+        log.info("--bm25-only: building FTS5 only")
+        BM25Index(args.bm25_db).build(records)
+        return 0
+
+    if not args.rebuild:
+        log.info("no --rebuild; nothing to do. (Use --rebuild to upsert.)")
+        return 0
+
+    t_c = time.time()
+    n = upsert_to_chroma(records)
+    log.info("chroma: %d chunks in %.1fs", n, time.time() - t_c)
+
+    # Build BM25 too — see PLAN.md Phase 8. Safe to remove this block
+    # for products that don't need hybrid retrieval.
+    try:
+        from .bm25 import BM25Index
+        t_b = time.time()
+        BM25Index(args.bm25_db).build(records)
+        log.info("bm25 done in %.1fs", time.time() - t_b)
+    except ImportError:
+        log.info("rag.bm25 not available — skipping BM25 build")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())