commit 9ba615c8ee4f0c598716d6804403e0b85f9a9b71 Author: Justin Paul Date: Fri May 22 09:18:17 2026 -0400 initial: docs-mcp-template — build guide + scaffolded server Template for building hosted MCP servers over a product's public documentation. Distilled from one production build; everything product-specific has been factored out. Contents: - PLAN.md — comprehensive build guide. 13 phases from project skeleton through weekly_digest. Includes the gotchas ("fetch-depth: 0 always", reranker per-pair token limit, Cloudflare body cap, dash-not-bash on Gitea runners), the decisions worth carrying forward, and a per-product customization checklist. - CLAUDE.md — guidance for Claude Code working in a clone of this template. Phase identification table, conventions (env-gating + operator confirmation for side-effecting tools, defensive fallback for retrieval components), common commands. - README.md — quick-start summary. Scaffolded code (all signature-stable, with NotImplementedError stubs where phase-specific work is required): docs_mcp/server.py FastMCP server, stateless_http=True, with search_docs / get_page / list_versions baseline tools and commented stubs for the rest of the phase set. docs_mcp/usage.py TimedCall telemetry, JSONL, daily rotation, 90-day retention. Reusable as-is. rag/embeddings.py Ollama embedder (nomic-embed-text default), load-balanced across N URLs. Reusable. rag/chunk.py Paragraph-aware chunker with synthetic chunk 0. Per-product tunable. rag/index.py Chroma + BM25 builder. --rebuild and --bm25-only flags. rag/bm25.py SQLite FTS5 lexical index. Reusable. scrape/changelog.py --cached / --ref / --json / --history-out. Reusable. scrape/README.md What you write per-product. eval/queries.jsonl.example Curate ~25 hand-labeled queries here. eval/retrievers.py Retriever protocol + stub classes. eval/run_eval.py MRR / Recall@K / nDCG@K harness skeleton. scripts/usage_report.py Standalone log analyzer; the FOLLOW-UP CHECKS pattern noted in the module docstring. scripts/registry_gc.py Gitea container registry cleanup. Reusable. Deployment + CI: Dockerfile Python 3.12-slim; COPY corpus + chroma + bm25 last for cache efficiency. deploy/docker-compose.yml MCP + reranker sidecar + Watchtower. Templated with . .gitea/workflows/refresh.yml Weekly cron + manual dispatch. fetch-depth: 0, retry-on-race, three-tag image scheme. .gitea/workflows/image-only.yml Code-only ship cycle, ~18min. Co-Authored-By: Claude Opus 4.7 (1M context) diff --git a/.gitea/workflows/image-only.yml b/.gitea/workflows/image-only.yml new file mode 100644 index 0000000..83b9b5f --- /dev/null +++ b/.gitea/workflows/image-only.yml @@ -0,0 +1,89 @@ +name: Image rebuild (skip scrape) + +# Fast path for code-only changes. Skips the scrape and goes straight to: +# rebuild indexes (from corpus already committed on main) + image build +# + push. Runtime is ~18 min vs ~40 min for the full refresh. +# +# Use when a PR only changes code/config — anything where the upstream +# corpus hasn't moved but we want the new Python in the running image. +# +# IMPORTANT: fetch-depth: 0 is required for the digest-history step +# to find commits to walk. Don't change to 1. + +on: + workflow_dispatch: + +env: + REGISTRY_PUSH: : + REGISTRY_PULL: + IMAGE: /-docs-mcp + OLLAMA_URL: http://:11434 + EMBED_MODEL: nomic-embed-text + PRODUCT_NAME: + +jobs: + build: + runs-on: docker + container: + image: catthehacker/ubuntu:act-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # Full history (not shallow) so the digest-history step can + # walk git log up to --history-days back. + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install -q --upgrade pip + python -m pip install -q -r requirements.txt + + - name: Refresh digest history + # Cheap (a few seconds); doesn't touch corpus content. + # Without this step, a code-only deploy would ship an + # increasingly-stale digest history relative to git. + run: | + mkdir -p corpus/.digest + python -m scrape.changelog \ + --history-out corpus/.digest/history.jsonl \ + --history-days 120 + + - name: Verify committed corpus is present + run: | + test -d corpus || { echo "ERROR: corpus/ missing on this ref"; exit 1; } + echo "corpus: $(du -sh corpus | cut -f1), $(find corpus -name '*.md' | wc -l) markdown files" + + - name: Rebuild indexes from existing corpus + run: python -m rag.index --rebuild + + - name: Log in to registry (LAN endpoint) + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u --password-stdin + + - name: Build & push image + run: | + SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) + DATE_TAG=$(date -u +%Y.%m.%d) + docker build \ + -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ + -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ + -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ + . + docker push "${REGISTRY_PUSH}/${IMAGE}:latest" + docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" + docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" + + - name: Prune old container versions + env: + GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} + run: | + python scripts/registry_gc.py \ + --owner \ + --package -docs-mcp \ + --keep-days 90 \ + --keep-latest 5 diff --git a/.gitea/workflows/refresh.yml b/.gitea/workflows/refresh.yml new file mode 100644 index 0000000..ad10efe --- /dev/null +++ b/.gitea/workflows/refresh.yml @@ -0,0 +1,158 @@ +name: Weekly docs refresh + +# Runs the full pipeline: scrape upstream → rebuild indexes → push +# image. Cron'd weekly (Mondays). Skip the reindex + image-push if the +# scrape produced no diff against the committed corpus. +# +# IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because +# the digest-history step walks git log up to --history-days back. +# With a shallow checkout the history file ships empty. + +on: + schedule: + - cron: "0 6 * * 1" # Mondays 06:00 UTC + workflow_dispatch: + inputs: + force_build: + description: "Rebuild indexes + push image even if corpus is unchanged" + type: boolean + default: false + +env: + # If your registry sits behind Cloudflare with its 100 MB body cap, + # use a LAN endpoint for pushes (bypasses CF) and the public hostname + # for pulls (response bodies aren't capped). + REGISTRY_PUSH: : + REGISTRY_PULL: + IMAGE: /-docs-mcp + + # Embedder. One URL per GPU; the indexer round-robins. + OLLAMA_URL: http://:11434 + EMBED_MODEL: nomic-embed-text + + PRODUCT_NAME: + +jobs: + refresh: + runs-on: docker + container: + image: catthehacker/ubuntu:act-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # Full history — required for the digest-history step to + # walk git log. Default fetch-depth: 1 silently produces a + # 0-byte history file. + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install -q --upgrade pip + python -m pip install -q -r requirements.txt + + # ---- Phase 1: scrape --------------------------------------- + - name: Refresh bundle catalog + run: python -m scrape.bundles + + - name: Re-scrape all bundles + # --force re-fetches every page so we actually see upstream + # edits. Without it the runner skips pages already on disk. + run: python -m scrape.runner --all --force --concurrency 6 + + # ---- Build the digest history BEFORE committing ------------ + # See PLAN.md Phase 13. Walks recent corpus-touching commits + # and writes corpus/.digest/history.jsonl. The current refresh + # gets added on the NEXT run's history (one-week lag is fine). + - name: Build digest history + run: | + mkdir -p corpus/.digest + python -m scrape.changelog \ + --history-out corpus/.digest/history.jsonl \ + --history-days 120 + + # ---- Commit + retry-on-race -------------------------------- + - name: Commit corpus changes (if any) + id: commit + run: | + git config user.name "-docs-refresh" + git config user.email "actions@" + git add bundles.json corpus + if git diff --cached --quiet; then + echo "no corpus changes — skipping reindex and image build" + echo "changed=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + echo "changed=true" >> "$GITHUB_OUTPUT" + python -m scrape.changelog --cached > /tmp/changelog.txt + summary=$(head -1 /tmp/changelog.txt) + ts=$(date -u +"%Y-%m-%dT%H:%MZ") + { + echo "weekly refresh: ${ts} — ${summary}" + echo "" + cat /tmp/changelog.txt + } > /tmp/commitmsg.txt + git commit -F /tmp/commitmsg.txt + # Retry on race: if main moved while we were scraping (a + # human merged a PR during the run), `git push` rejects + # with "fetch first". Rebase our corpus commit onto new + # main and retry. Corpus + code paths are disjoint, so + # the rebase is trivially clean. + attempt=1 + while [ $attempt -le 3 ]; do + if git push; then + echo "pushed corpus changes (attempt $attempt)" + break + fi + if [ $attempt -eq 3 ]; then + echo "push still failing after 3 attempts — bailing" + exit 1 + fi + git fetch origin main + git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; } + attempt=$((attempt + 1)) + done + + # ---- Reindex Chroma + BM25 --------------------------------- + - name: Rebuild indexes + if: steps.commit.outputs.changed == 'true' || inputs.force_build == true + run: python -m rag.index --rebuild + + # ---- Build & push image ------------------------------------ + - name: Log in to registry (LAN endpoint) + if: steps.commit.outputs.changed == 'true' || inputs.force_build == true + run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u --password-stdin + + - name: Build & push image + if: steps.commit.outputs.changed == 'true' || inputs.force_build == true + # Runner shell is /bin/sh — use cut instead of ${VAR::N}. + # Three tags: :latest (Watchtower target), : + # (rollback pin), : (human-readable). + run: | + SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) + DATE_TAG=$(date -u +%Y.%m.%d) + docker build \ + -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ + -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ + -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ + . + docker push "${REGISTRY_PUSH}/${IMAGE}:latest" + docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" + docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" + + # ---- Registry GC ------------------------------------------- + - name: Prune old container versions + if: steps.commit.outputs.changed == 'true' || inputs.force_build == true + env: + GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }} + run: | + python scripts/registry_gc.py \ + --owner \ + --package -docs-mcp \ + --keep-days 90 \ + --keep-latest 5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fbc0883 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Virtualenv +venv/ +.venv/ + +# Regenerable from corpus + CI +corpus/ +chroma/ +bm25/ + +# Python detritus +__pycache__/ +*.py[cod] +*.egg-info/ +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +# Eval results (regenerable; commit only the headline baseline if you want) +# eval/results/ + +# Usage logs (host-mounted volume in prod; don't commit dev logs) +var/ + +# Local-only env +.env +.env.local + +# IDE +.vscode/ +.idea/ +*.swp diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..4d4da98 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,232 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when +working with code in this repository. + +## Purpose + +This is a **template** for building an MCP server over a product's +public documentation. When you (Claude) are working in a clone of this +repo, you are helping the user implement one specific product's docs +MCP — not editing the template itself. + +**Read `PLAN.md` first.** It's the canonical build guide and lays out +13 phases. Most user requests will be "implement Phase N" or "we hit +a bug in Phase N." Identify the phase before doing anything else. + +## Working with this template + +### Identifying the current phase + +When the user clones this template and starts working, figure out +which phase they're on by inspecting: + +| Signal | Likely phase | +|---|---| +| `corpus/` doesn't exist | Phase 1 (scraper) — they need to build it before anything else works | +| `corpus/` exists, `chroma/` doesn't | Phase 2 (indexing) | +| Indexes exist, only `search_docs` / `get_page` / `list_versions` implemented | Phase 3 (server skeleton done; next: Dockerfile + CI) | +| No `Dockerfile` or `.gitea/workflows/` updated | Phase 4–5 | +| `RERANK_URL` env unset in compose | Phase 6 not done | +| `HYBRID_SEARCH` env unset, no `rag/bm25.py` content | Phase 8 not done | +| No `eval/results/` directory | Phase 7 not done | +| `find_doc_inconsistencies` / `submit_doc_bug` are commented-out stubs in `docs_mcp/server.py` | Phase 12 | +| No `corpus/.digest/` produced by CI | Phase 13 | + +When in doubt, ask the user: *"Which phase from PLAN.md are we +working on?"* + +### The scaffolded server has stubs + +`docs_mcp/server.py` ships with three working tools (`search_docs`, +`get_page`, `list_versions`) and signature-only stubs for the +phase-specific tools. The stubs `raise NotImplementedError` with a +phase hint in the docstring. When implementing a phase, you'll be +filling these bodies in — DO NOT change the signatures unless the +user has a specific reason. Signatures are the public contract +between the MCP and its clients (Claude Desktop, Claude Code, +Cursor, etc.). + +## Layout + +``` +. +├── PLAN.md # Read first. Phase-by-phase build guide. +├── README.md # Quick-start summary. +├── CLAUDE.md # This file. +├── requirements.txt +├── Dockerfile +├── deploy/docker-compose.yml +├── .gitea/workflows/ +│ ├── refresh.yml # Weekly cron: scrape + index + image +│ └── image-only.yml # On-demand: code-only ship cycle +├── scrape/ # Phase 1 — product-specific scraper here +│ └── changelog.py # Reusable: --json, --history-out +├── rag/ # Phase 2/8 — indexing +│ ├── embeddings.py # Ollama embedder (swappable) +│ ├── chunk.py # Page → chunks (adjust per page format) +│ ├── index.py # Builds Chroma + BM25 +│ └── bm25.py # SQLite FTS5 lexical index +├── docs_mcp/ # Phase 3+ — MCP server +│ ├── server.py # FastMCP + tool definitions +│ └── usage.py # TimedCall telemetry +├── eval/ # Phase 7 — golden-query harness +│ ├── queries.jsonl.example +│ ├── retrievers.py +│ └── run_eval.py +├── scripts/ # Standalone ops scripts +│ ├── usage_report.py +│ └── registry_gc.py +└── deploy/ + └── docker-compose.yml +``` + +## Conventions + +### Tool docstrings are user interface + +The text in `@mcp.tool()` docstrings is what the LLM sees and uses to +decide whether to call the tool. Treat it like a button label. +*"Use when..."*, *"Call proactively whenever..."* phrasings work +well. Don't bury the headline in implementation notes. + +### Side-effecting tools must be env-gated AND operator-confirmed + +Any tool that POSTs to an external service (submit_doc_bug being the +canonical example): + +1. Must check an env flag at call time and return a "disabled, + manual fallback at " message if unset. +2. Must have a loud docstring requiring per-call operator + confirmation in the LLM conversation flow (the LLM drafts, shows + the operator the exact payload, asks yes/no, only then calls). +3. Must do upfront validation (URL allowlist, content length, etc.) + so the LLM gets a clean error instead of a wire-level failure. + +Match the `submit_doc_bug` patterns documented in PLAN.md Phase 12. + +### Defensive fallback for retrieval components + +The reranker, BM25 index, and any external dependency must fail +gracefully: + +- Catch the specific exception type +- Log a warning with enough info to debug +- Fall back to a working baseline (dense-only, no reranker, etc.) +- Never block a search_docs call on a single failure + +The user's MCP is in front of real people; partial degradation +beats a 500. + +### Verify retrieval changes with the eval harness + +Any change that touches retrieval (new embedder, chunker tweak, +reranker model, filter shape) ships with eval numbers in the commit +message. Don't ship retrieval changes on vibes. If `eval/queries.jsonl` +isn't populated yet, populate it before changing retrieval — it's +the most important file in the repo. + +### Standard infrastructure choices + +These are reasoned defaults — only deviate if you have a specific +need: + +- **Embedding model**: `nomic-embed-text` via Ollama (768-dim, + free, on-prem) +- **Reranker**: `jina-reranker-v2-base` GGUF via llama.cpp + `/v1/rerank` endpoint +- **Vector store**: Chroma `PersistentClient` +- **Lexical store**: SQLite FTS5 (stdlib) +- **Fusion**: Reciprocal Rank Fusion with k=60 +- **Transport**: streamable-HTTP in prod, stdio for local dev +- **MCP framework**: FastMCP with `stateless_http=True` +- **Container deploy**: Watchtower auto-pull on `:latest`, rollback + via `:` pin + +### Naming the product + +The template uses `PRODUCT_NAME` env var (defaults to `"myproduct"`) +throughout. Set it on first build. References show up in: + +- `docs_mcp/server.py` — `FastMCP(f"{PRODUCT_NAME}-docs", ...)` +- Collection name (`_docs`) +- BM25 db filename +- Tool names that include the product name (e.g., the `_api_lessons` + tool — convention is to name it `_api_lessons`) + +Use lowercase, underscores not hyphens, since it ends up in tool +identifiers that the LLM reads. + +## Common commands + +```bash +# Set up dev environment +python -m venv venv && source venv/bin/activate +pip install -r requirements.txt + +# Run the MCP server locally for Claude Desktop dev +python -m docs_mcp.server --transport stdio + +# Run as HTTP for integration testing +python -m docs_mcp.server --transport streamable-http --port 8000 + +# Rebuild Chroma + BM25 indexes from corpus +python -m rag.index --rebuild + +# Rebuild only BM25 (fast iteration) +python -m rag.index --bm25-only + +# Run the eval harness +python -m eval.run_eval --queries eval/queries.jsonl --output eval/results/baseline.md + +# Generate changelog summary (called by CI, useful locally too) +python -m scrape.changelog --cached +python -m scrape.changelog --history-out corpus/.digest/history.jsonl --history-days 120 +``` + +## Gotchas (carried forward from the reference build) + +- **`fetch-depth: 0` on `actions/checkout@v4`** in both workflows. + Default is shallow; history-walking steps (changelog, digest) + silently produce empty output otherwise. This is the #1 thing + people miss. +- **Reranker per-pair token limit**: jina-reranker GGUF rejects the + ENTIRE batch if any doc exceeds `n_ctx_train=1024`. Truncate docs + to ~2000 chars before sending to rerank. Full chunk text still + goes back to the user; truncation is reranking-only. +- **FastMCP `stateless_http=True`**: critical for production + hosting behind Watchtower auto-updates. Without it, every + container recreate produces a 404 storm from clients with + stale session IDs. +- **Runner shell is `/bin/sh` (dash)**: no `${VAR::N}` substring + expansion in workflow scripts. Use `cut`/`awk`/`printf`. +- **Cloudflare 100 MB body cap**: if pushing through a Cloudflare- + fronted registry, push via LAN endpoint, pull via public + hostname. Same registry, different URLs. + +## When the user says... + +| User says | You do | +|---|---| +| "Let's start building" / "set up the project" | Read PLAN.md Phase 0; create dirs, requirements.txt, etc. Confirm Python version and existing tooling. | +| "Build the scraper" / "scrape the docs" | Read PLAN.md Phase 1. Find the upstream portal's underlying API by sniffing; AVOID headless-browser solutions unless the API path is truly closed. | +| "Get retrieval working" / "make search work" | Read PLAN.md Phase 2-3. Implement chunking, embedder, Chroma indexer, then the three baseline tools. | +| "Add a reranker" | Read PLAN.md Phase 6. Stand up the llama.cpp sidecar, implement `_rerank()`. Verify with the eval harness. | +| "Search is missing X queries" | Run the eval harness first to confirm the failure. Then consider: rich chunk-0 rewrites, hybrid retrieval, curated knowledge layer. Don't just tune cosine. | +| "Let's add hybrid search" | Read PLAN.md Phase 8. Only after you've established the failure mode with eval queries — hybrid is not free. | +| "Make a tool that submits doc bugs" | Read PLAN.md Phase 12. Find the docs portal's feedback endpoint by sniffing. Build with operator confirmation as a hard requirement in the tool docstring. | +| "I want a 'what changed' tool" | Read PLAN.md Phase 13. Don't try to do this at runtime — pre-bake the history JSONL at CI time. | + +## Out-of-scope concerns (don't try to solve here) + +- **Reverse proxy / TLS termination** — outside the repo. User + picks Caddy / Cloudflare Tunnel / nginx / Traefik based on their + infra. +- **MetaMCP or other gateway** — outside the repo. Optional, only + matters when running multiple MCPs. +- **GPU container orchestration** — outside the repo. Pattern is + one Ollama container per GPU; the indexer load-balances. Document + it in deploy/ but don't build it in this template. +- **Email/blog delivery for weekly_digest** — out of scope per + PLAN.md ("Out of scope" section). Add a separate script in + scripts/ if/when the user asks. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f4e4d14 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +# Docs MCP server — production image. +# +# Structure: copy code first, then the regenerable indexes last so a +# code-only change doesn't invalidate the corpus COPY layer. +# +# The container runs the MCP server via streamable-http on PORT 8000. +# Override via MCP_HOST / MCP_PORT env if you front it with a different +# reverse-proxy setup. + +FROM python:3.12-slim + +WORKDIR /app + +# Install Python deps first for cacheability. +COPY requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +# Code. +COPY scrape /app/scrape +COPY rag /app/rag +COPY docs_mcp /app/docs_mcp + +# Catalog. Written by the scraper at CI time. +COPY bundles.json /app/ + +# Regenerable indexes. CI builds these from corpus/ in the same job +# that builds the image. Listed last so code changes don't invalidate +# the COPY layer cache for these (much larger) directories. +# +# bm25/ is only consulted when HYBRID_SEARCH=true (the server falls +# back to dense-only if it's missing). +COPY corpus /app/corpus +COPY chroma /app/chroma +COPY bm25 /app/bm25 + +ENV PYTHONUNBUFFERED=1 \ + MCP_TRANSPORT=streamable-http \ + MCP_HOST=0.0.0.0 \ + MCP_PORT=8000 + +EXPOSE 8000 + +ENTRYPOINT ["python", "-m", "docs_mcp.server"] diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 0000000..ca0f327 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,647 @@ +# Docs MCP Server — Build Guide + +A reusable recipe for building a hosted MCP server over a product's +public documentation. Distilled from one production build; everything +product-specific has been factored out. + +The end product is a streamable-HTTP MCP server with ~15 tools that +any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can +call to answer questions against the docs, surface what changed +recently, find inconsistencies, and (optionally) submit doc bugs +back upstream. + +--- + +## What you're building + +A pipeline with these stages: + +``` +upstream docs portal + │ + ▼ + scrape ──► corpus//.md + .json sidecar + │ + ▼ + chunk + embed ──► chroma/ (dense vectors) + │ ──► bm25/ (FTS5 lexical index) + ▼ + MCP server ──► search_docs / get_page / diff_versions / weekly_digest / + find_doc_inconsistencies / submit_doc_bug / ... + │ + ▼ + reverse proxy / Cloudflare Tunnel ──► public endpoint +``` + +Two CI cadences: + +- **Weekly cron** (~40 min): full re-scrape, re-chunk, re-embed, + image build & push. +- **On-demand image-only** (~18 min): code-only rebuild from + committed corpus, image build & push. + +A container registry (self-hosted Gitea works well), a host running +Docker Compose, Watchtower auto-updating from `:latest`, and a +reverse proxy in front. + +--- + +## Build phases + +Each phase is a discrete, shippable unit. Build them in order; each +one is useful on its own and unlocks the next. Realistic effort per +phase is given as a rough order of magnitude. Total: roughly 2–3 +weeks of focused work for the full stack. + +### Phase 0 — Project skeleton *(half a day)* + +Goals: directory layout, dependency manifest, virtualenv. + +- Top-level dirs: `scrape/`, `corpus/` (gitignored), `rag/`, + `docs_mcp/`, `eval/`, `scripts/`, `deploy/`, `.gitea/workflows/`. +- `requirements.txt` with the dependencies you'll need across all + phases (FastMCP, chromadb, httpx, beautifulsoup4 or whatever HTML + parser, ollama or sentence-transformers client, etc.). +- `python -m venv venv` and pin Python version (3.11 or 3.12 — be + conservative; some embedding libraries have version-specific + wheels). +- `.gitignore`: `venv/`, `corpus/` (regenerable), `chroma/` + (regenerable), `bm25/` (regenerable), `*.pyc`, `__pycache__/`, + `.pytest_cache/`. + +### Phase 1 — Scraper *(2–4 days, product-specific)* + +This is the most product-dependent phase. The goal is to write a +scraper that produces a normalized corpus layout regardless of +upstream portal shape. + +Output shape (mandatory): + +``` +corpus/ + / # one dir per "doc bundle" — see Glossary + .md # markdown body + .json # sidecar with structured metadata + ... +bundles.json # catalog of bundles with metadata +``` + +**Bundle metadata** (`bundles.json` is a list of these): + +```json +{ + "slug": "", + "title": "User-facing title", + "version": "10.9", + "platform": "VMware vSphere", // may be null + "product": "Admin Guide", // optional but useful + "language": "en-US", + "page_count": 127, + "dates": { + "Added on": "2024-01-15", + "Updated on": "2026-05-20" + }, + "landing_page": "" +} +``` + +**Per-page sidecar** (`.json`) carries page-level metadata. +The one field that matters cross-cutting is `topic_cluster` (see +Phase 9): + +```json +{ + "bundle_id": "", + "page_id": "", + "title": "How to ...", + "ordinal": 42, + "topic_cluster": { + "clustering_title": "How to ...", + "clustered_topics": [ + {"bundle_id": "...10.8", "page_id": "How_to_X.htm", "clustering_title": "..."}, + {"bundle_id": "...10.9", "page_id": "How_to_X.htm", "clustering_title": "..."} + ] + } +} +``` + +If the portal exposes a cross-version "this page corresponds to that +page" mapping, capture it here. If it doesn't, you can synthesize a +filename-based fallback (same filename across bundle versions = same +topic) and live without the editor-curated mapping. The features that +read `topic_cluster` (`list_cluster`, `diff_versions`, +`find_doc_inconsistencies`, parts of `weekly_digest`) will work +either way; they're more accurate with real clusters. + +**Patterns that recur across doc portals:** + +- Most modern doc portals are SPAs. Plain `requests.get` won't see + rendered content. Either find the underlying API the SPA calls (the + cheapest, most reliable path), or fall back to a headless browser + (Playwright). The API path is almost always available; sniff the + network tab. +- Portals usually expose a "bundle/topic" hierarchy under the hood + (Zoomin, Madcap Flare, Paligo, GitBook, Docusaurus all do). Map + it to `bundles.json` + `corpus//`. +- Many portals expose `?save_local=` or `.pdf` rendered versions; the + HTML they serve is structurally cleaner than what the page shows + through the SPA shell. + +**`scrape/changelog.py`** (~250 LOC; see Phase 13) — provides +`summarize_diff()`, `render_human()`, `walk_history()` and the +`--json` / `--history-out` modes. Mostly reusable as-is; the only +product-specific bit is the path layout assumption. + +### Phase 2 — Chunking + embeddings + Chroma *(2 days)* + +Goal: build a queryable dense index from the scraped corpus. + +- `rag/chunk.py` — split each page's markdown into ~400-600 token + chunks. Strategy that works: paragraph-aware splitter with a + rich "chunk 0" containing the page title + 1-sentence summary + + bag-of-words from key terms. Chunk 0 is what dense retrieval lands + on first; getting it right dominates retrieval quality. +- `rag/embeddings.py` — pluggable embedder. Recommended start: + Ollama-hosted `nomic-embed-text` (768-dim, free, good baseline). + Other defensible choices: `text-embedding-3-small` (OpenAI), + `bge-m3` (also via Ollama). The embedder is a Chroma + `EmbeddingFunction` that returns `list[list[float]]` for a list + of texts. +- `rag/index.py` — orchestrates: read corpus → emit chunks (with + metadata: bundle_id, page_id, version, platform, ordinal) → + upsert into Chroma collection. `--rebuild` flag for a clean + reindex. Run via `python -m rag.index --rebuild`. + +Chroma settings: `PersistentClient(path="chroma/")` and +`Settings(anonymized_telemetry=False)`. Single collection +(`_docs`). + +**GPU note**: embedding 70K chunks on CPU takes hours; on a GPU +(via Ollama with `NVIDIA_VISIBLE_DEVICES`) takes ~10 minutes. Two +GPUs in parallel: ~5 minutes. The orchestrator just needs to load- +balance HTTP requests across multiple Ollama endpoints. + +### Phase 3 — MCP server skeleton *(1 day)* + +Goal: working FastMCP server with three tools — `search_docs`, +`get_page`, `list_versions`. + +- `docs_mcp/server.py` — `FastMCP("-docs", stateless_http=True)`. + `stateless_http=True` is critical for production hosting: every + request creates an ephemeral session, so container recreates don't + produce a 404 storm from stale `mcp-session-id` headers on + clients. +- Lazy initialization for everything expensive (Chroma client, + embedder, bundles catalog) so the server starts cleanly even when + Ollama is briefly unreachable. +- Tool: `search_docs(query, version=None, platform=None, + bundle_id=None, k=10)`. Returns markdown of top-k chunks with full + source URLs. +- Tool: `get_page(bundle_id, page_id)`. Returns full page markdown + + metadata. +- Tool: `list_versions()`. Returns the version/platform facets + available, drawn from `bundles.json`. Helps the LLM pick filter + values. + +Transports: stdio (for local Claude Desktop dev), streamable-HTTP +(for hosted production). One argparse switch. + +```python +@mcp.tool() +def search_docs( + query: Annotated[str, Field(description="Natural-language query about .")], + version: Annotated[str | None, Field(description="Restrict to one version")] = None, + ... +) -> str: + ... +``` + +The tool descriptions are first-class context — the LLM reads them +and decides whether to call the tool. Treat them as button labels; +use "Call when..." / "Use proactively whenever..." phrasings. + +### Phase 4 — Containerization *(1 day)* + +Goal: image you can run anywhere. + +- `Dockerfile`: Python 3.12-slim base, install requirements, COPY + `scrape rag diff docs_mcp` + `bundles.json` + `corpus/ chroma/` + + (later) `bm25/`. Don't COPY `scripts/` — those stay external + for ops use only. +- `ENTRYPOINT ["python", "-m", "docs_mcp.server", + "--transport", "streamable-http"]`. Configurable host/port via env. +- `deploy/docker-compose.yml`: one service, named volumes for usage + logs and any state, Watchtower label, depends_on for the reranker + sidecar (Phase 6). + +Smoke-test locally: `docker compose up` should expose +`http://localhost:8000/mcp` and respond to an MCP `initialize` JSON-RPC. + +### Phase 5 — CI on self-hosted Gitea Actions *(1–2 days)* + +Goal: weekly cron rebuild + on-demand code-only ship cycle. + +**Two workflows, two cadences:** + +| Workflow | Trigger | Steps | Runtime | +|---|---|---|---| +| `refresh.yml` | Monday cron + manual dispatch | scrape → commit corpus → rebuild indexes → build & push image | ~40 min | +| `image-only.yml` | manual dispatch only | rebuild indexes from committed corpus → build & push image | ~18 min | + +**Critical settings (learned the hard way):** + +- `fetch-depth: 0` on `actions/checkout@v4`. The default depth is 1 + (shallow), which breaks any step that walks git history (changelog, + digest history walker). Pay the ~10 second cost; never debug a + "0-byte history file" mystery. +- `runs-on: docker` (Gitea convention, not `ubuntu-latest`). +- Runner shell is `/bin/sh` (dash), not bash. `${VAR::N}` substring + expansion doesn't exist; use `cut` / `printf` / `awk`. + +**Retry-on-race pattern for long-running scrapes:** + +```bash +attempt=1 +while [ $attempt -le 3 ]; do + if git push; then + echo "pushed (attempt $attempt)" + break + fi + [ $attempt -eq 3 ] && { echo "still failing"; exit 1; } + git fetch origin main + git rebase origin/main || { echo "conflict — bail"; exit 1; } + attempt=$((attempt + 1)) +done +``` + +Works because scrape commits only touch `corpus/` + `bundles.json`, +and code merges only touch `.py` / `.yml` — disjoint paths, trivially +clean rebases. + +**Image tagging — three tags per build:** + +| Tag | Purpose | +|---|---| +| `:latest` | Watchtower watches this for auto-deploy | +| `:` | Immutable; rollback target | +| `:` | Human-readable in incident notes | + +Same tag set on every build; rollback is a one-line compose edit +to pin `:` instead of `:latest`. + +**Container registry behind Cloudflare:** + +Cloudflare's free tier has a 100 MB request body limit. Big image +layers (Chroma index can easily be 800+ MB) exceed it on push. The +fix is a LAN registry endpoint for push, public hostname for pull: + +```yaml +env: + REGISTRY_PUSH: : # bypasses Cloudflare + REGISTRY_PULL: # response bodies aren't capped +``` + +Runner needs the LAN endpoint in `/etc/docker/daemon.json` +`insecure-registries`. Costs nothing operationally; saves hours +of debugging. + +**Registry GC:** weekly cron in the workflow that walks the package +versions, keeps `:latest` + N most-recent date tags + anything +pushed in the last 90 days, deletes the rest. Worth ~50 LOC; the +package GC on the Gitea side reclaims disk after. + +### Phase 6 — Reranker *(half a day)* + +Goal: lift retrieval quality 3× by cross-encoder reranking the top-N +dense candidates. + +- A `/v1/rerank` HTTP endpoint backed by `llama.cpp` serving + `jina-reranker-v2-base` (GGUF). Runs as a sidecar in compose. + GPU strongly recommended (CPU latency is unworkable for live + queries). +- `_rerank(query, docs)` helper in the server: POST to the endpoint, + apply the scores, re-sort the top-N candidates. Defensive: on any + failure log a warning and fall through to dense-only. +- Env: `RERANK_URL` (off by default), `RERANK_POOL` (how deep to + pull candidates for reranking; 200 is a good default), + `RERANK_TIMEOUT` (30s for cold-start tolerance). +- **Watch the per-pair token limit.** Jina's GGUF reports + `n_ctx_train=1024` and llama.cpp will reject the ENTIRE batch if + any pair exceeds it. Truncate doc text to ~2000 chars before + reranking. The full untruncated chunk still goes back to the user; + truncation is only for the reranker scoring path. + +### Phase 7 — Eval harness *(1 day)* + +Goal: hand-curated golden queries + standard metrics so you can +measure the impact of any retrieval change. + +- `eval/queries.jsonl`: 20–25 hand-curated queries with expected + pages. Spread across versions, platforms, and difficulty levels. + Include the queries that "obviously" should work and DON'T — + those are the ones to track. +- `eval/retrievers.py`: a `Retriever` protocol with concrete + implementations: `DenseRetriever`, `RerankedRetriever`, + `BM25Retriever` (Phase 8), `HybridRetriever` (Phase 8). One + matrix dimension per knob. +- `eval/run_eval.py`: computes MRR / Recall@5 / nDCG@5 across all + retrievers; emits a markdown comparison table at + `eval/results/.md`. Commit the result so PRs land with + the A/B evidence in the diff. + +Three numbers are enough — don't overengineer. The hand-curated +queries are the value; the metrics are just a stable way to score +them. + +### Phase 8 — BM25 + Hybrid retrieval *(half a day, conditional)* + +**Skip unless your eval shows specific failure modes.** Dense +embeddings + cross-encoder reranker handle most queries. The case +where they don't: queries with rare technical tokens (filenames, +language names, error codes) get buried at dense rank 1000+ by a +much larger prose corpus that's semantically nearby. The reranker +only sees top-200, so it never gets a shot. + +- `rag/bm25.py`: SQLite FTS5 index, in the stdlib, on-disk + (`bm25/.db`). Two tables — metadata table keyed by + rowid, FTS5 virtual table for full-text. Sanitize the query + (strip FTS5 reserved keywords, OR-join tokens for recall). ~210 + LOC. +- `_rrf_fuse()` in the server — Reciprocal Rank Fusion with `k=60`. + Per-id score = `sum_over_retrievers(1 / (k + rank))`. Returns + ordered ids plus per-retriever contribution dict for telemetry. +- `search_docs` hybrid path: run dense + BM25 in parallel, + RRF-fuse, hand the merged top-200 to the reranker. Env-gated: + `HYBRID_SEARCH=true`. +- Log `top1_source` per call (`dense_only` / `bm25_only` / `both`) + to usage logs so you can measure whether BM25 is actually earning + its keep on production traffic. + +If after 4–6 weeks of production data you see `bm25_only >= 80%`, +you can simplify to BM25-only (much less infrastructure). If +`both >= 50%`, hybrid is acting as tie-breaker not rescue — keep it +or simplify depending on how much you care about the long tail. + +### Phase 9 — Multi-version diff tooling *(1 day, if applicable)* + +**Only relevant if the product has multiple maintained versions.** + +- `diff_versions(bundle_id, page_id, against_bundle_id)`: unified + diff between two versions of the same page. Two matching + strategies: editor-curated `topic_cluster` peer (if the portal + exposes it), or same-filename fallback. +- `list_cluster(bundle_id, page_id)`: list cross-version peers + for one page. +- `bundle_changelog(bundle_id_new, bundle_id_old)`: added / + removed / changed pages between two bundles, sorted by churn. +- `_diff_churn(a, b)`: small helper, ~15 LOC of `difflib.unified_diff + --unified=0` line counting. Used by `bundle_changelog`, + `find_doc_inconsistencies`, and `weekly_digest`. + +### Phase 10 — Usage logging *(half a day)* + +Goal: per-call JSONL telemetry so you can answer "what are people +actually asking for" and "is the new feature getting used." + +- `docs_mcp/usage.py`: `TimedCall` context manager that captures + tool name, args, elapsed time, hits returned, any extra fields + set by the tool via `_call.set(key=value)`. Writes JSONL to + `var/logs/usage.jsonl`, rotated daily, kept 90 days. +- Mount the log dir as a named compose volume so logs survive + container recreates. +- `scripts/usage_report.py` (standalone, no docs_mcp deps): reads + the JSONL files, prints per-tool counts, top queries, 0-hit + queries, filter usage histogram, reranker activity. Markdown + output flag for piping into weekly digest emails. + +What to log: query text, filters, hits returned, elapsed_ms, +reranker_fired flag, hybrid top1_source, retrieval_mode. What NOT +to log: anything PII-shaped. The corpus is public, queries are +usually about the product, not personal — but be deliberate. + +### Phase 11 — Curated knowledge layer *(2 days)* + +The "RAG can't tell you what isn't in the docs" gap. Surfaces: + +- **API quickstart repos** if the product has them. Ingest the + example scripts (Python, PowerShell, curl) into the corpus. + Rewrite chunk-0 for each script to embed naturally — explicit + natural-language H1, task description sentence, keyword bag. + Dense embeddings need an anchor. +- **A curated `_api_lessons` markdown doc** for things + the swagger / OpenAPI doesn't say: auth flow gotchas, async-task + patterns, schema bugs you've hit, platform-detection quirks. + Surface as a dedicated MCP tool whose description tells the LLM: + *"Call proactively whenever the user asks you to write a script + / integrate with the API / debug a 4xx response."* +- **An auto-hint banner** in `search_docs` results — when the + query matches a script/API trigger word, render a one-line nudge + at the top of results pointing at the dedicated tool. Belt-and- + suspenders for queries where the LLM doesn't think to call it + proactively. + +### Phase 12 — Doc-bug workflow tools *(1 day, optional)* + +Two tools that pair up to enable a *"check the docs for +inconsistencies, draft bugs, confirm, submit"* workflow. + +- `find_doc_inconsistencies(scope_query, version=None, platform=None, + max_pages=30, checks=None)`: deterministic, read-only. Two checks: + cross-version drift (pages whose content shifted between immediate- + previous versions in the actionable 10–60% churn band) and + redirect-chain detection (short pages whose body is just a "see + [other page] for details" pointer). Heavy lifting is line-level + diff (`difflib`) against editor-curated cluster peers; the model + judges which findings are real bugs. + +- `submit_doc_bug(page_url, content, email=None, rating=None, + like=None)`: POSTs to the docs portal's feedback endpoint. + Env-gated by `DOC_BUG_SUBMIT_ENABLED=true` so dev/staging + deployments can't accidentally hit the upstream. The tool's + docstring is loud about a mandatory operator-confirmation + workflow per submission — LLM must draft, show, ask, then + submit. Explicit *"do not loop"* instruction. Defensive + validation upfront (URL host matches expected portal, content + non-empty, etc.) so the LLM gets a clean error instead of a + rejected POST. + +**You'll need to find the docs portal's feedback endpoint.** Most +portals route the "Was this helpful?" widget through a backend +API; sniff the browser network tab on the live site. The payload +shape varies; common fields: content/body, page url/href, optional +email, optional rating, optional thumbs. Most accept anonymous +POSTs with no captcha at the JSON-API layer (even if the widget +shows a captcha). Validate before you ship — and if the endpoint +has rate limits or captcha enforcement, the tool returns a clean +"submission rejected — paste manually at " fallback. + +The whole point is the per-bug operator confirmation in the +LLM-side conversation flow; the tool description enforces it. Do +not bypass. + +### Phase 13 — Weekly digest tool *(half a day)* + +Goal: a tool that answers *"what changed in the docs in the last N +days?"* with no runtime git dependency (the prod container has no +git). + +- Extend `scrape/changelog.py` with `--json` (one-shot structured + output) and `--history-out PATH` (walks `git log --first-parent + --since=" days ago"` for corpus-touching commits, writes one + JSON line per commit to a JSONL file). +- CI workflows write the JSONL file into the image at build time: + `corpus/.digest/history.jsonl`. Both `refresh.yml` and + `image-only.yml`. **`fetch-depth: 0` is required** — see Phase 5. +- New MCP tool `weekly_digest(days=7, version=None, platform=None, + max_bundles=25, max_pages_per_bundle=10)`: reads the JSONL, + filters to the window, applies version/platform via + `bundles.json` metadata, aggregates per-bundle change counts and + page lists, renders markdown. +- Post-filter totals are critical: the headline "X page changes + across Y bundles" must compute X from the filtered set, not the + raw record count. Otherwise filtered calls look wrong to the + reader. + +Out of scope but trivial bolt-ons: scheduled HTML email of the +digest, auto-publish to a blog, per-page diff excerpts as a +follow-up tool. + +--- + +## Standard tool set + +By the end you'll have ~15 tools registered. Production-tested +shape: + +| Tool | What it does | +|---|---| +| `search_docs` | Semantic search with version/platform/bundle filters | +| `get_page` | Full markdown + metadata for one page | +| `list_versions` | Discover available facet values | +| `list_cluster` | Cross-version peers for one page (if applicable) | +| `diff_versions` | Unified diff of a page across two versions | +| `bundle_changelog` | Added / removed / changed pages between two bundles | +| `weekly_digest` | What changed in the last N days, with filters | +| `corpus_status` | Freshness + size of the knowledge base | +| `find_doc_inconsistencies` | Scoped scan for doc bugs | +| `submit_doc_bug` | Submit a drafted bug (env-gated, operator-confirmed) | +| `_api_lessons` | Curated API gotchas, proactively-called | +| product-specific tools | Interop matrix, lifecycle queries, etc. | + +--- + +## Per-product customization checklist + +When applying this template to a new product, here's what you have +to figure out yourself — everything else is shared infrastructure: + +- **Doc portal mechanics** + - URL pattern for pages + - Bundle/version concept (Zoomin "bundle", Madcap "project", + GitBook "space", Docusaurus "docs version" — same idea, different + name) + - SPA backing API (sniff the network tab) or fallback to + headless browser + - How `topic_cluster` -equivalent cross-version peers are exposed + (or whether you synthesize them from filenames) +- **Bundle metadata schema** + - What does `version` look like? Semver, calendar, named? + - What does `platform` mean for this product? Is there a useful + facet at all? + - Other useful facets (language, product line, edition)? +- **Filterable facets** for `search_docs` + - One filter per high-cardinality facet + - Skip filters that have <5 distinct values — they're not worth + the surface area +- **Feedback endpoint** (for `submit_doc_bug`, if you want it) + - URL of the POST endpoint + - Required + optional payload fields + - Captcha / rate-limit behavior + - Whether anonymous submissions are accepted +- **Curated knowledge** for the `_api_lessons` tool + - What does the product's API documentation NOT say that you've + learned from real integration work? +- **Quickstart / example repos** + - Does the vendor publish working code? Ingest it; rewrite + chunk-0 for natural-language retrieval. + +--- + +## Decisions worth carrying forward + +Things you'll save time on by deciding the same way again: + +- **Tool descriptions are user interface.** The LLM reads them + verbatim and decides whether to call the tool. *"Use when..."* + and *"Call proactively whenever..."* are real surfaces; treat + them like button labels. Most retrieval improvements turn out + to be tool-description rewrites in disguise. +- **`stateless_http=True`** on the FastMCP server. Eliminates + whole categories of session-ID-related 404 storms after + container recreates. +- **Pre-bake everything at CI time.** No runtime calls to git, + external services, or anything you wouldn't trust on a + Cloudflare outage. If the digest needs git history, write a + JSONL file at CI time. If the lessons doc needs to load fast, + bake it into the image. +- **Env-gate every side-effecting tool.** Off by default in dev; + on only in production compose. Belt and suspenders against + accidental writes from staging environments. +- **Operator-confirmation pattern for side-effecting tools.** + The tool docstring is the only place to enforce + human-in-the-loop. Make it loud. "MANDATORY", "Do not loop", + "show-confirm-then-submit" — those phrasings work. +- **Verify with hand-curated golden queries before shipping any + retrieval change.** Numbers in the diff, in the commit message. + Don't ship retrieval changes on vibes. +- **Two-cadence CI** (weekly scrape vs on-demand code-only) + saves hours per code iteration once you're past the + one-iteration-a-week stage. +- **Rolling tag + sha-pinned tag** deploy pattern. `:latest` is + what Watchtower watches; `:` is your safety net. Rollback + is a one-line compose edit, not a redeploy. +- **Usage logging is non-negotiable.** You will be wrong about + what people use. Capture the truth from day one; let it tell + you which features to keep building and which to delete. + +--- + +## Glossary + +- **Bundle** — one logical doc set in the portal. Zoomin calls + them bundles; Madcap calls them projects; the concept is the + same: a versioned, titled collection of pages. One dir under + `corpus/`. +- **Page** — one HTML page in a bundle. One `.md` + one `.json` + sidecar under the bundle dir. +- **Topic cluster** — Zoomin's name for "this page in version + 10.9 corresponds to that page in version 10.8." Stored in the + per-page sidecar. The portal-agnostic concept is "cross-version + peer mapping." +- **Chunk** — a unit of text that gets independently embedded and + stored in Chroma. Target ~400-600 tokens; preserve paragraph + boundaries. +- **RRF** — Reciprocal Rank Fusion. The way to merge two ranked + lists from independent retrievers without score calibration. + +--- + +## What's deliberately NOT in this template + +Decisions you should make per-product (not copy from the original +build): + +- The reverse proxy and TLS termination layer. Could be Caddy, + nginx, Traefik, Cloudflare Tunnel — pick what your infra uses. +- The Gateway / aggregator in front of multiple MCPs (MetaMCP is one + option; you may not need any aggregator if you're running a + single product MCP). +- The specific embedding model — `nomic-embed-text` is a strong + default but newer / domain-specific models may be better for + some products. +- The Ollama containers / GPU setup — depends on what hardware you + have. The pattern is one container per GPU with explicit + `NVIDIA_VISIBLE_DEVICES` pinning; the indexer load-balances + across them. +- Whether to publish a blog series alongside the build. Strongly + recommended (forces clarity, builds an audience), but optional. diff --git a/README.md b/README.md new file mode 100644 index 0000000..52cd328 --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +# docs-mcp-template + +A reusable template for building hosted MCP servers over a product's +public documentation. Distilled from one production build; everything +product-specific has been factored out. + +The end product is a streamable-HTTP MCP server with ~15 tools that +any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can +call to answer questions against the docs, surface what changed +recently, find inconsistencies, and (optionally) submit doc bugs +back upstream. + +## What's here + +- **[PLAN.md](PLAN.md)** — comprehensive build guide. Phased + approach (13 phases, ~2–3 weeks of focused work for the full + stack). Includes the design decisions, the gotchas, and a + per-product customization checklist. +- **Scaffolded skeleton** — working FastMCP server with stub tools, + Dockerfile, docker-compose, CI workflows, eval harness layout, + usage logging. Everything you need to `git clone` and start + filling in the product-specific bits. + +## Quick start + +```bash +git clone https://git.jpaul.io/justin/docs-mcp-template.git my-product-docs +cd my-product-docs +git remote remove origin # detach from template +python -m venv venv && source venv/bin/activate +pip install -r requirements.txt + +# Read PLAN.md before doing anything else. Pay particular attention to +# Phase 1 (scraper) — that's the most product-specific phase. + +# Run the stub server (no corpus yet — just verifies the wiring): +python -m docs_mcp.server --transport stdio +``` + +## Repo layout + +``` +. +├── PLAN.md # The build guide. Read first. +├── README.md +├── requirements.txt +├── Dockerfile +├── .gitignore +├── .gitea/workflows/ +│ ├── refresh.yml # Weekly scrape + index + image push +│ └── image-only.yml # On-demand code-only ship +├── scrape/ +│ ├── README.md # Product-specific scraper goes here +│ └── changelog.py # Reusable: --json, --history-out +├── rag/ +│ ├── embeddings.py # Ollama embedder, swappable +│ ├── chunk.py # Chunker — adjust per page format +│ ├── index.py # Builds Chroma + (optionally) BM25 +│ └── bm25.py # SQLite FTS5 lexical index +├── docs_mcp/ +│ ├── server.py # FastMCP server with stub tools +│ └── usage.py # TimedCall + JSONL telemetry +├── eval/ +│ ├── queries.jsonl.example # Curate ~25 hand-labeled queries +│ ├── retrievers.py # Retriever protocol + implementations +│ └── run_eval.py # MRR / Recall@k / nDCG@k harness +├── scripts/ +│ ├── usage_report.py # Standalone log analyzer +│ └── registry_gc.py # Container registry cleanup +└── deploy/ + └── docker-compose.yml # Hosting stack: MCP + reranker + Watchtower +``` + +## What's product-specific (must implement) + +- `scrape/` — the scraper itself. The template gives you the corpus + layout contract and a working `changelog.py`; the actual extraction + logic is yours. +- The corpus on disk (gitignored; rebuilt by CI). +- The reranker GGUF model and llama.cpp container (commented in + `deploy/docker-compose.yml`). +- The reverse proxy / TLS layer in front of the public endpoint. +- The hand-curated knowledge surface (your product's API gotchas, + example scripts, anything the LLM should know that the docs + don't say). + +## What's NOT product-specific (works as-is) + +- FastMCP server skeleton + tool decoration pattern +- Chroma + Ollama embedding pipeline +- BM25 / SQLite FTS5 lexical index +- Hybrid retrieval (RRF) + reranker integration +- Eval harness (Retriever protocol, MRR/Recall/nDCG) +- Usage logging (TimedCall, JSONL, daily rotation) +- CI workflow shape (weekly + on-demand, retry-on-race, three-tag + image scheme) +- Registry GC script +- Standard tools: `search_docs`, `get_page`, `list_versions`, + `diff_versions`, `bundle_changelog`, `weekly_digest`, + `find_doc_inconsistencies`, `submit_doc_bug`, etc. + +## License + +Internal template. Adjust before publishing. diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml new file mode 100644 index 0000000..0aa05a8 --- /dev/null +++ b/deploy/docker-compose.yml @@ -0,0 +1,111 @@ +# Hosting stack for a docs MCP server. +# +# Replace below with your product name on first deploy. +# Volumes: usage logs are mounted to a host path so they survive +# Watchtower-driven container recreates. +# +# This template assumes a reverse proxy / Cloudflare Tunnel terminates +# TLS in front of port 8000. Adjust if your infra differs. + +services: + + # The MCP server. Watchtower auto-pulls on :latest changes. + -docs-mcp: + image: //-docs-mcp:latest + container_name: -docs-mcp + restart: unless-stopped + ports: + - "8000:8000" + environment: + PRODUCT_NAME: "" + PRODUCT_DOCS_URL: "https://docs.example.com" + + # Streamable-HTTP transport. Stateless mode is required for + # production: clients don't lose sessions when Watchtower + # recreates the container. + MCP_TRANSPORT: streamable-http + MCP_HOST: 0.0.0.0 + MCP_PORT: "8000" + + # If you run MetaMCP or another gateway in front and reach + # this container via its compose DNS name (e.g. -docs-mcp:8000), + # add that hostname here. "*" disables the rebind check entirely. + MCP_ALLOWED_HOSTS: "-docs-mcp,localhost,127.0.0.1" + + # Phase 6 — reranker sidecar (jina-reranker-v2-base via llama.cpp). + RERANK_URL: http://-rerank:8080 + RERANK_POOL: "200" + RERANK_TIMEOUT: "30" + + # Phase 8 — hybrid retrieval (BM25 + dense + RRF). Set true + # only after the eval harness shows the dense-only path + # missing technical-term queries that BM25 catches. + HYBRID_SEARCH: "true" + + # Phase 10 — usage telemetry. + USAGE_LOG_DIR: /app/var/logs + USAGE_LOG_KEEP_DAYS: "90" + + # Phase 12 — doc-bug submission gate. Off by default; on only + # in production after you've verified the endpoint contract. + DOC_BUG_SUBMIT_ENABLED: "false" + # DOC_BUG_API_URL: "https://docs-be.example.com/api/feedback" + volumes: + # Usage logs persist across container recreates. + - ./-docs-mcp-logs:/app/var/logs + depends_on: + - -rerank + labels: + # Watchtower polls *only* containers with this label set true. + com.centurylinklabs.watchtower.enable: "true" + networks: + - mcp + + # Reranker sidecar — llama.cpp serving jina-reranker-v2-base. + # Requires GPU access; adjust runtime/devices for your hardware. + -rerank: + image: ghcr.io/ggml-org/llama.cpp:server-cuda + container_name: -rerank + restart: unless-stopped + # Mount the GGUF model from the host. Download from huggingface + # (gguf-org/jina-reranker-v2-base-multilingual-GGUF) first. + volumes: + - /path/to/models:/models:ro + command: > + --model /models/jina-reranker-v2-base.Q8_0.gguf + --reranking + --host 0.0.0.0 + --port 8080 + --n-gpu-layers 99 + --ctx-size 4096 + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + networks: + - mcp + + # Watchtower — auto-pulls :latest on push. + # Only watches containers labeled `com.centurylinklabs.watchtower.enable=true`. + watchtower: + image: containrrr/watchtower:latest + container_name: watchtower + restart: unless-stopped + volumes: + - /var/run/docker.sock:/var/run/docker.sock + environment: + WATCHTOWER_POLL_INTERVAL: "300" # 5 min + WATCHTOWER_LABEL_ENABLE: "true" + WATCHTOWER_CLEANUP: "true" # remove old images after pull + # If your registry requires auth, mount a docker config: + # volumes: + # - ./registry-auth.json:/config.json:ro + networks: + - mcp + +networks: + mcp: + driver: bridge diff --git a/docs_mcp/__init__.py b/docs_mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docs_mcp/server.py b/docs_mcp/server.py new file mode 100644 index 0000000..28b1345 --- /dev/null +++ b/docs_mcp/server.py @@ -0,0 +1,263 @@ +"""MCP server skeleton — fill in PRODUCT_NAME and the tool bodies. + +This file is the template's structural anchor. The phases described in +PLAN.md add or extend pieces of this file: + + Phase 3 — search_docs, get_page, list_versions stubs (you are here) + Phase 6 — reranker integration in search_docs + Phase 8 — BM25 + hybrid retrieval (HYBRID_SEARCH env gate, _rrf_fuse) + Phase 9 — diff_versions, list_cluster, bundle_changelog + Phase 10 — TimedCall wiring (already imported below) + Phase 11 — _api_lessons tool + Phase 12 — find_doc_inconsistencies, submit_doc_bug + Phase 13 — weekly_digest + _digest_history reader + +Every stub below has a docstring + `raise NotImplementedError`. Replace +the body when you reach the corresponding phase. Keep the signatures +stable across products — clients depend on them. +""" +from __future__ import annotations + +import json +import logging +import os +import re +from pathlib import Path +from typing import Annotated + +from mcp.server.fastmcp import FastMCP +from pydantic import Field + +from .usage import TimedCall + +log = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Product-specific configuration. Set these for each new build. +# --------------------------------------------------------------------------- +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct") +PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com") +COLLECTION = f"{PRODUCT_NAME}_docs" + +# Paths inside the deployed container (and matching layout locally for dev). +ROOT = Path(__file__).resolve().parent.parent +CORPUS = ROOT / "corpus" +CHROMA_DIR = ROOT / "chroma" +BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db"))) +BUNDLES_JSON = ROOT / "bundles.json" + +# --------------------------------------------------------------------------- +# Feature flags (Phase 6 / 8 / 12 enable these as you ship each phase). +# --------------------------------------------------------------------------- +RERANK_URL = os.environ.get("RERANK_URL", "").rstrip("/") or None +RERANK_POOL = int(os.environ.get("RERANK_POOL", "50")) +RERANK_TIMEOUT = float(os.environ.get("RERANK_TIMEOUT", "30")) + +HYBRID_SEARCH = os.environ.get("HYBRID_SEARCH", "").lower() in ("true", "1", "yes", "on") +RRF_K = int(os.environ.get("RRF_K", "60")) + +DOC_BUG_SUBMIT_ENABLED = os.environ.get("DOC_BUG_SUBMIT_ENABLED", "").lower() in ("true", "1", "yes", "on") +DOC_BUG_API_URL = os.environ.get("DOC_BUG_API_URL", "") # product-specific endpoint +DOC_BUG_TIMEOUT = float(os.environ.get("DOC_BUG_TIMEOUT", "15")) + + +# --------------------------------------------------------------------------- +# FastMCP setup. +# +# stateless_http=True — every request creates an ephemeral session and +# discards it on return. Critical for production: clients don't get +# 404 storms when the container is recreated by Watchtower. +# --------------------------------------------------------------------------- +mcp = FastMCP(f"{PRODUCT_NAME}-docs", stateless_http=True) + + +# --------------------------------------------------------------------------- +# Lazy helpers — instantiate expensive things only when actually needed, +# so the server still starts when (e.g.) Ollama is briefly unreachable. +# --------------------------------------------------------------------------- + +def _bundles() -> dict[str, dict]: + """Cached load of bundles.json into a {slug: bundle_dict} mapping. + + bundles.json is the product-specific catalog written by the Phase 1 + scraper. See PLAN.md Phase 1 for the schema. + """ + if not BUNDLES_JSON.exists(): + return {} + cat = json.loads(BUNDLES_JSON.read_text()) + return {b["slug"]: b for b in cat} + + +def _build_where(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None: + """Translate filter args into a Chroma `where` clause.""" + conds: list[dict] = [] + if version: + conds.append({"version": version}) + if platform: + conds.append({"platform": platform}) + if bundle_id: + conds.append({"bundle_id": bundle_id}) + if not conds: + return None + if len(conds) == 1: + return conds[0] + return {"$and": conds} + + +def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None: + """Read a corpus page off disk. Returns (markdown_body, metadata_dict).""" + md_path = CORPUS / bundle_id / (page_id + ".md") + json_path = CORPUS / bundle_id / (page_id + ".json") + if not md_path.exists() or not json_path.exists(): + return None + return md_path.read_text(), json.loads(json_path.read_text()) + + +# =========================================================================== +# Tools +# =========================================================================== + +@mcp.tool() +def search_docs( + query: Annotated[str, Field(description=f"Natural-language query about {PRODUCT_NAME}.")], + version: Annotated[ + str | None, + Field(description="OPTIONAL version filter — restrict to one product version."), + ] = None, + platform: Annotated[ + str | None, + Field(description="OPTIONAL platform filter. Set to one of the platforms listed by list_versions(); omit for all platforms."), + ] = None, + bundle_id: Annotated[ + str | None, + Field(description="OPTIONAL bundle filter — pin to a specific doc bundle slug."), + ] = None, + k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10, +) -> str: + """Search the {product} docs corpus. + + Returns the top-k most relevant chunks (with full source page URLs) + given a natural-language query. Optional filters narrow the search + to one version, one platform, or one bundle. Use list_versions() + first if you need to discover the available facet values. + + Call this tool whenever the user asks anything that should be + answerable from the official product documentation. + """ + with TimedCall("search_docs", { + "query": query, "version": version, "platform": platform, + "bundle_id": bundle_id, "k": k, + }) as _call: + # TODO Phase 2-3: query Chroma collection (see rag/index.py for + # how it was built). Render the top-k chunks as markdown with + # source URLs. + # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set. + # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run + # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank. + _call.set(hits_returned=0) + raise NotImplementedError("Phase 2/3: implement Chroma query + rendering") + + +@mcp.tool() +def get_page( + bundle_id: Annotated[str, Field(description="Bundle slug.")], + page_id: Annotated[str, Field(description="Page filename within the bundle.")], +) -> str: + """Return the full markdown for one page, plus a metadata header. + + Use after search_docs surfaces a relevant page and the user (or you) + want the complete text — not just the matched chunks. + """ + with TimedCall("get_page", {"bundle_id": bundle_id, "page_id": page_id}) as _call: + data = _read_page(bundle_id, page_id) + if data is None: + _call.set(found=False) + return f"Page not found: {bundle_id}/{page_id}" + md, meta = data + _call.set(found=True, page_chars=len(md)) + # TODO: add a metadata header (title, version, source URL) above + # the body. Product-specific shape. + return md + + +@mcp.tool() +def list_versions() -> str: + """List the available version/platform facets across all bundles. + + Use this to discover valid filter values for search_docs. + """ + with TimedCall("list_versions", {}) as _call: + cat = _bundles() + if not cat: + return "_(no bundles indexed yet — run the scraper + indexer)_" + versions = sorted({b.get("version") for b in cat.values() if b.get("version")}) + platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")}) + _call.set(versions=len(versions), platforms=len(platforms)) + lines = [f"# Facets across {len(cat)} bundle(s)", ""] + if versions: + lines.append("## Versions"); lines.append("") + for v in versions: lines.append(f"- `{v}`") + lines.append("") + if platforms: + lines.append("## Platforms"); lines.append("") + for p in platforms: lines.append(f"- `{p}`") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Stubs for later phases — keep the signatures in this file so refactors +# don't lose the contracts. Implementations come per phase. +# --------------------------------------------------------------------------- + +# @mcp.tool() # Phase 9 +# def list_cluster(bundle_id: str, page_id: str) -> str: ... + +# @mcp.tool() # Phase 9 +# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ... + +# @mcp.tool() # Phase 9 +# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ... + +# @mcp.tool() # Phase 13 +# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ... + +# @mcp.tool() # Phase 9 (or 3 — useful early) +# def corpus_status() -> str: ... + +# @mcp.tool() # Phase 11 +# def myproduct_api_lessons(topic: str | None = None) -> str: ... + +# @mcp.tool() # Phase 12 +# def find_doc_inconsistencies(scope_query: str, ...) -> str: ... + +# @mcp.tool() # Phase 12 +# def submit_doc_bug(page_url: str, content: str, email: str | None = None, ...) -> str: ... + + +# =========================================================================== +# Entry point +# =========================================================================== + +def main() -> None: + import argparse + p = argparse.ArgumentParser(description=f"{PRODUCT_NAME} docs MCP server") + p.add_argument("--transport", choices=["stdio", "streamable-http", "sse"], + default=os.environ.get("MCP_TRANSPORT", "stdio")) + p.add_argument("--host", default=os.environ.get("MCP_HOST", "0.0.0.0")) + p.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "8000"))) + args = p.parse_args() + + if args.transport == "stdio": + mcp.run() + else: + mcp.settings.host = args.host + mcp.settings.port = args.port + # DNS-rebinding protection defaults to localhost-only — disable for + # container-network DNS hostnames. See PLAN.md "Hosting" notes. + if os.environ.get("MCP_DISABLE_DNS_REBINDING_PROTECTION") in {"1", "true", "yes"}: + mcp.settings.transport_security.enable_dns_rebinding_protection = False + mcp.run(transport=args.transport) + + +if __name__ == "__main__": + main() diff --git a/docs_mcp/usage.py b/docs_mcp/usage.py new file mode 100644 index 0000000..e4ad4b2 --- /dev/null +++ b/docs_mcp/usage.py @@ -0,0 +1,127 @@ +"""Per-call usage telemetry — JSONL with daily rotation and retention. + +Reusable as-is across products. Drop the import + `with TimedCall(...)` +into any tool body and the call gets logged with the tool name, args, +elapsed time, and any extra fields the tool sets via `_call.set(...)`. + +The log file is `var/logs/usage.jsonl` by default (override with the +`USAGE_LOG_DIR` env). Daily rotation; files older than +`USAGE_LOG_KEEP_DAYS` (default 90) are deleted on next write. + +Layout of one record: + + { + "ts": "2026-05-22T13:14:15+00:00", + "tool": "search_docs", + "args": {"query": "...", "version": "10.9", "k": 10}, + "elapsed_ms": 142.5, + "hits_returned": 7, # optional, set by the tool + "reranked": true, # optional, set by the tool + // ... any other key the tool sets via _call.set(...) + } +""" +from __future__ import annotations + +import json +import os +import time +import threading +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any + + +USAGE_LOG_DIR = Path(os.environ.get("USAGE_LOG_DIR", "var/logs")) +USAGE_LOG_KEEP_DAYS = int(os.environ.get("USAGE_LOG_KEEP_DAYS", "90")) + +# Single global lock to serialize writes from multiple request handlers. +# JSONL appends are atomic at the OS level for short records on most +# filesystems, but the lock is cheap and saves you from cross-platform +# surprises. +_lock = threading.Lock() +_last_rotation_check: float = 0.0 + + +def _maybe_rotate() -> None: + """Move usage.jsonl → usage.jsonl. if the date has rolled. + + Cheap to call; we only do filesystem work when a day has actually + passed since the last check. + """ + global _last_rotation_check + now = time.time() + if now - _last_rotation_check < 300: # 5 min cap between checks + return + _last_rotation_check = now + + USAGE_LOG_DIR.mkdir(parents=True, exist_ok=True) + active = USAGE_LOG_DIR / "usage.jsonl" + if active.exists(): + try: + mtime = datetime.fromtimestamp(active.stat().st_mtime, tz=timezone.utc).date() + today = datetime.now(timezone.utc).date() + if mtime < today: + rotated = USAGE_LOG_DIR / f"usage.jsonl.{mtime.isoformat()}" + if not rotated.exists(): + active.rename(rotated) + except OSError: + pass + + # Retention: delete usage.jsonl.YYYY-MM-DD files older than the + # retention window. The active file is never deleted by this. + cutoff = datetime.now(timezone.utc).date() - timedelta(days=USAGE_LOG_KEEP_DAYS) + for f in USAGE_LOG_DIR.glob("usage.jsonl.*"): + try: + datestamp = f.name.split(".", 2)[-1] + if datetime.fromisoformat(datestamp).date() < cutoff: + f.unlink() + except (ValueError, OSError): + continue + + +class TimedCall: + """Context manager that captures one tool call's telemetry record. + + Usage: + + with TimedCall("search_docs", {"query": q, ...}) as call: + ... do the work ... + call.set(hits_returned=len(results), reranked=True) + + On exit, writes one JSONL record to usage.jsonl. Exceptions are + captured into the `error` field; the exception is re-raised so + the tool's caller sees the failure. + """ + + def __init__(self, tool: str, args: dict[str, Any]): + self.tool = tool + self.args = args + self.extra: dict[str, Any] = {} + self._t0: float = 0.0 + + def set(self, **kwargs: Any) -> None: + """Attach extra fields to the eventual telemetry record.""" + self.extra.update(kwargs) + + def __enter__(self) -> "TimedCall": + self._t0 = time.perf_counter() + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + elapsed_ms = (time.perf_counter() - self._t0) * 1000.0 + record: dict[str, Any] = { + "ts": datetime.now(timezone.utc).isoformat(), + "tool": self.tool, + "args": self.args, + "elapsed_ms": round(elapsed_ms, 2), + } + if exc_type is not None: + record["error"] = f"{exc_type.__name__}: {exc_val}" + record.update(self.extra) + + _maybe_rotate() + with _lock: + USAGE_LOG_DIR.mkdir(parents=True, exist_ok=True) + with open(USAGE_LOG_DIR / "usage.jsonl", "a") as fh: + fh.write(json.dumps(record, separators=(",", ":")) + "\n") + # Don't swallow the exception — the caller still needs to see it. diff --git a/eval/__init__.py b/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/eval/queries.jsonl.example b/eval/queries.jsonl.example new file mode 100644 index 0000000..ae67214 --- /dev/null +++ b/eval/queries.jsonl.example @@ -0,0 +1,4 @@ +{"query": "how to install on Linux", "expected": [{"bundle_id": "Install.Linux.10.0", "page_id": "Installation.htm"}], "tags": ["install", "linux"]} +{"query": "configure database connection for high availability", "expected": [{"bundle_id": "Admin.10.0", "page_id": "HA_Setup.htm"}], "tags": ["ha", "config"]} +{"query": "API endpoint to list users", "expected": [{"bundle_id": "API.10.0", "page_id": "Users_API.htm"}], "tags": ["api"]} +{"query": "what changed between 10.0 and 10.1", "expected": [{"bundle_id": "Release_Notes.10.1", "page_id": "Whats_New.htm"}], "tags": ["release-notes"]} diff --git a/eval/retrievers.py b/eval/retrievers.py new file mode 100644 index 0000000..bc06a18 --- /dev/null +++ b/eval/retrievers.py @@ -0,0 +1,62 @@ +"""Retriever protocol + concrete implementations. + +A single matrix dimension per knob (dense / reranked / bm25 / hybrid) +so the eval harness can compare them apples-to-apples. Implement these +once at Phase 7 and reuse them across every retrieval change. + +Each retriever returns a ranked list of (bundle_id, page_id) tuples +deduplicated to the page level (chunks within the same page collapse +to one entry; the highest-ranked chunk's position wins). +""" +from __future__ import annotations + +from typing import Protocol, Iterable + + +class Retriever(Protocol): + name: str + + def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: + """Return up to k (bundle_id, page_id) tuples in rank order.""" + ... + + +def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]: + """Take a stream of (bundle_id, page_id, chunk_ordinal) and return + the first k unique pages in their first-seen order.""" + seen: set[tuple[str, str]] = set() + out: list[tuple[str, str]] = [] + for bid, pid, _ord in chunk_ids: + key = (bid, pid) + if key in seen: + continue + seen.add(key) + out.append(key) + if len(out) >= k: + break + return out + + +# TODO Phase 2/3 — implement these once Chroma + the bm25 module are +# in place. Each one is small (15-30 LOC). The eval harness imports +# from this module by class name. +# +# class DenseRetriever: +# name = "dense" +# def __init__(self, collection): self.col = collection +# def retrieve(self, query, k=10): ... +# +# class RerankedRetriever: +# name = "dense+rerank" +# def __init__(self, collection, rerank_url, pool=200): ... +# def retrieve(self, query, k=10): ... +# +# class BM25Retriever: +# name = "bm25" +# def __init__(self, bm25_index): ... +# def retrieve(self, query, k=10): ... +# +# class HybridRetriever: +# name = "bm25+dense+rrf" +# def __init__(self, dense, bm25, k_rrf=60): ... +# def retrieve(self, query, k=10): ... diff --git a/eval/run_eval.py b/eval/run_eval.py new file mode 100644 index 0000000..9ba3aa6 --- /dev/null +++ b/eval/run_eval.py @@ -0,0 +1,91 @@ +"""Run all retrievers against eval/queries.jsonl, emit a markdown report. + +Metrics computed per retriever: + + MRR — mean reciprocal rank of the FIRST expected page in the + ranked result list (0 if not in top-k). + Recall@K — fraction of expected pages that appear in top-K. + nDCG@K — discounted gain weighted by rank position. + +The "right" number depends on what you're measuring. MRR tracks "the +first-line answer is correct"; Recall@K tracks "everything relevant +is there to draw from"; nDCG@K is a smoother combination of both. +For docs-RAG, MRR is usually the headline metric. + +Usage: + + python -m eval.run_eval \\ + --queries eval/queries.jsonl \\ + --k 5 \\ + --output eval/results/baseline.md +""" +from __future__ import annotations + +import argparse +import json +import math +import time +from pathlib import Path +from typing import Iterable + + +def load_queries(path: Path) -> list[dict]: + with open(path) as fh: + return [json.loads(line) for line in fh if line.strip()] + + +def reciprocal_rank(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]]) -> float: + expected_set = set(expected) + for i, page in enumerate(retrieved, start=1): + if page in expected_set: + return 1.0 / i + return 0.0 + + +def recall_at_k(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]], k: int) -> float: + if not expected: + return 0.0 + retrieved_set = set(retrieved[:k]) + hits = sum(1 for e in expected if e in retrieved_set) + return hits / len(expected) + + +def ndcg_at_k(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]], k: int) -> float: + expected_set = set(expected) + dcg = 0.0 + for i, page in enumerate(retrieved[:k], start=1): + if page in expected_set: + dcg += 1.0 / math.log2(i + 1) + # Ideal DCG: every expected page in the top positions. + idcg = sum(1.0 / math.log2(i + 1) for i in range(1, min(len(expected), k) + 1)) + return dcg / idcg if idcg else 0.0 + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--queries", type=Path, default=Path("eval/queries.jsonl")) + p.add_argument("--k", type=int, default=5) + p.add_argument("--output", type=Path, default=Path("eval/results/baseline.md")) + args = p.parse_args() + + if not args.queries.exists(): + print(f"queries file not found: {args.queries}") + print("hint: copy eval/queries.jsonl.example and edit") + return 1 + + queries = load_queries(args.queries) + print(f"loaded {len(queries)} queries") + + # TODO Phase 7: instantiate the retrievers you implemented in + # eval/retrievers.py and run each one against each query. + # Aggregate MRR / Recall@K / nDCG@K per retriever. Emit a + # markdown table to args.output. Commit the file alongside the + # PR that changes retrieval. + raise NotImplementedError( + "Wire up the retrievers in eval/retrievers.py first, then " + "fill in this evaluation loop. See PLAN.md Phase 7." + ) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/rag/__init__.py b/rag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag/bm25.py b/rag/bm25.py new file mode 100644 index 0000000..06982e0 --- /dev/null +++ b/rag/bm25.py @@ -0,0 +1,277 @@ +"""SQLite FTS5-backed BM25 retrieval over the same chunks Chroma indexes. + +Hybrid retrieval (BM25 + dense + Reciprocal Rank Fusion) addresses a +limit of single-tower dense embeddings: when a query has specific +technical terms (filenames, language names, error codes, API paths), +the dense embedding doesn't bridge from the query into a short +code-focused chunk. The chunk loses to the much larger crowd of +prose chunks that semantically match the query topic. + +BM25 handles this directly. Lexical overlap on rare terms ("python", +"create_vpg.py", "PROTECTED_SITE_ID", "applyUpgrade") scores those +chunks high. Fused with the dense ranking via RRF, the hybrid result +is strictly better than either alone for the queries we've seen +fail. + +Why SQLite FTS5: + - In the stdlib. Zero new deps. + - On-disk. Same persistence model as Chroma — Docker COPY the dir, + `rag.index --rebuild` regenerates from corpus. + - Built-in `bm25()` ranking function. No knobs to tune that matter + for our use case (k1=1.2, b=0.75 defaults are fine). + - Builds 70k+ chunks in seconds. Faster than the Chroma rebuild's + embedding step by 100×, so it adds basically nothing to the + full-rebuild cycle. + +Schema is two tables to keep filtering clean. FTS5 doesn't filter +nicely on its own columns; the content_rowid pattern keeps an +external metadata table joinable by rowid: + + CREATE TABLE chunks_meta ( + rowid INTEGER PRIMARY KEY AUTOINCREMENT, + id TEXT UNIQUE, + bundle_id TEXT, page_id TEXT, version TEXT, + platform TEXT, product TEXT, ordinal INTEGER + ); + CREATE VIRTUAL TABLE chunks_fts USING fts5( + text, + tokenize = 'porter unicode61 remove_diacritics 2', + content = 'chunks_meta', + content_rowid = 'rowid' + ); + +Queries: + + SELECT m.id, bm25(chunks_fts) AS score + FROM chunks_meta m + JOIN chunks_fts f ON m.rowid = f.rowid + WHERE f MATCH ? + AND m.version = ? -- optional metadata filter + ORDER BY bm25(chunks_fts) -- lower = better in FTS5 + LIMIT ?; +""" +from __future__ import annotations + +import logging +import re +import sqlite3 +from pathlib import Path +from typing import Any + +log = logging.getLogger(__name__) + +# Default location: bm25/_docs.db at the repo root, next to chroma/. +ROOT = Path(__file__).resolve().parent.parent +DEFAULT_DB_DIR = ROOT / "bm25" +DEFAULT_DB_NAME = "_docs.db" + +# Columns we expose as filterable metadata. Mirrors what _build_where in +# docs_mcp/server.py accepts so the same filter dicts work for both +# Chroma and BM25 without per-retriever translation in the caller. +FILTER_COLUMNS = ("bundle_id", "page_id", "version", "platform", "product", "ordinal") + + +# Allowlist tokenizer for free-text queries. FTS5's parser chokes on lots +# of punctuation we routinely see in user queries (".10.9", "?", "VPG's", +# em-dash, etc.). Rather than blocklist every operator, just keep +# alphanumerics + a few separators and replace everything else with a +# space. This loses the ability to phrase-search ("exact match") but we +# don't expose that to users anyway — they ask natural-language questions +# and want the answer, not a Boolean DSL. +_KEEP_RE = re.compile(r"[^A-Za-z0-9_\s]") +# FTS5 reserves these Boolean operator KEYWORDS at the token level — +# stripping them avoids accidental phrase-query behavior when a user +# query happens to contain bare "AND", "OR", "NOT", "NEAR". +_BOOLEAN_KW_RE = re.compile(r"(? str: + """Reduce a natural-language query to an FTS5 OR-of-tokens query. + + Two transformations: + + 1. Non-alphanumeric → space (drops punctuation; "10.9?" becomes + "10 9"). Lets us handle versions, parens, question marks, etc. + without inviting FTS5 parse errors. + 2. Boolean keywords stripped (FTS5 reserves AND/OR/NOT/NEAR). + 3. Tokens explicitly OR'd. FTS5's default is AND-of-tokens — for + any non-trivial natural-language query that means zero hits + (no chunk contains every word). OR semantics is what we want: + BM25 already weights documents containing more query terms + higher, so we don't lose precision, but we DO gain recall. + """ + cleaned = _KEEP_RE.sub(" ", text) + cleaned = _BOOLEAN_KW_RE.sub(" ", cleaned) + tokens = cleaned.split() + if not tokens: + return "" + return " OR ".join(tokens) + + +def _where_to_sql(where: dict | None) -> tuple[str, list[Any]]: + """Translate a Chroma-shaped filter dict into a SQL fragment + params. + + Accepts the same shapes ``docs_mcp.server._build_where`` produces: + + None → ("", []) + {"version": "10.9"} → ("AND m.version = ?", ["10.9"]) + {"$and": [{...}, {...}]} → ("AND m.X = ? AND m.Y = ?", [...]) + + Unknown keys are silently dropped (defensive — better to over-match + than to crash on a filter we don't know). + """ + if not where: + return "", [] + parts: list[str] = [] + params: list[Any] = [] + + def _emit_eq(cond: dict[str, Any]) -> None: + for k, v in cond.items(): + if k in FILTER_COLUMNS: + parts.append(f"m.{k} = ?") + params.append(v) + + if "$and" in where: + for sub in where["$and"]: + _emit_eq(sub) + else: + _emit_eq(where) + if not parts: + return "", [] + return "AND " + " AND ".join(parts), params + + +class BM25Index: + """Thin wrapper around an FTS5-backed sqlite db. + + Single-writer model. Reads are connection-per-call (sqlite handles + concurrency through file locks; for our read-heavy workload that's + fine and avoids cross-thread connection sharing issues with the MCP + server's request handlers). + """ + + def __init__(self, db_path: Path | None = None): + self.db_path = Path(db_path) if db_path else (DEFAULT_DB_DIR / DEFAULT_DB_NAME) + + # -- build ---------------------------------------------------------- + + def build(self, records: list[dict]) -> int: + """Rebuild the index from scratch from `records`. + + `records` is the same list ``rag.index.page_records`` produces: + ``[{"id": ..., "text": ..., "metadata": {...}}, ...]``. Bulk + insert wrapped in a transaction — single-digit seconds for the + full 73k-chunk corpus. + """ + self.db_path.parent.mkdir(parents=True, exist_ok=True) + # Drop and recreate. Idempotent rebuild. + if self.db_path.exists(): + self.db_path.unlink() + with sqlite3.connect(self.db_path) as con: + con.executescript(self._schema_sql()) + con.executemany( + "INSERT INTO chunks_meta (id, bundle_id, page_id, version, " + "platform, product, ordinal) VALUES (?, ?, ?, ?, ?, ?, ?)", + [ + ( + r["id"], + r["metadata"].get("bundle_id") or "", + r["metadata"].get("page_id") or "", + r["metadata"].get("version") or "", + r["metadata"].get("platform") or "", + r["metadata"].get("product") or "", + int(r["metadata"].get("ordinal") or 0), + ) + for r in records + ], + ) + # Populate the FTS5 contentless-ish table by rowid. We populated + # chunks_meta first; rowids align with insertion order. + con.executemany( + "INSERT INTO chunks_fts (rowid, text) VALUES (?, ?)", + [ + (i + 1, r["text"]) + for i, r in enumerate(records) + ], + ) + con.commit() + log.info("bm25: indexed %d chunks → %s", len(records), self.db_path) + return len(records) + + # -- query ---------------------------------------------------------- + + def query( + self, + text: str, + n: int = 200, + where: dict | None = None, + ) -> list[tuple[str, float]]: + """Return up to `n` (chunk_id, bm25_score) pairs, lowest score first. + + FTS5's bm25() returns NEGATIVE numbers — more relevant docs have + smaller (more negative) scores. We order ASC so the first row is + the most relevant. Callers that need a "rank" should enumerate + the returned list. + """ + sanitized = _sanitize_query(text) + if not sanitized: + return [] + where_sql, params = _where_to_sql(where) + # FTS5 MATCH wants the unaliased table name on its left, so we use + # chunks_fts (no alias) and JOIN by rowid against chunks_meta. + sql = ( + "SELECT m.id, bm25(chunks_fts) AS score " + "FROM chunks_fts " + "JOIN chunks_meta m ON m.rowid = chunks_fts.rowid " + f"WHERE chunks_fts MATCH ? {where_sql} " + "ORDER BY bm25(chunks_fts) " + "LIMIT ?" + ) + try: + with sqlite3.connect(self.db_path) as con: + cur = con.execute(sql, [sanitized, *params, n]) + return [(row[0], float(row[1])) for row in cur.fetchall()] + except sqlite3.OperationalError as e: + # FTS5 syntax error (rare after sanitization) or db missing. + # Caller decides whether to fall back to dense-only. + log.warning("bm25 query failed (%s); query=%r", e, sanitized[:80]) + return [] + + def exists(self) -> bool: + """Cheap probe — does the index file exist on disk?""" + return self.db_path.exists() + + def count(self) -> int: + """Number of chunks indexed. 0 if the db is missing or empty.""" + if not self.exists(): + return 0 + try: + with sqlite3.connect(self.db_path) as con: + return con.execute("SELECT COUNT(*) FROM chunks_meta").fetchone()[0] + except sqlite3.OperationalError: + return 0 + + # -- schema --------------------------------------------------------- + + @staticmethod + def _schema_sql() -> str: + return """ + CREATE TABLE chunks_meta ( + rowid INTEGER PRIMARY KEY AUTOINCREMENT, + id TEXT UNIQUE NOT NULL, + bundle_id TEXT, + page_id TEXT, + version TEXT, + platform TEXT, + product TEXT, + ordinal INTEGER + ); + CREATE INDEX idx_meta_version ON chunks_meta(version); + CREATE INDEX idx_meta_platform ON chunks_meta(platform); + CREATE INDEX idx_meta_bundle ON chunks_meta(bundle_id); + + CREATE VIRTUAL TABLE chunks_fts USING fts5( + text, + tokenize = 'porter unicode61 remove_diacritics 2' + ); + """ diff --git a/rag/chunk.py b/rag/chunk.py new file mode 100644 index 0000000..b8d7317 --- /dev/null +++ b/rag/chunk.py @@ -0,0 +1,126 @@ +"""Markdown chunker — paragraph-aware, ~400-600 token target. + +Adjust the chunking strategy per product if your page format differs +significantly from prose. The output shape (id, text, metadata) is +fixed by the downstream Chroma + BM25 indexing in rag/index.py — don't +change that. + +The key knob you'll tune per product is chunk-0. Dense retrieval lands +on chunk 0 first for most queries. Make it a synthetic chunk built +from: + + - the page title (as natural-language H1) + - a 1-sentence task description (you'll have to generate this — for + pages that already have a "## Overview" or "## Introduction" the + first sentence usually works) + - a keyword bag of important terms (filenames, API names, error + codes — the rare technical tokens that BM25 lights up on) + +Without a rich chunk 0, dense retrieval gets dominated by the much +larger prose body, and short pages (script examples, reference cards) +get buried. +""" +from __future__ import annotations + +import re +from typing import Iterator + + +# Approximate token estimate from char count. Tunable — set per +# embedder if the default 4 chars/token is wrong. +CHARS_PER_TOKEN = 4 +TARGET_TOKENS = 500 +TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN + + +def estimate_tokens(text: str) -> int: + return max(1, len(text) // CHARS_PER_TOKEN) + + +def split_paragraphs(md: str) -> list[str]: + """Split markdown into paragraph-ish blocks. + + Keeps fenced code blocks together (don't slice through ```). + Headings start new paragraphs. + """ + blocks: list[str] = [] + current: list[str] = [] + in_fence = False + for line in md.splitlines(keepends=True): + stripped = line.strip() + if stripped.startswith("```"): + in_fence = not in_fence + current.append(line) + continue + if in_fence: + current.append(line) + continue + if stripped.startswith("#"): + if current: + blocks.append("".join(current).strip()) + current = [] + current.append(line) + continue + if not stripped and current and not "".join(current).strip().endswith("\n\n"): + current.append(line) + blocks.append("".join(current).strip()) + current = [] + continue + current.append(line) + if current: + blocks.append("".join(current).strip()) + return [b for b in blocks if b] + + +def chunks_from_page( + text: str, + page_id: str, + metadata: dict, +) -> Iterator[dict]: + """Yield chunk dicts ready for index.py to upsert. + + The synthetic chunk 0 is the per-product customization point. The + default below is a simple title + body-first-paragraph; rewrite + for richer retrieval signal (see module docstring). + """ + paragraphs = split_paragraphs(text) + if not paragraphs: + return + + # ----- Chunk 0: synthetic anchor for dense retrieval --------- + title = metadata.get("title") or page_id + first_para = next((p for p in paragraphs if not p.startswith("#")), "") + chunk0_body = ( + f"# {title}\n\n" + f"{first_para[:300]}" + # TODO per product: append a keyword bag here (filenames, + # API names, error codes) for BM25 + dense joint coverage. + ) + yield { + "id": f"{metadata['bundle_id']}::{page_id}::0", + "text": chunk0_body, + "metadata": {**metadata, "ordinal": 0}, + } + + # ----- Body chunks: pack paragraphs up to TARGET_CHARS ------- + ordinal = 1 + buf: list[str] = [] + buf_chars = 0 + for p in paragraphs: + if buf_chars + len(p) > TARGET_CHARS and buf: + yield { + "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", + "text": "\n\n".join(buf), + "metadata": {**metadata, "ordinal": ordinal}, + } + ordinal += 1 + buf = [] + buf_chars = 0 + buf.append(p) + buf_chars += len(p) + if buf: + yield { + "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", + "text": "\n\n".join(buf), + "metadata": {**metadata, "ordinal": ordinal}, + } diff --git a/rag/embeddings.py b/rag/embeddings.py new file mode 100644 index 0000000..84d3bbd --- /dev/null +++ b/rag/embeddings.py @@ -0,0 +1,72 @@ +"""Embedding function for Chroma — Ollama-hosted nomic-embed-text by default. + +Swappable: implement the same `embedding_function()` interface returning +a Chroma `EmbeddingFunction` and the rest of the pipeline doesn't care. + +Defaults (override via env): + OLLAMA_URL one or more comma-separated URLs (load-balanced) + EMBED_MODEL model name; default 'nomic-embed-text' + EMBED_DIM expected embedding dim; default 768 (nomic-embed-text) +""" +from __future__ import annotations + +import os +import logging +from typing import Any + +import httpx +from chromadb import EmbeddingFunction, Documents, Embeddings + +log = logging.getLogger(__name__) + +OLLAMA_URLS = [u.strip() for u in os.environ.get("OLLAMA_URL", + "http://localhost:11434").split(",") if u.strip()] +EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text") +EMBED_DIM = int(os.environ.get("EMBED_DIM", "768")) + + +class OllamaEmbeddings(EmbeddingFunction): + """Calls /api/embed across N Ollama endpoints, naive round-robin. + + For indexing throughput on multiple GPUs, run one Ollama container + per GPU (pinned via NVIDIA_VISIBLE_DEVICES) and pass all their URLs + in OLLAMA_URL — the embedder picks the next endpoint per batch. + """ + + def __init__(self, urls: list[str] = OLLAMA_URLS, model: str = EMBED_MODEL): + self.urls = urls + self.model = model + self._next = 0 + + def __call__(self, input: Documents) -> Embeddings: + url = self.urls[self._next % len(self.urls)] + self._next += 1 + with httpx.Client(timeout=300) as c: + r = c.post(f"{url}/api/embed", + json={"model": self.model, "input": list(input)}) + r.raise_for_status() + data = r.json() + return data.get("embeddings") or [] + + def name(self) -> str: # newer chromadb requires this + return f"ollama:{self.model}" + + @staticmethod + def build_from_config(config: dict) -> "OllamaEmbeddings": # newer chromadb + return OllamaEmbeddings( + urls=config.get("urls", OLLAMA_URLS), + model=config.get("model", EMBED_MODEL), + ) + + def get_config(self) -> dict: # newer chromadb + return {"urls": self.urls, "model": self.model} + + def default_space(self) -> str: + return "cosine" + + def supported_spaces(self) -> list[str]: + return ["cosine", "l2", "ip"] + + +def embedding_function() -> EmbeddingFunction: + return OllamaEmbeddings() diff --git a/rag/index.py b/rag/index.py new file mode 100644 index 0000000..8d1c74f --- /dev/null +++ b/rag/index.py @@ -0,0 +1,134 @@ +"""Build Chroma (and optionally BM25) indexes from corpus on disk. + +Reads `corpus//.{md,json}`, chunks each page, upserts +into Chroma. With --rebuild, drops + recreates the collection (clean +state). With --bm25-only, skips Chroma and rebuilds only the FTS5 +index — useful for fast iteration when chunking didn't change. +""" +from __future__ import annotations + +import argparse +import json +import logging +import time +from pathlib import Path +from typing import Iterator + +import chromadb +from chromadb.config import Settings + +from .chunk import chunks_from_page +from .embeddings import embedding_function + +log = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") + +ROOT = Path(__file__).resolve().parent.parent +CORPUS = ROOT / "corpus" +CHROMA_DIR = ROOT / "chroma" + +# Collection name — convention: _docs. Override via env if needed. +import os +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct") +COLLECTION = f"{PRODUCT_NAME}_docs" + + +def page_records() -> Iterator[dict]: + """Walk corpus/, yield chunks for every page.""" + if not CORPUS.exists(): + log.error("corpus/ doesn't exist; run the scraper first") + return + for bundle_dir in sorted(CORPUS.iterdir()): + if not bundle_dir.is_dir() or bundle_dir.name.startswith("."): + continue + for md_path in sorted(bundle_dir.glob("*.md")): + page_id = md_path.stem + sidecar = md_path.with_suffix(".json") + if not sidecar.exists(): + log.warning("skipping %s — no JSON sidecar", md_path) + continue + md = md_path.read_text() + meta = json.loads(sidecar.read_text()) + # Surface common filter fields at the chunk-metadata level + # so Chroma's `where` filter can use them. + base_meta = { + "bundle_id": bundle_dir.name, + "page_id": page_id, + "title": meta.get("title") or "", + "version": meta.get("version") or "", + "platform": meta.get("platform") or "", + "product": meta.get("product") or "", + } + yield from chunks_from_page(md, page_id, base_meta) + + +def upsert_to_chroma(records: list[dict]) -> int: + client = chromadb.PersistentClient( + path=str(CHROMA_DIR), + settings=Settings(anonymized_telemetry=False), + ) + # Drop + recreate for --rebuild semantics + try: + client.delete_collection(COLLECTION) + except Exception: + pass + col = client.create_collection(COLLECTION, embedding_function=embedding_function()) + + BATCH = 64 + total = 0 + for i in range(0, len(records), BATCH): + chunk = records[i:i + BATCH] + col.upsert( + ids=[r["id"] for r in chunk], + documents=[r["text"] for r in chunk], + metadatas=[r["metadata"] for r in chunk], + ) + total += len(chunk) + log.info("upserted %d / %d chunks", total, len(records)) + return total + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--rebuild", action="store_true", + help="Drop and recreate the Chroma collection.") + p.add_argument("--bm25-only", action="store_true", + help="Rebuild only the BM25 index, skip Chroma.") + p.add_argument("--bm25-db", type=Path, + default=ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db", + help="Path to the BM25 sqlite db.") + args = p.parse_args() + + log.info("reading corpus from %s", CORPUS) + t0 = time.time() + records = list(page_records()) + log.info("loaded %d chunks in %.1fs", len(records), time.time() - t0) + + if args.bm25_only: + from .bm25 import BM25Index + log.info("--bm25-only: building FTS5 only") + BM25Index(args.bm25_db).build(records) + return 0 + + if not args.rebuild: + log.info("no --rebuild; nothing to do. (Use --rebuild to upsert.)") + return 0 + + t_c = time.time() + n = upsert_to_chroma(records) + log.info("chroma: %d chunks in %.1fs", n, time.time() - t_c) + + # Build BM25 too — see PLAN.md Phase 8. Safe to remove this block + # for products that don't need hybrid retrieval. + try: + from .bm25 import BM25Index + t_b = time.time() + BM25Index(args.bm25_db).build(records) + log.info("bm25 done in %.1fs", time.time() - t_b) + except ImportError: + log.info("rag.bm25 not available — skipping BM25 build") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b9982a9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +# MCP server +mcp[fastmcp]>=1.0.0 +pydantic>=2.0 +httpx>=0.27 + +# Vector store + embeddings +chromadb>=0.5.0 +ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not + +# Scraping (Phase 1; adjust per product) +beautifulsoup4>=4.12 +requests>=2.31 +# playwright>=1.40 # uncomment if you need headless browser fallback + +# Evaluation +numpy>=1.26 + +# Dev / utility +python-dateutil>=2.8 diff --git a/scrape/README.md b/scrape/README.md new file mode 100644 index 0000000..44d6df3 --- /dev/null +++ b/scrape/README.md @@ -0,0 +1,59 @@ +# scrape/ + +Product-specific. **You implement this for each product.** The +template gives you the contract; the extraction logic depends on +the upstream doc portal. + +See `PLAN.md` Phase 1 for the corpus layout the rest of the pipeline +expects. + +## What you write + +At minimum, two scripts: + +### `scrape/bundles.py` + +Discovers the upstream portal's bundle catalog and writes +`bundles.json` at the repo root. One entry per bundle (versioned doc +set) with the schema in PLAN.md. + +```bash +python -m scrape.bundles +``` + +### `scrape/runner.py` + +Scrapes the pages of each bundle (or a single bundle with `--bundle +`). Writes: + +- `corpus//.md` — extracted markdown body +- `corpus//.json` — per-page metadata sidecar + +```bash +python -m scrape.runner --all --force --concurrency 6 +python -m scrape.runner --bundle Admin.VC.HTML.10.9 +``` + +## Tips + +- **Sniff before you scrape.** Almost every modern doc portal is an + SPA that calls a backend API. Open the browser's Network tab, + click around, find the underlying JSON. Scraping the API is 10× + cheaper and 100× more reliable than scraping the rendered HTML. +- **Idempotent re-scrapes.** Without `--force`, the runner should + skip pages already on disk so a resume doesn't have to re-fetch + everything. With `--force`, re-fetch every page — that's the + weekly cron mode that catches edits. +- **Respect the portal.** Backoff on 429s. Set a recognizable + user-agent so the portal owner can identify you if they want to. +- **Whitespace normalize.** Markdown that round-trips through HTML + often has extra blank lines. Normalize to a single blank between + paragraphs so diffs are clean (the changelog summary and digest + tools care about line counts). + +## What's already reusable + +`scrape/changelog.py` is fully product-agnostic and ready to use +as-is. It walks `git diff --name-status` output to produce a +structured summary, and walks `git log` for the digest history +(Phase 13). diff --git a/scrape/__init__.py b/scrape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scrape/changelog.py b/scrape/changelog.py new file mode 100644 index 0000000..9ff2789 --- /dev/null +++ b/scrape/changelog.py @@ -0,0 +1,272 @@ +"""Generate a summary of corpus changes. + +Two output shapes for two consumers: + + 1. Human-readable text (default) — written into the weekly-refresh + commit message so the commit log is greppable for *"what changed + this week"* instead of *"806 files changed"*. + + 2. Structured JSON (``--json``) and rolling JSONL history + (``--history-out``) — consumed by the ``weekly_digest`` MCP tool. + Computed in CI and committed at ``corpus/.digest/history.jsonl``; + the tool reads it at runtime because the prod container is a + static filesystem COPY with no git available. + +Usage: + + # Commit-message helper (existing behavior — unchanged) + python -m scrape.changelog [--cached] [--ref REF] + + # One-shot JSON for the current diff range + python -m scrape.changelog --cached --json + + # Build / refresh the digest history file (CI use) + python -m scrape.changelog --history-out corpus/.digest/history.jsonl \\ + --history-days 120 + +The history walker only includes commits that touch ``corpus/`` (or +``bundles.json``); it skips pure code/CI commits. Each emitted record +carries the commit's short sha, ISO timestamp, subject, and the same +structured summary the ``--json`` path produces, so the consumer can +treat history records and one-shot summaries interchangeably. +""" +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from collections import defaultdict +from typing import Any + + +def git(*args: str) -> str: + return subprocess.check_output(["git", *args], text=True) + + +def summarize_diff(diff_output: str) -> dict[str, Any]: + """Parse ``git diff --name-status`` output into a structured summary. + + Pure function (no IO, no git calls) so the same logic is exercised + by the human-readable, JSON-one-shot, and history-walking paths. + + Returns a dict with: + + md_count int — total .md files changed + json_count int — total .json sidecars changed + content_bundles dict — {bundle_id: [page_id_without_.md, ...]} + Only bundles where at least one .md + file moved. Lists are in the order + git emitted them. + json_only_bundles list[str] — bundles whose ONLY change was sidecar + drift (no .md changes). Sorted. + new_bundles list[str] — bundles whose first .md was Added + in this diff. Sorted. + other_files list[str] — any non-corpus path mentioned in the + diff, as ``"STATUS path"`` strings. + """ + md_changes: dict[str, list[str]] = defaultdict(list) + json_only_bundles: set[str] = set() + new_bundles: set[str] = set() + md_count = json_count = 0 + other_files: list[str] = [] + + for line in diff_output.splitlines(): + if not line.strip(): + continue + # statuspath (or statusoldnew for renames; we take + # the post-rename path as the canonical location). + parts = line.split("\t") + status, path = parts[0], parts[-1] + if not path.startswith("corpus/"): + other_files.append(f"{status} {path}") + continue + segs = path.split("/", 2) + if len(segs) < 3: + # corpus/ with no bundle dir — skip. + continue + _, bundle, page = segs + if page.endswith(".md"): + md_changes[bundle].append(page[:-3]) + md_count += 1 + if status == "A": + new_bundles.add(bundle) + elif page.endswith(".json"): + json_count += 1 + json_only_bundles.add(bundle) + + # A bundle counts as "content-changing" if it had any .md edit. Sidecar- + # only drift goes in the separate bucket so the commit message doesn't + # report timestamp churn as if it were real edits. + content_bundles_set = set(md_changes) + drift_only = sorted(json_only_bundles - content_bundles_set) + + return { + "md_count": md_count, + "json_count": json_count, + "content_bundles": dict(md_changes), # cast back to plain dict for JSON + "json_only_bundles": drift_only, + "new_bundles": sorted(new_bundles), + "other_files": other_files, + } + + +def render_human(summary: dict[str, Any]) -> str: + """Format a summary dict as the multi-line commit-message text. + + Matches the historical output exactly so existing commit-message + tooling and downstream readers don't have to change. + """ + lines: list[str] = [] + content_bundles = sorted(summary["content_bundles"]) + md_count = summary["md_count"] + json_count = summary["json_count"] + new_bundles = set(summary["new_bundles"]) + drift_only = summary["json_only_bundles"] + other_files = summary["other_files"] + + lines.append(f"{md_count} content change(s) across {len(content_bundles)} bundle(s)") + lines.append(f"{json_count} sidecar metadata update(s)") + if new_bundles: + lines.append(f"{len(new_bundles)} new bundle(s) added") + if other_files: + lines.append(f"{len(other_files)} other file change(s)") + + if content_bundles: + lines.append("") + lines.append("Bundles with content changes:") + for b in content_bundles: + pages = summary["content_bundles"][b] + tag = " (NEW)" if b in new_bundles else "" + lines.append(f" {b}{tag}: {len(pages)} page(s)") + for p in pages[:5]: + lines.append(f" - {p}") + if len(pages) > 5: + lines.append(f" ... and {len(pages) - 5} more") + if drift_only: + lines.append("") + head = ", ".join(drift_only[:10]) + suffix = " …" if len(drift_only) > 10 else "" + lines.append(f"Bundles with sidecar-only drift ({len(drift_only)}): {head}{suffix}") + return "\n".join(lines) + + +def walk_history(history_days: int) -> list[dict[str, Any]]: + """Walk recent corpus-touching commits, emit one summary per commit. + + Uses ``git log --first-parent main`` to keep the rolling weekly- + refresh line clean of branch-merge noise. Only commits whose diff + touches ``corpus/`` or ``bundles.json`` are emitted; pure code + commits are skipped (they have nothing to digest). + + Each record: + + { + "sha": "", + "timestamp": "", + "subject": "", + ... + every field from summarize_diff() + } + """ + # Find candidate commits. --first-parent keeps the linear refresh history + # on main and ignores branch-side merges. We still need to filter by what + # the commit actually touched, because non-corpus commits can land on + # main (PR merges for code, CI tweaks, etc.). + raw = git( + "log", + f"--since={history_days} days ago", + "--first-parent", + "main", + "--pretty=format:%H%x09%cI%x09%s", + ) + + records: list[dict[str, Any]] = [] + for line in raw.splitlines(): + if not line.strip(): + continue + parts = line.split("\t", 2) + if len(parts) < 3: + continue + sha, ts, subject = parts + + # What did this commit actually touch? Cheap: just the name-status diff + # against its first parent. Empty stdout = commit didn't change any + # files we care about. Root commits (no parent) error out — suppress + # the stderr noise and skip them. + try: + diff = subprocess.check_output( + ["git", "diff", "--name-status", f"{sha}^..{sha}"], + text=True, + stderr=subprocess.DEVNULL, + ) + except subprocess.CalledProcessError: + continue + if not diff.strip(): + continue + + summary = summarize_diff(diff) + # Skip pure code commits — only emit records that have actual corpus + # content motion. This is what makes the history "interesting" for + # the weekly digest. + if summary["md_count"] == 0 and summary["json_count"] == 0 and not summary["new_bundles"]: + continue + + records.append({ + "sha": sha[:12], + "timestamp": ts, + "subject": subject, + **summary, + }) + + return records + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--cached", action="store_true", + help="Summarize staged changes instead of a ref range.") + p.add_argument("--ref", default="HEAD^..HEAD", + help="Diff range to summarize (default: HEAD^..HEAD).") + p.add_argument("--json", dest="as_json", action="store_true", + help="Emit one JSON object instead of the human-readable form.") + p.add_argument("--history-out", metavar="PATH", + help="Walk recent corpus-touching commits and write a " + "JSONL history file at PATH. Overwrites if it exists. " + "Implies the history walker; --cached/--ref are ignored.") + p.add_argument("--history-days", type=int, default=120, + help="How far back the history walker looks (default 120).") + args = p.parse_args() + + # History-walker path: build the JSONL file consumed by the + # weekly_digest MCP tool, then exit. CI uses this. + if args.history_out: + records = walk_history(args.history_days) + # Sort by timestamp ascending so the file is roughly stable + # across rebuilds (commits within a single run could otherwise + # depend on git log default ordering). + records.sort(key=lambda r: r["timestamp"]) + with open(args.history_out, "w") as fh: + for rec in records: + fh.write(json.dumps(rec, separators=(",", ":")) + "\n") + # Brief stdout signal for CI logs — easy to spot in the workflow run. + print(f"wrote {len(records)} commit record(s) to {args.history_out} " + f"covering up to {args.history_days} days") + return 0 + + # One-shot summary path. Unchanged behavior for --cached / --ref. + if args.cached: + diff_args = ["diff", "--name-status", "--cached"] + else: + diff_args = ["diff", "--name-status", args.ref] + diff = git(*diff_args) + summary = summarize_diff(diff) + + if args.as_json: + print(json.dumps(summary, separators=(",", ":"))) + else: + print(render_human(summary)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/registry_gc.py b/scripts/registry_gc.py new file mode 100644 index 0000000..41bbc52 --- /dev/null +++ b/scripts/registry_gc.py @@ -0,0 +1,108 @@ +"""Gitea container-registry garbage collection. + +Lists package versions for one container package and deletes versions +older than --keep-days. Always preserves: + + - the :latest tag + - the --keep-latest most-recent date-tagged versions + - anything pushed in the last --keep-days days + +The actual disk reclaim happens on Gitea's next package GC cron (admin +site settings). This script just marks the versions for deletion. + +Usage: + + python scripts/registry_gc.py \\ + --owner \\ + --package -docs-mcp \\ + --keep-days 90 \\ + --keep-latest 5 + +Auth: reads GITEA_TOKEN from env (set in the workflow as a secret). +""" +from __future__ import annotations + +import argparse +import os +import sys +from datetime import datetime, timedelta, timezone +from urllib.request import Request, urlopen +from urllib.error import HTTPError +import json + + +GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io") + + +def api(token: str, method: str, path: str) -> object: + req = Request(f"{GITEA_HOST}{path}", + headers={"Authorization": f"token {token}"}, + method=method) + try: + with urlopen(req, timeout=30) as r: + body = r.read() + return json.loads(body) if body else None + except HTTPError as e: + if e.code == 404: + return None + raise + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--owner", required=True) + p.add_argument("--package", required=True) + p.add_argument("--keep-days", type=int, default=90) + p.add_argument("--keep-latest", type=int, default=5) + p.add_argument("--dry-run", action="store_true") + args = p.parse_args() + + token = os.environ.get("GITEA_TOKEN") + if not token: + print("GITEA_TOKEN not set", file=sys.stderr) + return 1 + + versions = api(token, "GET", + f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or [] + if not versions: + print(f"no versions found for {args.owner}/{args.package}") + return 0 + + cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days) + + # Date-tagged versions (YYYY.MM.DD), newest first + date_tagged = [] + for v in versions: + tags = v.get("tags") or [] + for t in tags: + if len(t) == 10 and t[4] == "." and t[7] == ".": + date_tagged.append((t, v)) + break + date_tagged.sort(key=lambda kv: kv[0], reverse=True) + keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]} + + deleted = 0 + for v in versions: + tags = v.get("tags") or [] + if "latest" in tags: + continue + if any(t in keep_date_tags for t in tags): + continue + try: + created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) + except (KeyError, ValueError): + continue + if created >= cutoff: + continue + version_id = v.get("id") + print(f" deleting v{version_id} tags={tags} created={v['created_at']}") + if not args.dry_run: + api(token, "DELETE", + f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}") + deleted += 1 + print(f"done: {deleted} version(s) deleted") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/usage_report.py b/scripts/usage_report.py new file mode 100644 index 0000000..6bfaa25 --- /dev/null +++ b/scripts/usage_report.py @@ -0,0 +1,251 @@ +"""Summarize usage logs from docs_mcp.usage into a quick scan. + +Reads one or more usage.jsonl* files and prints sections for: + + - per-tool call counts + - top search_docs queries by frequency + - 0-hit queries (where we returned nothing — high-signal for tuning) + - filter usage histogram (which version / platform / bundle filters get hit) + - reranker effectiveness (calls where the reranker fired vs not) + - hybrid retrieval top-1 attribution (dense vs bm25 vs both) + +Usage: + + # Default: read /app/var/logs in the production container + python scripts/usage_report.py --logs-dir /path/to/usage/logs + + # Last N days only: + python scripts/usage_report.py --logs-dir --since 7d + + # Markdown output (for piping into a weekly digest email, etc): + python scripts/usage_report.py --logs-dir --format markdown + +The script doesn't depend on anything in the docs_mcp package — it's a +standalone tool that can run anywhere with the log files available +(scp them off the host, point it at the directory). + +---------------------------------------------------------------------- +FOLLOW-UP CHECKS +---------------------------------------------------------------------- + +Pattern: when you ship a retrieval change with a hypothesis attached +(e.g. "hybrid will rescue queries dense misses"), add a note HERE +describing what the usage report should show and at what threshold +the change earns its keep. Future-you running the report a month +later will be glad. Example: + + Q: Does the dense leg of hybrid retrieval earn its keep on + real traffic, or could we simplify to BM25-only? + + - bm25_only >= 80%% --> dense not doing much; consider + simplifying to BM25 mode + - both >= 50%% --> hybrid is tie-breaking; keep it + - dense_only > bm25_only --> dense is the workhorse; keep + +Also worth a glance every month: + + - 0-hit queries list (tuning candidates) + - reranker p95 latency drift (slow reranker = bad UX) + - filter usage (does anyone actually use version/platform + filters? if not, simplify the tool surface) +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from collections import Counter, defaultdict +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Iterable + + +def parse_since(s: str | None) -> datetime | None: + """Accept '7d', '24h', '30m', or an ISO timestamp. None → no cutoff.""" + if not s: + return None + m = re.fullmatch(r"(\d+)([dhm])", s) + if m: + n, unit = int(m.group(1)), m.group(2) + delta = {"d": timedelta(days=n), "h": timedelta(hours=n), "m": timedelta(minutes=n)}[unit] + return datetime.now(timezone.utc) - delta + return datetime.fromisoformat(s.replace("Z", "+00:00")) + + +def load_events(logs_dir: Path, since: datetime | None) -> Iterable[dict[str, Any]]: + """Yield every JSONL record across all files in logs_dir.""" + if not logs_dir.exists(): + print(f"warning: logs dir {logs_dir} does not exist", file=sys.stderr) + return + # usage.jsonl is the active file; usage.jsonl.YYYY-MM-DD are rotated. + files = sorted(logs_dir.glob("usage.jsonl*")) + for f in files: + with open(f) as fh: + for ln, line in enumerate(fh, start=1): + line = line.strip() + if not line: + continue + try: + rec = json.loads(line) + except json.JSONDecodeError as e: + print(f" ! skipping {f}:{ln}: {e}", file=sys.stderr) + continue + if since: + ts = rec.get("ts", "") + try: + rec_ts = datetime.fromisoformat(ts.replace("Z", "+00:00")) + except ValueError: + continue + if rec_ts < since: + continue + yield rec + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--logs-dir", type=Path, default=Path("/app/var/logs"), + help="directory with usage.jsonl* files") + p.add_argument("--since", default=None, + help="time window: '7d', '24h', '30m', or ISO timestamp") + p.add_argument("--top", type=int, default=25, + help="how many top queries / filters to show") + p.add_argument("--format", choices=("text", "markdown"), default="text") + args = p.parse_args() + + since = parse_since(args.since) + events = list(load_events(args.logs_dir, since)) + if not events: + print("(no events in window)") + return 0 + + print(f"# Usage report — {len(events)} events" + + (f" since {since.isoformat()}" if since else "") + + f" from {args.logs_dir}") + print() + + # 1. Per-tool counts + by_tool = Counter(e["tool"] for e in events) + print("## Per-tool call counts") + print() + if args.format == "markdown": + print("| tool | calls |") + print("|---|---|") + for tool, n in by_tool.most_common(): + print(f"| `{tool}` | {n} |") + else: + for tool, n in by_tool.most_common(): + print(f" {tool:<25s} {n:>6d}") + print() + + # 2. Top search_docs queries + search_events = [e for e in events if e["tool"] == "search_docs"] + queries = Counter(e["args"].get("query", "") for e in search_events) + print(f"## Top {args.top} search_docs queries (of {len(search_events)} searches)") + print() + if args.format == "markdown": + print("| count | query |") + print("|---|---|") + for q, n in queries.most_common(args.top): + print(f"| {n} | `{q}` |") + else: + for q, n in queries.most_common(args.top): + print(f" {n:>5d} {q!r}") + print() + + # 3. 0-hit queries — the highest-signal data for tuning + zero_hit = [e for e in search_events if e.get("hits_returned") == 0] + zero_q = Counter(e["args"].get("query", "") for e in zero_hit) + print(f"## 0-hit queries ({len(zero_hit)} of {len(search_events)} searches returned nothing)") + print() + if zero_q: + if args.format == "markdown": + print("| count | query | filters |") + print("|---|---|---|") + # Group by query, show filter examples for each + examples_by_query: dict[str, list[dict]] = defaultdict(list) + for e in zero_hit: + examples_by_query[e["args"].get("query", "")].append(e["args"]) + for q, n in zero_q.most_common(args.top): + ex = examples_by_query[q][0] + f = {k: v for k, v in ex.items() + if k in ("version", "platform", "bundle_id") and v} + print(f"| {n} | `{q}` | `{f}` |") + else: + for q, n in zero_q.most_common(args.top): + print(f" {n:>5d} {q!r}") + else: + print(" _(no 0-hit queries in window)_") + print() + + # 4. Filter usage + filter_use = Counter() + for e in search_events: + a = e["args"] + v = a.get("version") + p_ = a.get("platform") + b = a.get("bundle_id") + if v: + filter_use[f"version={v}"] += 1 + if p_: + filter_use[f"platform={p_}"] += 1 + if b: + filter_use[f"bundle_id={b}"] += 1 + if not (v or p_ or b): + filter_use["(no filter)"] += 1 + print(f"## search_docs filter usage") + print() + if args.format == "markdown": + print("| filter | count |") + print("|---|---|") + for f, n in filter_use.most_common(args.top): + print(f"| `{f}` | {n} |") + else: + for f, n in filter_use.most_common(args.top): + print(f" {n:>5d} {f}") + print() + + # 5. Reranker effectiveness + reranked = [e for e in search_events if e.get("reranked") is True] + dense_only = [e for e in search_events if e.get("reranked") is False] + print(f"## Reranker activity") + print() + print(f" reranked: {len(reranked):>5d}") + print(f" dense only: {len(dense_only):>5d} (filter too narrow or 0 results)") + if reranked: + elapsed = [e["elapsed_ms"] for e in reranked if e.get("elapsed_ms") is not None] + if elapsed: + elapsed.sort() + p50 = elapsed[len(elapsed) // 2] + p95 = elapsed[int(len(elapsed) * 0.95)] + print(f" reranked latency p50: {p50:.0f} ms, p95: {p95:.0f} ms") + print() + + # 6. Hybrid retrieval activity — which retriever contributed the top-1? + # Empty unless HYBRID_SEARCH=true is set on the MCP container. + hybrid_events = [e for e in search_events if e.get("retrieval_mode") == "hybrid"] + if hybrid_events: + by_source = Counter(e.get("top1_source") for e in hybrid_events + if e.get("top1_source")) + print("## Hybrid retrieval — top-1 attribution") + print() + print(f" hybrid mode events: {len(hybrid_events)}") + total = sum(by_source.values()) or 1 + for src in ("both", "dense_only", "bm25_only"): + n = by_source.get(src, 0) + pct = 100.0 * n / total + label = { + "both": "in BOTH retrievers' top-N", + "dense_only": "dense found it, BM25 didn't", + "bm25_only": "BM25 found it, dense didn't", + }[src] + print(f" {src:<11s} {n:>5d} ({pct:5.1f}%) — {label}") + rescued = by_source.get("bm25_only", 0) + if rescued and total: + print(f"\n → {rescued} ({100.0 * rescued / total:.1f}%) of hybrid queries had the top-1 " + "result that ONLY BM25 surfaced. Without hybrid those would have been dense-misses.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())