commit 9ba615c8ee4f0c598716d6804403e0b85f9a9b71
Author: Justin Paul <justin@jpaul.me>
Date:   Fri May 22 09:18:17 2026 -0400

    initial: docs-mcp-template — build guide + scaffolded server
    
    Template for building hosted MCP servers over a product's public
    documentation. Distilled from one production build; everything
    product-specific has been factored out.
    
    Contents:
    
    - PLAN.md — comprehensive build guide. 13 phases from project
      skeleton through weekly_digest. Includes the gotchas
      ("fetch-depth: 0 always", reranker per-pair token limit,
      Cloudflare body cap, dash-not-bash on Gitea runners), the
      decisions worth carrying forward, and a per-product
      customization checklist.
    
    - CLAUDE.md — guidance for Claude Code working in a clone of this
      template. Phase identification table, conventions (env-gating +
      operator confirmation for side-effecting tools, defensive
      fallback for retrieval components), common commands.
    
    - README.md — quick-start summary.
    
    Scaffolded code (all signature-stable, with NotImplementedError
    stubs where phase-specific work is required):
    
      docs_mcp/server.py    FastMCP server, stateless_http=True, with
                            search_docs / get_page / list_versions
                            baseline tools and commented stubs for the
                            rest of the phase set.
      docs_mcp/usage.py     TimedCall telemetry, JSONL, daily rotation,
                            90-day retention. Reusable as-is.
      rag/embeddings.py     Ollama embedder (nomic-embed-text default),
                            load-balanced across N URLs. Reusable.
      rag/chunk.py          Paragraph-aware chunker with synthetic
                            chunk 0. Per-product tunable.
      rag/index.py          Chroma + BM25 builder. --rebuild and
                            --bm25-only flags.
      rag/bm25.py           SQLite FTS5 lexical index. Reusable.
      scrape/changelog.py   --cached / --ref / --json / --history-out.
                            Reusable.
      scrape/README.md      What you write per-product.
      eval/queries.jsonl.example
                            Curate ~25 hand-labeled queries here.
      eval/retrievers.py    Retriever protocol + stub classes.
      eval/run_eval.py      MRR / Recall@K / nDCG@K harness skeleton.
      scripts/usage_report.py
                            Standalone log analyzer; the
                            FOLLOW-UP CHECKS pattern noted in the
                            module docstring.
      scripts/registry_gc.py
                            Gitea container registry cleanup. Reusable.
    
    Deployment + CI:
    
      Dockerfile               Python 3.12-slim; COPY corpus + chroma
                               + bm25 last for cache efficiency.
      deploy/docker-compose.yml MCP + reranker sidecar + Watchtower.
                               Templated with <placeholders>.
      .gitea/workflows/refresh.yml    Weekly cron + manual dispatch.
                                      fetch-depth: 0, retry-on-race,
                                      three-tag image scheme.
      .gitea/workflows/image-only.yml Code-only ship cycle, ~18min.
    
    Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git a/.gitea/workflows/image-only.yml b/.gitea/workflows/image-only.yml
new file mode 100644
index 0000000..83b9b5f
--- /dev/null
+++ b/.gitea/workflows/image-only.yml
@@ -0,0 +1,89 @@
+name: Image rebuild (skip scrape)
+
+# Fast path for code-only changes. Skips the scrape and goes straight to:
+# rebuild indexes (from corpus already committed on main) + image build
+# + push. Runtime is ~18 min vs ~40 min for the full refresh.
+#
+# Use when a PR only changes code/config — anything where the upstream
+# corpus hasn't moved but we want the new Python in the running image.
+#
+# IMPORTANT: fetch-depth: 0 is required for the digest-history step
+# to find commits to walk. Don't change to 1.
+
+on:
+  workflow_dispatch:
+
+env:
+  REGISTRY_PUSH: <lan-host>:<port>
+  REGISTRY_PULL: <public-registry-hostname>
+  IMAGE: <owner>/<product>-docs-mcp
+  OLLAMA_URL: http://<gpu-host>:11434
+  EMBED_MODEL: nomic-embed-text
+  PRODUCT_NAME: <product>
+
+jobs:
+  build:
+    runs-on: docker
+    container:
+      image: catthehacker/ubuntu:act-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          # Full history (not shallow) so the digest-history step can
+          # walk git log up to --history-days back.
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -q --upgrade pip
+          python -m pip install -q -r requirements.txt
+
+      - name: Refresh digest history
+        # Cheap (a few seconds); doesn't touch corpus content.
+        # Without this step, a code-only deploy would ship an
+        # increasingly-stale digest history relative to git.
+        run: |
+          mkdir -p corpus/.digest
+          python -m scrape.changelog \
+              --history-out corpus/.digest/history.jsonl \
+              --history-days 120
+
+      - name: Verify committed corpus is present
+        run: |
+          test -d corpus || { echo "ERROR: corpus/ missing on this ref"; exit 1; }
+          echo "corpus: $(du -sh corpus | cut -f1), $(find corpus -name '*.md' | wc -l) markdown files"
+
+      - name: Rebuild indexes from existing corpus
+        run: python -m rag.index --rebuild
+
+      - name: Log in to registry (LAN endpoint)
+        run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u <user> --password-stdin
+
+      - name: Build & push image
+        run: |
+          SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12)
+          DATE_TAG=$(date -u +%Y.%m.%d)
+          docker build \
+            -t "${REGISTRY_PUSH}/${IMAGE}:latest" \
+            -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \
+            -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \
+            .
+          docker push "${REGISTRY_PUSH}/${IMAGE}:latest"
+          docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}"
+          docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}"
+
+      - name: Prune old container versions
+        env:
+          GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
+        run: |
+          python scripts/registry_gc.py \
+            --owner <user> \
+            --package <product>-docs-mcp \
+            --keep-days 90 \
+            --keep-latest 5
diff --git a/.gitea/workflows/refresh.yml b/.gitea/workflows/refresh.yml
new file mode 100644
index 0000000..ad10efe
--- /dev/null
+++ b/.gitea/workflows/refresh.yml
@@ -0,0 +1,158 @@
+name: Weekly docs refresh
+
+# Runs the full pipeline: scrape upstream → rebuild indexes → push
+# image. Cron'd weekly (Mondays). Skip the reindex + image-push if the
+# scrape produced no diff against the committed corpus.
+#
+# IMPORTANT: actions/checkout@v4 fetch-depth: 0 is required because
+# the digest-history step walks git log up to --history-days back.
+# With a shallow checkout the history file ships empty.
+
+on:
+  schedule:
+    - cron: "0 6 * * 1"     # Mondays 06:00 UTC
+  workflow_dispatch:
+    inputs:
+      force_build:
+        description: "Rebuild indexes + push image even if corpus is unchanged"
+        type: boolean
+        default: false
+
+env:
+  # If your registry sits behind Cloudflare with its 100 MB body cap,
+  # use a LAN endpoint for pushes (bypasses CF) and the public hostname
+  # for pulls (response bodies aren't capped).
+  REGISTRY_PUSH: <lan-host>:<port>
+  REGISTRY_PULL: <public-registry-hostname>
+  IMAGE: <owner>/<product>-docs-mcp
+
+  # Embedder. One URL per GPU; the indexer round-robins.
+  OLLAMA_URL: http://<gpu-host>:11434
+  EMBED_MODEL: nomic-embed-text
+
+  PRODUCT_NAME: <product>
+
+jobs:
+  refresh:
+    runs-on: docker
+    container:
+      image: catthehacker/ubuntu:act-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          # Full history — required for the digest-history step to
+          # walk git log. Default fetch-depth: 1 silently produces a
+          # 0-byte history file.
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install -q --upgrade pip
+          python -m pip install -q -r requirements.txt
+
+      # ---- Phase 1: scrape ---------------------------------------
+      - name: Refresh bundle catalog
+        run: python -m scrape.bundles
+
+      - name: Re-scrape all bundles
+        # --force re-fetches every page so we actually see upstream
+        # edits. Without it the runner skips pages already on disk.
+        run: python -m scrape.runner --all --force --concurrency 6
+
+      # ---- Build the digest history BEFORE committing ------------
+      # See PLAN.md Phase 13. Walks recent corpus-touching commits
+      # and writes corpus/.digest/history.jsonl. The current refresh
+      # gets added on the NEXT run's history (one-week lag is fine).
+      - name: Build digest history
+        run: |
+          mkdir -p corpus/.digest
+          python -m scrape.changelog \
+              --history-out corpus/.digest/history.jsonl \
+              --history-days 120
+
+      # ---- Commit + retry-on-race --------------------------------
+      - name: Commit corpus changes (if any)
+        id: commit
+        run: |
+          git config user.name "<product>-docs-refresh"
+          git config user.email "actions@<your-domain>"
+          git add bundles.json corpus
+          if git diff --cached --quiet; then
+            echo "no corpus changes — skipping reindex and image build"
+            echo "changed=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "changed=true" >> "$GITHUB_OUTPUT"
+          python -m scrape.changelog --cached > /tmp/changelog.txt
+          summary=$(head -1 /tmp/changelog.txt)
+          ts=$(date -u +"%Y-%m-%dT%H:%MZ")
+          {
+            echo "weekly refresh: ${ts} — ${summary}"
+            echo ""
+            cat /tmp/changelog.txt
+          } > /tmp/commitmsg.txt
+          git commit -F /tmp/commitmsg.txt
+          # Retry on race: if main moved while we were scraping (a
+          # human merged a PR during the run), `git push` rejects
+          # with "fetch first". Rebase our corpus commit onto new
+          # main and retry. Corpus + code paths are disjoint, so
+          # the rebase is trivially clean.
+          attempt=1
+          while [ $attempt -le 3 ]; do
+            if git push; then
+              echo "pushed corpus changes (attempt $attempt)"
+              break
+            fi
+            if [ $attempt -eq 3 ]; then
+              echo "push still failing after 3 attempts — bailing"
+              exit 1
+            fi
+            git fetch origin main
+            git rebase origin/main || { echo "rebase conflict — bailing"; exit 1; }
+            attempt=$((attempt + 1))
+          done
+
+      # ---- Reindex Chroma + BM25 ---------------------------------
+      - name: Rebuild indexes
+        if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
+        run: python -m rag.index --rebuild
+
+      # ---- Build & push image ------------------------------------
+      - name: Log in to registry (LAN endpoint)
+        if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
+        run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u <user> --password-stdin
+
+      - name: Build & push image
+        if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
+        # Runner shell is /bin/sh — use cut instead of ${VAR::N}.
+        # Three tags: :latest (Watchtower target), :<sha12>
+        # (rollback pin), :<YYYY.MM.DD> (human-readable).
+        run: |
+          SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12)
+          DATE_TAG=$(date -u +%Y.%m.%d)
+          docker build \
+            -t "${REGISTRY_PUSH}/${IMAGE}:latest" \
+            -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \
+            -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \
+            .
+          docker push "${REGISTRY_PUSH}/${IMAGE}:latest"
+          docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}"
+          docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}"
+
+      # ---- Registry GC -------------------------------------------
+      - name: Prune old container versions
+        if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
+        env:
+          GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
+        run: |
+          python scripts/registry_gc.py \
+            --owner <user> \
+            --package <product>-docs-mcp \
+            --keep-days 90 \
+            --keep-latest 5
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fbc0883
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,31 @@
+# Virtualenv
+venv/
+.venv/
+
+# Regenerable from corpus + CI
+corpus/
+chroma/
+bm25/
+
+# Python detritus
+__pycache__/
+*.py[cod]
+*.egg-info/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+# Eval results (regenerable; commit only the headline baseline if you want)
+# eval/results/
+
+# Usage logs (host-mounted volume in prod; don't commit dev logs)
+var/
+
+# Local-only env
+.env
+.env.local
+
+# IDE
+.vscode/
+.idea/
+*.swp
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..4d4da98
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,232 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when
+working with code in this repository.
+
+## Purpose
+
+This is a **template** for building an MCP server over a product's
+public documentation. When you (Claude) are working in a clone of this
+repo, you are helping the user implement one specific product's docs
+MCP — not editing the template itself.
+
+**Read `PLAN.md` first.** It's the canonical build guide and lays out
+13 phases. Most user requests will be "implement Phase N" or "we hit
+a bug in Phase N." Identify the phase before doing anything else.
+
+## Working with this template
+
+### Identifying the current phase
+
+When the user clones this template and starts working, figure out
+which phase they're on by inspecting:
+
+| Signal | Likely phase |
+|---|---|
+| `corpus/` doesn't exist | Phase 1 (scraper) — they need to build it before anything else works |
+| `corpus/` exists, `chroma/` doesn't | Phase 2 (indexing) |
+| Indexes exist, only `search_docs` / `get_page` / `list_versions` implemented | Phase 3 (server skeleton done; next: Dockerfile + CI) |
+| No `Dockerfile` or `.gitea/workflows/` updated | Phase 4–5 |
+| `RERANK_URL` env unset in compose | Phase 6 not done |
+| `HYBRID_SEARCH` env unset, no `rag/bm25.py` content | Phase 8 not done |
+| No `eval/results/` directory | Phase 7 not done |
+| `find_doc_inconsistencies` / `submit_doc_bug` are commented-out stubs in `docs_mcp/server.py` | Phase 12 |
+| No `corpus/.digest/` produced by CI | Phase 13 |
+
+When in doubt, ask the user: *"Which phase from PLAN.md are we
+working on?"*
+
+### The scaffolded server has stubs
+
+`docs_mcp/server.py` ships with three working tools (`search_docs`,
+`get_page`, `list_versions`) and signature-only stubs for the
+phase-specific tools. The stubs `raise NotImplementedError` with a
+phase hint in the docstring. When implementing a phase, you'll be
+filling these bodies in — DO NOT change the signatures unless the
+user has a specific reason. Signatures are the public contract
+between the MCP and its clients (Claude Desktop, Claude Code,
+Cursor, etc.).
+
+## Layout
+
+```
+.
+├── PLAN.md                       # Read first. Phase-by-phase build guide.
+├── README.md                     # Quick-start summary.
+├── CLAUDE.md                     # This file.
+├── requirements.txt
+├── Dockerfile
+├── deploy/docker-compose.yml
+├── .gitea/workflows/
+│   ├── refresh.yml               # Weekly cron: scrape + index + image
+│   └── image-only.yml            # On-demand: code-only ship cycle
+├── scrape/                       # Phase 1 — product-specific scraper here
+│   └── changelog.py              # Reusable: --json, --history-out
+├── rag/                          # Phase 2/8 — indexing
+│   ├── embeddings.py             # Ollama embedder (swappable)
+│   ├── chunk.py                  # Page → chunks (adjust per page format)
+│   ├── index.py                  # Builds Chroma + BM25
+│   └── bm25.py                   # SQLite FTS5 lexical index
+├── docs_mcp/                     # Phase 3+ — MCP server
+│   ├── server.py                 # FastMCP + tool definitions
+│   └── usage.py                  # TimedCall telemetry
+├── eval/                         # Phase 7 — golden-query harness
+│   ├── queries.jsonl.example
+│   ├── retrievers.py
+│   └── run_eval.py
+├── scripts/                      # Standalone ops scripts
+│   ├── usage_report.py
+│   └── registry_gc.py
+└── deploy/
+    └── docker-compose.yml
+```
+
+## Conventions
+
+### Tool docstrings are user interface
+
+The text in `@mcp.tool()` docstrings is what the LLM sees and uses to
+decide whether to call the tool. Treat it like a button label.
+*"Use when..."*, *"Call proactively whenever..."* phrasings work
+well. Don't bury the headline in implementation notes.
+
+### Side-effecting tools must be env-gated AND operator-confirmed
+
+Any tool that POSTs to an external service (submit_doc_bug being the
+canonical example):
+
+1. Must check an env flag at call time and return a "disabled,
+   manual fallback at <URL>" message if unset.
+2. Must have a loud docstring requiring per-call operator
+   confirmation in the LLM conversation flow (the LLM drafts, shows
+   the operator the exact payload, asks yes/no, only then calls).
+3. Must do upfront validation (URL allowlist, content length, etc.)
+   so the LLM gets a clean error instead of a wire-level failure.
+
+Match the `submit_doc_bug` patterns documented in PLAN.md Phase 12.
+
+### Defensive fallback for retrieval components
+
+The reranker, BM25 index, and any external dependency must fail
+gracefully:
+
+- Catch the specific exception type
+- Log a warning with enough info to debug
+- Fall back to a working baseline (dense-only, no reranker, etc.)
+- Never block a search_docs call on a single failure
+
+The user's MCP is in front of real people; partial degradation
+beats a 500.
+
+### Verify retrieval changes with the eval harness
+
+Any change that touches retrieval (new embedder, chunker tweak,
+reranker model, filter shape) ships with eval numbers in the commit
+message. Don't ship retrieval changes on vibes. If `eval/queries.jsonl`
+isn't populated yet, populate it before changing retrieval — it's
+the most important file in the repo.
+
+### Standard infrastructure choices
+
+These are reasoned defaults — only deviate if you have a specific
+need:
+
+- **Embedding model**: `nomic-embed-text` via Ollama (768-dim,
+  free, on-prem)
+- **Reranker**: `jina-reranker-v2-base` GGUF via llama.cpp
+  `/v1/rerank` endpoint
+- **Vector store**: Chroma `PersistentClient`
+- **Lexical store**: SQLite FTS5 (stdlib)
+- **Fusion**: Reciprocal Rank Fusion with k=60
+- **Transport**: streamable-HTTP in prod, stdio for local dev
+- **MCP framework**: FastMCP with `stateless_http=True`
+- **Container deploy**: Watchtower auto-pull on `:latest`, rollback
+  via `:<sha12>` pin
+
+### Naming the product
+
+The template uses `PRODUCT_NAME` env var (defaults to `"myproduct"`)
+throughout. Set it on first build. References show up in:
+
+- `docs_mcp/server.py` — `FastMCP(f"{PRODUCT_NAME}-docs", ...)`
+- Collection name (`<product>_docs`)
+- BM25 db filename
+- Tool names that include the product name (e.g., the `_api_lessons`
+  tool — convention is to name it `<product>_api_lessons`)
+
+Use lowercase, underscores not hyphens, since it ends up in tool
+identifiers that the LLM reads.
+
+## Common commands
+
+```bash
+# Set up dev environment
+python -m venv venv && source venv/bin/activate
+pip install -r requirements.txt
+
+# Run the MCP server locally for Claude Desktop dev
+python -m docs_mcp.server --transport stdio
+
+# Run as HTTP for integration testing
+python -m docs_mcp.server --transport streamable-http --port 8000
+
+# Rebuild Chroma + BM25 indexes from corpus
+python -m rag.index --rebuild
+
+# Rebuild only BM25 (fast iteration)
+python -m rag.index --bm25-only
+
+# Run the eval harness
+python -m eval.run_eval --queries eval/queries.jsonl --output eval/results/baseline.md
+
+# Generate changelog summary (called by CI, useful locally too)
+python -m scrape.changelog --cached
+python -m scrape.changelog --history-out corpus/.digest/history.jsonl --history-days 120
+```
+
+## Gotchas (carried forward from the reference build)
+
+- **`fetch-depth: 0` on `actions/checkout@v4`** in both workflows.
+  Default is shallow; history-walking steps (changelog, digest)
+  silently produce empty output otherwise. This is the #1 thing
+  people miss.
+- **Reranker per-pair token limit**: jina-reranker GGUF rejects the
+  ENTIRE batch if any doc exceeds `n_ctx_train=1024`. Truncate docs
+  to ~2000 chars before sending to rerank. Full chunk text still
+  goes back to the user; truncation is reranking-only.
+- **FastMCP `stateless_http=True`**: critical for production
+  hosting behind Watchtower auto-updates. Without it, every
+  container recreate produces a 404 storm from clients with
+  stale session IDs.
+- **Runner shell is `/bin/sh` (dash)**: no `${VAR::N}` substring
+  expansion in workflow scripts. Use `cut`/`awk`/`printf`.
+- **Cloudflare 100 MB body cap**: if pushing through a Cloudflare-
+  fronted registry, push via LAN endpoint, pull via public
+  hostname. Same registry, different URLs.
+
+## When the user says...
+
+| User says | You do |
+|---|---|
+| "Let's start building" / "set up the project" | Read PLAN.md Phase 0; create dirs, requirements.txt, etc. Confirm Python version and existing tooling. |
+| "Build the scraper" / "scrape the docs" | Read PLAN.md Phase 1. Find the upstream portal's underlying API by sniffing; AVOID headless-browser solutions unless the API path is truly closed. |
+| "Get retrieval working" / "make search work" | Read PLAN.md Phase 2-3. Implement chunking, embedder, Chroma indexer, then the three baseline tools. |
+| "Add a reranker" | Read PLAN.md Phase 6. Stand up the llama.cpp sidecar, implement `_rerank()`. Verify with the eval harness. |
+| "Search is missing X queries" | Run the eval harness first to confirm the failure. Then consider: rich chunk-0 rewrites, hybrid retrieval, curated knowledge layer. Don't just tune cosine. |
+| "Let's add hybrid search" | Read PLAN.md Phase 8. Only after you've established the failure mode with eval queries — hybrid is not free. |
+| "Make a tool that submits doc bugs" | Read PLAN.md Phase 12. Find the docs portal's feedback endpoint by sniffing. Build with operator confirmation as a hard requirement in the tool docstring. |
+| "I want a 'what changed' tool" | Read PLAN.md Phase 13. Don't try to do this at runtime — pre-bake the history JSONL at CI time. |
+
+## Out-of-scope concerns (don't try to solve here)
+
+- **Reverse proxy / TLS termination** — outside the repo. User
+  picks Caddy / Cloudflare Tunnel / nginx / Traefik based on their
+  infra.
+- **MetaMCP or other gateway** — outside the repo. Optional, only
+  matters when running multiple MCPs.
+- **GPU container orchestration** — outside the repo. Pattern is
+  one Ollama container per GPU; the indexer load-balances. Document
+  it in deploy/ but don't build it in this template.
+- **Email/blog delivery for weekly_digest** — out of scope per
+  PLAN.md ("Out of scope" section). Add a separate script in
+  scripts/ if/when the user asks.
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..f4e4d14
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+# Docs MCP server — production image.
+#
+# Structure: copy code first, then the regenerable indexes last so a
+# code-only change doesn't invalidate the corpus COPY layer.
+#
+# The container runs the MCP server via streamable-http on PORT 8000.
+# Override via MCP_HOST / MCP_PORT env if you front it with a different
+# reverse-proxy setup.
+
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install Python deps first for cacheability.
+COPY requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Code.
+COPY scrape /app/scrape
+COPY rag /app/rag
+COPY docs_mcp /app/docs_mcp
+
+# Catalog. Written by the scraper at CI time.
+COPY bundles.json /app/
+
+# Regenerable indexes. CI builds these from corpus/ in the same job
+# that builds the image. Listed last so code changes don't invalidate
+# the COPY layer cache for these (much larger) directories.
+#
+# bm25/ is only consulted when HYBRID_SEARCH=true (the server falls
+# back to dense-only if it's missing).
+COPY corpus /app/corpus
+COPY chroma /app/chroma
+COPY bm25 /app/bm25
+
+ENV PYTHONUNBUFFERED=1 \
+    MCP_TRANSPORT=streamable-http \
+    MCP_HOST=0.0.0.0 \
+    MCP_PORT=8000
+
+EXPOSE 8000
+
+ENTRYPOINT ["python", "-m", "docs_mcp.server"]
diff --git a/PLAN.md b/PLAN.md
new file mode 100644
index 0000000..ca0f327
--- /dev/null
+++ b/PLAN.md
@@ -0,0 +1,647 @@
+# Docs MCP Server — Build Guide
+
+A reusable recipe for building a hosted MCP server over a product's
+public documentation. Distilled from one production build; everything
+product-specific has been factored out.
+
+The end product is a streamable-HTTP MCP server with ~15 tools that
+any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can
+call to answer questions against the docs, surface what changed
+recently, find inconsistencies, and (optionally) submit doc bugs
+back upstream.
+
+---
+
+## What you're building
+
+A pipeline with these stages:
+
+```
+upstream docs portal
+        │
+        ▼
+   scrape  ──► corpus/<bundle>/<page>.md + .json sidecar
+        │
+        ▼
+    chunk + embed  ──► chroma/  (dense vectors)
+        │           ──► bm25/   (FTS5 lexical index)
+        ▼
+   MCP server  ──► search_docs / get_page / diff_versions / weekly_digest /
+                   find_doc_inconsistencies / submit_doc_bug / ...
+        │
+        ▼
+   reverse proxy / Cloudflare Tunnel ──► public endpoint
+```
+
+Two CI cadences:
+
+- **Weekly cron** (~40 min): full re-scrape, re-chunk, re-embed,
+  image build & push.
+- **On-demand image-only** (~18 min): code-only rebuild from
+  committed corpus, image build & push.
+
+A container registry (self-hosted Gitea works well), a host running
+Docker Compose, Watchtower auto-updating from `:latest`, and a
+reverse proxy in front.
+
+---
+
+## Build phases
+
+Each phase is a discrete, shippable unit. Build them in order; each
+one is useful on its own and unlocks the next. Realistic effort per
+phase is given as a rough order of magnitude. Total: roughly 2–3
+weeks of focused work for the full stack.
+
+### Phase 0 — Project skeleton  *(half a day)*
+
+Goals: directory layout, dependency manifest, virtualenv.
+
+- Top-level dirs: `scrape/`, `corpus/` (gitignored), `rag/`,
+  `docs_mcp/`, `eval/`, `scripts/`, `deploy/`, `.gitea/workflows/`.
+- `requirements.txt` with the dependencies you'll need across all
+  phases (FastMCP, chromadb, httpx, beautifulsoup4 or whatever HTML
+  parser, ollama or sentence-transformers client, etc.).
+- `python -m venv venv` and pin Python version (3.11 or 3.12 — be
+  conservative; some embedding libraries have version-specific
+  wheels).
+- `.gitignore`: `venv/`, `corpus/` (regenerable), `chroma/`
+  (regenerable), `bm25/` (regenerable), `*.pyc`, `__pycache__/`,
+  `.pytest_cache/`.
+
+### Phase 1 — Scraper  *(2–4 days, product-specific)*
+
+This is the most product-dependent phase. The goal is to write a
+scraper that produces a normalized corpus layout regardless of
+upstream portal shape.
+
+Output shape (mandatory):
+
+```
+corpus/
+  <bundle_id>/             # one dir per "doc bundle" — see Glossary
+    <page_id>.md           # markdown body
+    <page_id>.json         # sidecar with structured metadata
+  ...
+bundles.json               # catalog of bundles with metadata
+```
+
+**Bundle metadata** (`bundles.json` is a list of these):
+
+```json
+{
+  "slug":          "<bundle_id>",
+  "title":         "User-facing title",
+  "version":       "10.9",
+  "platform":      "VMware vSphere",   // may be null
+  "product":       "Admin Guide",       // optional but useful
+  "language":      "en-US",
+  "page_count":    127,
+  "dates": {
+    "Added on":    "2024-01-15",
+    "Updated on":  "2026-05-20"
+  },
+  "landing_page":  "<page_id>"
+}
+```
+
+**Per-page sidecar** (`<page_id>.json`) carries page-level metadata.
+The one field that matters cross-cutting is `topic_cluster` (see
+Phase 9):
+
+```json
+{
+  "bundle_id":     "<bundle_id>",
+  "page_id":       "<page_id>",
+  "title":         "How to ...",
+  "ordinal":       42,
+  "topic_cluster": {
+    "clustering_title": "How to ...",
+    "clustered_topics": [
+      {"bundle_id": "...10.8", "page_id": "How_to_X.htm", "clustering_title": "..."},
+      {"bundle_id": "...10.9", "page_id": "How_to_X.htm", "clustering_title": "..."}
+    ]
+  }
+}
+```
+
+If the portal exposes a cross-version "this page corresponds to that
+page" mapping, capture it here. If it doesn't, you can synthesize a
+filename-based fallback (same filename across bundle versions = same
+topic) and live without the editor-curated mapping. The features that
+read `topic_cluster` (`list_cluster`, `diff_versions`,
+`find_doc_inconsistencies`, parts of `weekly_digest`) will work
+either way; they're more accurate with real clusters.
+
+**Patterns that recur across doc portals:**
+
+- Most modern doc portals are SPAs. Plain `requests.get` won't see
+  rendered content. Either find the underlying API the SPA calls (the
+  cheapest, most reliable path), or fall back to a headless browser
+  (Playwright). The API path is almost always available; sniff the
+  network tab.
+- Portals usually expose a "bundle/topic" hierarchy under the hood
+  (Zoomin, Madcap Flare, Paligo, GitBook, Docusaurus all do). Map
+  it to `bundles.json` + `corpus/<bundle>/<page>`.
+- Many portals expose `?save_local=` or `.pdf` rendered versions; the
+  HTML they serve is structurally cleaner than what the page shows
+  through the SPA shell.
+
+**`scrape/changelog.py`** (~250 LOC; see Phase 13) — provides
+`summarize_diff()`, `render_human()`, `walk_history()` and the
+`--json` / `--history-out` modes. Mostly reusable as-is; the only
+product-specific bit is the path layout assumption.
+
+### Phase 2 — Chunking + embeddings + Chroma  *(2 days)*
+
+Goal: build a queryable dense index from the scraped corpus.
+
+- `rag/chunk.py` — split each page's markdown into ~400-600 token
+  chunks. Strategy that works: paragraph-aware splitter with a
+  rich "chunk 0" containing the page title + 1-sentence summary +
+  bag-of-words from key terms. Chunk 0 is what dense retrieval lands
+  on first; getting it right dominates retrieval quality.
+- `rag/embeddings.py` — pluggable embedder. Recommended start:
+  Ollama-hosted `nomic-embed-text` (768-dim, free, good baseline).
+  Other defensible choices: `text-embedding-3-small` (OpenAI),
+  `bge-m3` (also via Ollama). The embedder is a Chroma
+  `EmbeddingFunction` that returns `list[list[float]]` for a list
+  of texts.
+- `rag/index.py` — orchestrates: read corpus → emit chunks (with
+  metadata: bundle_id, page_id, version, platform, ordinal) →
+  upsert into Chroma collection. `--rebuild` flag for a clean
+  reindex. Run via `python -m rag.index --rebuild`.
+
+Chroma settings: `PersistentClient(path="chroma/")` and
+`Settings(anonymized_telemetry=False)`. Single collection
+(`<product>_docs`).
+
+**GPU note**: embedding 70K chunks on CPU takes hours; on a GPU
+(via Ollama with `NVIDIA_VISIBLE_DEVICES`) takes ~10 minutes. Two
+GPUs in parallel: ~5 minutes. The orchestrator just needs to load-
+balance HTTP requests across multiple Ollama endpoints.
+
+### Phase 3 — MCP server skeleton  *(1 day)*
+
+Goal: working FastMCP server with three tools — `search_docs`,
+`get_page`, `list_versions`.
+
+- `docs_mcp/server.py` — `FastMCP("<product>-docs", stateless_http=True)`.
+  `stateless_http=True` is critical for production hosting: every
+  request creates an ephemeral session, so container recreates don't
+  produce a 404 storm from stale `mcp-session-id` headers on
+  clients.
+- Lazy initialization for everything expensive (Chroma client,
+  embedder, bundles catalog) so the server starts cleanly even when
+  Ollama is briefly unreachable.
+- Tool: `search_docs(query, version=None, platform=None,
+  bundle_id=None, k=10)`. Returns markdown of top-k chunks with full
+  source URLs.
+- Tool: `get_page(bundle_id, page_id)`. Returns full page markdown +
+  metadata.
+- Tool: `list_versions()`. Returns the version/platform facets
+  available, drawn from `bundles.json`. Helps the LLM pick filter
+  values.
+
+Transports: stdio (for local Claude Desktop dev), streamable-HTTP
+(for hosted production). One argparse switch.
+
+```python
+@mcp.tool()
+def search_docs(
+    query: Annotated[str, Field(description="Natural-language query about <product>.")],
+    version: Annotated[str | None, Field(description="Restrict to one version")] = None,
+    ...
+) -> str:
+    ...
+```
+
+The tool descriptions are first-class context — the LLM reads them
+and decides whether to call the tool. Treat them as button labels;
+use "Call when..." / "Use proactively whenever..." phrasings.
+
+### Phase 4 — Containerization  *(1 day)*
+
+Goal: image you can run anywhere.
+
+- `Dockerfile`: Python 3.12-slim base, install requirements, COPY
+  `scrape rag diff docs_mcp` + `bundles.json` + `corpus/ chroma/`
+  + (later) `bm25/`. Don't COPY `scripts/` — those stay external
+  for ops use only.
+- `ENTRYPOINT ["python", "-m", "docs_mcp.server",
+  "--transport", "streamable-http"]`. Configurable host/port via env.
+- `deploy/docker-compose.yml`: one service, named volumes for usage
+  logs and any state, Watchtower label, depends_on for the reranker
+  sidecar (Phase 6).
+
+Smoke-test locally: `docker compose up` should expose
+`http://localhost:8000/mcp` and respond to an MCP `initialize` JSON-RPC.
+
+### Phase 5 — CI on self-hosted Gitea Actions  *(1–2 days)*
+
+Goal: weekly cron rebuild + on-demand code-only ship cycle.
+
+**Two workflows, two cadences:**
+
+| Workflow | Trigger | Steps | Runtime |
+|---|---|---|---|
+| `refresh.yml` | Monday cron + manual dispatch | scrape → commit corpus → rebuild indexes → build & push image | ~40 min |
+| `image-only.yml` | manual dispatch only | rebuild indexes from committed corpus → build & push image | ~18 min |
+
+**Critical settings (learned the hard way):**
+
+- `fetch-depth: 0` on `actions/checkout@v4`. The default depth is 1
+  (shallow), which breaks any step that walks git history (changelog,
+  digest history walker). Pay the ~10 second cost; never debug a
+  "0-byte history file" mystery.
+- `runs-on: docker` (Gitea convention, not `ubuntu-latest`).
+- Runner shell is `/bin/sh` (dash), not bash. `${VAR::N}` substring
+  expansion doesn't exist; use `cut` / `printf` / `awk`.
+
+**Retry-on-race pattern for long-running scrapes:**
+
+```bash
+attempt=1
+while [ $attempt -le 3 ]; do
+  if git push; then
+    echo "pushed (attempt $attempt)"
+    break
+  fi
+  [ $attempt -eq 3 ] && { echo "still failing"; exit 1; }
+  git fetch origin main
+  git rebase origin/main || { echo "conflict — bail"; exit 1; }
+  attempt=$((attempt + 1))
+done
+```
+
+Works because scrape commits only touch `corpus/` + `bundles.json`,
+and code merges only touch `.py` / `.yml` — disjoint paths, trivially
+clean rebases.
+
+**Image tagging — three tags per build:**
+
+| Tag | Purpose |
+|---|---|
+| `:latest` | Watchtower watches this for auto-deploy |
+| `:<sha12>` | Immutable; rollback target |
+| `:<YYYY.MM.DD>` | Human-readable in incident notes |
+
+Same tag set on every build; rollback is a one-line compose edit
+to pin `:<sha>` instead of `:latest`.
+
+**Container registry behind Cloudflare:**
+
+Cloudflare's free tier has a 100 MB request body limit. Big image
+layers (Chroma index can easily be 800+ MB) exceed it on push. The
+fix is a LAN registry endpoint for push, public hostname for pull:
+
+```yaml
+env:
+  REGISTRY_PUSH: <lan-ip>:<port>     # bypasses Cloudflare
+  REGISTRY_PULL: <public-hostname>   # response bodies aren't capped
+```
+
+Runner needs the LAN endpoint in `/etc/docker/daemon.json`
+`insecure-registries`. Costs nothing operationally; saves hours
+of debugging.
+
+**Registry GC:** weekly cron in the workflow that walks the package
+versions, keeps `:latest` + N most-recent date tags + anything
+pushed in the last 90 days, deletes the rest. Worth ~50 LOC; the
+package GC on the Gitea side reclaims disk after.
+
+### Phase 6 — Reranker  *(half a day)*
+
+Goal: lift retrieval quality 3× by cross-encoder reranking the top-N
+dense candidates.
+
+- A `/v1/rerank` HTTP endpoint backed by `llama.cpp` serving
+  `jina-reranker-v2-base` (GGUF). Runs as a sidecar in compose.
+  GPU strongly recommended (CPU latency is unworkable for live
+  queries).
+- `_rerank(query, docs)` helper in the server: POST to the endpoint,
+  apply the scores, re-sort the top-N candidates. Defensive: on any
+  failure log a warning and fall through to dense-only.
+- Env: `RERANK_URL` (off by default), `RERANK_POOL` (how deep to
+  pull candidates for reranking; 200 is a good default),
+  `RERANK_TIMEOUT` (30s for cold-start tolerance).
+- **Watch the per-pair token limit.** Jina's GGUF reports
+  `n_ctx_train=1024` and llama.cpp will reject the ENTIRE batch if
+  any pair exceeds it. Truncate doc text to ~2000 chars before
+  reranking. The full untruncated chunk still goes back to the user;
+  truncation is only for the reranker scoring path.
+
+### Phase 7 — Eval harness  *(1 day)*
+
+Goal: hand-curated golden queries + standard metrics so you can
+measure the impact of any retrieval change.
+
+- `eval/queries.jsonl`: 20–25 hand-curated queries with expected
+  pages. Spread across versions, platforms, and difficulty levels.
+  Include the queries that "obviously" should work and DON'T —
+  those are the ones to track.
+- `eval/retrievers.py`: a `Retriever` protocol with concrete
+  implementations: `DenseRetriever`, `RerankedRetriever`,
+  `BM25Retriever` (Phase 8), `HybridRetriever` (Phase 8). One
+  matrix dimension per knob.
+- `eval/run_eval.py`: computes MRR / Recall@5 / nDCG@5 across all
+  retrievers; emits a markdown comparison table at
+  `eval/results/<baseline>.md`. Commit the result so PRs land with
+  the A/B evidence in the diff.
+
+Three numbers are enough — don't overengineer. The hand-curated
+queries are the value; the metrics are just a stable way to score
+them.
+
+### Phase 8 — BM25 + Hybrid retrieval  *(half a day, conditional)*
+
+**Skip unless your eval shows specific failure modes.** Dense
+embeddings + cross-encoder reranker handle most queries. The case
+where they don't: queries with rare technical tokens (filenames,
+language names, error codes) get buried at dense rank 1000+ by a
+much larger prose corpus that's semantically nearby. The reranker
+only sees top-200, so it never gets a shot.
+
+- `rag/bm25.py`: SQLite FTS5 index, in the stdlib, on-disk
+  (`bm25/<product>.db`). Two tables — metadata table keyed by
+  rowid, FTS5 virtual table for full-text. Sanitize the query
+  (strip FTS5 reserved keywords, OR-join tokens for recall). ~210
+  LOC.
+- `_rrf_fuse()` in the server — Reciprocal Rank Fusion with `k=60`.
+  Per-id score = `sum_over_retrievers(1 / (k + rank))`. Returns
+  ordered ids plus per-retriever contribution dict for telemetry.
+- `search_docs` hybrid path: run dense + BM25 in parallel,
+  RRF-fuse, hand the merged top-200 to the reranker. Env-gated:
+  `HYBRID_SEARCH=true`.
+- Log `top1_source` per call (`dense_only` / `bm25_only` / `both`)
+  to usage logs so you can measure whether BM25 is actually earning
+  its keep on production traffic.
+
+If after 4–6 weeks of production data you see `bm25_only >= 80%`,
+you can simplify to BM25-only (much less infrastructure). If
+`both >= 50%`, hybrid is acting as tie-breaker not rescue — keep it
+or simplify depending on how much you care about the long tail.
+
+### Phase 9 — Multi-version diff tooling  *(1 day, if applicable)*
+
+**Only relevant if the product has multiple maintained versions.**
+
+- `diff_versions(bundle_id, page_id, against_bundle_id)`: unified
+  diff between two versions of the same page. Two matching
+  strategies: editor-curated `topic_cluster` peer (if the portal
+  exposes it), or same-filename fallback.
+- `list_cluster(bundle_id, page_id)`: list cross-version peers
+  for one page.
+- `bundle_changelog(bundle_id_new, bundle_id_old)`: added /
+  removed / changed pages between two bundles, sorted by churn.
+- `_diff_churn(a, b)`: small helper, ~15 LOC of `difflib.unified_diff
+  --unified=0` line counting. Used by `bundle_changelog`,
+  `find_doc_inconsistencies`, and `weekly_digest`.
+
+### Phase 10 — Usage logging  *(half a day)*
+
+Goal: per-call JSONL telemetry so you can answer "what are people
+actually asking for" and "is the new feature getting used."
+
+- `docs_mcp/usage.py`: `TimedCall` context manager that captures
+  tool name, args, elapsed time, hits returned, any extra fields
+  set by the tool via `_call.set(key=value)`. Writes JSONL to
+  `var/logs/usage.jsonl`, rotated daily, kept 90 days.
+- Mount the log dir as a named compose volume so logs survive
+  container recreates.
+- `scripts/usage_report.py` (standalone, no docs_mcp deps): reads
+  the JSONL files, prints per-tool counts, top queries, 0-hit
+  queries, filter usage histogram, reranker activity. Markdown
+  output flag for piping into weekly digest emails.
+
+What to log: query text, filters, hits returned, elapsed_ms,
+reranker_fired flag, hybrid top1_source, retrieval_mode. What NOT
+to log: anything PII-shaped. The corpus is public, queries are
+usually about the product, not personal — but be deliberate.
+
+### Phase 11 — Curated knowledge layer  *(2 days)*
+
+The "RAG can't tell you what isn't in the docs" gap. Surfaces:
+
+- **API quickstart repos** if the product has them. Ingest the
+  example scripts (Python, PowerShell, curl) into the corpus.
+  Rewrite chunk-0 for each script to embed naturally — explicit
+  natural-language H1, task description sentence, keyword bag.
+  Dense embeddings need an anchor.
+- **A curated `<product>_api_lessons` markdown doc** for things
+  the swagger / OpenAPI doesn't say: auth flow gotchas, async-task
+  patterns, schema bugs you've hit, platform-detection quirks.
+  Surface as a dedicated MCP tool whose description tells the LLM:
+  *"Call proactively whenever the user asks you to write a script
+  / integrate with the API / debug a 4xx response."*
+- **An auto-hint banner** in `search_docs` results — when the
+  query matches a script/API trigger word, render a one-line nudge
+  at the top of results pointing at the dedicated tool. Belt-and-
+  suspenders for queries where the LLM doesn't think to call it
+  proactively.
+
+### Phase 12 — Doc-bug workflow tools  *(1 day, optional)*
+
+Two tools that pair up to enable a *"check the docs for
+inconsistencies, draft bugs, confirm, submit"* workflow.
+
+- `find_doc_inconsistencies(scope_query, version=None, platform=None,
+  max_pages=30, checks=None)`: deterministic, read-only. Two checks:
+  cross-version drift (pages whose content shifted between immediate-
+  previous versions in the actionable 10–60% churn band) and
+  redirect-chain detection (short pages whose body is just a "see
+  [other page] for details" pointer). Heavy lifting is line-level
+  diff (`difflib`) against editor-curated cluster peers; the model
+  judges which findings are real bugs.
+
+- `submit_doc_bug(page_url, content, email=None, rating=None,
+  like=None)`: POSTs to the docs portal's feedback endpoint.
+  Env-gated by `DOC_BUG_SUBMIT_ENABLED=true` so dev/staging
+  deployments can't accidentally hit the upstream. The tool's
+  docstring is loud about a mandatory operator-confirmation
+  workflow per submission — LLM must draft, show, ask, then
+  submit. Explicit *"do not loop"* instruction. Defensive
+  validation upfront (URL host matches expected portal, content
+  non-empty, etc.) so the LLM gets a clean error instead of a
+  rejected POST.
+
+**You'll need to find the docs portal's feedback endpoint.** Most
+portals route the "Was this helpful?" widget through a backend
+API; sniff the browser network tab on the live site. The payload
+shape varies; common fields: content/body, page url/href, optional
+email, optional rating, optional thumbs. Most accept anonymous
+POSTs with no captcha at the JSON-API layer (even if the widget
+shows a captcha). Validate before you ship — and if the endpoint
+has rate limits or captcha enforcement, the tool returns a clean
+"submission rejected — paste manually at <url>" fallback.
+
+The whole point is the per-bug operator confirmation in the
+LLM-side conversation flow; the tool description enforces it. Do
+not bypass.
+
+### Phase 13 — Weekly digest tool  *(half a day)*
+
+Goal: a tool that answers *"what changed in the docs in the last N
+days?"* with no runtime git dependency (the prod container has no
+git).
+
+- Extend `scrape/changelog.py` with `--json` (one-shot structured
+  output) and `--history-out PATH` (walks `git log --first-parent
+  --since="<N> days ago"` for corpus-touching commits, writes one
+  JSON line per commit to a JSONL file).
+- CI workflows write the JSONL file into the image at build time:
+  `corpus/.digest/history.jsonl`. Both `refresh.yml` and
+  `image-only.yml`. **`fetch-depth: 0` is required** — see Phase 5.
+- New MCP tool `weekly_digest(days=7, version=None, platform=None,
+  max_bundles=25, max_pages_per_bundle=10)`: reads the JSONL,
+  filters to the window, applies version/platform via
+  `bundles.json` metadata, aggregates per-bundle change counts and
+  page lists, renders markdown.
+- Post-filter totals are critical: the headline "X page changes
+  across Y bundles" must compute X from the filtered set, not the
+  raw record count. Otherwise filtered calls look wrong to the
+  reader.
+
+Out of scope but trivial bolt-ons: scheduled HTML email of the
+digest, auto-publish to a blog, per-page diff excerpts as a
+follow-up tool.
+
+---
+
+## Standard tool set
+
+By the end you'll have ~15 tools registered. Production-tested
+shape:
+
+| Tool | What it does |
+|---|---|
+| `search_docs` | Semantic search with version/platform/bundle filters |
+| `get_page` | Full markdown + metadata for one page |
+| `list_versions` | Discover available facet values |
+| `list_cluster` | Cross-version peers for one page (if applicable) |
+| `diff_versions` | Unified diff of a page across two versions |
+| `bundle_changelog` | Added / removed / changed pages between two bundles |
+| `weekly_digest` | What changed in the last N days, with filters |
+| `corpus_status` | Freshness + size of the knowledge base |
+| `find_doc_inconsistencies` | Scoped scan for doc bugs |
+| `submit_doc_bug` | Submit a drafted bug (env-gated, operator-confirmed) |
+| `<product>_api_lessons` | Curated API gotchas, proactively-called |
+| product-specific tools | Interop matrix, lifecycle queries, etc. |
+
+---
+
+## Per-product customization checklist
+
+When applying this template to a new product, here's what you have
+to figure out yourself — everything else is shared infrastructure:
+
+- **Doc portal mechanics**
+  - URL pattern for pages
+  - Bundle/version concept (Zoomin "bundle", Madcap "project",
+    GitBook "space", Docusaurus "docs version" — same idea, different
+    name)
+  - SPA backing API (sniff the network tab) or fallback to
+    headless browser
+  - How `topic_cluster` -equivalent cross-version peers are exposed
+    (or whether you synthesize them from filenames)
+- **Bundle metadata schema**
+  - What does `version` look like? Semver, calendar, named?
+  - What does `platform` mean for this product? Is there a useful
+    facet at all?
+  - Other useful facets (language, product line, edition)?
+- **Filterable facets** for `search_docs`
+  - One filter per high-cardinality facet
+  - Skip filters that have <5 distinct values — they're not worth
+    the surface area
+- **Feedback endpoint** (for `submit_doc_bug`, if you want it)
+  - URL of the POST endpoint
+  - Required + optional payload fields
+  - Captcha / rate-limit behavior
+  - Whether anonymous submissions are accepted
+- **Curated knowledge** for the `_api_lessons` tool
+  - What does the product's API documentation NOT say that you've
+    learned from real integration work?
+- **Quickstart / example repos**
+  - Does the vendor publish working code? Ingest it; rewrite
+    chunk-0 for natural-language retrieval.
+
+---
+
+## Decisions worth carrying forward
+
+Things you'll save time on by deciding the same way again:
+
+- **Tool descriptions are user interface.** The LLM reads them
+  verbatim and decides whether to call the tool. *"Use when..."*
+  and *"Call proactively whenever..."* are real surfaces; treat
+  them like button labels. Most retrieval improvements turn out
+  to be tool-description rewrites in disguise.
+- **`stateless_http=True`** on the FastMCP server. Eliminates
+  whole categories of session-ID-related 404 storms after
+  container recreates.
+- **Pre-bake everything at CI time.** No runtime calls to git,
+  external services, or anything you wouldn't trust on a
+  Cloudflare outage. If the digest needs git history, write a
+  JSONL file at CI time. If the lessons doc needs to load fast,
+  bake it into the image.
+- **Env-gate every side-effecting tool.** Off by default in dev;
+  on only in production compose. Belt and suspenders against
+  accidental writes from staging environments.
+- **Operator-confirmation pattern for side-effecting tools.**
+  The tool docstring is the only place to enforce
+  human-in-the-loop. Make it loud. "MANDATORY", "Do not loop",
+  "show-confirm-then-submit" — those phrasings work.
+- **Verify with hand-curated golden queries before shipping any
+  retrieval change.** Numbers in the diff, in the commit message.
+  Don't ship retrieval changes on vibes.
+- **Two-cadence CI** (weekly scrape vs on-demand code-only)
+  saves hours per code iteration once you're past the
+  one-iteration-a-week stage.
+- **Rolling tag + sha-pinned tag** deploy pattern. `:latest` is
+  what Watchtower watches; `:<sha>` is your safety net. Rollback
+  is a one-line compose edit, not a redeploy.
+- **Usage logging is non-negotiable.** You will be wrong about
+  what people use. Capture the truth from day one; let it tell
+  you which features to keep building and which to delete.
+
+---
+
+## Glossary
+
+- **Bundle** — one logical doc set in the portal. Zoomin calls
+  them bundles; Madcap calls them projects; the concept is the
+  same: a versioned, titled collection of pages. One dir under
+  `corpus/`.
+- **Page** — one HTML page in a bundle. One `.md` + one `.json`
+  sidecar under the bundle dir.
+- **Topic cluster** — Zoomin's name for "this page in version
+  10.9 corresponds to that page in version 10.8." Stored in the
+  per-page sidecar. The portal-agnostic concept is "cross-version
+  peer mapping."
+- **Chunk** — a unit of text that gets independently embedded and
+  stored in Chroma. Target ~400-600 tokens; preserve paragraph
+  boundaries.
+- **RRF** — Reciprocal Rank Fusion. The way to merge two ranked
+  lists from independent retrievers without score calibration.
+
+---
+
+## What's deliberately NOT in this template
+
+Decisions you should make per-product (not copy from the original
+build):
+
+- The reverse proxy and TLS termination layer. Could be Caddy,
+  nginx, Traefik, Cloudflare Tunnel — pick what your infra uses.
+- The Gateway / aggregator in front of multiple MCPs (MetaMCP is one
+  option; you may not need any aggregator if you're running a
+  single product MCP).
+- The specific embedding model — `nomic-embed-text` is a strong
+  default but newer / domain-specific models may be better for
+  some products.
+- The Ollama containers / GPU setup — depends on what hardware you
+  have. The pattern is one container per GPU with explicit
+  `NVIDIA_VISIBLE_DEVICES` pinning; the indexer load-balances
+  across them.
+- Whether to publish a blog series alongside the build. Strongly
+  recommended (forces clarity, builds an audience), but optional.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..52cd328
--- /dev/null
+++ b/README.md
@@ -0,0 +1,104 @@
+# docs-mcp-template
+
+A reusable template for building hosted MCP servers over a product's
+public documentation. Distilled from one production build; everything
+product-specific has been factored out.
+
+The end product is a streamable-HTTP MCP server with ~15 tools that
+any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can
+call to answer questions against the docs, surface what changed
+recently, find inconsistencies, and (optionally) submit doc bugs
+back upstream.
+
+## What's here
+
+- **[PLAN.md](PLAN.md)** — comprehensive build guide. Phased
+  approach (13 phases, ~2–3 weeks of focused work for the full
+  stack). Includes the design decisions, the gotchas, and a
+  per-product customization checklist.
+- **Scaffolded skeleton** — working FastMCP server with stub tools,
+  Dockerfile, docker-compose, CI workflows, eval harness layout,
+  usage logging. Everything you need to `git clone` and start
+  filling in the product-specific bits.
+
+## Quick start
+
+```bash
+git clone https://git.jpaul.io/justin/docs-mcp-template.git my-product-docs
+cd my-product-docs
+git remote remove origin  # detach from template
+python -m venv venv && source venv/bin/activate
+pip install -r requirements.txt
+
+# Read PLAN.md before doing anything else. Pay particular attention to
+# Phase 1 (scraper) — that's the most product-specific phase.
+
+# Run the stub server (no corpus yet — just verifies the wiring):
+python -m docs_mcp.server --transport stdio
+```
+
+## Repo layout
+
+```
+.
+├── PLAN.md                        # The build guide. Read first.
+├── README.md
+├── requirements.txt
+├── Dockerfile
+├── .gitignore
+├── .gitea/workflows/
+│   ├── refresh.yml                # Weekly scrape + index + image push
+│   └── image-only.yml             # On-demand code-only ship
+├── scrape/
+│   ├── README.md                  # Product-specific scraper goes here
+│   └── changelog.py               # Reusable: --json, --history-out
+├── rag/
+│   ├── embeddings.py              # Ollama embedder, swappable
+│   ├── chunk.py                   # Chunker — adjust per page format
+│   ├── index.py                   # Builds Chroma + (optionally) BM25
+│   └── bm25.py                    # SQLite FTS5 lexical index
+├── docs_mcp/
+│   ├── server.py                  # FastMCP server with stub tools
+│   └── usage.py                   # TimedCall + JSONL telemetry
+├── eval/
+│   ├── queries.jsonl.example      # Curate ~25 hand-labeled queries
+│   ├── retrievers.py              # Retriever protocol + implementations
+│   └── run_eval.py                # MRR / Recall@k / nDCG@k harness
+├── scripts/
+│   ├── usage_report.py            # Standalone log analyzer
+│   └── registry_gc.py             # Container registry cleanup
+└── deploy/
+    └── docker-compose.yml         # Hosting stack: MCP + reranker + Watchtower
+```
+
+## What's product-specific (must implement)
+
+- `scrape/` — the scraper itself. The template gives you the corpus
+  layout contract and a working `changelog.py`; the actual extraction
+  logic is yours.
+- The corpus on disk (gitignored; rebuilt by CI).
+- The reranker GGUF model and llama.cpp container (commented in
+  `deploy/docker-compose.yml`).
+- The reverse proxy / TLS layer in front of the public endpoint.
+- The hand-curated knowledge surface (your product's API gotchas,
+  example scripts, anything the LLM should know that the docs
+  don't say).
+
+## What's NOT product-specific (works as-is)
+
+- FastMCP server skeleton + tool decoration pattern
+- Chroma + Ollama embedding pipeline
+- BM25 / SQLite FTS5 lexical index
+- Hybrid retrieval (RRF) + reranker integration
+- Eval harness (Retriever protocol, MRR/Recall/nDCG)
+- Usage logging (TimedCall, JSONL, daily rotation)
+- CI workflow shape (weekly + on-demand, retry-on-race, three-tag
+  image scheme)
+- Registry GC script
+- Standard tools: `search_docs`, `get_page`, `list_versions`,
+  `diff_versions`, `bundle_changelog`, `weekly_digest`,
+  `find_doc_inconsistencies`, `submit_doc_bug`, etc.
+
+## License
+
+Internal template. Adjust before publishing.
diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
new file mode 100644
index 0000000..0aa05a8
--- /dev/null
+++ b/deploy/docker-compose.yml
@@ -0,0 +1,111 @@
+# Hosting stack for a docs MCP server.
+#
+# Replace <product> below with your product name on first deploy.
+# Volumes: usage logs are mounted to a host path so they survive
+# Watchtower-driven container recreates.
+#
+# This template assumes a reverse proxy / Cloudflare Tunnel terminates
+# TLS in front of port 8000. Adjust if your infra differs.
+
+services:
+
+  # The MCP server. Watchtower auto-pulls on :latest changes.
+  <product>-docs-mcp:
+    image: <registry>/<owner>/<product>-docs-mcp:latest
+    container_name: <product>-docs-mcp
+    restart: unless-stopped
+    ports:
+      - "8000:8000"
+    environment:
+      PRODUCT_NAME: "<product>"
+      PRODUCT_DOCS_URL: "https://docs.example.com"
+
+      # Streamable-HTTP transport. Stateless mode is required for
+      # production: clients don't lose sessions when Watchtower
+      # recreates the container.
+      MCP_TRANSPORT: streamable-http
+      MCP_HOST: 0.0.0.0
+      MCP_PORT: "8000"
+
+      # If you run MetaMCP or another gateway in front and reach
+      # this container via its compose DNS name (e.g. <product>-docs-mcp:8000),
+      # add that hostname here. "*" disables the rebind check entirely.
+      MCP_ALLOWED_HOSTS: "<product>-docs-mcp,localhost,127.0.0.1"
+
+      # Phase 6 — reranker sidecar (jina-reranker-v2-base via llama.cpp).
+      RERANK_URL: http://<product>-rerank:8080
+      RERANK_POOL: "200"
+      RERANK_TIMEOUT: "30"
+
+      # Phase 8 — hybrid retrieval (BM25 + dense + RRF). Set true
+      # only after the eval harness shows the dense-only path
+      # missing technical-term queries that BM25 catches.
+      HYBRID_SEARCH: "true"
+
+      # Phase 10 — usage telemetry.
+      USAGE_LOG_DIR: /app/var/logs
+      USAGE_LOG_KEEP_DAYS: "90"
+
+      # Phase 12 — doc-bug submission gate. Off by default; on only
+      # in production after you've verified the endpoint contract.
+      DOC_BUG_SUBMIT_ENABLED: "false"
+      # DOC_BUG_API_URL: "https://docs-be.example.com/api/feedback"
+    volumes:
+      # Usage logs persist across container recreates.
+      - ./<product>-docs-mcp-logs:/app/var/logs
+    depends_on:
+      - <product>-rerank
+    labels:
+      # Watchtower polls *only* containers with this label set true.
+      com.centurylinklabs.watchtower.enable: "true"
+    networks:
+      - mcp
+
+  # Reranker sidecar — llama.cpp serving jina-reranker-v2-base.
+  # Requires GPU access; adjust runtime/devices for your hardware.
+  <product>-rerank:
+    image: ghcr.io/ggml-org/llama.cpp:server-cuda
+    container_name: <product>-rerank
+    restart: unless-stopped
+    # Mount the GGUF model from the host. Download from huggingface
+    # (gguf-org/jina-reranker-v2-base-multilingual-GGUF) first.
+    volumes:
+      - /path/to/models:/models:ro
+    command: >
+      --model /models/jina-reranker-v2-base.Q8_0.gguf
+      --reranking
+      --host 0.0.0.0
+      --port 8080
+      --n-gpu-layers 99
+      --ctx-size 4096
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    networks:
+      - mcp
+
+  # Watchtower — auto-pulls :latest on push.
+  # Only watches containers labeled `com.centurylinklabs.watchtower.enable=true`.
+  watchtower:
+    image: containrrr/watchtower:latest
+    container_name: watchtower
+    restart: unless-stopped
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    environment:
+      WATCHTOWER_POLL_INTERVAL: "300"   # 5 min
+      WATCHTOWER_LABEL_ENABLE: "true"
+      WATCHTOWER_CLEANUP: "true"        # remove old images after pull
+    # If your registry requires auth, mount a docker config:
+    #  volumes:
+    #    - ./registry-auth.json:/config.json:ro
+    networks:
+      - mcp
+
+networks:
+  mcp:
+    driver: bridge
diff --git a/docs_mcp/__init__.py b/docs_mcp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docs_mcp/server.py b/docs_mcp/server.py
new file mode 100644
index 0000000..28b1345
--- /dev/null
+++ b/docs_mcp/server.py
@@ -0,0 +1,263 @@
+"""MCP server skeleton — fill in PRODUCT_NAME and the tool bodies.
+
+This file is the template's structural anchor. The phases described in
+PLAN.md add or extend pieces of this file:
+
+  Phase 3  — search_docs, get_page, list_versions stubs (you are here)
+  Phase 6  — reranker integration in search_docs
+  Phase 8  — BM25 + hybrid retrieval (HYBRID_SEARCH env gate, _rrf_fuse)
+  Phase 9  — diff_versions, list_cluster, bundle_changelog
+  Phase 10 — TimedCall wiring (already imported below)
+  Phase 11 — <product>_api_lessons tool
+  Phase 12 — find_doc_inconsistencies, submit_doc_bug
+  Phase 13 — weekly_digest + _digest_history reader
+
+Every stub below has a docstring + `raise NotImplementedError`. Replace
+the body when you reach the corresponding phase. Keep the signatures
+stable across products — clients depend on them.
+"""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+from pathlib import Path
+from typing import Annotated
+
+from mcp.server.fastmcp import FastMCP
+from pydantic import Field
+
+from .usage import TimedCall
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Product-specific configuration. Set these for each new build.
+# ---------------------------------------------------------------------------
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
+PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com")
+COLLECTION = f"{PRODUCT_NAME}_docs"
+
+# Paths inside the deployed container (and matching layout locally for dev).
+ROOT = Path(__file__).resolve().parent.parent
+CORPUS = ROOT / "corpus"
+CHROMA_DIR = ROOT / "chroma"
+BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db")))
+BUNDLES_JSON = ROOT / "bundles.json"
+
+# ---------------------------------------------------------------------------
+# Feature flags (Phase 6 / 8 / 12 enable these as you ship each phase).
+# ---------------------------------------------------------------------------
+RERANK_URL = os.environ.get("RERANK_URL", "").rstrip("/") or None
+RERANK_POOL = int(os.environ.get("RERANK_POOL", "50"))
+RERANK_TIMEOUT = float(os.environ.get("RERANK_TIMEOUT", "30"))
+
+HYBRID_SEARCH = os.environ.get("HYBRID_SEARCH", "").lower() in ("true", "1", "yes", "on")
+RRF_K = int(os.environ.get("RRF_K", "60"))
+
+DOC_BUG_SUBMIT_ENABLED = os.environ.get("DOC_BUG_SUBMIT_ENABLED", "").lower() in ("true", "1", "yes", "on")
+DOC_BUG_API_URL = os.environ.get("DOC_BUG_API_URL", "")  # product-specific endpoint
+DOC_BUG_TIMEOUT = float(os.environ.get("DOC_BUG_TIMEOUT", "15"))
+
+
+# ---------------------------------------------------------------------------
+# FastMCP setup.
+#
+# stateless_http=True — every request creates an ephemeral session and
+# discards it on return. Critical for production: clients don't get
+# 404 storms when the container is recreated by Watchtower.
+# ---------------------------------------------------------------------------
+mcp = FastMCP(f"{PRODUCT_NAME}-docs", stateless_http=True)
+
+
+# ---------------------------------------------------------------------------
+# Lazy helpers — instantiate expensive things only when actually needed,
+# so the server still starts when (e.g.) Ollama is briefly unreachable.
+# ---------------------------------------------------------------------------
+
+def _bundles() -> dict[str, dict]:
+    """Cached load of bundles.json into a {slug: bundle_dict} mapping.
+
+    bundles.json is the product-specific catalog written by the Phase 1
+    scraper. See PLAN.md Phase 1 for the schema.
+    """
+    if not BUNDLES_JSON.exists():
+        return {}
+    cat = json.loads(BUNDLES_JSON.read_text())
+    return {b["slug"]: b for b in cat}
+
+
+def _build_where(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None:
+    """Translate filter args into a Chroma `where` clause."""
+    conds: list[dict] = []
+    if version:
+        conds.append({"version": version})
+    if platform:
+        conds.append({"platform": platform})
+    if bundle_id:
+        conds.append({"bundle_id": bundle_id})
+    if not conds:
+        return None
+    if len(conds) == 1:
+        return conds[0]
+    return {"$and": conds}
+
+
+def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None:
+    """Read a corpus page off disk. Returns (markdown_body, metadata_dict)."""
+    md_path = CORPUS / bundle_id / (page_id + ".md")
+    json_path = CORPUS / bundle_id / (page_id + ".json")
+    if not md_path.exists() or not json_path.exists():
+        return None
+    return md_path.read_text(), json.loads(json_path.read_text())
+
+
+# ===========================================================================
+# Tools
+# ===========================================================================
+
+@mcp.tool()
+def search_docs(
+    query: Annotated[str, Field(description=f"Natural-language query about {PRODUCT_NAME}.")],
+    version: Annotated[
+        str | None,
+        Field(description="OPTIONAL version filter — restrict to one product version."),
+    ] = None,
+    platform: Annotated[
+        str | None,
+        Field(description="OPTIONAL platform filter. Set to one of the platforms listed by list_versions(); omit for all platforms."),
+    ] = None,
+    bundle_id: Annotated[
+        str | None,
+        Field(description="OPTIONAL bundle filter — pin to a specific doc bundle slug."),
+    ] = None,
+    k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10,
+) -> str:
+    """Search the {product} docs corpus.
+
+    Returns the top-k most relevant chunks (with full source page URLs)
+    given a natural-language query. Optional filters narrow the search
+    to one version, one platform, or one bundle. Use list_versions()
+    first if you need to discover the available facet values.
+
+    Call this tool whenever the user asks anything that should be
+    answerable from the official product documentation.
+    """
+    with TimedCall("search_docs", {
+        "query": query, "version": version, "platform": platform,
+        "bundle_id": bundle_id, "k": k,
+    }) as _call:
+        # TODO Phase 2-3: query Chroma collection (see rag/index.py for
+        # how it was built). Render the top-k chunks as markdown with
+        # source URLs.
+        # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set.
+        # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run
+        # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank.
+        _call.set(hits_returned=0)
+        raise NotImplementedError("Phase 2/3: implement Chroma query + rendering")
+
+
+@mcp.tool()
+def get_page(
+    bundle_id: Annotated[str, Field(description="Bundle slug.")],
+    page_id: Annotated[str, Field(description="Page filename within the bundle.")],
+) -> str:
+    """Return the full markdown for one page, plus a metadata header.
+
+    Use after search_docs surfaces a relevant page and the user (or you)
+    want the complete text — not just the matched chunks.
+    """
+    with TimedCall("get_page", {"bundle_id": bundle_id, "page_id": page_id}) as _call:
+        data = _read_page(bundle_id, page_id)
+        if data is None:
+            _call.set(found=False)
+            return f"Page not found: {bundle_id}/{page_id}"
+        md, meta = data
+        _call.set(found=True, page_chars=len(md))
+        # TODO: add a metadata header (title, version, source URL) above
+        # the body. Product-specific shape.
+        return md
+
+
+@mcp.tool()
+def list_versions() -> str:
+    """List the available version/platform facets across all bundles.
+
+    Use this to discover valid filter values for search_docs.
+    """
+    with TimedCall("list_versions", {}) as _call:
+        cat = _bundles()
+        if not cat:
+            return "_(no bundles indexed yet — run the scraper + indexer)_"
+        versions = sorted({b.get("version") for b in cat.values() if b.get("version")})
+        platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")})
+        _call.set(versions=len(versions), platforms=len(platforms))
+        lines = [f"# Facets across {len(cat)} bundle(s)", ""]
+        if versions:
+            lines.append("## Versions"); lines.append("")
+            for v in versions: lines.append(f"- `{v}`")
+            lines.append("")
+        if platforms:
+            lines.append("## Platforms"); lines.append("")
+            for p in platforms: lines.append(f"- `{p}`")
+        return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Stubs for later phases — keep the signatures in this file so refactors
+# don't lose the contracts. Implementations come per phase.
+# ---------------------------------------------------------------------------
+
+# @mcp.tool()  # Phase 9
+# def list_cluster(bundle_id: str, page_id: str) -> str: ...
+
+# @mcp.tool()  # Phase 9
+# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ...
+
+# @mcp.tool()  # Phase 9
+# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ...
+
+# @mcp.tool()  # Phase 13
+# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ...
+
+# @mcp.tool()  # Phase 9 (or 3 — useful early)
+# def corpus_status() -> str: ...
+
+# @mcp.tool()  # Phase 11
+# def myproduct_api_lessons(topic: str | None = None) -> str: ...
+
+# @mcp.tool()  # Phase 12
+# def find_doc_inconsistencies(scope_query: str, ...) -> str: ...
+
+# @mcp.tool()  # Phase 12
+# def submit_doc_bug(page_url: str, content: str, email: str | None = None, ...) -> str: ...
+
+
+# ===========================================================================
+# Entry point
+# ===========================================================================
+
+def main() -> None:
+    import argparse
+    p = argparse.ArgumentParser(description=f"{PRODUCT_NAME} docs MCP server")
+    p.add_argument("--transport", choices=["stdio", "streamable-http", "sse"],
+                   default=os.environ.get("MCP_TRANSPORT", "stdio"))
+    p.add_argument("--host", default=os.environ.get("MCP_HOST", "0.0.0.0"))
+    p.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "8000")))
+    args = p.parse_args()
+
+    if args.transport == "stdio":
+        mcp.run()
+    else:
+        mcp.settings.host = args.host
+        mcp.settings.port = args.port
+        # DNS-rebinding protection defaults to localhost-only — disable for
+        # container-network DNS hostnames. See PLAN.md "Hosting" notes.
+        if os.environ.get("MCP_DISABLE_DNS_REBINDING_PROTECTION") in {"1", "true", "yes"}:
+            mcp.settings.transport_security.enable_dns_rebinding_protection = False
+        mcp.run(transport=args.transport)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs_mcp/usage.py b/docs_mcp/usage.py
new file mode 100644
index 0000000..e4ad4b2
--- /dev/null
+++ b/docs_mcp/usage.py
@@ -0,0 +1,127 @@
+"""Per-call usage telemetry — JSONL with daily rotation and retention.
+
+Reusable as-is across products. Drop the import + `with TimedCall(...)`
+into any tool body and the call gets logged with the tool name, args,
+elapsed time, and any extra fields the tool sets via `_call.set(...)`.
+
+The log file is `var/logs/usage.jsonl` by default (override with the
+`USAGE_LOG_DIR` env). Daily rotation; files older than
+`USAGE_LOG_KEEP_DAYS` (default 90) are deleted on next write.
+
+Layout of one record:
+
+    {
+      "ts":           "2026-05-22T13:14:15+00:00",
+      "tool":         "search_docs",
+      "args":         {"query": "...", "version": "10.9", "k": 10},
+      "elapsed_ms":   142.5,
+      "hits_returned": 7,           # optional, set by the tool
+      "reranked":     true,         # optional, set by the tool
+      // ... any other key the tool sets via _call.set(...)
+    }
+"""
+from __future__ import annotations
+
+import json
+import os
+import time
+import threading
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+
+
+USAGE_LOG_DIR = Path(os.environ.get("USAGE_LOG_DIR", "var/logs"))
+USAGE_LOG_KEEP_DAYS = int(os.environ.get("USAGE_LOG_KEEP_DAYS", "90"))
+
+# Single global lock to serialize writes from multiple request handlers.
+# JSONL appends are atomic at the OS level for short records on most
+# filesystems, but the lock is cheap and saves you from cross-platform
+# surprises.
+_lock = threading.Lock()
+_last_rotation_check: float = 0.0
+
+
+def _maybe_rotate() -> None:
+    """Move usage.jsonl → usage.jsonl.<yesterday> if the date has rolled.
+
+    Cheap to call; we only do filesystem work when a day has actually
+    passed since the last check.
+    """
+    global _last_rotation_check
+    now = time.time()
+    if now - _last_rotation_check < 300:  # 5 min cap between checks
+        return
+    _last_rotation_check = now
+
+    USAGE_LOG_DIR.mkdir(parents=True, exist_ok=True)
+    active = USAGE_LOG_DIR / "usage.jsonl"
+    if active.exists():
+        try:
+            mtime = datetime.fromtimestamp(active.stat().st_mtime, tz=timezone.utc).date()
+            today = datetime.now(timezone.utc).date()
+            if mtime < today:
+                rotated = USAGE_LOG_DIR / f"usage.jsonl.{mtime.isoformat()}"
+                if not rotated.exists():
+                    active.rename(rotated)
+        except OSError:
+            pass
+
+    # Retention: delete usage.jsonl.YYYY-MM-DD files older than the
+    # retention window. The active file is never deleted by this.
+    cutoff = datetime.now(timezone.utc).date() - timedelta(days=USAGE_LOG_KEEP_DAYS)
+    for f in USAGE_LOG_DIR.glob("usage.jsonl.*"):
+        try:
+            datestamp = f.name.split(".", 2)[-1]
+            if datetime.fromisoformat(datestamp).date() < cutoff:
+                f.unlink()
+        except (ValueError, OSError):
+            continue
+
+
+class TimedCall:
+    """Context manager that captures one tool call's telemetry record.
+
+    Usage:
+
+        with TimedCall("search_docs", {"query": q, ...}) as call:
+            ... do the work ...
+            call.set(hits_returned=len(results), reranked=True)
+
+    On exit, writes one JSONL record to usage.jsonl. Exceptions are
+    captured into the `error` field; the exception is re-raised so
+    the tool's caller sees the failure.
+    """
+
+    def __init__(self, tool: str, args: dict[str, Any]):
+        self.tool = tool
+        self.args = args
+        self.extra: dict[str, Any] = {}
+        self._t0: float = 0.0
+
+    def set(self, **kwargs: Any) -> None:
+        """Attach extra fields to the eventual telemetry record."""
+        self.extra.update(kwargs)
+
+    def __enter__(self) -> "TimedCall":
+        self._t0 = time.perf_counter()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        elapsed_ms = (time.perf_counter() - self._t0) * 1000.0
+        record: dict[str, Any] = {
+            "ts":         datetime.now(timezone.utc).isoformat(),
+            "tool":       self.tool,
+            "args":       self.args,
+            "elapsed_ms": round(elapsed_ms, 2),
+        }
+        if exc_type is not None:
+            record["error"] = f"{exc_type.__name__}: {exc_val}"
+        record.update(self.extra)
+
+        _maybe_rotate()
+        with _lock:
+            USAGE_LOG_DIR.mkdir(parents=True, exist_ok=True)
+            with open(USAGE_LOG_DIR / "usage.jsonl", "a") as fh:
+                fh.write(json.dumps(record, separators=(",", ":")) + "\n")
+        # Don't swallow the exception — the caller still needs to see it.
diff --git a/eval/__init__.py b/eval/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/eval/queries.jsonl.example b/eval/queries.jsonl.example
new file mode 100644
index 0000000..ae67214
--- /dev/null
+++ b/eval/queries.jsonl.example
@@ -0,0 +1,4 @@
+{"query": "how to install <product> on Linux", "expected": [{"bundle_id": "Install.Linux.10.0", "page_id": "Installation.htm"}], "tags": ["install", "linux"]}
+{"query": "configure database connection for high availability", "expected": [{"bundle_id": "Admin.10.0", "page_id": "HA_Setup.htm"}], "tags": ["ha", "config"]}
+{"query": "API endpoint to list users", "expected": [{"bundle_id": "API.10.0", "page_id": "Users_API.htm"}], "tags": ["api"]}
+{"query": "what changed between 10.0 and 10.1", "expected": [{"bundle_id": "Release_Notes.10.1", "page_id": "Whats_New.htm"}], "tags": ["release-notes"]}
diff --git a/eval/retrievers.py b/eval/retrievers.py
new file mode 100644
index 0000000..bc06a18
--- /dev/null
+++ b/eval/retrievers.py
@@ -0,0 +1,62 @@
+"""Retriever protocol + concrete implementations.
+
+A single matrix dimension per knob (dense / reranked / bm25 / hybrid)
+so the eval harness can compare them apples-to-apples. Implement these
+once at Phase 7 and reuse them across every retrieval change.
+
+Each retriever returns a ranked list of (bundle_id, page_id) tuples
+deduplicated to the page level (chunks within the same page collapse
+to one entry; the highest-ranked chunk's position wins).
+"""
+from __future__ import annotations
+
+from typing import Protocol, Iterable
+
+
+class Retriever(Protocol):
+    name: str
+
+    def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
+        """Return up to k (bundle_id, page_id) tuples in rank order."""
+        ...
+
+
+def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]:
+    """Take a stream of (bundle_id, page_id, chunk_ordinal) and return
+    the first k unique pages in their first-seen order."""
+    seen: set[tuple[str, str]] = set()
+    out: list[tuple[str, str]] = []
+    for bid, pid, _ord in chunk_ids:
+        key = (bid, pid)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(key)
+        if len(out) >= k:
+            break
+    return out
+
+
+# TODO Phase 2/3 — implement these once Chroma + the bm25 module are
+# in place. Each one is small (15-30 LOC). The eval harness imports
+# from this module by class name.
+#
+# class DenseRetriever:
+#     name = "dense"
+#     def __init__(self, collection): self.col = collection
+#     def retrieve(self, query, k=10): ...
+#
+# class RerankedRetriever:
+#     name = "dense+rerank"
+#     def __init__(self, collection, rerank_url, pool=200): ...
+#     def retrieve(self, query, k=10): ...
+#
+# class BM25Retriever:
+#     name = "bm25"
+#     def __init__(self, bm25_index): ...
+#     def retrieve(self, query, k=10): ...
+#
+# class HybridRetriever:
+#     name = "bm25+dense+rrf"
+#     def __init__(self, dense, bm25, k_rrf=60): ...
+#     def retrieve(self, query, k=10): ...
diff --git a/eval/run_eval.py b/eval/run_eval.py
new file mode 100644
index 0000000..9ba3aa6
--- /dev/null
+++ b/eval/run_eval.py
@@ -0,0 +1,91 @@
+"""Run all retrievers against eval/queries.jsonl, emit a markdown report.
+
+Metrics computed per retriever:
+
+  MRR        — mean reciprocal rank of the FIRST expected page in the
+               ranked result list (0 if not in top-k).
+  Recall@K   — fraction of expected pages that appear in top-K.
+  nDCG@K     — discounted gain weighted by rank position.
+
+The "right" number depends on what you're measuring. MRR tracks "the
+first-line answer is correct"; Recall@K tracks "everything relevant
+is there to draw from"; nDCG@K is a smoother combination of both.
+For docs-RAG, MRR is usually the headline metric.
+
+Usage:
+
+    python -m eval.run_eval \\
+        --queries eval/queries.jsonl \\
+        --k 5 \\
+        --output eval/results/baseline.md
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import math
+import time
+from pathlib import Path
+from typing import Iterable
+
+
+def load_queries(path: Path) -> list[dict]:
+    with open(path) as fh:
+        return [json.loads(line) for line in fh if line.strip()]
+
+
+def reciprocal_rank(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]]) -> float:
+    expected_set = set(expected)
+    for i, page in enumerate(retrieved, start=1):
+        if page in expected_set:
+            return 1.0 / i
+    return 0.0
+
+
+def recall_at_k(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]], k: int) -> float:
+    if not expected:
+        return 0.0
+    retrieved_set = set(retrieved[:k])
+    hits = sum(1 for e in expected if e in retrieved_set)
+    return hits / len(expected)
+
+
+def ndcg_at_k(retrieved: list[tuple[str, str]], expected: list[tuple[str, str]], k: int) -> float:
+    expected_set = set(expected)
+    dcg = 0.0
+    for i, page in enumerate(retrieved[:k], start=1):
+        if page in expected_set:
+            dcg += 1.0 / math.log2(i + 1)
+    # Ideal DCG: every expected page in the top positions.
+    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, min(len(expected), k) + 1))
+    return dcg / idcg if idcg else 0.0
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--queries", type=Path, default=Path("eval/queries.jsonl"))
+    p.add_argument("--k", type=int, default=5)
+    p.add_argument("--output", type=Path, default=Path("eval/results/baseline.md"))
+    args = p.parse_args()
+
+    if not args.queries.exists():
+        print(f"queries file not found: {args.queries}")
+        print("hint: copy eval/queries.jsonl.example and edit")
+        return 1
+
+    queries = load_queries(args.queries)
+    print(f"loaded {len(queries)} queries")
+
+    # TODO Phase 7: instantiate the retrievers you implemented in
+    # eval/retrievers.py and run each one against each query.
+    # Aggregate MRR / Recall@K / nDCG@K per retriever. Emit a
+    # markdown table to args.output. Commit the file alongside the
+    # PR that changes retrieval.
+    raise NotImplementedError(
+        "Wire up the retrievers in eval/retrievers.py first, then "
+        "fill in this evaluation loop. See PLAN.md Phase 7."
+    )
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/rag/__init__.py b/rag/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/rag/bm25.py b/rag/bm25.py
new file mode 100644
index 0000000..06982e0
--- /dev/null
+++ b/rag/bm25.py
@@ -0,0 +1,277 @@
+"""SQLite FTS5-backed BM25 retrieval over the same chunks Chroma indexes.
+
+Hybrid retrieval (BM25 + dense + Reciprocal Rank Fusion) addresses a
+limit of single-tower dense embeddings: when a query has specific
+technical terms (filenames, language names, error codes, API paths),
+the dense embedding doesn't bridge from the query into a short
+code-focused chunk. The chunk loses to the much larger crowd of
+prose chunks that semantically match the query topic.
+
+BM25 handles this directly. Lexical overlap on rare terms ("python",
+"create_vpg.py", "PROTECTED_SITE_ID", "applyUpgrade") scores those
+chunks high. Fused with the dense ranking via RRF, the hybrid result
+is strictly better than either alone for the queries we've seen
+fail.
+
+Why SQLite FTS5:
+  - In the stdlib. Zero new deps.
+  - On-disk. Same persistence model as Chroma — Docker COPY the dir,
+    `rag.index --rebuild` regenerates from corpus.
+  - Built-in `bm25()` ranking function. No knobs to tune that matter
+    for our use case (k1=1.2, b=0.75 defaults are fine).
+  - Builds 70k+ chunks in seconds. Faster than the Chroma rebuild's
+    embedding step by 100×, so it adds basically nothing to the
+    full-rebuild cycle.
+
+Schema is two tables to keep filtering clean. FTS5 doesn't filter
+nicely on its own columns; the content_rowid pattern keeps an
+external metadata table joinable by rowid:
+
+    CREATE TABLE chunks_meta (
+        rowid INTEGER PRIMARY KEY AUTOINCREMENT,
+        id TEXT UNIQUE,
+        bundle_id TEXT, page_id TEXT, version TEXT,
+        platform TEXT, product TEXT, ordinal INTEGER
+    );
+    CREATE VIRTUAL TABLE chunks_fts USING fts5(
+        text,
+        tokenize = 'porter unicode61 remove_diacritics 2',
+        content = 'chunks_meta',
+        content_rowid = 'rowid'
+    );
+
+Queries:
+
+    SELECT m.id, bm25(chunks_fts) AS score
+    FROM chunks_meta m
+    JOIN chunks_fts  f ON m.rowid = f.rowid
+    WHERE f MATCH ?
+      AND m.version = ?            -- optional metadata filter
+    ORDER BY bm25(chunks_fts)      -- lower = better in FTS5
+    LIMIT ?;
+"""
+from __future__ import annotations
+
+import logging
+import re
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+log = logging.getLogger(__name__)
+
+# Default location: bm25/<product>_docs.db at the repo root, next to chroma/.
+ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_DB_DIR = ROOT / "bm25"
+DEFAULT_DB_NAME = "<product>_docs.db"
+
+# Columns we expose as filterable metadata. Mirrors what _build_where in
+# docs_mcp/server.py accepts so the same filter dicts work for both
+# Chroma and BM25 without per-retriever translation in the caller.
+FILTER_COLUMNS = ("bundle_id", "page_id", "version", "platform", "product", "ordinal")
+
+
+# Allowlist tokenizer for free-text queries. FTS5's parser chokes on lots
+# of punctuation we routinely see in user queries (".10.9", "?", "VPG's",
+# em-dash, etc.). Rather than blocklist every operator, just keep
+# alphanumerics + a few separators and replace everything else with a
+# space. This loses the ability to phrase-search ("exact match") but we
+# don't expose that to users anyway — they ask natural-language questions
+# and want the answer, not a Boolean DSL.
+_KEEP_RE = re.compile(r"[^A-Za-z0-9_\s]")
+# FTS5 reserves these Boolean operator KEYWORDS at the token level —
+# stripping them avoids accidental phrase-query behavior when a user
+# query happens to contain bare "AND", "OR", "NOT", "NEAR".
+_BOOLEAN_KW_RE = re.compile(r"(?<!\w)(AND|OR|NOT|NEAR)(?!\w)")
+
+
+def _sanitize_query(text: str) -> str:
+    """Reduce a natural-language query to an FTS5 OR-of-tokens query.
+
+    Two transformations:
+
+    1. Non-alphanumeric → space (drops punctuation; "10.9?" becomes
+       "10 9"). Lets us handle versions, parens, question marks, etc.
+       without inviting FTS5 parse errors.
+    2. Boolean keywords stripped (FTS5 reserves AND/OR/NOT/NEAR).
+    3. Tokens explicitly OR'd. FTS5's default is AND-of-tokens — for
+       any non-trivial natural-language query that means zero hits
+       (no chunk contains every word). OR semantics is what we want:
+       BM25 already weights documents containing more query terms
+       higher, so we don't lose precision, but we DO gain recall.
+    """
+    cleaned = _KEEP_RE.sub(" ", text)
+    cleaned = _BOOLEAN_KW_RE.sub(" ", cleaned)
+    tokens = cleaned.split()
+    if not tokens:
+        return ""
+    return " OR ".join(tokens)
+
+
+def _where_to_sql(where: dict | None) -> tuple[str, list[Any]]:
+    """Translate a Chroma-shaped filter dict into a SQL fragment + params.
+
+    Accepts the same shapes ``docs_mcp.server._build_where`` produces:
+
+        None                          → ("", [])
+        {"version": "10.9"}           → ("AND m.version = ?", ["10.9"])
+        {"$and": [{...}, {...}]}      → ("AND m.X = ? AND m.Y = ?", [...])
+
+    Unknown keys are silently dropped (defensive — better to over-match
+    than to crash on a filter we don't know).
+    """
+    if not where:
+        return "", []
+    parts: list[str] = []
+    params: list[Any] = []
+
+    def _emit_eq(cond: dict[str, Any]) -> None:
+        for k, v in cond.items():
+            if k in FILTER_COLUMNS:
+                parts.append(f"m.{k} = ?")
+                params.append(v)
+
+    if "$and" in where:
+        for sub in where["$and"]:
+            _emit_eq(sub)
+    else:
+        _emit_eq(where)
+    if not parts:
+        return "", []
+    return "AND " + " AND ".join(parts), params
+
+
+class BM25Index:
+    """Thin wrapper around an FTS5-backed sqlite db.
+
+    Single-writer model. Reads are connection-per-call (sqlite handles
+    concurrency through file locks; for our read-heavy workload that's
+    fine and avoids cross-thread connection sharing issues with the MCP
+    server's request handlers).
+    """
+
+    def __init__(self, db_path: Path | None = None):
+        self.db_path = Path(db_path) if db_path else (DEFAULT_DB_DIR / DEFAULT_DB_NAME)
+
+    # -- build ----------------------------------------------------------
+
+    def build(self, records: list[dict]) -> int:
+        """Rebuild the index from scratch from `records`.
+
+        `records` is the same list ``rag.index.page_records`` produces:
+        ``[{"id": ..., "text": ..., "metadata": {...}}, ...]``. Bulk
+        insert wrapped in a transaction — single-digit seconds for the
+        full 73k-chunk corpus.
+        """
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        # Drop and recreate. Idempotent rebuild.
+        if self.db_path.exists():
+            self.db_path.unlink()
+        with sqlite3.connect(self.db_path) as con:
+            con.executescript(self._schema_sql())
+            con.executemany(
+                "INSERT INTO chunks_meta (id, bundle_id, page_id, version, "
+                "platform, product, ordinal) VALUES (?, ?, ?, ?, ?, ?, ?)",
+                [
+                    (
+                        r["id"],
+                        r["metadata"].get("bundle_id") or "",
+                        r["metadata"].get("page_id") or "",
+                        r["metadata"].get("version") or "",
+                        r["metadata"].get("platform") or "",
+                        r["metadata"].get("product") or "",
+                        int(r["metadata"].get("ordinal") or 0),
+                    )
+                    for r in records
+                ],
+            )
+            # Populate the FTS5 contentless-ish table by rowid. We populated
+            # chunks_meta first; rowids align with insertion order.
+            con.executemany(
+                "INSERT INTO chunks_fts (rowid, text) VALUES (?, ?)",
+                [
+                    (i + 1, r["text"])
+                    for i, r in enumerate(records)
+                ],
+            )
+            con.commit()
+        log.info("bm25: indexed %d chunks → %s", len(records), self.db_path)
+        return len(records)
+
+    # -- query ----------------------------------------------------------
+
+    def query(
+        self,
+        text: str,
+        n: int = 200,
+        where: dict | None = None,
+    ) -> list[tuple[str, float]]:
+        """Return up to `n` (chunk_id, bm25_score) pairs, lowest score first.
+
+        FTS5's bm25() returns NEGATIVE numbers — more relevant docs have
+        smaller (more negative) scores. We order ASC so the first row is
+        the most relevant. Callers that need a "rank" should enumerate
+        the returned list.
+        """
+        sanitized = _sanitize_query(text)
+        if not sanitized:
+            return []
+        where_sql, params = _where_to_sql(where)
+        # FTS5 MATCH wants the unaliased table name on its left, so we use
+        # chunks_fts (no alias) and JOIN by rowid against chunks_meta.
+        sql = (
+            "SELECT m.id, bm25(chunks_fts) AS score "
+            "FROM chunks_fts "
+            "JOIN chunks_meta m ON m.rowid = chunks_fts.rowid "
+            f"WHERE chunks_fts MATCH ? {where_sql} "
+            "ORDER BY bm25(chunks_fts) "
+            "LIMIT ?"
+        )
+        try:
+            with sqlite3.connect(self.db_path) as con:
+                cur = con.execute(sql, [sanitized, *params, n])
+                return [(row[0], float(row[1])) for row in cur.fetchall()]
+        except sqlite3.OperationalError as e:
+            # FTS5 syntax error (rare after sanitization) or db missing.
+            # Caller decides whether to fall back to dense-only.
+            log.warning("bm25 query failed (%s); query=%r", e, sanitized[:80])
+            return []
+
+    def exists(self) -> bool:
+        """Cheap probe — does the index file exist on disk?"""
+        return self.db_path.exists()
+
+    def count(self) -> int:
+        """Number of chunks indexed. 0 if the db is missing or empty."""
+        if not self.exists():
+            return 0
+        try:
+            with sqlite3.connect(self.db_path) as con:
+                return con.execute("SELECT COUNT(*) FROM chunks_meta").fetchone()[0]
+        except sqlite3.OperationalError:
+            return 0
+
+    # -- schema ---------------------------------------------------------
+
+    @staticmethod
+    def _schema_sql() -> str:
+        return """
+        CREATE TABLE chunks_meta (
+            rowid     INTEGER PRIMARY KEY AUTOINCREMENT,
+            id        TEXT UNIQUE NOT NULL,
+            bundle_id TEXT,
+            page_id   TEXT,
+            version   TEXT,
+            platform  TEXT,
+            product   TEXT,
+            ordinal   INTEGER
+        );
+        CREATE INDEX idx_meta_version  ON chunks_meta(version);
+        CREATE INDEX idx_meta_platform ON chunks_meta(platform);
+        CREATE INDEX idx_meta_bundle   ON chunks_meta(bundle_id);
+
+        CREATE VIRTUAL TABLE chunks_fts USING fts5(
+            text,
+            tokenize = 'porter unicode61 remove_diacritics 2'
+        );
+        """
diff --git a/rag/chunk.py b/rag/chunk.py
new file mode 100644
index 0000000..b8d7317
--- /dev/null
+++ b/rag/chunk.py
@@ -0,0 +1,126 @@
+"""Markdown chunker — paragraph-aware, ~400-600 token target.
+
+Adjust the chunking strategy per product if your page format differs
+significantly from prose. The output shape (id, text, metadata) is
+fixed by the downstream Chroma + BM25 indexing in rag/index.py — don't
+change that.
+
+The key knob you'll tune per product is chunk-0. Dense retrieval lands
+on chunk 0 first for most queries. Make it a synthetic chunk built
+from:
+
+  - the page title (as natural-language H1)
+  - a 1-sentence task description (you'll have to generate this — for
+    pages that already have a "## Overview" or "## Introduction" the
+    first sentence usually works)
+  - a keyword bag of important terms (filenames, API names, error
+    codes — the rare technical tokens that BM25 lights up on)
+
+Without a rich chunk 0, dense retrieval gets dominated by the much
+larger prose body, and short pages (script examples, reference cards)
+get buried.
+"""
+from __future__ import annotations
+
+import re
+from typing import Iterator
+
+
+# Approximate token estimate from char count. Tunable — set per
+# embedder if the default 4 chars/token is wrong.
+CHARS_PER_TOKEN = 4
+TARGET_TOKENS = 500
+TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN
+
+
+def estimate_tokens(text: str) -> int:
+    return max(1, len(text) // CHARS_PER_TOKEN)
+
+
+def split_paragraphs(md: str) -> list[str]:
+    """Split markdown into paragraph-ish blocks.
+
+    Keeps fenced code blocks together (don't slice through ```).
+    Headings start new paragraphs.
+    """
+    blocks: list[str] = []
+    current: list[str] = []
+    in_fence = False
+    for line in md.splitlines(keepends=True):
+        stripped = line.strip()
+        if stripped.startswith("```"):
+            in_fence = not in_fence
+            current.append(line)
+            continue
+        if in_fence:
+            current.append(line)
+            continue
+        if stripped.startswith("#"):
+            if current:
+                blocks.append("".join(current).strip())
+                current = []
+            current.append(line)
+            continue
+        if not stripped and current and not "".join(current).strip().endswith("\n\n"):
+            current.append(line)
+            blocks.append("".join(current).strip())
+            current = []
+            continue
+        current.append(line)
+    if current:
+        blocks.append("".join(current).strip())
+    return [b for b in blocks if b]
+
+
+def chunks_from_page(
+    text: str,
+    page_id: str,
+    metadata: dict,
+) -> Iterator[dict]:
+    """Yield chunk dicts ready for index.py to upsert.
+
+    The synthetic chunk 0 is the per-product customization point. The
+    default below is a simple title + body-first-paragraph; rewrite
+    for richer retrieval signal (see module docstring).
+    """
+    paragraphs = split_paragraphs(text)
+    if not paragraphs:
+        return
+
+    # ----- Chunk 0: synthetic anchor for dense retrieval ---------
+    title = metadata.get("title") or page_id
+    first_para = next((p for p in paragraphs if not p.startswith("#")), "")
+    chunk0_body = (
+        f"# {title}\n\n"
+        f"{first_para[:300]}"
+        # TODO per product: append a keyword bag here (filenames,
+        # API names, error codes) for BM25 + dense joint coverage.
+    )
+    yield {
+        "id":       f"{metadata['bundle_id']}::{page_id}::0",
+        "text":     chunk0_body,
+        "metadata": {**metadata, "ordinal": 0},
+    }
+
+    # ----- Body chunks: pack paragraphs up to TARGET_CHARS -------
+    ordinal = 1
+    buf: list[str] = []
+    buf_chars = 0
+    for p in paragraphs:
+        if buf_chars + len(p) > TARGET_CHARS and buf:
+            yield {
+                "id":       f"{metadata['bundle_id']}::{page_id}::{ordinal}",
+                "text":     "\n\n".join(buf),
+                "metadata": {**metadata, "ordinal": ordinal},
+            }
+            ordinal += 1
+            buf = []
+            buf_chars = 0
+        buf.append(p)
+        buf_chars += len(p)
+    if buf:
+        yield {
+            "id":       f"{metadata['bundle_id']}::{page_id}::{ordinal}",
+            "text":     "\n\n".join(buf),
+            "metadata": {**metadata, "ordinal": ordinal},
+        }
diff --git a/rag/embeddings.py b/rag/embeddings.py
new file mode 100644
index 0000000..84d3bbd
--- /dev/null
+++ b/rag/embeddings.py
@@ -0,0 +1,72 @@
+"""Embedding function for Chroma — Ollama-hosted nomic-embed-text by default.
+
+Swappable: implement the same `embedding_function()` interface returning
+a Chroma `EmbeddingFunction` and the rest of the pipeline doesn't care.
+
+Defaults (override via env):
+  OLLAMA_URL    one or more comma-separated URLs (load-balanced)
+  EMBED_MODEL   model name; default 'nomic-embed-text'
+  EMBED_DIM     expected embedding dim; default 768 (nomic-embed-text)
+"""
+from __future__ import annotations
+
+import os
+import logging
+from typing import Any
+
+import httpx
+from chromadb import EmbeddingFunction, Documents, Embeddings
+
+log = logging.getLogger(__name__)
+
+OLLAMA_URLS = [u.strip() for u in os.environ.get("OLLAMA_URL",
+               "http://localhost:11434").split(",") if u.strip()]
+EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
+EMBED_DIM = int(os.environ.get("EMBED_DIM", "768"))
+
+
+class OllamaEmbeddings(EmbeddingFunction):
+    """Calls /api/embed across N Ollama endpoints, naive round-robin.
+
+    For indexing throughput on multiple GPUs, run one Ollama container
+    per GPU (pinned via NVIDIA_VISIBLE_DEVICES) and pass all their URLs
+    in OLLAMA_URL — the embedder picks the next endpoint per batch.
+    """
+
+    def __init__(self, urls: list[str] = OLLAMA_URLS, model: str = EMBED_MODEL):
+        self.urls = urls
+        self.model = model
+        self._next = 0
+
+    def __call__(self, input: Documents) -> Embeddings:
+        url = self.urls[self._next % len(self.urls)]
+        self._next += 1
+        with httpx.Client(timeout=300) as c:
+            r = c.post(f"{url}/api/embed",
+                       json={"model": self.model, "input": list(input)})
+            r.raise_for_status()
+            data = r.json()
+        return data.get("embeddings") or []
+
+    def name(self) -> str:                  # newer chromadb requires this
+        return f"ollama:{self.model}"
+
+    @staticmethod
+    def build_from_config(config: dict) -> "OllamaEmbeddings":  # newer chromadb
+        return OllamaEmbeddings(
+            urls=config.get("urls", OLLAMA_URLS),
+            model=config.get("model", EMBED_MODEL),
+        )
+
+    def get_config(self) -> dict:           # newer chromadb
+        return {"urls": self.urls, "model": self.model}
+
+    def default_space(self) -> str:
+        return "cosine"
+
+    def supported_spaces(self) -> list[str]:
+        return ["cosine", "l2", "ip"]
+
+
+def embedding_function() -> EmbeddingFunction:
+    return OllamaEmbeddings()
diff --git a/rag/index.py b/rag/index.py
new file mode 100644
index 0000000..8d1c74f
--- /dev/null
+++ b/rag/index.py
@@ -0,0 +1,134 @@
+"""Build Chroma (and optionally BM25) indexes from corpus on disk.
+
+Reads `corpus/<bundle>/<page>.{md,json}`, chunks each page, upserts
+into Chroma. With --rebuild, drops + recreates the collection (clean
+state). With --bm25-only, skips Chroma and rebuilds only the FTS5
+index — useful for fast iteration when chunking didn't change.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterator
+
+import chromadb
+from chromadb.config import Settings
+
+from .chunk import chunks_from_page
+from .embeddings import embedding_function
+
+log = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(message)s")
+
+ROOT = Path(__file__).resolve().parent.parent
+CORPUS = ROOT / "corpus"
+CHROMA_DIR = ROOT / "chroma"
+
+# Collection name — convention: <product>_docs. Override via env if needed.
+import os
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
+COLLECTION = f"{PRODUCT_NAME}_docs"
+
+
+def page_records() -> Iterator[dict]:
+    """Walk corpus/, yield chunks for every page."""
+    if not CORPUS.exists():
+        log.error("corpus/ doesn't exist; run the scraper first")
+        return
+    for bundle_dir in sorted(CORPUS.iterdir()):
+        if not bundle_dir.is_dir() or bundle_dir.name.startswith("."):
+            continue
+        for md_path in sorted(bundle_dir.glob("*.md")):
+            page_id = md_path.stem
+            sidecar = md_path.with_suffix(".json")
+            if not sidecar.exists():
+                log.warning("skipping %s — no JSON sidecar", md_path)
+                continue
+            md = md_path.read_text()
+            meta = json.loads(sidecar.read_text())
+            # Surface common filter fields at the chunk-metadata level
+            # so Chroma's `where` filter can use them.
+            base_meta = {
+                "bundle_id": bundle_dir.name,
+                "page_id":   page_id,
+                "title":     meta.get("title") or "",
+                "version":   meta.get("version") or "",
+                "platform":  meta.get("platform") or "",
+                "product":   meta.get("product") or "",
+            }
+            yield from chunks_from_page(md, page_id, base_meta)
+
+
+def upsert_to_chroma(records: list[dict]) -> int:
+    client = chromadb.PersistentClient(
+        path=str(CHROMA_DIR),
+        settings=Settings(anonymized_telemetry=False),
+    )
+    # Drop + recreate for --rebuild semantics
+    try:
+        client.delete_collection(COLLECTION)
+    except Exception:
+        pass
+    col = client.create_collection(COLLECTION, embedding_function=embedding_function())
+
+    BATCH = 64
+    total = 0
+    for i in range(0, len(records), BATCH):
+        chunk = records[i:i + BATCH]
+        col.upsert(
+            ids=[r["id"] for r in chunk],
+            documents=[r["text"] for r in chunk],
+            metadatas=[r["metadata"] for r in chunk],
+        )
+        total += len(chunk)
+        log.info("upserted %d / %d chunks", total, len(records))
+    return total
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--rebuild", action="store_true",
+                   help="Drop and recreate the Chroma collection.")
+    p.add_argument("--bm25-only", action="store_true",
+                   help="Rebuild only the BM25 index, skip Chroma.")
+    p.add_argument("--bm25-db", type=Path,
+                   default=ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db",
+                   help="Path to the BM25 sqlite db.")
+    args = p.parse_args()
+
+    log.info("reading corpus from %s", CORPUS)
+    t0 = time.time()
+    records = list(page_records())
+    log.info("loaded %d chunks in %.1fs", len(records), time.time() - t0)
+
+    if args.bm25_only:
+        from .bm25 import BM25Index
+        log.info("--bm25-only: building FTS5 only")
+        BM25Index(args.bm25_db).build(records)
+        return 0
+
+    if not args.rebuild:
+        log.info("no --rebuild; nothing to do. (Use --rebuild to upsert.)")
+        return 0
+
+    t_c = time.time()
+    n = upsert_to_chroma(records)
+    log.info("chroma: %d chunks in %.1fs", n, time.time() - t_c)
+
+    # Build BM25 too — see PLAN.md Phase 8. Safe to remove this block
+    # for products that don't need hybrid retrieval.
+    try:
+        from .bm25 import BM25Index
+        t_b = time.time()
+        BM25Index(args.bm25_db).build(records)
+        log.info("bm25 done in %.1fs", time.time() - t_b)
+    except ImportError:
+        log.info("rag.bm25 not available — skipping BM25 build")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b9982a9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,19 @@
+# MCP server
+mcp[fastmcp]>=1.0.0
+pydantic>=2.0
+httpx>=0.27
+
+# Vector store + embeddings
+chromadb>=0.5.0
+ollama>=0.4.0      # if using Ollama-hosted embedder; swap if not
+
+# Scraping (Phase 1; adjust per product)
+beautifulsoup4>=4.12
+requests>=2.31
+# playwright>=1.40  # uncomment if you need headless browser fallback
+
+# Evaluation
+numpy>=1.26
+
+# Dev / utility
+python-dateutil>=2.8
diff --git a/scrape/README.md b/scrape/README.md
new file mode 100644
index 0000000..44d6df3
--- /dev/null
+++ b/scrape/README.md
@@ -0,0 +1,59 @@
+# scrape/
+
+Product-specific. **You implement this for each product.** The
+template gives you the contract; the extraction logic depends on
+the upstream doc portal.
+
+See `PLAN.md` Phase 1 for the corpus layout the rest of the pipeline
+expects.
+
+## What you write
+
+At minimum, two scripts:
+
+### `scrape/bundles.py`
+
+Discovers the upstream portal's bundle catalog and writes
+`bundles.json` at the repo root. One entry per bundle (versioned doc
+set) with the schema in PLAN.md.
+
+```bash
+python -m scrape.bundles
+```
+
+### `scrape/runner.py`
+
+Scrapes the pages of each bundle (or a single bundle with `--bundle
+<slug>`). Writes:
+
+- `corpus/<bundle_id>/<page_id>.md` — extracted markdown body
+- `corpus/<bundle_id>/<page_id>.json` — per-page metadata sidecar
+
+```bash
+python -m scrape.runner --all --force --concurrency 6
+python -m scrape.runner --bundle Admin.VC.HTML.10.9
+```
+
+## Tips
+
+- **Sniff before you scrape.** Almost every modern doc portal is an
+  SPA that calls a backend API. Open the browser's Network tab,
+  click around, find the underlying JSON. Scraping the API is 10×
+  cheaper and 100× more reliable than scraping the rendered HTML.
+- **Idempotent re-scrapes.** Without `--force`, the runner should
+  skip pages already on disk so a resume doesn't have to re-fetch
+  everything. With `--force`, re-fetch every page — that's the
+  weekly cron mode that catches edits.
+- **Respect the portal.** Backoff on 429s. Set a recognizable
+  user-agent so the portal owner can identify you if they want to.
+- **Whitespace normalize.** Markdown that round-trips through HTML
+  often has extra blank lines. Normalize to a single blank between
+  paragraphs so diffs are clean (the changelog summary and digest
+  tools care about line counts).
+
+## What's already reusable
+
+`scrape/changelog.py` is fully product-agnostic and ready to use
+as-is. It walks `git diff --name-status` output to produce a
+structured summary, and walks `git log` for the digest history
+(Phase 13).
diff --git a/scrape/__init__.py b/scrape/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/scrape/changelog.py b/scrape/changelog.py
new file mode 100644
index 0000000..9ff2789
--- /dev/null
+++ b/scrape/changelog.py
@@ -0,0 +1,272 @@
+"""Generate a summary of corpus changes.
+
+Two output shapes for two consumers:
+
+  1. Human-readable text (default) — written into the weekly-refresh
+     commit message so the commit log is greppable for *"what changed
+     this week"* instead of *"806 files changed"*.
+
+  2. Structured JSON (``--json``) and rolling JSONL history
+     (``--history-out``) — consumed by the ``weekly_digest`` MCP tool.
+     Computed in CI and committed at ``corpus/.digest/history.jsonl``;
+     the tool reads it at runtime because the prod container is a
+     static filesystem COPY with no git available.
+
+Usage:
+
+    # Commit-message helper (existing behavior — unchanged)
+    python -m scrape.changelog [--cached] [--ref REF]
+
+    # One-shot JSON for the current diff range
+    python -m scrape.changelog --cached --json
+
+    # Build / refresh the digest history file (CI use)
+    python -m scrape.changelog --history-out corpus/.digest/history.jsonl \\
+        --history-days 120
+
+The history walker only includes commits that touch ``corpus/`` (or
+``bundles.json``); it skips pure code/CI commits. Each emitted record
+carries the commit's short sha, ISO timestamp, subject, and the same
+structured summary the ``--json`` path produces, so the consumer can
+treat history records and one-shot summaries interchangeably.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+from collections import defaultdict
+from typing import Any
+
+
+def git(*args: str) -> str:
+    return subprocess.check_output(["git", *args], text=True)
+
+
+def summarize_diff(diff_output: str) -> dict[str, Any]:
+    """Parse ``git diff --name-status`` output into a structured summary.
+
+    Pure function (no IO, no git calls) so the same logic is exercised
+    by the human-readable, JSON-one-shot, and history-walking paths.
+
+    Returns a dict with:
+
+        md_count           int       — total .md files changed
+        json_count         int       — total .json sidecars changed
+        content_bundles    dict      — {bundle_id: [page_id_without_.md, ...]}
+                                       Only bundles where at least one .md
+                                       file moved. Lists are in the order
+                                       git emitted them.
+        json_only_bundles  list[str] — bundles whose ONLY change was sidecar
+                                       drift (no .md changes). Sorted.
+        new_bundles        list[str] — bundles whose first .md was Added
+                                       in this diff. Sorted.
+        other_files        list[str] — any non-corpus path mentioned in the
+                                       diff, as ``"STATUS path"`` strings.
+    """
+    md_changes: dict[str, list[str]] = defaultdict(list)
+    json_only_bundles: set[str] = set()
+    new_bundles: set[str] = set()
+    md_count = json_count = 0
+    other_files: list[str] = []
+
+    for line in diff_output.splitlines():
+        if not line.strip():
+            continue
+        # status<TAB>path (or status<TAB>old<TAB>new for renames; we take
+        # the post-rename path as the canonical location).
+        parts = line.split("\t")
+        status, path = parts[0], parts[-1]
+        if not path.startswith("corpus/"):
+            other_files.append(f"{status} {path}")
+            continue
+        segs = path.split("/", 2)
+        if len(segs) < 3:
+            # corpus/<filename> with no bundle dir — skip.
+            continue
+        _, bundle, page = segs
+        if page.endswith(".md"):
+            md_changes[bundle].append(page[:-3])
+            md_count += 1
+            if status == "A":
+                new_bundles.add(bundle)
+        elif page.endswith(".json"):
+            json_count += 1
+            json_only_bundles.add(bundle)
+
+    # A bundle counts as "content-changing" if it had any .md edit. Sidecar-
+    # only drift goes in the separate bucket so the commit message doesn't
+    # report timestamp churn as if it were real edits.
+    content_bundles_set = set(md_changes)
+    drift_only = sorted(json_only_bundles - content_bundles_set)
+
+    return {
+        "md_count":          md_count,
+        "json_count":        json_count,
+        "content_bundles":   dict(md_changes),   # cast back to plain dict for JSON
+        "json_only_bundles": drift_only,
+        "new_bundles":       sorted(new_bundles),
+        "other_files":       other_files,
+    }
+
+
+def render_human(summary: dict[str, Any]) -> str:
+    """Format a summary dict as the multi-line commit-message text.
+
+    Matches the historical output exactly so existing commit-message
+    tooling and downstream readers don't have to change.
+    """
+    lines: list[str] = []
+    content_bundles = sorted(summary["content_bundles"])
+    md_count = summary["md_count"]
+    json_count = summary["json_count"]
+    new_bundles = set(summary["new_bundles"])
+    drift_only = summary["json_only_bundles"]
+    other_files = summary["other_files"]
+
+    lines.append(f"{md_count} content change(s) across {len(content_bundles)} bundle(s)")
+    lines.append(f"{json_count} sidecar metadata update(s)")
+    if new_bundles:
+        lines.append(f"{len(new_bundles)} new bundle(s) added")
+    if other_files:
+        lines.append(f"{len(other_files)} other file change(s)")
+
+    if content_bundles:
+        lines.append("")
+        lines.append("Bundles with content changes:")
+        for b in content_bundles:
+            pages = summary["content_bundles"][b]
+            tag = " (NEW)" if b in new_bundles else ""
+            lines.append(f"  {b}{tag}: {len(pages)} page(s)")
+            for p in pages[:5]:
+                lines.append(f"    - {p}")
+            if len(pages) > 5:
+                lines.append(f"    ... and {len(pages) - 5} more")
+    if drift_only:
+        lines.append("")
+        head = ", ".join(drift_only[:10])
+        suffix = " …" if len(drift_only) > 10 else ""
+        lines.append(f"Bundles with sidecar-only drift ({len(drift_only)}): {head}{suffix}")
+    return "\n".join(lines)
+
+
+def walk_history(history_days: int) -> list[dict[str, Any]]:
+    """Walk recent corpus-touching commits, emit one summary per commit.
+
+    Uses ``git log --first-parent main`` to keep the rolling weekly-
+    refresh line clean of branch-merge noise. Only commits whose diff
+    touches ``corpus/`` or ``bundles.json`` are emitted; pure code
+    commits are skipped (they have nothing to digest).
+
+    Each record:
+
+        {
+          "sha":       "<short sha>",
+          "timestamp": "<ISO 8601, UTC>",
+          "subject":   "<commit subject line>",
+          ... + every field from summarize_diff()
+        }
+    """
+    # Find candidate commits. --first-parent keeps the linear refresh history
+    # on main and ignores branch-side merges. We still need to filter by what
+    # the commit actually touched, because non-corpus commits can land on
+    # main (PR merges for code, CI tweaks, etc.).
+    raw = git(
+        "log",
+        f"--since={history_days} days ago",
+        "--first-parent",
+        "main",
+        "--pretty=format:%H%x09%cI%x09%s",
+    )
+
+    records: list[dict[str, Any]] = []
+    for line in raw.splitlines():
+        if not line.strip():
+            continue
+        parts = line.split("\t", 2)
+        if len(parts) < 3:
+            continue
+        sha, ts, subject = parts
+
+        # What did this commit actually touch? Cheap: just the name-status diff
+        # against its first parent. Empty stdout = commit didn't change any
+        # files we care about. Root commits (no parent) error out — suppress
+        # the stderr noise and skip them.
+        try:
+            diff = subprocess.check_output(
+                ["git", "diff", "--name-status", f"{sha}^..{sha}"],
+                text=True,
+                stderr=subprocess.DEVNULL,
+            )
+        except subprocess.CalledProcessError:
+            continue
+        if not diff.strip():
+            continue
+
+        summary = summarize_diff(diff)
+        # Skip pure code commits — only emit records that have actual corpus
+        # content motion. This is what makes the history "interesting" for
+        # the weekly digest.
+        if summary["md_count"] == 0 and summary["json_count"] == 0 and not summary["new_bundles"]:
+            continue
+
+        records.append({
+            "sha":       sha[:12],
+            "timestamp": ts,
+            "subject":   subject,
+            **summary,
+        })
+
+    return records
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--cached", action="store_true",
+                   help="Summarize staged changes instead of a ref range.")
+    p.add_argument("--ref", default="HEAD^..HEAD",
+                   help="Diff range to summarize (default: HEAD^..HEAD).")
+    p.add_argument("--json", dest="as_json", action="store_true",
+                   help="Emit one JSON object instead of the human-readable form.")
+    p.add_argument("--history-out", metavar="PATH",
+                   help="Walk recent corpus-touching commits and write a "
+                        "JSONL history file at PATH. Overwrites if it exists. "
+                        "Implies the history walker; --cached/--ref are ignored.")
+    p.add_argument("--history-days", type=int, default=120,
+                   help="How far back the history walker looks (default 120).")
+    args = p.parse_args()
+
+    # History-walker path: build the JSONL file consumed by the
+    # weekly_digest MCP tool, then exit. CI uses this.
+    if args.history_out:
+        records = walk_history(args.history_days)
+        # Sort by timestamp ascending so the file is roughly stable
+        # across rebuilds (commits within a single run could otherwise
+        # depend on git log default ordering).
+        records.sort(key=lambda r: r["timestamp"])
+        with open(args.history_out, "w") as fh:
+            for rec in records:
+                fh.write(json.dumps(rec, separators=(",", ":")) + "\n")
+        # Brief stdout signal for CI logs — easy to spot in the workflow run.
+        print(f"wrote {len(records)} commit record(s) to {args.history_out} "
+              f"covering up to {args.history_days} days")
+        return 0
+
+    # One-shot summary path. Unchanged behavior for --cached / --ref.
+    if args.cached:
+        diff_args = ["diff", "--name-status", "--cached"]
+    else:
+        diff_args = ["diff", "--name-status", args.ref]
+    diff = git(*diff_args)
+    summary = summarize_diff(diff)
+
+    if args.as_json:
+        print(json.dumps(summary, separators=(",", ":")))
+    else:
+        print(render_human(summary))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/registry_gc.py b/scripts/registry_gc.py
new file mode 100644
index 0000000..41bbc52
--- /dev/null
+++ b/scripts/registry_gc.py
@@ -0,0 +1,108 @@
+"""Gitea container-registry garbage collection.
+
+Lists package versions for one container package and deletes versions
+older than --keep-days. Always preserves:
+
+  - the :latest tag
+  - the --keep-latest most-recent date-tagged versions
+  - anything pushed in the last --keep-days days
+
+The actual disk reclaim happens on Gitea's next package GC cron (admin
+site settings). This script just marks the versions for deletion.
+
+Usage:
+
+    python scripts/registry_gc.py \\
+        --owner <user> \\
+        --package <product>-docs-mcp \\
+        --keep-days 90 \\
+        --keep-latest 5
+
+Auth: reads GITEA_TOKEN from env (set in the workflow as a secret).
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from datetime import datetime, timedelta, timezone
+from urllib.request import Request, urlopen
+from urllib.error import HTTPError
+import json
+
+
+GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io")
+
+
+def api(token: str, method: str, path: str) -> object:
+    req = Request(f"{GITEA_HOST}{path}",
+                  headers={"Authorization": f"token {token}"},
+                  method=method)
+    try:
+        with urlopen(req, timeout=30) as r:
+            body = r.read()
+            return json.loads(body) if body else None
+    except HTTPError as e:
+        if e.code == 404:
+            return None
+        raise
+
+
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--owner", required=True)
+    p.add_argument("--package", required=True)
+    p.add_argument("--keep-days", type=int, default=90)
+    p.add_argument("--keep-latest", type=int, default=5)
+    p.add_argument("--dry-run", action="store_true")
+    args = p.parse_args()
+
+    token = os.environ.get("GITEA_TOKEN")
+    if not token:
+        print("GITEA_TOKEN not set", file=sys.stderr)
+        return 1
+
+    versions = api(token, "GET",
+                   f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or []
+    if not versions:
+        print(f"no versions found for {args.owner}/{args.package}")
+        return 0
+
+    cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days)
+
+    # Date-tagged versions (YYYY.MM.DD), newest first
+    date_tagged = []
+    for v in versions:
+        tags = v.get("tags") or []
+        for t in tags:
+            if len(t) == 10 and t[4] == "." and t[7] == ".":
+                date_tagged.append((t, v))
+                break
+    date_tagged.sort(key=lambda kv: kv[0], reverse=True)
+    keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]}
+
+    deleted = 0
+    for v in versions:
+        tags = v.get("tags") or []
+        if "latest" in tags:
+            continue
+        if any(t in keep_date_tags for t in tags):
+            continue
+        try:
+            created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00"))
+        except (KeyError, ValueError):
+            continue
+        if created >= cutoff:
+            continue
+        version_id = v.get("id")
+        print(f"  deleting v{version_id}  tags={tags}  created={v['created_at']}")
+        if not args.dry_run:
+            api(token, "DELETE",
+                f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}")
+            deleted += 1
+    print(f"done: {deleted} version(s) deleted")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/usage_report.py b/scripts/usage_report.py
new file mode 100644
index 0000000..6bfaa25
--- /dev/null
+++ b/scripts/usage_report.py
@@ -0,0 +1,251 @@
+"""Summarize usage logs from docs_mcp.usage into a quick scan.
+
+Reads one or more usage.jsonl* files and prints sections for:
+
+  - per-tool call counts
+  - top search_docs queries by frequency
+  - 0-hit queries (where we returned nothing — high-signal for tuning)
+  - filter usage histogram (which version / platform / bundle filters get hit)
+  - reranker effectiveness (calls where the reranker fired vs not)
+  - hybrid retrieval top-1 attribution (dense vs bm25 vs both)
+
+Usage:
+
+    # Default: read /app/var/logs in the production container
+    python scripts/usage_report.py --logs-dir /path/to/usage/logs
+
+    # Last N days only:
+    python scripts/usage_report.py --logs-dir <dir> --since 7d
+
+    # Markdown output (for piping into a weekly digest email, etc):
+    python scripts/usage_report.py --logs-dir <dir> --format markdown
+
+The script doesn't depend on anything in the docs_mcp package — it's a
+standalone tool that can run anywhere with the log files available
+(scp them off the host, point it at the directory).
+
+----------------------------------------------------------------------
+FOLLOW-UP CHECKS
+----------------------------------------------------------------------
+
+Pattern: when you ship a retrieval change with a hypothesis attached
+(e.g. "hybrid will rescue queries dense misses"), add a note HERE
+describing what the usage report should show and at what threshold
+the change earns its keep. Future-you running the report a month
+later will be glad. Example:
+
+  Q: Does the dense leg of hybrid retrieval earn its keep on
+     real traffic, or could we simplify to BM25-only?
+
+  - bm25_only >= 80%%  --> dense not doing much; consider
+                          simplifying to BM25 mode
+  - both     >= 50%%  --> hybrid is tie-breaking; keep it
+  - dense_only > bm25_only --> dense is the workhorse; keep
+
+Also worth a glance every month:
+
+  - 0-hit queries list (tuning candidates)
+  - reranker p95 latency drift (slow reranker = bad UX)
+  - filter usage (does anyone actually use version/platform
+    filters? if not, simplify the tool surface)
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from collections import Counter, defaultdict
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any, Iterable
+
+
+def parse_since(s: str | None) -> datetime | None:
+    """Accept '7d', '24h', '30m', or an ISO timestamp. None → no cutoff."""
+    if not s:
+        return None
+    m = re.fullmatch(r"(\d+)([dhm])", s)
+    if m:
+        n, unit = int(m.group(1)), m.group(2)
+        delta = {"d": timedelta(days=n), "h": timedelta(hours=n), "m": timedelta(minutes=n)}[unit]
+        return datetime.now(timezone.utc) - delta
+    return datetime.fromisoformat(s.replace("Z", "+00:00"))
+
+
+def load_events(logs_dir: Path, since: datetime | None) -> Iterable[dict[str, Any]]:
+    """Yield every JSONL record across all files in logs_dir."""
+    if not logs_dir.exists():
+        print(f"warning: logs dir {logs_dir} does not exist", file=sys.stderr)
+        return
+    # usage.jsonl is the active file; usage.jsonl.YYYY-MM-DD are rotated.
+    files = sorted(logs_dir.glob("usage.jsonl*"))
+    for f in files:
+        with open(f) as fh:
+            for ln, line in enumerate(fh, start=1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rec = json.loads(line)
+                except json.JSONDecodeError as e:
+                    print(f"  ! skipping {f}:{ln}: {e}", file=sys.stderr)
+                    continue
+                if since:
+                    ts = rec.get("ts", "")
+                    try:
+                        rec_ts = datetime.fromisoformat(ts.replace("Z", "+00:00"))
+                    except ValueError:
+                        continue
+                    if rec_ts < since:
+                        continue
+                yield rec
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--logs-dir", type=Path, default=Path("/app/var/logs"),
+                   help="directory with usage.jsonl* files")
+    p.add_argument("--since", default=None,
+                   help="time window: '7d', '24h', '30m', or ISO timestamp")
+    p.add_argument("--top", type=int, default=25,
+                   help="how many top queries / filters to show")
+    p.add_argument("--format", choices=("text", "markdown"), default="text")
+    args = p.parse_args()
+
+    since = parse_since(args.since)
+    events = list(load_events(args.logs_dir, since))
+    if not events:
+        print("(no events in window)")
+        return 0
+
+    print(f"# Usage report — {len(events)} events"
+          + (f" since {since.isoformat()}" if since else "")
+          + f" from {args.logs_dir}")
+    print()
+
+    # 1. Per-tool counts
+    by_tool = Counter(e["tool"] for e in events)
+    print("## Per-tool call counts")
+    print()
+    if args.format == "markdown":
+        print("| tool | calls |")
+        print("|---|---|")
+        for tool, n in by_tool.most_common():
+            print(f"| `{tool}` | {n} |")
+    else:
+        for tool, n in by_tool.most_common():
+            print(f"  {tool:<25s} {n:>6d}")
+    print()
+
+    # 2. Top search_docs queries
+    search_events = [e for e in events if e["tool"] == "search_docs"]
+    queries = Counter(e["args"].get("query", "") for e in search_events)
+    print(f"## Top {args.top} search_docs queries  (of {len(search_events)} searches)")
+    print()
+    if args.format == "markdown":
+        print("| count | query |")
+        print("|---|---|")
+        for q, n in queries.most_common(args.top):
+            print(f"| {n} | `{q}` |")
+    else:
+        for q, n in queries.most_common(args.top):
+            print(f"  {n:>5d}  {q!r}")
+    print()
+
+    # 3. 0-hit queries — the highest-signal data for tuning
+    zero_hit = [e for e in search_events if e.get("hits_returned") == 0]
+    zero_q = Counter(e["args"].get("query", "") for e in zero_hit)
+    print(f"## 0-hit queries  ({len(zero_hit)} of {len(search_events)} searches returned nothing)")
+    print()
+    if zero_q:
+        if args.format == "markdown":
+            print("| count | query | filters |")
+            print("|---|---|---|")
+            # Group by query, show filter examples for each
+            examples_by_query: dict[str, list[dict]] = defaultdict(list)
+            for e in zero_hit:
+                examples_by_query[e["args"].get("query", "")].append(e["args"])
+            for q, n in zero_q.most_common(args.top):
+                ex = examples_by_query[q][0]
+                f = {k: v for k, v in ex.items()
+                     if k in ("version", "platform", "bundle_id") and v}
+                print(f"| {n} | `{q}` | `{f}` |")
+        else:
+            for q, n in zero_q.most_common(args.top):
+                print(f"  {n:>5d}  {q!r}")
+    else:
+        print("  _(no 0-hit queries in window)_")
+    print()
+
+    # 4. Filter usage
+    filter_use = Counter()
+    for e in search_events:
+        a = e["args"]
+        v = a.get("version")
+        p_ = a.get("platform")
+        b = a.get("bundle_id")
+        if v:
+            filter_use[f"version={v}"] += 1
+        if p_:
+            filter_use[f"platform={p_}"] += 1
+        if b:
+            filter_use[f"bundle_id={b}"] += 1
+        if not (v or p_ or b):
+            filter_use["(no filter)"] += 1
+    print(f"## search_docs filter usage")
+    print()
+    if args.format == "markdown":
+        print("| filter | count |")
+        print("|---|---|")
+        for f, n in filter_use.most_common(args.top):
+            print(f"| `{f}` | {n} |")
+    else:
+        for f, n in filter_use.most_common(args.top):
+            print(f"  {n:>5d}  {f}")
+    print()
+
+    # 5. Reranker effectiveness
+    reranked = [e for e in search_events if e.get("reranked") is True]
+    dense_only = [e for e in search_events if e.get("reranked") is False]
+    print(f"## Reranker activity")
+    print()
+    print(f"  reranked:    {len(reranked):>5d}")
+    print(f"  dense only:  {len(dense_only):>5d}  (filter too narrow or 0 results)")
+    if reranked:
+        elapsed = [e["elapsed_ms"] for e in reranked if e.get("elapsed_ms") is not None]
+        if elapsed:
+            elapsed.sort()
+            p50 = elapsed[len(elapsed) // 2]
+            p95 = elapsed[int(len(elapsed) * 0.95)]
+            print(f"  reranked latency p50: {p50:.0f} ms,  p95: {p95:.0f} ms")
+    print()
+
+    # 6. Hybrid retrieval activity — which retriever contributed the top-1?
+    # Empty unless HYBRID_SEARCH=true is set on the MCP container.
+    hybrid_events = [e for e in search_events if e.get("retrieval_mode") == "hybrid"]
+    if hybrid_events:
+        by_source = Counter(e.get("top1_source") for e in hybrid_events
+                            if e.get("top1_source"))
+        print("## Hybrid retrieval — top-1 attribution")
+        print()
+        print(f"  hybrid mode events: {len(hybrid_events)}")
+        total = sum(by_source.values()) or 1
+        for src in ("both", "dense_only", "bm25_only"):
+            n = by_source.get(src, 0)
+            pct = 100.0 * n / total
+            label = {
+                "both":       "in BOTH retrievers' top-N",
+                "dense_only": "dense found it, BM25 didn't",
+                "bm25_only":  "BM25 found it, dense didn't",
+            }[src]
+            print(f"  {src:<11s} {n:>5d}  ({pct:5.1f}%)  — {label}")
+        rescued = by_source.get("bm25_only", 0)
+        if rescued and total:
+            print(f"\n  → {rescued} ({100.0 * rescued / total:.1f}%) of hybrid queries had the top-1 "
+                  "result that ONLY BM25 surfaced. Without hybrid those would have been dense-misses.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())