From 97a2a05b248c2520c87c40a68543cd1cd0555c7f Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Sun, 24 May 2026 10:02:01 -0400
Subject: [PATCH] Phase 3: MCP server tools for the labels corpus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adapt docs_mcp/server.py from versioned-software-docs domain to
pesticide-labels domain. Standard MCP tool names preserved
(search_docs / get_page / list_versions) so existing MCP clients
(Claude Desktop, Cursor) still pick them up; docstrings + argument
shape are labels-domain.

Tools shipped:
- search_docs(query, source, product_class, registrant_contains,
  signal_word, epa_reg_no, k) — dense Chroma query with optional
  filters, post-filtered for registrant substring. Returns top-k
  chunks rendered as markdown with product / reg / registrant /
  actives / signal / section / label-PDF URL.
- get_page(source, source_key) — full label markdown + metadata
  header. source_key is slug for MFR sources, EPA Reg No for EPA PPLS.
- list_versions() — discovers facet values: sources, product
  classes, signal words, registrants (samples up to 50K chunks
  from Chroma to enumerate distinct metadata values).
- corpus_status() — fast no-embedder counts: labels on disk per
  source, chunks in Chroma, BM25 db size, active feature flags.

Wiring:
- Reads PPLS_CORPUS_ROOT + PPLS_CHROMA_DIR (matches the scrapers
  and indexer).
- Uses sources.json (not the template's bundles.json).
- Lazy Chroma init so the server starts cleanly even when Ollama
  is briefly down (e.g. during HVM corpus rebuilds).
- Phase 6 reranker + Phase 8 hybrid hooks left as feature flags
  (RERANK_URL, HYBRID_SEARCH) — fail open to dense-only when unset.

Smoke test against the live 216K-chunk corpus:
  - corpus_status: 4,157 labels / 216,467 chunks / 416 MB BM25 ✓
  - search_docs("waterhemp control on soybeans", k=2) returns
    Tackle Herbicide (FMC, 279-3564, glyph+imazethapyr) and
    R14640 Herbicide (Bayer, 524-724, glyph) with section context
    (ROUNDUP READY SOYBEANS / SOYBEAN) and dist-derived scores
    of 0.76 each — highly relevant.

Run as stdio for Claude Desktop:
  PPLS_CORPUS_ROOT=/run/media/justin/USB/ppls-corpus \
    OLLAMA_URL=http://gpu1:11434,http://gpu2:11434  \
    PRODUCT_NAME=ppls \
    python -m docs_mcp.server --transport stdio

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs_mcp/server.py | 524 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 401 insertions(+), 123 deletions(-)
diff --git a/docs_mcp/server.py b/docs_mcp/server.py
index 9a9387c..365b16d 100644
--- a/docs_mcp/server.py
+++ b/docs_mcp/server.py
@@ -1,20 +1,19 @@
-"""MCP server skeleton — fill in PRODUCT_NAME and the tool bodies.
+"""MCP server for the ppls-docs pesticide label corpus.
 
-This file is the template's structural anchor. The phases described in
-PLAN.md add or extend pieces of this file:
+Adapted from the docs-mcp-template (which targeted versioned software
+docs) for the EPA pesticide-labels domain: ``bundle_id`` → ``source``,
+``page_id`` → ``source_key`` (slug for MFRs, EPA Reg No for EPA PPLS),
+and ``version``/``platform`` filters → product-class / registrant /
+signal-word filters. See ``scrape/README.md`` for the corpus schema.
 
-  Phase 3  — search_docs, get_page, list_versions stubs (you are here)
+Phase progression in this file:
+  Phase 3  — search_docs, get_page, list_versions, corpus_status  (you are here)
   Phase 6  — reranker integration in search_docs
-  Phase 8  — BM25 + hybrid retrieval (HYBRID_SEARCH env gate, _rrf_fuse)
-  Phase 9  — diff_versions, list_cluster, bundle_changelog
-  Phase 10 — TimedCall wiring (already imported below)
-  Phase 11 — <product>_api_lessons tool
-  Phase 12 — find_doc_inconsistencies
-  Phase 13 — weekly_digest + _digest_history reader
+  Phase 8  — BM25 + hybrid retrieval (HYBRID_SEARCH env gate)
 
-Every stub below has a docstring + `raise NotImplementedError`. Replace
-the body when you reach the corresponding phase. Keep the signatures
-stable across products — clients depend on them.
+Standard MCP tool names (search_docs / get_page / list_versions) are
+preserved so clients that expect a docs MCP shape still work; the
+docstrings make the labels-domain semantics explicit.
 """
 from __future__ import annotations
 
@@ -33,21 +32,25 @@ from .usage import TimedCall
 log = logging.getLogger(__name__)
 
 # ---------------------------------------------------------------------------
-# Product-specific configuration. Set these for each new build.
+# Product configuration.
 # ---------------------------------------------------------------------------
-PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
-PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com")
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls")
+PRODUCT_DOCS_URL = os.environ.get(
+    "PRODUCT_DOCS_URL",
+    "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1",
+)
 COLLECTION = f"{PRODUCT_NAME}_docs"
 
-# Paths inside the deployed container (and matching layout locally for dev).
-ROOT = Path(__file__).resolve().parent.parent
-CORPUS = ROOT / "corpus"
-CHROMA_DIR = ROOT / "chroma"
-BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db")))
-BUNDLES_JSON = ROOT / "bundles.json"
+# Paths — corpus on (possibly) external storage, indexes always at repo root.
+REPO_ROOT = Path(__file__).resolve().parent.parent
+CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
+CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma")
+BM25_DB = Path(os.environ.get("BM25_DB",
+               str(REPO_ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db")))
+SOURCES_JSON = REPO_ROOT / "sources.json"
 
 # ---------------------------------------------------------------------------
-# Feature flags (Phase 6 / 8 enable these as you ship each phase).
+# Feature flags (enabled in later phases).
 # ---------------------------------------------------------------------------
 RERANK_URL = os.environ.get("RERANK_URL", "").rstrip("/") or None
 RERANK_POOL = int(os.environ.get("RERANK_POOL", "50"))
@@ -59,40 +62,76 @@ RRF_K = int(os.environ.get("RRF_K", "60"))
 
 # ---------------------------------------------------------------------------
 # FastMCP setup.
-#
-# stateless_http=True — every request creates an ephemeral session and
-# discards it on return. Critical for production: clients don't get
-# 404 storms when the container is recreated by Watchtower.
 # ---------------------------------------------------------------------------
 mcp = FastMCP(f"{PRODUCT_NAME}-docs", stateless_http=True)
 
 
 # ---------------------------------------------------------------------------
-# Lazy helpers — instantiate expensive things only when actually needed,
-# so the server still starts when (e.g.) Ollama is briefly unreachable.
+# Lazy helpers.
 # ---------------------------------------------------------------------------
 
-def _bundles() -> dict[str, dict]:
-    """Cached load of bundles.json into a {slug: bundle_dict} mapping.
+_chroma_collection = None
+_sources_cache: dict[str, dict] | None = None
 
-    bundles.json is the product-specific catalog written by the Phase 1
-    scraper. See PLAN.md Phase 1 for the schema.
+
+def _sources() -> dict[str, dict]:
+    """Load sources.json as {source_id: source_dict}."""
+    global _sources_cache
+    if _sources_cache is not None:
+        return _sources_cache
+    if not SOURCES_JSON.exists():
+        _sources_cache = {}
+        return _sources_cache
+    try:
+        items = json.loads(SOURCES_JSON.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError) as exc:
+        log.warning("sources.json unreadable: %s", exc)
+        items = []
+    _sources_cache = {s["id"]: s for s in items if "id" in s}
+    return _sources_cache
+
+
+def _collection():
+    """Get the Chroma collection (lazy — only loads the embedder when first
+    queried, so the server starts cleanly even if Ollama is briefly down)."""
+    global _chroma_collection
+    if _chroma_collection is not None:
+        return _chroma_collection
+    import chromadb
+    from chromadb.config import Settings
+    from rag.embeddings import embedding_function
+    client = chromadb.PersistentClient(
+        path=str(CHROMA_DIR),
+        settings=Settings(anonymized_telemetry=False),
+    )
+    _chroma_collection = client.get_collection(
+        COLLECTION, embedding_function=embedding_function()
+    )
+    return _chroma_collection
+
+
+def _build_where(
+    source: str | None,
+    product_class: str | None,
+    registrant_contains: str | None,
+    signal_word: str | None,
+    epa_reg_no: str | None,
+) -> dict | None:
+    """Translate filter args into a Chroma `where` clause.
+
+    Chroma's where supports exact-match per field (and $and/$or). For
+    `registrant_contains` we can only do exact equality at the where level,
+    so substring matching is applied post-query in Python.
     """
-    if not BUNDLES_JSON.exists():
-        return {}
-    cat = json.loads(BUNDLES_JSON.read_text())
-    return {b["slug"]: b for b in cat}
-
-
-def _build_where(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None:
-    """Translate filter args into a Chroma `where` clause."""
     conds: list[dict] = []
-    if version:
-        conds.append({"version": version})
-    if platform:
-        conds.append({"platform": platform})
-    if bundle_id:
-        conds.append({"bundle_id": bundle_id})
+    if source:
+        conds.append({"source": source})
+    if product_class:
+        conds.append({"product_class": product_class})
+    if signal_word:
+        conds.append({"signal_word": signal_word})
+    if epa_reg_no:
+        conds.append({"epa_reg_no": epa_reg_no})
     if not conds:
         return None
     if len(conds) == 1:
@@ -100,13 +139,45 @@ def _build_where(version: str | None, platform: str | None, bundle_id: str | Non
     return {"$and": conds}
 
 
-def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None:
-    """Read a corpus page off disk. Returns (markdown_body, metadata_dict)."""
-    md_path = CORPUS / bundle_id / (page_id + ".md")
-    json_path = CORPUS / bundle_id / (page_id + ".json")
+def _read_label(source: str, source_key: str) -> tuple[str, dict] | None:
+    """Read a label off disk. Returns (markdown_body, metadata_dict) or None."""
+    md_path = CORPUS_ROOT / source / f"{source_key}.md"
+    json_path = CORPUS_ROOT / source / f"{source_key}.json"
     if not md_path.exists() or not json_path.exists():
         return None
-    return md_path.read_text(), json.loads(json_path.read_text())
+    try:
+        return md_path.read_text(encoding="utf-8"), json.loads(
+            json_path.read_text(encoding="utf-8")
+        )
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
+def _format_hit(doc: str, meta: dict, score: float) -> str:
+    """Render one search hit as a markdown block."""
+    product = meta.get("product_name") or meta.get("source_key") or "(unknown)"
+    reg = meta.get("epa_reg_no") or "—"
+    registrant = meta.get("registrant") or ""
+    actives = meta.get("active_ingredients") or ""
+    pclass = meta.get("product_class") or ""
+    signal = meta.get("signal_word") or ""
+    section = meta.get("section") or ""
+    source = meta.get("source") or "?"
+    source_key = meta.get("source_key") or "?"
+    label_url = meta.get("label_url") or ""
+
+    header = (
+        f"### {product}  (EPA Reg {reg})  · score={score:.3f}\n"
+        f"- **Source:** `{source}/{source_key}`"
+        + (f"  · class: {pclass}" if pclass else "")
+        + (f"  · signal: {signal}" if signal else "")
+        + (f"  · section: {section}" if section else "")
+        + "\n"
+        + (f"- **Registrant:** {registrant}\n" if registrant else "")
+        + (f"- **Active ingredients:** {actives}\n" if actives else "")
+        + (f"- **Label PDF:** {label_url}\n" if label_url else "")
+    )
+    return header + "\n" + doc.strip() + "\n"
 
 
 # ===========================================================================
@@ -115,88 +186,309 @@ def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None:
 
 @mcp.tool()
 def search_docs(
-    query: Annotated[str, Field(description=f"Natural-language query about {PRODUCT_NAME}.")],
-    version: Annotated[
+    query: Annotated[
+        str,
+        Field(description="Natural-language query about pesticide labels — "
+                          "products, crops, pests, application rates, REI/PHI, "
+                          "tank-mix restrictions, signal words, active ingredients."),
+    ],
+    source: Annotated[
         str | None,
-        Field(description="OPTIONAL version filter — restrict to one product version."),
+        Field(description="OPTIONAL source id to restrict the search (e.g. "
+                          "'bayer', 'epa_ppls'). Use list_versions() to discover "
+                          "available sources."),
     ] = None,
-    platform: Annotated[
+    product_class: Annotated[
         str | None,
-        Field(description="OPTIONAL platform filter. Set to one of the platforms listed by list_versions(); omit for all platforms."),
+        Field(description="OPTIONAL product class filter: 'herbicide', "
+                          "'fungicide', 'insecticide', 'seed-treatment'. "
+                          "Often null for EPA PPLS records."),
     ] = None,
-    bundle_id: Annotated[
+    registrant_contains: Annotated[
         str | None,
-        Field(description="OPTIONAL bundle filter — pin to a specific doc bundle slug."),
+        Field(description="OPTIONAL substring of the registrant company name "
+                          "(case-insensitive). Use to scope to a manufacturer "
+                          "(e.g., 'SYNGENTA', 'BAYER', 'CORTEVA')."),
+    ] = None,
+    signal_word: Annotated[
+        str | None,
+        Field(description="OPTIONAL EPA signal word filter: 'Danger', 'Warning', "
+                          "'Caution', or 'No Signal Word'."),
+    ] = None,
+    epa_reg_no: Annotated[
+        str | None,
+        Field(description="OPTIONAL exact EPA Registration Number (e.g. "
+                          "'524-591', '524-475-12345'). Narrows to chunks from "
+                          "just that registration."),
     ] = None,
     k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10,
 ) -> str:
-    """Search the {product} docs corpus.
+    """Search the EPA / manufacturer pesticide-label corpus.
 
-    Returns the top-k most relevant chunks (with full source page URLs)
-    given a natural-language query. Optional filters narrow the search
-    to one version, one platform, or one bundle. Use list_versions()
-    first if you need to discover the available facet values.
+    Returns the top-k most relevant label chunks for a natural-language
+    query. Each hit shows product name, EPA Reg No, registrant, signal
+    word, active ingredients, and a link to the source PDF.
 
-    Call this tool whenever the user asks anything that should be
-    answerable from the official product documentation.
+    Call this proactively whenever the user asks anything that should
+    be answerable from a pesticide product label — application rates,
+    target pests, target crops, re-entry intervals (REI), pre-harvest
+    intervals (PHI), tank-mix restrictions, signal words, environmental
+    hazards, storage requirements, etc.
+
+    The corpus is scoped to US row crops (corn / soybeans / wheat).
+    For products outside that scope, results will be empty or marginal.
     """
     with TimedCall("search_docs", {
-        "query": query, "version": version, "platform": platform,
-        "bundle_id": bundle_id, "k": k,
+        "query": query, "source": source, "product_class": product_class,
+        "registrant_contains": registrant_contains, "signal_word": signal_word,
+        "epa_reg_no": epa_reg_no, "k": k,
     }) as _call:
-        # TODO Phase 2-3: query Chroma collection (see rag/index.py for
-        # how it was built). Render the top-k chunks as markdown with
-        # source URLs.
-        # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set.
-        # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run
-        # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank.
-        _call.set(hits_returned=0)
-        raise NotImplementedError("Phase 2/3: implement Chroma query + rendering")
+        try:
+            col = _collection()
+        except Exception as exc:  # noqa: BLE001
+            _call.set(hits_returned=0, error=str(exc))
+            return f"_(search backend unavailable: {exc})_"
+
+        where = _build_where(source, product_class, registrant_contains,
+                             signal_word, epa_reg_no)
+        # Over-fetch when we'll post-filter on registrant substring, so we
+        # still have ~k matches after the filter trims.
+        n_fetch = k * 4 if registrant_contains else k
+        try:
+            res = col.query(query_texts=[query], n_results=n_fetch, where=where)
+        except Exception as exc:  # noqa: BLE001
+            _call.set(hits_returned=0, error=str(exc))
+            return f"_(search failed: {exc})_"
+
+        docs = res.get("documents", [[]])[0]
+        metas = res.get("metadatas", [[]])[0]
+        dists = res.get("distances", [[]])[0]
+
+        # Cosine distance → similarity score (1 - d). Clip to [0,1] for display.
+        scored: list[tuple[str, dict, float]] = []
+        for doc, meta, dist in zip(docs, metas, dists):
+            if registrant_contains:
+                reg = (meta.get("registrant") or "").upper()
+                if registrant_contains.upper() not in reg:
+                    continue
+            score = max(0.0, 1.0 - float(dist))
+            scored.append((doc, meta, score))
+            if len(scored) >= k:
+                break
+
+        _call.set(hits_returned=len(scored))
+        if not scored:
+            return "_(no results — try broadening the query, dropping filters, or check list_versions() for valid sources/classes)_"
+
+        out: list[str] = [
+            f"# Search results for {query!r}  ({len(scored)} of top-{n_fetch} dense hits)",
+            "",
+        ]
+        for doc, meta, score in scored:
+            out.append(_format_hit(doc, meta, score))
+        return "\n".join(out)
 
 
 @mcp.tool()
 def get_page(
-    bundle_id: Annotated[str, Field(description="Bundle slug.")],
-    page_id: Annotated[str, Field(description="Page filename within the bundle.")],
+    source: Annotated[
+        str,
+        Field(description="Source id (e.g. 'bayer', 'epa_ppls'). See "
+                          "list_versions()."),
+    ],
+    source_key: Annotated[
+        str,
+        Field(description="Per-source primary key — a product slug for "
+                          "manufacturer sources ('warrant', 'huskie') or an "
+                          "EPA Reg No for EPA PPLS ('524-475')."),
+    ],
 ) -> str:
-    """Return the full markdown for one page, plus a metadata header.
+    """Return the full markdown of one pesticide label, with metadata header.
 
-    Use after search_docs surfaces a relevant page and the user (or you)
-    want the complete text — not just the matched chunks.
+    Use this after search_docs surfaces a relevant label and you (or the
+    user) want the complete text — not just the matched chunks. Useful
+    when answering nuanced questions about a specific product's
+    directions, restrictions, or tank-mix table.
     """
-    with TimedCall("get_page", {"bundle_id": bundle_id, "page_id": page_id}) as _call:
-        data = _read_page(bundle_id, page_id)
+    with TimedCall("get_page", {"source": source, "source_key": source_key}) as _call:
+        data = _read_label(source, source_key)
         if data is None:
             _call.set(found=False)
-            return f"Page not found: {bundle_id}/{page_id}"
+            return f"Label not found: {source}/{source_key}"
         md, meta = data
-        _call.set(found=True, page_chars=len(md))
-        # TODO: add a metadata header (title, version, source URL) above
-        # the body. Product-specific shape.
-        return md
+        _call.set(found=True, label_chars=len(md))
+        label = meta.get("label") or {}
+        actives_list = [
+            a["name"] for a in (meta.get("active_ingredients") or [])
+            if isinstance(a, dict) and a.get("name")
+        ]
+        header_lines = [
+            f"# {meta.get('product_name') or source_key}",
+            "",
+            f"- **EPA Reg No:** {meta.get('epa_reg_no') or '(unknown)'}",
+            f"- **Source:** {source}/{source_key}",
+        ]
+        if meta.get("registrant"):
+            header_lines.append(f"- **Registrant:** {meta['registrant']}")
+        if meta.get("product_class"):
+            header_lines.append(f"- **Product class:** {meta['product_class']}")
+        if meta.get("signal_word"):
+            header_lines.append(f"- **Signal word:** {meta['signal_word']}")
+        if actives_list:
+            header_lines.append(f"- **Active ingredients:** {', '.join(actives_list)}")
+        if label.get("accepted_date"):
+            header_lines.append(f"- **Label accepted:** {label['accepted_date']}")
+        if label.get("url"):
+            header_lines.append(f"- **Label PDF:** {label['url']}")
+        header_lines.extend(["", "---", ""])
+        return "\n".join(header_lines) + md
 
 
 @mcp.tool()
 def list_versions() -> str:
-    """List the available version/platform facets across all bundles.
+    """List the available sources, product classes, and registrants in the corpus.
 
-    Use this to discover valid filter values for search_docs.
+    Use this to discover valid filter values for search_docs. The corpus
+    is scoped to US row-crop pesticide labels (corn / soybeans / wheat).
+
+    Despite the name (preserved for MCP-client compatibility), this
+    returns labels-domain facets — not software-version facets.
     """
     with TimedCall("list_versions", {}) as _call:
-        cat = _bundles()
-        if not cat:
-            return "_(no bundles indexed yet — run the scraper + indexer)_"
-        versions = sorted({b.get("version") for b in cat.values() if b.get("version")})
-        platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")})
-        _call.set(versions=len(versions), platforms=len(platforms))
-        lines = [f"# Facets across {len(cat)} bundle(s)", ""]
-        if versions:
-            lines.append("## Versions"); lines.append("")
-            for v in versions: lines.append(f"- `{v}`")
+        cat = _sources()
+
+        # Source-level summary from sources.json
+        lines: list[str] = ["# PPLS docs corpus"]
+
+        # Live counts from Chroma (best-effort; the server should still
+        # render a useful response if Chroma is unreachable)
+        chunk_count = label_count = None
+        try:
+            col = _collection()
+            chunk_count = col.count()
+        except Exception:  # noqa: BLE001
+            pass
+        if CORPUS_ROOT.exists():
+            label_count = sum(
+                1 for p in CORPUS_ROOT.glob("*/*.json")
+                if not p.name.startswith(".")
+            )
+
+        if chunk_count is not None or label_count is not None:
             lines.append("")
-        if platforms:
-            lines.append("## Platforms"); lines.append("")
-            for p in platforms: lines.append(f"- `{p}`")
+            if label_count is not None:
+                lines.append(f"- **Labels indexed:** {label_count:,}")
+            if chunk_count is not None:
+                lines.append(f"- **Chunks indexed:** {chunk_count:,}")
+
+        if cat:
+            lines.append("\n## Sources\n")
+            for sid, s in sorted(cat.items()):
+                title = s.get("title") or sid
+                stype = s.get("type") or ""
+                lines.append(f"- `{sid}`  *({stype})*  — {title}")
+                if s.get("scope_filter"):
+                    lines.append(f"  - scope: {s['scope_filter']}")
+        else:
+            lines.append("\n_(sources.json missing — corpus may not be initialized)_")
+
+        # Per-source facets if Chroma is reachable
+        try:
+            col = _collection()
+            # We can't enumerate distinct metadata values from Chroma directly;
+            # walk a sample to discover them. ~50K sample is fine for our
+            # ~200K-chunk corpus and keeps this tool fast.
+            sample = col.get(limit=50000, include=["metadatas"])
+            metas = sample.get("metadatas") or []
+            classes = sorted({m.get("product_class") for m in metas if m.get("product_class")})
+            signals = sorted({m.get("signal_word") for m in metas if m.get("signal_word")})
+            registrants = sorted({m.get("registrant") for m in metas if m.get("registrant")})
+            _call.set(sources=len(cat), classes=len(classes),
+                      signals=len(signals), registrants=len(registrants))
+            if classes:
+                lines.append("\n## Product classes\n")
+                for c in classes:
+                    lines.append(f"- `{c}`")
+            if signals:
+                lines.append("\n## Signal words\n")
+                for s in signals:
+                    lines.append(f"- `{s}`")
+            if registrants:
+                lines.append(f"\n## Registrants  ({len(registrants)})\n")
+                for r in registrants[:50]:
+                    lines.append(f"- {r}")
+                if len(registrants) > 50:
+                    lines.append(f"- _(…{len(registrants)-50} more)_")
+        except Exception as exc:  # noqa: BLE001
+            log.debug("could not sample Chroma metadata: %s", exc)
+            _call.set(sources=len(cat), classes=0, signals=0, registrants=0)
+
+        return "\n".join(lines)
+
+
+@mcp.tool()
+def corpus_status() -> str:
+    """Report counts + freshness of the indexed label corpus.
+
+    Use to confirm the search backend is healthy, see how many labels are
+    indexed, and check which sources are currently feeding the corpus.
+    Cheap — no embedder call.
+    """
+    with TimedCall("corpus_status", {}) as _call:
+        lines: list[str] = ["# PPLS corpus status\n"]
+
+        # On-disk corpus
+        labels_by_source: dict[str, int] = {}
+        if CORPUS_ROOT.exists():
+            for source_dir in sorted(CORPUS_ROOT.iterdir()):
+                if not source_dir.is_dir() or source_dir.name.startswith("."):
+                    continue
+                n = sum(1 for _ in source_dir.glob("*.json"))
+                if n:
+                    labels_by_source[source_dir.name] = n
+        else:
+            lines.append(f"_(corpus root {CORPUS_ROOT} doesn't exist)_")
+            _call.set(labels=0, chunks=0, sources=0)
+            return "\n".join(lines)
+
+        total_labels = sum(labels_by_source.values())
+        lines.append(f"- **Corpus root:** `{CORPUS_ROOT}`")
+        lines.append(f"- **Total labels on disk:** {total_labels:,}")
+
+        # Chroma
+        try:
+            col = _collection()
+            chunks = col.count()
+            lines.append(f"- **Chunks in Chroma:** {chunks:,}")
+            lines.append(f"- **Chroma dir:** `{CHROMA_DIR}`")
+            lines.append(f"- **Collection:** `{COLLECTION}`")
+        except Exception as exc:  # noqa: BLE001
+            chunks = 0
+            lines.append(f"- **Chroma:** _unavailable_ ({exc})")
+
+        # BM25
+        if BM25_DB.exists():
+            lines.append(f"- **BM25 db:** `{BM25_DB}`  ({BM25_DB.stat().st_size / 1024 / 1024:.0f} MB)")
+        else:
+            lines.append("- **BM25 db:** _not built_")
+
+        if labels_by_source:
+            lines.append("\n## Labels per source\n")
+            for src, n in sorted(labels_by_source.items(), key=lambda kv: -kv[1]):
+                lines.append(f"- `{src}`: {n:,} labels")
+
+        # Active feature flags
+        flags = []
+        if RERANK_URL:
+            flags.append(f"RERANK_URL=`{RERANK_URL}`")
+        if HYBRID_SEARCH:
+            flags.append("HYBRID_SEARCH=on")
+        if flags:
+            lines.append("\n## Active feature flags\n")
+            for f in flags:
+                lines.append(f"- {f}")
+
+        _call.set(labels=total_labels, chunks=chunks, sources=len(labels_by_source))
         return "\n".join(lines)
 
 
@@ -205,27 +497,12 @@ def list_versions() -> str:
 # don't lose the contracts. Implementations come per phase.
 # ---------------------------------------------------------------------------
 
-# @mcp.tool()  # Phase 9
-# def list_cluster(bundle_id: str, page_id: str) -> str: ...
-
-# @mcp.tool()  # Phase 9
-# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ...
-
-# @mcp.tool()  # Phase 9
-# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ...
-
-# @mcp.tool()  # Phase 13
-# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ...
-
-# @mcp.tool()  # Phase 9 (or 3 — useful early)
-# def corpus_status() -> str: ...
-
-# @mcp.tool()  # Phase 11
-# def myproduct_api_lessons(topic: str | None = None) -> str: ...
-
 # @mcp.tool()  # Phase 12
 # def find_doc_inconsistencies(scope_query: str, ...) -> str: ...
 
+# @mcp.tool()  # Phase 11
+# def ppls_label_lessons(topic: str | None = None) -> str: ...
+
 
 # ===========================================================================
 # Entry point
@@ -240,13 +517,14 @@ def main() -> None:
     p.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "8000")))
     args = p.parse_args()
 
+    logging.basicConfig(level=logging.INFO,
+                        format="%(asctime)s  %(levelname)s  %(name)s  %(message)s")
+
     if args.transport == "stdio":
         mcp.run()
     else:
         mcp.settings.host = args.host
         mcp.settings.port = args.port
-        # DNS-rebinding protection defaults to localhost-only — disable for
-        # container-network DNS hostnames. See PLAN.md "Hosting" notes.
         if os.environ.get("MCP_DISABLE_DNS_REBINDING_PROTECTION") in {"1", "true", "yes"}:
             mcp.settings.transport_security.enable_dns_rebinding_protection = False
         mcp.run(transport=args.transport)