hvm-docs/docs_mcp/server.py

"""MCP server skeleton — fill in PRODUCT_NAME and the tool bodies.

This file is the template's structural anchor. The phases described in
PLAN.md add or extend pieces of this file:

  Phase 3  — search_docs, get_page, list_versions stubs (you are here)
  Phase 6  — reranker integration in search_docs
  Phase 8  — BM25 + hybrid retrieval (HYBRID_SEARCH env gate, _rrf_fuse)
  Phase 9  — diff_versions, list_cluster, bundle_changelog
  Phase 10 — TimedCall wiring (already imported below)
  Phase 11 — <product>_api_lessons tool
  Phase 12 — find_doc_inconsistencies, submit_doc_bug
  Phase 13 — weekly_digest + _digest_history reader

Every stub below has a docstring + `raise NotImplementedError`. Replace
the body when you reach the corresponding phase. Keep the signatures
stable across products — clients depend on them.
"""
from __future__ import annotations

import json
import logging
import os
import re
from pathlib import Path
from typing import Annotated

from mcp.server.fastmcp import FastMCP
from pydantic import Field

from .usage import TimedCall

log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Product-specific configuration. Set these for each new build.
# ---------------------------------------------------------------------------
PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com")
COLLECTION = f"{PRODUCT_NAME}_docs"

# Paths inside the deployed container (and matching layout locally for dev).
ROOT = Path(__file__).resolve().parent.parent
CORPUS = ROOT / "corpus"
CHROMA_DIR = ROOT / "chroma"
BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db")))
BUNDLES_JSON = ROOT / "bundles.json"

# ---------------------------------------------------------------------------
# Feature flags (Phase 6 / 8 / 12 enable these as you ship each phase).
# ---------------------------------------------------------------------------
RERANK_URL = os.environ.get("RERANK_URL", "").rstrip("/") or None
RERANK_POOL = int(os.environ.get("RERANK_POOL", "50"))
RERANK_TIMEOUT = float(os.environ.get("RERANK_TIMEOUT", "30"))

HYBRID_SEARCH = os.environ.get("HYBRID_SEARCH", "").lower() in ("true", "1", "yes", "on")
RRF_K = int(os.environ.get("RRF_K", "60"))

DOC_BUG_SUBMIT_ENABLED = os.environ.get("DOC_BUG_SUBMIT_ENABLED", "").lower() in ("true", "1", "yes", "on")
DOC_BUG_API_URL = os.environ.get("DOC_BUG_API_URL", "")  # product-specific endpoint
DOC_BUG_TIMEOUT = float(os.environ.get("DOC_BUG_TIMEOUT", "15"))


# ---------------------------------------------------------------------------
# FastMCP setup.
#
# stateless_http=True — every request creates an ephemeral session and
# discards it on return. Critical for production: clients don't get
# 404 storms when the container is recreated by Watchtower.
# ---------------------------------------------------------------------------
mcp = FastMCP(f"{PRODUCT_NAME}-docs", stateless_http=True)


# ---------------------------------------------------------------------------
# Lazy helpers — instantiate expensive things only when actually needed,
# so the server still starts when (e.g.) Ollama is briefly unreachable.
# ---------------------------------------------------------------------------

def _bundles() -> dict[str, dict]:
    """Cached load of bundles.json into a {slug: bundle_dict} mapping.

    bundles.json is the product-specific catalog written by the Phase 1
    scraper. See PLAN.md Phase 1 for the schema.
    """
    if not BUNDLES_JSON.exists():
        return {}
    cat = json.loads(BUNDLES_JSON.read_text())
    return {b["slug"]: b for b in cat}


def _build_where(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None:
    """Translate filter args into a Chroma `where` clause."""
    conds: list[dict] = []
    if version:
        conds.append({"version": version})
    if platform:
        conds.append({"platform": platform})
    if bundle_id:
        conds.append({"bundle_id": bundle_id})
    if not conds:
        return None
    if len(conds) == 1:
        return conds[0]
    return {"$and": conds}


def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None:
    """Read a corpus page off disk. Returns (markdown_body, metadata_dict)."""
    md_path = CORPUS / bundle_id / (page_id + ".md")
    json_path = CORPUS / bundle_id / (page_id + ".json")
    if not md_path.exists() or not json_path.exists():
        return None
    return md_path.read_text(), json.loads(json_path.read_text())


# ===========================================================================
# Tools
# ===========================================================================

@mcp.tool()
def search_docs(
    query: Annotated[str, Field(description=f"Natural-language query about {PRODUCT_NAME}.")],
    version: Annotated[
        str | None,
        Field(description="OPTIONAL version filter — restrict to one product version."),
    ] = None,
    platform: Annotated[
        str | None,
        Field(description="OPTIONAL platform filter. Set to one of the platforms listed by list_versions(); omit for all platforms."),
    ] = None,
    bundle_id: Annotated[
        str | None,
        Field(description="OPTIONAL bundle filter — pin to a specific doc bundle slug."),
    ] = None,
    k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10,
) -> str:
    """Search the {product} docs corpus.

    Returns the top-k most relevant chunks (with full source page URLs)
    given a natural-language query. Optional filters narrow the search
    to one version, one platform, or one bundle. Use list_versions()
    first if you need to discover the available facet values.

    Call this tool whenever the user asks anything that should be
    answerable from the official product documentation.
    """
    with TimedCall("search_docs", {
        "query": query, "version": version, "platform": platform,
        "bundle_id": bundle_id, "k": k,
    }) as _call:
        # TODO Phase 2-3: query Chroma collection (see rag/index.py for
        # how it was built). Render the top-k chunks as markdown with
        # source URLs.
        # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set.
        # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run
        # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank.
        _call.set(hits_returned=0)
        raise NotImplementedError("Phase 2/3: implement Chroma query + rendering")


@mcp.tool()
def get_page(
    bundle_id: Annotated[str, Field(description="Bundle slug.")],
    page_id: Annotated[str, Field(description="Page filename within the bundle.")],
) -> str:
    """Return the full markdown for one page, plus a metadata header.

    Use after search_docs surfaces a relevant page and the user (or you)
    want the complete text — not just the matched chunks.
    """
    with TimedCall("get_page", {"bundle_id": bundle_id, "page_id": page_id}) as _call:
        data = _read_page(bundle_id, page_id)
        if data is None:
            _call.set(found=False)
            return f"Page not found: {bundle_id}/{page_id}"
        md, meta = data
        _call.set(found=True, page_chars=len(md))
        # TODO: add a metadata header (title, version, source URL) above
        # the body. Product-specific shape.
        return md


@mcp.tool()
def list_versions() -> str:
    """List the available version/platform facets across all bundles.

    Use this to discover valid filter values for search_docs.
    """
    with TimedCall("list_versions", {}) as _call:
        cat = _bundles()
        if not cat:
            return "_(no bundles indexed yet — run the scraper + indexer)_"
        versions = sorted({b.get("version") for b in cat.values() if b.get("version")})
        platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")})
        _call.set(versions=len(versions), platforms=len(platforms))
        lines = [f"# Facets across {len(cat)} bundle(s)", ""]
        if versions:
            lines.append("## Versions"); lines.append("")
            for v in versions: lines.append(f"- `{v}`")
            lines.append("")
        if platforms:
            lines.append("## Platforms"); lines.append("")
            for p in platforms: lines.append(f"- `{p}`")
        return "\n".join(lines)


# ---------------------------------------------------------------------------
# Stubs for later phases — keep the signatures in this file so refactors
# don't lose the contracts. Implementations come per phase.
# ---------------------------------------------------------------------------

# @mcp.tool()  # Phase 9
# def list_cluster(bundle_id: str, page_id: str) -> str: ...

# @mcp.tool()  # Phase 9
# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ...

# @mcp.tool()  # Phase 9
# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ...

# @mcp.tool()  # Phase 13
# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ...

# @mcp.tool()  # Phase 9 (or 3 — useful early)
# def corpus_status() -> str: ...

# @mcp.tool()  # Phase 11
# def myproduct_api_lessons(topic: str | None = None) -> str: ...

# @mcp.tool()  # Phase 12
# def find_doc_inconsistencies(scope_query: str, ...) -> str: ...

# @mcp.tool()  # Phase 12
# def submit_doc_bug(page_url: str, content: str, email: str | None = None, ...) -> str: ...


# ===========================================================================
# Entry point
# ===========================================================================

def main() -> None:
    import argparse
    p = argparse.ArgumentParser(description=f"{PRODUCT_NAME} docs MCP server")
    p.add_argument("--transport", choices=["stdio", "streamable-http", "sse"],
                   default=os.environ.get("MCP_TRANSPORT", "stdio"))
    p.add_argument("--host", default=os.environ.get("MCP_HOST", "0.0.0.0"))
    p.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "8000")))
    args = p.parse_args()

    if args.transport == "stdio":
        mcp.run()
    else:
        mcp.settings.host = args.host
        mcp.settings.port = args.port
        # DNS-rebinding protection defaults to localhost-only — disable for
        # container-network DNS hostnames. See PLAN.md "Hosting" notes.
        if os.environ.get("MCP_DISABLE_DNS_REBINDING_PROTECTION") in {"1", "true", "yes"}:
            mcp.settings.transport_security.enable_dns_rebinding_protection = False
        mcp.run(transport=args.transport)


if __name__ == "__main__":
    main()