From 97a2a05b248c2520c87c40a68543cd1cd0555c7f Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sun, 24 May 2026 10:02:01 -0400 Subject: [PATCH] Phase 3: MCP server tools for the labels corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt docs_mcp/server.py from versioned-software-docs domain to pesticide-labels domain. Standard MCP tool names preserved (search_docs / get_page / list_versions) so existing MCP clients (Claude Desktop, Cursor) still pick them up; docstrings + argument shape are labels-domain. Tools shipped: - search_docs(query, source, product_class, registrant_contains, signal_word, epa_reg_no, k) — dense Chroma query with optional filters, post-filtered for registrant substring. Returns top-k chunks rendered as markdown with product / reg / registrant / actives / signal / section / label-PDF URL. - get_page(source, source_key) — full label markdown + metadata header. source_key is slug for MFR sources, EPA Reg No for EPA PPLS. - list_versions() — discovers facet values: sources, product classes, signal words, registrants (samples up to 50K chunks from Chroma to enumerate distinct metadata values). - corpus_status() — fast no-embedder counts: labels on disk per source, chunks in Chroma, BM25 db size, active feature flags. Wiring: - Reads PPLS_CORPUS_ROOT + PPLS_CHROMA_DIR (matches the scrapers and indexer). - Uses sources.json (not the template's bundles.json). - Lazy Chroma init so the server starts cleanly even when Ollama is briefly down (e.g. during HVM corpus rebuilds). - Phase 6 reranker + Phase 8 hybrid hooks left as feature flags (RERANK_URL, HYBRID_SEARCH) — fail open to dense-only when unset. Smoke test against the live 216K-chunk corpus: - corpus_status: 4,157 labels / 216,467 chunks / 416 MB BM25 ✓ - search_docs("waterhemp control on soybeans", k=2) returns Tackle Herbicide (FMC, 279-3564, glyph+imazethapyr) and R14640 Herbicide (Bayer, 524-724, glyph) with section context (ROUNDUP READY SOYBEANS / SOYBEAN) and dist-derived scores of 0.76 each — highly relevant. Run as stdio for Claude Desktop: PPLS_CORPUS_ROOT=/run/media/justin/USB/ppls-corpus \ OLLAMA_URL=http://gpu1:11434,http://gpu2:11434 \ PRODUCT_NAME=ppls \ python -m docs_mcp.server --transport stdio Co-Authored-By: Claude Opus 4.7 (1M context) --- docs_mcp/server.py | 524 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 401 insertions(+), 123 deletions(-) diff --git a/docs_mcp/server.py b/docs_mcp/server.py index 9a9387c..365b16d 100644 --- a/docs_mcp/server.py +++ b/docs_mcp/server.py @@ -1,20 +1,19 @@ -"""MCP server skeleton — fill in PRODUCT_NAME and the tool bodies. +"""MCP server for the ppls-docs pesticide label corpus. -This file is the template's structural anchor. The phases described in -PLAN.md add or extend pieces of this file: +Adapted from the docs-mcp-template (which targeted versioned software +docs) for the EPA pesticide-labels domain: ``bundle_id`` → ``source``, +``page_id`` → ``source_key`` (slug for MFRs, EPA Reg No for EPA PPLS), +and ``version``/``platform`` filters → product-class / registrant / +signal-word filters. See ``scrape/README.md`` for the corpus schema. - Phase 3 — search_docs, get_page, list_versions stubs (you are here) +Phase progression in this file: + Phase 3 — search_docs, get_page, list_versions, corpus_status (you are here) Phase 6 — reranker integration in search_docs - Phase 8 — BM25 + hybrid retrieval (HYBRID_SEARCH env gate, _rrf_fuse) - Phase 9 — diff_versions, list_cluster, bundle_changelog - Phase 10 — TimedCall wiring (already imported below) - Phase 11 — _api_lessons tool - Phase 12 — find_doc_inconsistencies - Phase 13 — weekly_digest + _digest_history reader + Phase 8 — BM25 + hybrid retrieval (HYBRID_SEARCH env gate) -Every stub below has a docstring + `raise NotImplementedError`. Replace -the body when you reach the corresponding phase. Keep the signatures -stable across products — clients depend on them. +Standard MCP tool names (search_docs / get_page / list_versions) are +preserved so clients that expect a docs MCP shape still work; the +docstrings make the labels-domain semantics explicit. """ from __future__ import annotations @@ -33,21 +32,25 @@ from .usage import TimedCall log = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Product-specific configuration. Set these for each new build. +# Product configuration. # --------------------------------------------------------------------------- -PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct") -PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com") +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls") +PRODUCT_DOCS_URL = os.environ.get( + "PRODUCT_DOCS_URL", + "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1", +) COLLECTION = f"{PRODUCT_NAME}_docs" -# Paths inside the deployed container (and matching layout locally for dev). -ROOT = Path(__file__).resolve().parent.parent -CORPUS = ROOT / "corpus" -CHROMA_DIR = ROOT / "chroma" -BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db"))) -BUNDLES_JSON = ROOT / "bundles.json" +# Paths — corpus on (possibly) external storage, indexes always at repo root. +REPO_ROOT = Path(__file__).resolve().parent.parent +CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") +CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma") +BM25_DB = Path(os.environ.get("BM25_DB", + str(REPO_ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db"))) +SOURCES_JSON = REPO_ROOT / "sources.json" # --------------------------------------------------------------------------- -# Feature flags (Phase 6 / 8 enable these as you ship each phase). +# Feature flags (enabled in later phases). # --------------------------------------------------------------------------- RERANK_URL = os.environ.get("RERANK_URL", "").rstrip("/") or None RERANK_POOL = int(os.environ.get("RERANK_POOL", "50")) @@ -59,40 +62,76 @@ RRF_K = int(os.environ.get("RRF_K", "60")) # --------------------------------------------------------------------------- # FastMCP setup. -# -# stateless_http=True — every request creates an ephemeral session and -# discards it on return. Critical for production: clients don't get -# 404 storms when the container is recreated by Watchtower. # --------------------------------------------------------------------------- mcp = FastMCP(f"{PRODUCT_NAME}-docs", stateless_http=True) # --------------------------------------------------------------------------- -# Lazy helpers — instantiate expensive things only when actually needed, -# so the server still starts when (e.g.) Ollama is briefly unreachable. +# Lazy helpers. # --------------------------------------------------------------------------- -def _bundles() -> dict[str, dict]: - """Cached load of bundles.json into a {slug: bundle_dict} mapping. +_chroma_collection = None +_sources_cache: dict[str, dict] | None = None - bundles.json is the product-specific catalog written by the Phase 1 - scraper. See PLAN.md Phase 1 for the schema. + +def _sources() -> dict[str, dict]: + """Load sources.json as {source_id: source_dict}.""" + global _sources_cache + if _sources_cache is not None: + return _sources_cache + if not SOURCES_JSON.exists(): + _sources_cache = {} + return _sources_cache + try: + items = json.loads(SOURCES_JSON.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + log.warning("sources.json unreadable: %s", exc) + items = [] + _sources_cache = {s["id"]: s for s in items if "id" in s} + return _sources_cache + + +def _collection(): + """Get the Chroma collection (lazy — only loads the embedder when first + queried, so the server starts cleanly even if Ollama is briefly down).""" + global _chroma_collection + if _chroma_collection is not None: + return _chroma_collection + import chromadb + from chromadb.config import Settings + from rag.embeddings import embedding_function + client = chromadb.PersistentClient( + path=str(CHROMA_DIR), + settings=Settings(anonymized_telemetry=False), + ) + _chroma_collection = client.get_collection( + COLLECTION, embedding_function=embedding_function() + ) + return _chroma_collection + + +def _build_where( + source: str | None, + product_class: str | None, + registrant_contains: str | None, + signal_word: str | None, + epa_reg_no: str | None, +) -> dict | None: + """Translate filter args into a Chroma `where` clause. + + Chroma's where supports exact-match per field (and $and/$or). For + `registrant_contains` we can only do exact equality at the where level, + so substring matching is applied post-query in Python. """ - if not BUNDLES_JSON.exists(): - return {} - cat = json.loads(BUNDLES_JSON.read_text()) - return {b["slug"]: b for b in cat} - - -def _build_where(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None: - """Translate filter args into a Chroma `where` clause.""" conds: list[dict] = [] - if version: - conds.append({"version": version}) - if platform: - conds.append({"platform": platform}) - if bundle_id: - conds.append({"bundle_id": bundle_id}) + if source: + conds.append({"source": source}) + if product_class: + conds.append({"product_class": product_class}) + if signal_word: + conds.append({"signal_word": signal_word}) + if epa_reg_no: + conds.append({"epa_reg_no": epa_reg_no}) if not conds: return None if len(conds) == 1: @@ -100,13 +139,45 @@ def _build_where(version: str | None, platform: str | None, bundle_id: str | Non return {"$and": conds} -def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None: - """Read a corpus page off disk. Returns (markdown_body, metadata_dict).""" - md_path = CORPUS / bundle_id / (page_id + ".md") - json_path = CORPUS / bundle_id / (page_id + ".json") +def _read_label(source: str, source_key: str) -> tuple[str, dict] | None: + """Read a label off disk. Returns (markdown_body, metadata_dict) or None.""" + md_path = CORPUS_ROOT / source / f"{source_key}.md" + json_path = CORPUS_ROOT / source / f"{source_key}.json" if not md_path.exists() or not json_path.exists(): return None - return md_path.read_text(), json.loads(json_path.read_text()) + try: + return md_path.read_text(encoding="utf-8"), json.loads( + json_path.read_text(encoding="utf-8") + ) + except (OSError, json.JSONDecodeError): + return None + + +def _format_hit(doc: str, meta: dict, score: float) -> str: + """Render one search hit as a markdown block.""" + product = meta.get("product_name") or meta.get("source_key") or "(unknown)" + reg = meta.get("epa_reg_no") or "—" + registrant = meta.get("registrant") or "" + actives = meta.get("active_ingredients") or "" + pclass = meta.get("product_class") or "" + signal = meta.get("signal_word") or "" + section = meta.get("section") or "" + source = meta.get("source") or "?" + source_key = meta.get("source_key") or "?" + label_url = meta.get("label_url") or "" + + header = ( + f"### {product} (EPA Reg {reg}) · score={score:.3f}\n" + f"- **Source:** `{source}/{source_key}`" + + (f" · class: {pclass}" if pclass else "") + + (f" · signal: {signal}" if signal else "") + + (f" · section: {section}" if section else "") + + "\n" + + (f"- **Registrant:** {registrant}\n" if registrant else "") + + (f"- **Active ingredients:** {actives}\n" if actives else "") + + (f"- **Label PDF:** {label_url}\n" if label_url else "") + ) + return header + "\n" + doc.strip() + "\n" # =========================================================================== @@ -115,88 +186,309 @@ def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None: @mcp.tool() def search_docs( - query: Annotated[str, Field(description=f"Natural-language query about {PRODUCT_NAME}.")], - version: Annotated[ + query: Annotated[ + str, + Field(description="Natural-language query about pesticide labels — " + "products, crops, pests, application rates, REI/PHI, " + "tank-mix restrictions, signal words, active ingredients."), + ], + source: Annotated[ str | None, - Field(description="OPTIONAL version filter — restrict to one product version."), + Field(description="OPTIONAL source id to restrict the search (e.g. " + "'bayer', 'epa_ppls'). Use list_versions() to discover " + "available sources."), ] = None, - platform: Annotated[ + product_class: Annotated[ str | None, - Field(description="OPTIONAL platform filter. Set to one of the platforms listed by list_versions(); omit for all platforms."), + Field(description="OPTIONAL product class filter: 'herbicide', " + "'fungicide', 'insecticide', 'seed-treatment'. " + "Often null for EPA PPLS records."), ] = None, - bundle_id: Annotated[ + registrant_contains: Annotated[ str | None, - Field(description="OPTIONAL bundle filter — pin to a specific doc bundle slug."), + Field(description="OPTIONAL substring of the registrant company name " + "(case-insensitive). Use to scope to a manufacturer " + "(e.g., 'SYNGENTA', 'BAYER', 'CORTEVA')."), + ] = None, + signal_word: Annotated[ + str | None, + Field(description="OPTIONAL EPA signal word filter: 'Danger', 'Warning', " + "'Caution', or 'No Signal Word'."), + ] = None, + epa_reg_no: Annotated[ + str | None, + Field(description="OPTIONAL exact EPA Registration Number (e.g. " + "'524-591', '524-475-12345'). Narrows to chunks from " + "just that registration."), ] = None, k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10, ) -> str: - """Search the {product} docs corpus. + """Search the EPA / manufacturer pesticide-label corpus. - Returns the top-k most relevant chunks (with full source page URLs) - given a natural-language query. Optional filters narrow the search - to one version, one platform, or one bundle. Use list_versions() - first if you need to discover the available facet values. + Returns the top-k most relevant label chunks for a natural-language + query. Each hit shows product name, EPA Reg No, registrant, signal + word, active ingredients, and a link to the source PDF. - Call this tool whenever the user asks anything that should be - answerable from the official product documentation. + Call this proactively whenever the user asks anything that should + be answerable from a pesticide product label — application rates, + target pests, target crops, re-entry intervals (REI), pre-harvest + intervals (PHI), tank-mix restrictions, signal words, environmental + hazards, storage requirements, etc. + + The corpus is scoped to US row crops (corn / soybeans / wheat). + For products outside that scope, results will be empty or marginal. """ with TimedCall("search_docs", { - "query": query, "version": version, "platform": platform, - "bundle_id": bundle_id, "k": k, + "query": query, "source": source, "product_class": product_class, + "registrant_contains": registrant_contains, "signal_word": signal_word, + "epa_reg_no": epa_reg_no, "k": k, }) as _call: - # TODO Phase 2-3: query Chroma collection (see rag/index.py for - # how it was built). Render the top-k chunks as markdown with - # source URLs. - # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set. - # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run - # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank. - _call.set(hits_returned=0) - raise NotImplementedError("Phase 2/3: implement Chroma query + rendering") + try: + col = _collection() + except Exception as exc: # noqa: BLE001 + _call.set(hits_returned=0, error=str(exc)) + return f"_(search backend unavailable: {exc})_" + + where = _build_where(source, product_class, registrant_contains, + signal_word, epa_reg_no) + # Over-fetch when we'll post-filter on registrant substring, so we + # still have ~k matches after the filter trims. + n_fetch = k * 4 if registrant_contains else k + try: + res = col.query(query_texts=[query], n_results=n_fetch, where=where) + except Exception as exc: # noqa: BLE001 + _call.set(hits_returned=0, error=str(exc)) + return f"_(search failed: {exc})_" + + docs = res.get("documents", [[]])[0] + metas = res.get("metadatas", [[]])[0] + dists = res.get("distances", [[]])[0] + + # Cosine distance → similarity score (1 - d). Clip to [0,1] for display. + scored: list[tuple[str, dict, float]] = [] + for doc, meta, dist in zip(docs, metas, dists): + if registrant_contains: + reg = (meta.get("registrant") or "").upper() + if registrant_contains.upper() not in reg: + continue + score = max(0.0, 1.0 - float(dist)) + scored.append((doc, meta, score)) + if len(scored) >= k: + break + + _call.set(hits_returned=len(scored)) + if not scored: + return "_(no results — try broadening the query, dropping filters, or check list_versions() for valid sources/classes)_" + + out: list[str] = [ + f"# Search results for {query!r} ({len(scored)} of top-{n_fetch} dense hits)", + "", + ] + for doc, meta, score in scored: + out.append(_format_hit(doc, meta, score)) + return "\n".join(out) @mcp.tool() def get_page( - bundle_id: Annotated[str, Field(description="Bundle slug.")], - page_id: Annotated[str, Field(description="Page filename within the bundle.")], + source: Annotated[ + str, + Field(description="Source id (e.g. 'bayer', 'epa_ppls'). See " + "list_versions()."), + ], + source_key: Annotated[ + str, + Field(description="Per-source primary key — a product slug for " + "manufacturer sources ('warrant', 'huskie') or an " + "EPA Reg No for EPA PPLS ('524-475')."), + ], ) -> str: - """Return the full markdown for one page, plus a metadata header. + """Return the full markdown of one pesticide label, with metadata header. - Use after search_docs surfaces a relevant page and the user (or you) - want the complete text — not just the matched chunks. + Use this after search_docs surfaces a relevant label and you (or the + user) want the complete text — not just the matched chunks. Useful + when answering nuanced questions about a specific product's + directions, restrictions, or tank-mix table. """ - with TimedCall("get_page", {"bundle_id": bundle_id, "page_id": page_id}) as _call: - data = _read_page(bundle_id, page_id) + with TimedCall("get_page", {"source": source, "source_key": source_key}) as _call: + data = _read_label(source, source_key) if data is None: _call.set(found=False) - return f"Page not found: {bundle_id}/{page_id}" + return f"Label not found: {source}/{source_key}" md, meta = data - _call.set(found=True, page_chars=len(md)) - # TODO: add a metadata header (title, version, source URL) above - # the body. Product-specific shape. - return md + _call.set(found=True, label_chars=len(md)) + label = meta.get("label") or {} + actives_list = [ + a["name"] for a in (meta.get("active_ingredients") or []) + if isinstance(a, dict) and a.get("name") + ] + header_lines = [ + f"# {meta.get('product_name') or source_key}", + "", + f"- **EPA Reg No:** {meta.get('epa_reg_no') or '(unknown)'}", + f"- **Source:** {source}/{source_key}", + ] + if meta.get("registrant"): + header_lines.append(f"- **Registrant:** {meta['registrant']}") + if meta.get("product_class"): + header_lines.append(f"- **Product class:** {meta['product_class']}") + if meta.get("signal_word"): + header_lines.append(f"- **Signal word:** {meta['signal_word']}") + if actives_list: + header_lines.append(f"- **Active ingredients:** {', '.join(actives_list)}") + if label.get("accepted_date"): + header_lines.append(f"- **Label accepted:** {label['accepted_date']}") + if label.get("url"): + header_lines.append(f"- **Label PDF:** {label['url']}") + header_lines.extend(["", "---", ""]) + return "\n".join(header_lines) + md @mcp.tool() def list_versions() -> str: - """List the available version/platform facets across all bundles. + """List the available sources, product classes, and registrants in the corpus. - Use this to discover valid filter values for search_docs. + Use this to discover valid filter values for search_docs. The corpus + is scoped to US row-crop pesticide labels (corn / soybeans / wheat). + + Despite the name (preserved for MCP-client compatibility), this + returns labels-domain facets — not software-version facets. """ with TimedCall("list_versions", {}) as _call: - cat = _bundles() - if not cat: - return "_(no bundles indexed yet — run the scraper + indexer)_" - versions = sorted({b.get("version") for b in cat.values() if b.get("version")}) - platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")}) - _call.set(versions=len(versions), platforms=len(platforms)) - lines = [f"# Facets across {len(cat)} bundle(s)", ""] - if versions: - lines.append("## Versions"); lines.append("") - for v in versions: lines.append(f"- `{v}`") + cat = _sources() + + # Source-level summary from sources.json + lines: list[str] = ["# PPLS docs corpus"] + + # Live counts from Chroma (best-effort; the server should still + # render a useful response if Chroma is unreachable) + chunk_count = label_count = None + try: + col = _collection() + chunk_count = col.count() + except Exception: # noqa: BLE001 + pass + if CORPUS_ROOT.exists(): + label_count = sum( + 1 for p in CORPUS_ROOT.glob("*/*.json") + if not p.name.startswith(".") + ) + + if chunk_count is not None or label_count is not None: lines.append("") - if platforms: - lines.append("## Platforms"); lines.append("") - for p in platforms: lines.append(f"- `{p}`") + if label_count is not None: + lines.append(f"- **Labels indexed:** {label_count:,}") + if chunk_count is not None: + lines.append(f"- **Chunks indexed:** {chunk_count:,}") + + if cat: + lines.append("\n## Sources\n") + for sid, s in sorted(cat.items()): + title = s.get("title") or sid + stype = s.get("type") or "" + lines.append(f"- `{sid}` *({stype})* — {title}") + if s.get("scope_filter"): + lines.append(f" - scope: {s['scope_filter']}") + else: + lines.append("\n_(sources.json missing — corpus may not be initialized)_") + + # Per-source facets if Chroma is reachable + try: + col = _collection() + # We can't enumerate distinct metadata values from Chroma directly; + # walk a sample to discover them. ~50K sample is fine for our + # ~200K-chunk corpus and keeps this tool fast. + sample = col.get(limit=50000, include=["metadatas"]) + metas = sample.get("metadatas") or [] + classes = sorted({m.get("product_class") for m in metas if m.get("product_class")}) + signals = sorted({m.get("signal_word") for m in metas if m.get("signal_word")}) + registrants = sorted({m.get("registrant") for m in metas if m.get("registrant")}) + _call.set(sources=len(cat), classes=len(classes), + signals=len(signals), registrants=len(registrants)) + if classes: + lines.append("\n## Product classes\n") + for c in classes: + lines.append(f"- `{c}`") + if signals: + lines.append("\n## Signal words\n") + for s in signals: + lines.append(f"- `{s}`") + if registrants: + lines.append(f"\n## Registrants ({len(registrants)})\n") + for r in registrants[:50]: + lines.append(f"- {r}") + if len(registrants) > 50: + lines.append(f"- _(…{len(registrants)-50} more)_") + except Exception as exc: # noqa: BLE001 + log.debug("could not sample Chroma metadata: %s", exc) + _call.set(sources=len(cat), classes=0, signals=0, registrants=0) + + return "\n".join(lines) + + +@mcp.tool() +def corpus_status() -> str: + """Report counts + freshness of the indexed label corpus. + + Use to confirm the search backend is healthy, see how many labels are + indexed, and check which sources are currently feeding the corpus. + Cheap — no embedder call. + """ + with TimedCall("corpus_status", {}) as _call: + lines: list[str] = ["# PPLS corpus status\n"] + + # On-disk corpus + labels_by_source: dict[str, int] = {} + if CORPUS_ROOT.exists(): + for source_dir in sorted(CORPUS_ROOT.iterdir()): + if not source_dir.is_dir() or source_dir.name.startswith("."): + continue + n = sum(1 for _ in source_dir.glob("*.json")) + if n: + labels_by_source[source_dir.name] = n + else: + lines.append(f"_(corpus root {CORPUS_ROOT} doesn't exist)_") + _call.set(labels=0, chunks=0, sources=0) + return "\n".join(lines) + + total_labels = sum(labels_by_source.values()) + lines.append(f"- **Corpus root:** `{CORPUS_ROOT}`") + lines.append(f"- **Total labels on disk:** {total_labels:,}") + + # Chroma + try: + col = _collection() + chunks = col.count() + lines.append(f"- **Chunks in Chroma:** {chunks:,}") + lines.append(f"- **Chroma dir:** `{CHROMA_DIR}`") + lines.append(f"- **Collection:** `{COLLECTION}`") + except Exception as exc: # noqa: BLE001 + chunks = 0 + lines.append(f"- **Chroma:** _unavailable_ ({exc})") + + # BM25 + if BM25_DB.exists(): + lines.append(f"- **BM25 db:** `{BM25_DB}` ({BM25_DB.stat().st_size / 1024 / 1024:.0f} MB)") + else: + lines.append("- **BM25 db:** _not built_") + + if labels_by_source: + lines.append("\n## Labels per source\n") + for src, n in sorted(labels_by_source.items(), key=lambda kv: -kv[1]): + lines.append(f"- `{src}`: {n:,} labels") + + # Active feature flags + flags = [] + if RERANK_URL: + flags.append(f"RERANK_URL=`{RERANK_URL}`") + if HYBRID_SEARCH: + flags.append("HYBRID_SEARCH=on") + if flags: + lines.append("\n## Active feature flags\n") + for f in flags: + lines.append(f"- {f}") + + _call.set(labels=total_labels, chunks=chunks, sources=len(labels_by_source)) return "\n".join(lines) @@ -205,27 +497,12 @@ def list_versions() -> str: # don't lose the contracts. Implementations come per phase. # --------------------------------------------------------------------------- -# @mcp.tool() # Phase 9 -# def list_cluster(bundle_id: str, page_id: str) -> str: ... - -# @mcp.tool() # Phase 9 -# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ... - -# @mcp.tool() # Phase 9 -# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ... - -# @mcp.tool() # Phase 13 -# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ... - -# @mcp.tool() # Phase 9 (or 3 — useful early) -# def corpus_status() -> str: ... - -# @mcp.tool() # Phase 11 -# def myproduct_api_lessons(topic: str | None = None) -> str: ... - # @mcp.tool() # Phase 12 # def find_doc_inconsistencies(scope_query: str, ...) -> str: ... +# @mcp.tool() # Phase 11 +# def ppls_label_lessons(topic: str | None = None) -> str: ... + # =========================================================================== # Entry point @@ -240,13 +517,14 @@ def main() -> None: p.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "8000"))) args = p.parse_args() + logging.basicConfig(level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s %(message)s") + if args.transport == "stdio": mcp.run() else: mcp.settings.host = args.host mcp.settings.port = args.port - # DNS-rebinding protection defaults to localhost-only — disable for - # container-network DNS hostnames. See PLAN.md "Hosting" notes. if os.environ.get("MCP_DISABLE_DNS_REBINDING_PROTECTION") in {"1", "true", "yes"}: mcp.settings.transport_security.enable_dns_rebinding_protection = False mcp.run(transport=args.transport)