From fa448f94e1815b018046a221985b087417c6a6a6 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 22 May 2026 15:26:24 -0400 Subject: [PATCH] build out morpheus-docs MCP stack, mirroring hvm-docs through Phases 1-13 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial scaffold: the docs-mcp-template clone with all the HVM-validated stack ported across, customized for Morpheus Enterprise (PRODUCT_NAME=morpheus, server name morpheus-docs). Bundles (live-discovered 2026-05-22; 1710 cataloged pages total): * morpheus_user_manual_8_1_0 sd00007510en_us 568 pages (Feb 2026) * morpheus_user_manual_8_1_1 sd00007621en_us 569 pages (Mar 2026) * morpheus_user_manual_8_1_2 sd00007732en_us 569 pages (Apr 2026) * morpheus_release_notes_8_1_0 sd00007496en_us single-doc * morpheus_release_notes_8_1_1 sd00007610en_us single-doc * morpheus_release_notes_8_1_2 sd00007733en_us single-doc * morpheus_quickspecs a50009231enw html-file (live curl_cffi against www.hpe.com; all 12+ Enterprise SKUs captured — S6E64..S6E73AAE for new/renewal/upgrade × 1/3/5-yr terms, plus services SKUs HA124A1#V38/V39 and H46SBA1). No Deployment Guide or Qualification Matrix on HPE Support for Morpheus Enterprise specifically — the only QM (sd00006551en_us) covers HVM clusters managed by Morpheus and lives in hvm-docs. Stack carried forward from hvm-docs: * rag/{index,chunk,embeddings,bm25}.py — including the MAX_CHARS=4000 chunk-cap fix for table-dense content * docs_mcp/{server,usage}.py — 11 MCP tools, BM25-default search, cross-encoder rerank, hybrid behind HYBRID_SEARCH=true, morpheus_api_lessons (renamed from hvm_api_lessons), env-gated submit_doc_bug * docs_mcp/api_lessons.md — Morpheus-specific scaffold covering licensing model, HVM elevation path, REST vs Plugin API, with TODO markers for sections to flesh out from real ops experience * scrape/{runner,quickspecs,changelog,bundles}.py — TOC + single-doc + html-file modes, curl_cffi Chrome120 for www.hpe.com edge bypass * eval/{retrievers,run_eval}.py + queries.jsonl scaffold (4 placeholder queries; populate after first scrape) * scripts/{rerank_server,usage_report,registry_gc}.py * .gitea/workflows/{refresh,image-only}.yml — same Gitea Actions setup zerto-docs uses (push LAN, pull public-URL, GPU Ollama pool) * deploy/docker-compose.yml — morpheus-docs-mcp service definition, shared jina-rerank sidecar, Watchtower-labeled * Dockerfile, requirements.txt, requirements-rerank.txt Verified locally: scrape produced 1599 .md pages (some TOC entries are parent-only and yield no body), 6353 chunks all under the 4 KB cap, MCP server boots and lists 11 tools cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitea/workflows/image-only.yml | 109 +-- .gitea/workflows/refresh.yml | 144 ++-- bundles.json | 119 ++++ deploy/docker-compose.yml | 40 +- docs_mcp/api_lessons.md | 148 ++++ docs_mcp/server.py | 1117 +++++++++++++++++++++++++++++-- eval/queries.jsonl | 4 + eval/retrievers.py | 146 +++- eval/run_eval.py | 90 ++- rag/chunk.py | 50 +- rag/embeddings.py | 25 +- rag/index.py | 2 +- requirements-rerank.txt | 10 + requirements.txt | 8 + scrape/README.md | 66 ++ scrape/bundles.py | 200 ++++++ scrape/quickspecs.py | 194 ++++++ scrape/quickspecs/README.md | 27 + scrape/runner.py | 339 ++++++++++ scripts/__init__.py | 0 scripts/registry_gc.py | 111 +-- scripts/rerank_server.py | 120 ++++ 22 files changed, 2822 insertions(+), 247 deletions(-) create mode 100644 bundles.json create mode 100644 docs_mcp/api_lessons.md create mode 100644 eval/queries.jsonl create mode 100644 requirements-rerank.txt create mode 100644 scrape/bundles.py create mode 100644 scrape/quickspecs.py create mode 100644 scrape/quickspecs/README.md create mode 100644 scrape/runner.py create mode 100644 scripts/__init__.py create mode 100644 scripts/rerank_server.py diff --git a/.gitea/workflows/image-only.yml b/.gitea/workflows/image-only.yml index abe60b2..4e09574 100644 --- a/.gitea/workflows/image-only.yml +++ b/.gitea/workflows/image-only.yml @@ -14,21 +14,17 @@ on: workflow_dispatch: env: - REGISTRY_PUSH: : - REGISTRY_PULL: - # Image name derives from the actual repo at runtime, so a clone - # doesn't need to find/replace anything. e.g. justin/my-product-docs. - # github.* context is Gitea Actions' inherited GitHub-Actions namespace - # — values come from the Gitea server, not github.com. + # PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare's 100 MB + # body cap. PULL uses the public hostname (HTTPS). Same Gitea registry. + REGISTRY_PUSH: 192.168.0.2:1234 + REGISTRY_PULL: git.jpaul.io IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} - OLLAMA_URL: http://:11434 + # Two GPU-pinned Ollama containers on the Gitea host — same infra + # zerto-docs uses. :11435 = Titan X, :11436 = 1080 Ti. Indexer + # round-robins per batch. + OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436 EMBED_MODEL: nomic-embed-text - # PRODUCT_NAME defaults to the repo name so a clone works without - # editing. Override here if you want a different identifier (e.g. - # repo "my-product-docs" → PRODUCT_NAME "myproduct"). Used as the - # Chroma collection name, BM25 db filename, and MCP server name — - # see docs_mcp/server.py. - PRODUCT_NAME: ${{ github.event.repository.name }} + PRODUCT_NAME: morpheus jobs: build: @@ -39,8 +35,7 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - # Full history (not shallow) so the digest-history step can - # walk git log up to --history-days back. + # Full history so digest-history can walk git log. fetch-depth: 0 - name: Set up Python @@ -54,9 +49,8 @@ jobs: python -m pip install -q -r requirements.txt - name: Refresh digest history - # Cheap (a few seconds); doesn't touch corpus content. - # Without this step, a code-only deploy would ship an - # increasingly-stale digest history relative to git. + # Cheap (few seconds). Without this step, a code-only deploy + # would ship an increasingly-stale digest history. run: | mkdir -p corpus/.digest python -m scrape.changelog \ @@ -71,42 +65,69 @@ jobs: - name: Rebuild indexes from existing corpus run: python -m rag.index --rebuild - - name: Log in to registry (LAN endpoint) - run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + # LAN registry is HTTP only. + config-inline: | + [registry."192.168.0.2:1234"] + http = true + insecure = true - - name: Build & push image + - name: Configure registry credentials for buildx + env: + REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }} + REGISTRY_USER: ${{ github.actor }} run: | - SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) - DATE_TAG=$(date -u +%Y.%m.%d) - docker build \ - -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ - -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ - -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ - . - docker push "${REGISTRY_PUSH}/${IMAGE}:latest" - docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" - docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" + mkdir -p ~/.docker + AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0) + cat > ~/.docker/config.json < ${PKG}: HTTP ${code}" + body=$(cat /tmp/link.out) + case "$code" in + 201) echo "OK — newly linked" ;; + 400|409) echo "OK — already linked: ${body}" ;; + *) echo "unexpected: ${body}"; exit 1 ;; esac - name: Prune old container versions diff --git a/.gitea/workflows/refresh.yml b/.gitea/workflows/refresh.yml index ef7f504..caef74a 100644 --- a/.gitea/workflows/refresh.yml +++ b/.gitea/workflows/refresh.yml @@ -19,27 +19,25 @@ on: default: false env: - # If your registry sits behind Cloudflare with its 100 MB body cap, - # use a LAN endpoint for pushes (bypasses CF) and the public hostname - # for pulls (response bodies aren't capped). - REGISTRY_PUSH: : - REGISTRY_PULL: - # Image name derives from the actual repo at runtime, so a clone - # doesn't need to find/replace anything. e.g. justin/my-product-docs. - # github.* context is Gitea Actions' inherited GitHub-Actions namespace - # — values come from the Gitea server, not github.com. + # PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare Tunnel's + # 100 MB body cap. PULL uses the public hostname (HTTPS). Same Gitea + # registry either way — package lands under the same owner/repo. + REGISTRY_PUSH: 192.168.0.2:1234 + REGISTRY_PULL: git.jpaul.io + + # Image name derives from the repo at runtime — clones don't need to + # edit this. github.* is the Gitea-Actions inherited namespace. IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }} - # Embedder. One URL per GPU; the indexer round-robins. - OLLAMA_URL: http://:11434 + # Two GPU-pinned Ollama containers on the Gitea host — same infra + # zerto-docs uses (deploy/ollama-rag.docker-compose.yml over there). + # :11435 owns the Titan X, :11436 owns the 1080 Ti; the indexer + # round-robins per batch so both cards run in parallel. The host's + # primary Ollama on :11434 is left alone for OpenWebUI etc. + OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436 EMBED_MODEL: nomic-embed-text - # PRODUCT_NAME defaults to the repo name so a clone works without - # editing. Override here if you want a different identifier (e.g. - # repo "my-product-docs" → PRODUCT_NAME "myproduct"). Used as the - # Chroma collection name, BM25 db filename, and MCP server name — - # see docs_mcp/server.py. - PRODUCT_NAME: ${{ github.event.repository.name }} + PRODUCT_NAME: morpheus jobs: refresh: @@ -50,10 +48,12 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - # Full history — required for the digest-history step to - # walk git log. Default fetch-depth: 1 silently produces a - # 0-byte history file. + # Full history — required for digest-history. Default depth 1 + # silently produces a 0-byte history file. fetch-depth: 0 + # Set the credentials Gitea injects so we can push corpus + # commits back. Persist them across the run. + token: ${{ secrets.GITEA_TOKEN }} - name: Set up Python uses: actions/setup-python@v5 @@ -89,8 +89,8 @@ jobs: - name: Commit corpus changes (if any) id: commit run: | - git config user.name "-docs-refresh" - git config user.email "actions@" + git config user.name "hvm-docs-refresh" + git config user.email "actions@jpaul.io" git add bundles.json corpus if git diff --cached --quiet; then echo "no corpus changes — skipping reindex and image build" @@ -132,49 +132,89 @@ jobs: if: steps.commit.outputs.changed == 'true' || inputs.force_build == true run: python -m rag.index --rebuild - # ---- Build & push image ------------------------------------ - - name: Log in to registry (LAN endpoint) + # ---- Build & push image (LAN endpoint, buildx) ------------- + - name: Set up Docker Buildx if: steps.commit.outputs.changed == 'true' || inputs.force_build == true - run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin + uses: docker/setup-buildx-action@v3 + with: + # LAN registry is HTTP only. Buildkit needs an explicit + # insecure-registry config or it tries to upgrade to HTTPS. + config-inline: | + [registry."192.168.0.2:1234"] + http = true + insecure = true - - name: Build & push image + - name: Configure registry credentials for buildx + # Can't use docker/login-action against the LAN endpoint — + # the host docker daemon errors on HTTP-vs-HTTPS. Buildx reads + # ~/.docker/config.json directly, so write the auth ourselves. if: steps.commit.outputs.changed == 'true' || inputs.force_build == true - # Runner shell is /bin/sh — use cut instead of ${VAR::N}. - # Three tags: :latest (Watchtower target), : - # (rollback pin), : (human-readable). + env: + REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }} + REGISTRY_USER: ${{ github.actor }} run: | - SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12) - DATE_TAG=$(date -u +%Y.%m.%d) - docker build \ - -t "${REGISTRY_PUSH}/${IMAGE}:latest" \ - -t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \ - -t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \ - . - docker push "${REGISTRY_PUSH}/${IMAGE}:latest" - docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" - docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" + mkdir -p ~/.docker + AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0) + cat > ~/.docker/config.json < ${PKG}: HTTP ${code}" + body=$(cat /tmp/link.out) + case "$code" in + 201) echo "OK — newly linked" ;; + 400|409) echo "OK — already linked: ${body}" ;; + *) echo "unexpected: ${body}"; exit 1 ;; esac # ---- Registry GC ------------------------------------------- diff --git a/bundles.json b/bundles.json new file mode 100644 index 0000000..9aef51a --- /dev/null +++ b/bundles.json @@ -0,0 +1,119 @@ +[ + { + "slug": "morpheus_user_manual_8_1_0", + "doc_id": "sd00007510en_us", + "title": "HPE Morpheus Enterprise Software Documentation v8.1.0", + "version": "8.1.0", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 568, + "mode": "toc", + "abstract": "", + "dates": { + "Published": "February 2026" + }, + "landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007510en_us" + }, + { + "slug": "morpheus_user_manual_8_1_1", + "doc_id": "sd00007621en_us", + "title": "HPE Morpheus Enterprise Software Documentation v8.1.1", + "version": "8.1.1", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 569, + "mode": "toc", + "abstract": "", + "dates": { + "Published": "March 2026" + }, + "landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007621en_us" + }, + { + "slug": "morpheus_user_manual_8_1_2", + "doc_id": "sd00007732en_us", + "title": "HPE Morpheus Enterprise Software Documentation v8.1.2", + "version": "8.1.2", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 569, + "mode": "toc", + "abstract": "", + "dates": { + "Published": "April 2026" + }, + "landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007732en_us" + }, + { + "slug": "morpheus_release_notes_8_1_0", + "doc_id": "sd00007496en_us", + "title": "v8.1.0 Release Notes", + "version": "8.1.0", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.0", + "dates": { + "Published": "February 2026" + }, + "landing_page": "sd00007496en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007496en_us" + }, + { + "slug": "morpheus_release_notes_8_1_1", + "doc_id": "sd00007610en_us", + "title": "v8.1.1 Release Notes", + "version": "8.1.1", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.1", + "dates": { + "Published": "March 2026" + }, + "landing_page": "sd00007610en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007610en_us" + }, + { + "slug": "morpheus_release_notes_8_1_2", + "doc_id": "sd00007733en_us", + "title": "v8.1.2 Release Notes", + "version": "8.1.2", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.2", + "dates": { + "Published": "April 2026" + }, + "landing_page": "sd00007733en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007733en_us" + }, + { + "slug": "morpheus_quickspecs", + "doc_id": "a50009231enw", + "title": "HPE Morpheus Enterprise Software QuickSpecs", + "version": "v1", + "platform": null, + "product": "QuickSpecs", + "language": "en-US", + "page_count": 1, + "mode": "html-file", + "abstract": "", + "dates": {}, + "landing_page": "a50009231enw", + "source_url": "https://www.hpe.com/psnow/doc/a50009231enw" + } +] diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml index 0aa05a8..39691d6 100644 --- a/deploy/docker-compose.yml +++ b/deploy/docker-compose.yml @@ -1,6 +1,6 @@ # Hosting stack for a docs MCP server. # -# Replace below with your product name on first deploy. +# Replace hvm below with your product name on first deploy. # Volumes: usage logs are mounted to a host path so they survive # Watchtower-driven container recreates. # @@ -10,15 +10,15 @@ services: # The MCP server. Watchtower auto-pulls on :latest changes. - -docs-mcp: - image: //-docs-mcp:latest - container_name: -docs-mcp + morpheus-docs-mcp: + image: git.jpaul.io/justin/morpheus-docs:latest + container_name: morpheus-docs-mcp restart: unless-stopped ports: - "8000:8000" environment: - PRODUCT_NAME: "" - PRODUCT_DOCS_URL: "https://docs.example.com" + PRODUCT_NAME: "morpheus" + PRODUCT_DOCS_URL: "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007732en_us" # Streamable-HTTP transport. Stateless mode is required for # production: clients don't lose sessions when Watchtower @@ -28,19 +28,21 @@ services: MCP_PORT: "8000" # If you run MetaMCP or another gateway in front and reach - # this container via its compose DNS name (e.g. -docs-mcp:8000), + # this container via its compose DNS name (e.g. morpheus-docs-mcp:8000), # add that hostname here. "*" disables the rebind check entirely. - MCP_ALLOWED_HOSTS: "-docs-mcp,localhost,127.0.0.1" + MCP_ALLOWED_HOSTS: "morpheus-docs-mcp,localhost,127.0.0.1" # Phase 6 — reranker sidecar (jina-reranker-v2-base via llama.cpp). - RERANK_URL: http://-rerank:8080 + RERANK_URL: http://hvm-rerank:8080 RERANK_POOL: "200" RERANK_TIMEOUT: "30" - # Phase 8 — hybrid retrieval (BM25 + dense + RRF). Set true - # only after the eval harness shows the dense-only path - # missing technical-term queries that BM25 catches. - HYBRID_SEARCH: "true" + # Phase 8 — hybrid retrieval (BM25 + dense + RRF). + # Eval on the HVM corpus (eval/results/baseline.md, 2026-05-22) shows + # BM25-default + reranker beats hybrid on every metric (MRR 0.920 vs + # 0.875). Leaving HYBRID_SEARCH off so search_docs runs BM25-first + + # reranker; dense is the fallback when BM25 finds nothing. + HYBRID_SEARCH: "false" # Phase 10 — usage telemetry. USAGE_LOG_DIR: /app/var/logs @@ -52,9 +54,9 @@ services: # DOC_BUG_API_URL: "https://docs-be.example.com/api/feedback" volumes: # Usage logs persist across container recreates. - - ./-docs-mcp-logs:/app/var/logs + - ./morpheus-docs-mcp-logs:/app/var/logs depends_on: - - -rerank + - hvm-rerank labels: # Watchtower polls *only* containers with this label set true. com.centurylinklabs.watchtower.enable: "true" @@ -63,9 +65,13 @@ services: # Reranker sidecar — llama.cpp serving jina-reranker-v2-base. # Requires GPU access; adjust runtime/devices for your hardware. - -rerank: + # + # For dev / CPU-only hosts, swap this service for scripts/rerank_server.py + # (sentence-transformers ms-marco-MiniLM-L-6-v2). Same /v1/rerank shape, + # ~500ms/batch on CPU vs ~50ms on GPU with the jina GGUF. + hvm-rerank: image: ghcr.io/ggml-org/llama.cpp:server-cuda - container_name: -rerank + container_name: hvm-rerank restart: unless-stopped # Mount the GGUF model from the host. Download from huggingface # (gguf-org/jina-reranker-v2-base-multilingual-GGUF) first. diff --git a/docs_mcp/api_lessons.md b/docs_mcp/api_lessons.md new file mode 100644 index 0000000..df1b789 --- /dev/null +++ b/docs_mcp/api_lessons.md @@ -0,0 +1,148 @@ +# HPE Morpheus Enterprise — Lessons + +Notes and gotchas about running, integrating with, and licensing +**HPE Morpheus Enterprise Software** that aren't obvious from the +official docs alone. The official User Manual + Release Notes + +QuickSpecs describe the product as designed; this file is what +experienced operators actually learn. + +> Treat this as living context. Update it when you (or the LLM +> driving this MCP) discover something non-obvious that the docs +> don't say or don't make findable. Each section is an H2 so the +> `morpheus_api_lessons(topic=...)` tool can return just the +> relevant piece. + +## TL;DR + +- **Morpheus Enterprise is the full cloud-management platform.** HPE + Morpheus VM Essentials (HVM) is the VM-only subset; Morpheus + Enterprise is what you "elevate to" when you need multi-cloud, + containers, automation, policy, FinOps, ITSM integration, and + self-service catalogs. The relationship is one-way upgrade. +- **Licensing is per physical CPU socket** on connected on-prem + clouds (bare metal, hypervisor hosts, Kubernetes worker nodes). + Public-cloud workloads (AWS / Azure / GCP / OCI) are factored at + **15 workloads per socket** equivalent. +- **All license SKUs include Tech Care Essentials 24×7** as part + of the license cost. There is no separate purchase for support + on the license tier. +- **`morpheus_quickspecs` is the source of truth for SKUs.** Don't + guess part numbers; query the QuickSpecs bundle. + +## Licensing and SKUs + +**Source of truth: the `morpheus_quickspecs` bundle.** Query it for +the current SKU list — the catalog updates more often than this +file does. + +Pricing model summary (from QuickSpecs v1, 2026): + +- **Per physical CPU socket** for connected on-prem clouds — + KVM/HVM hosts, VMware ESXi hosts, bare metal servers, Kubernetes + worker nodes. Count the **sockets**, not the cores; not the VMs. +- **Public cloud workloads factor at 15:1** — one socket of license + covers up to 15 public-cloud workloads (instances) across AWS, + Azure, GCP, OCI. +- **Term-based** licensing (not perpetual). 1, 3, and 5-year terms + on E-LTU SKUs. +- **All include HPE Tech Care Essentials** (24×7 support, 15-minute + response for severity-1) bundled into the license cost. + +> The exact ratios and SKU names can change between QuickSpecs +> revisions. Use the `morpheus_quickspecs` tool / bundle for current +> values rather than memorizing. + +## Elevation from HVM + +The "elevate to Morpheus Enterprise" path is the canonical journey +for customers who started on HVM and outgrew it: + +- **HVM clusters keep working unchanged after elevation.** You + don't redeploy the manager; you upgrade-in-place using a + Morpheus Enterprise license. +- **What changes:** the manager UI unlocks the full Enterprise + feature set — public-cloud integrations, container/Kubernetes + management, blueprints/catalogs, automation workflows, policy + engine, FinOps cost dashboards, ITSM connectors (ServiceNow etc.), + and the full REST API surface. +- **Existing HVM-tier work products survive the elevation:** + Instance backups, network pools, storage providers, user + accounts, integrations, scheduled jobs, etc. + +The HVM User Manual page `Elevating to HPE Morpheus Enterprise` +(GUID-ECCA4FDD-37C8-45CE-A71F-C6E73B3BA713) walks the procedure. +See also the HVM `morpheus-docs` sibling MCP's +`hvm_user_manual_8_1_*` bundles. + +## API surface — Plugin vs REST + +Morpheus exposes two completely separate extensibility surfaces: + +- **REST API** at `https:///api/` — external automation + and integration. Bearer-token authentication; tokens issued from + the user profile → API tokens UI. Full Enterprise API surface + available (vs HVM-only managers which 404 on Enterprise-only + endpoints). +- **Plugin API** — server-side extensions that load INTO the + manager process. Versioned independently of the platform + (Plugin API version listed in the Release Notes for each + Morpheus version). A plugin built for Plugin API 1.3.x may not + load on 1.4.x without changes. + +**TODO — fill in real operational lessons as we accumulate them.** + +## Multi-cloud onboarding + +**TODO.** Each cloud (AWS, Azure, GCP, OCI, VMware vSphere, KVM/HVM, +OpenStack, Nutanix, etc.) has its own onboarding ritual: credentials, +networking, IAM roles, regions, storage providers, image catalogs. +Search the User Manual: `search_docs(query="Add AWS cloud +integration")`, `search_docs(query="Azure subscription cost")`, etc. + +## Tenancy, RBAC, and groups + +**TODO.** Morpheus Enterprise tenancy is one of the more complex areas +— tenants, roles, groups, account groups, persona-based access. +Lessons specific to "what surprised me" go here. + +## Backups + +**TODO.** Morpheus Enterprise inherits the backup framework HVM +introduced (Storage Buckets, Execution Schedules, Backup Jobs) +and adds: cloud-native backup integrations (AWS Backup, Azure +Backup), per-instance backup policies via the policy engine, +ServiceNow-driven backup orchestration. Document the gotchas you +hit. + +## Common operational gotchas + +**TODO.** This is where the "experienced operator hallway +conversation" notes go. Examples to seed (delete or replace as you +learn): + +- **Service plan vs Instance type** — same concept, different + contexts. A service plan is the sizing template ("small / medium + / large with these CPU/RAM"); an instance type is what you + provision FROM the plan. Operators conflate them. +- **Cloud integration credentials are tenant-scoped, not + global.** Adding a credential at the master tenant doesn't + cascade — sub-tenants need their own (or the policy engine + granting access). +- **Policy engine vs Logic library** — both live under + Library/Automation, both can gate provisioning. Policies are + preventive (block bad config), logic is generative (run scripts + on lifecycle events). Pick the right tool. + +## Adding to this doc + +Two ways: + +1. Manually edit `docs_mcp/api_lessons.md` in this repo and commit. + The next image build picks it up. +2. Use `submit_doc_bug` for upstream issues, and append the + takeaway here once the docs team responds. + +The point of this doc is to surface the kind of context an +experienced operator would mention in a hallway conversation but +that doesn't quite fit anywhere in the formal product docs. Keep +sections tight — one H2 = one topic the LLM can return on demand. diff --git a/docs_mcp/server.py b/docs_mcp/server.py index 28b1345..d86a0a3 100644 --- a/docs_mcp/server.py +++ b/docs_mcp/server.py @@ -18,6 +18,8 @@ stable across products — clients depend on them. """ from __future__ import annotations +import datetime as _dt +import difflib import json import logging import os @@ -35,8 +37,11 @@ log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Product-specific configuration. Set these for each new build. # --------------------------------------------------------------------------- -PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct") -PRODUCT_DOCS_URL = os.environ.get("PRODUCT_DOCS_URL", "https://docs.example.com") +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "morpheus") +PRODUCT_DOCS_URL = os.environ.get( + "PRODUCT_DOCS_URL", + "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007732en_us", +) COLLECTION = f"{PRODUCT_NAME}_docs" # Paths inside the deployed container (and matching layout locally for dev). @@ -45,6 +50,8 @@ CORPUS = ROOT / "corpus" CHROMA_DIR = ROOT / "chroma" BM25_DB = Path(os.environ.get("BM25_DB", str(ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db"))) BUNDLES_JSON = ROOT / "bundles.json" +DIGEST_HISTORY_PATH = CORPUS / ".digest" / "history.jsonl" +API_LESSONS_MD = Path(__file__).resolve().parent / "api_lessons.md" # --------------------------------------------------------------------------- # Feature flags (Phase 6 / 8 / 12 enable these as you ship each phase). @@ -104,6 +111,15 @@ def _build_where(version: str | None, platform: str | None, bundle_id: str | Non return {"$and": conds} +def _where_for_bm25(version: str | None, platform: str | None, bundle_id: str | None) -> dict | None: + """BM25Index.query takes a flat dict of equality filters.""" + w: dict[str, str] = {} + if version: w["version"] = version + if platform: w["platform"] = platform + if bundle_id: w["bundle_id"] = bundle_id + return w or None + + def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None: """Read a corpus page off disk. Returns (markdown_body, metadata_dict).""" md_path = CORPUS / bundle_id / (page_id + ".md") @@ -113,6 +129,115 @@ def _read_page(bundle_id: str, page_id: str) -> tuple[str, dict] | None: return md_path.read_text(), json.loads(json_path.read_text()) +_CHROMA = None +_BM25 = None + + +def _collection(): + """Lazy Chroma collection handle. Cached after first call.""" + global _CHROMA + if _CHROMA is None: + import chromadb + from chromadb.config import Settings + from rag.embeddings import embedding_function + + client = chromadb.PersistentClient( + path=str(CHROMA_DIR), + settings=Settings(anonymized_telemetry=False), + ) + _CHROMA = client.get_collection(COLLECTION, embedding_function=embedding_function()) + return _CHROMA + + +def _bm25(): + """Lazy BM25Index handle. None if the FTS5 db isn't built.""" + global _BM25 + if _BM25 is None: + if not BM25_DB.exists(): + return None + try: + from rag.bm25 import BM25Index + _BM25 = BM25Index(str(BM25_DB)) + except Exception as e: # defensive: hybrid must never block dense + log.warning("BM25 unavailable, falling back to dense-only: %s", e) + return None + return _BM25 + + +def _enrich_from_chroma(col, chunk_ids: list[str], fused: list | None) -> tuple[list[str], list[dict], list[float]]: + """Fetch document text + metadata for a list of chunk ids from Chroma, in order.""" + if not chunk_ids: + return [], [], [] + g = col.get(ids=chunk_ids, include=["documents", "metadatas"]) + by_id = {i: (d, m) for i, d, m in zip(g["ids"], g["documents"], g["metadatas"])} + docs = [by_id[i][0] for i in chunk_ids if i in by_id] + metas = [by_id[i][1] for i in chunk_ids if i in by_id] + if fused is not None: + dists = [1.0 - score for _id, score, _src in fused[:len(docs)]] + else: + dists = [0.0] * len(docs) + return docs, metas, dists + + +def _rerank(query: str, candidates: list[tuple[str, str]]) -> list[tuple[str, str]] | None: + """POST to RERANK_URL /v1/rerank, return candidates re-ordered by relevance. + + `candidates` is `[(chunk_id, text), ...]`. Texts are truncated to ~2000 chars + before sending so we never blow past jina-reranker's 1024-token per-pair + cap (which 400s the entire batch). The full untruncated text still goes + back to the user from Chroma; truncation is reranking-only. + + Returns None on any failure — caller treats that as "skip reranking, + keep retrieval-order candidates." + """ + if not RERANK_URL or not candidates: + return None + try: + import httpx + payload = { + "query": query, + "documents": [(text or "")[:2000] for _cid, text in candidates], + "top_n": len(candidates), + } + with httpx.Client(timeout=RERANK_TIMEOUT) as c: + r = c.post(f"{RERANK_URL}/v1/rerank", json=payload) + r.raise_for_status() + results = r.json().get("results") or [] + order = [candidates[item["index"]] for item in results + if isinstance(item.get("index"), int) and 0 <= item["index"] < len(candidates)] + return order or None + except Exception as e: + log.warning("rerank failed, keeping retrieval order: %s", e) + return None + + +def _rrf_fuse(*ranked_lists: list[str], k: int = RRF_K) -> list[tuple[str, float, dict]]: + """Reciprocal Rank Fusion. Each ranked list is a sequence of ids in + descending relevance. Returns [(id, fused_score, per_retriever_contrib), ...] + sorted by score desc.""" + scores: dict[str, float] = {} + sources: dict[str, dict] = {} + names = ("dense", "bm25", "extra") + for idx, lst in enumerate(ranked_lists): + src = names[idx] if idx < len(names) else f"r{idx}" + for rank, ident in enumerate(lst, start=1): + scores[ident] = scores.get(ident, 0.0) + 1.0 / (k + rank) + sources.setdefault(ident, {})[src] = rank + ranked = sorted(scores.items(), key=lambda kv: -kv[1]) + return [(ident, score, sources[ident]) for ident, score in ranked] + + +def _source_url(bundle_id: str, page_id: str) -> str: + """Build the canonical docs portal URL for a (bundle, page) pair.""" + b = _bundles().get(bundle_id) + if not b: + return "" + doc_id = b.get("doc_id", "") + if page_id.startswith("GUID-"): + return f"https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html" + return f"https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" + + # =========================================================================== # Tools # =========================================================================== @@ -134,7 +259,7 @@ def search_docs( ] = None, k: Annotated[int, Field(description="Number of results to return.", ge=1, le=50)] = 10, ) -> str: - """Search the {product} docs corpus. + """Search the HPE Morpheus Enterprise (Morpheus) docs corpus. Returns the top-k most relevant chunks (with full source page URLs) given a natural-language query. Optional filters narrow the search @@ -142,20 +267,130 @@ def search_docs( first if you need to discover the available facet values. Call this tool whenever the user asks anything that should be - answerable from the official product documentation. + answerable from the official product documentation — install, + upgrade, configuration, backups, networking, HVM clusters, the + Morpheus UI, or any 8.1.x release-notes question. """ with TimedCall("search_docs", { "query": query, "version": version, "platform": platform, "bundle_id": bundle_id, "k": k, }) as _call: - # TODO Phase 2-3: query Chroma collection (see rag/index.py for - # how it was built). Render the top-k chunks as markdown with - # source URLs. - # TODO Phase 6: optional reranker via _rerank() if RERANK_URL set. - # TODO Phase 8: hybrid retrieval if HYBRID_SEARCH=true — run - # dense + BM25 in parallel, RRF-fuse, hand merged pool to rerank. - _call.set(hits_returned=0) - raise NotImplementedError("Phase 2/3: implement Chroma query + rendering") + try: + col = _collection() + except Exception as e: + log.exception("chroma collection unavailable") + _call.set(hits_returned=0, error=str(e)) + return f"_(search backend unavailable: {e})_" + + where = _build_where(version, platform, bundle_id) + bm25_where = _where_for_bm25(version, platform, bundle_id) + pool = max(k * 5, 50) + + # Retrieval mode selection. Eval on this corpus (2026-05-22, 22 golden + # queries) showed BM25 MRR=0.88 vs dense MRR=0.54 vs hybrid MRR=0.69 — + # HPE structured docs use controlled vocabulary, so lexical match wins. + # Dense is kept as fallback when BM25 has no tokens to chew on (e.g. + # purely stopword queries). HYBRID_SEARCH=true forces RRF fusion. + bm = _bm25() + docs: list[str] = [] + metas: list[dict] = [] + dists: list[float] = [] + retrieval_mode = "dense" + top1_source = "dense_only" + + if HYBRID_SEARCH and bm is not None: + try: + dense_res = col.query(query_texts=[query], n_results=pool, where=where) + dense_ids = (dense_res.get("ids") or [[]])[0] + bm_hits = bm.query(query, n=pool, where=bm25_where) + bm_ids = [cid for cid, _s in bm_hits] + fused = _rrf_fuse(dense_ids, bm_ids) + docs, metas, dists = _enrich_from_chroma(col, [c for c, _, _ in fused[:k]], fused) + if fused: + src0 = fused[0][2] + top1_source = ("both" if {"dense", "bm25"} <= set(src0) + else "bm25_only" if "bm25" in src0 + else "dense_only") + retrieval_mode = "hybrid" + except Exception as e: + log.warning("hybrid failed, falling back to BM25→dense: %s", e) + + if not docs and bm is not None: + try: + bm_hits = bm.query(query, n=k, where=bm25_where) + if bm_hits: + ids = [cid for cid, _s in bm_hits[:k]] + docs, metas, _ = _enrich_from_chroma(col, ids, None) + # FTS5 returns negative scores (lower=better). Map onto a + # similarity-ish [0..1] just for display. + dists = [max(0.0, min(1.0, 1.0 - abs(s) / 20.0)) for _id, s in bm_hits[:k]] + retrieval_mode = "bm25" + top1_source = "bm25_only" + except Exception as e: + log.warning("BM25 retrieval failed, falling back to dense: %s", e) + + if not docs: + res = col.query(query_texts=[query], n_results=k, where=where) + docs = (res.get("documents") or [[]])[0] + metas = (res.get("metadatas") or [[]])[0] + dists = (res.get("distances") or [[]])[0] + + reranker_fired = False + if RERANK_URL and docs: + # Pull a deeper pool to give the reranker something to chew on. + # We over-fetch up to RERANK_POOL chunks from whichever retriever + # already won, then ask the reranker to pick the final top-k. + pool_size = max(k, RERANK_POOL) + if len(docs) < pool_size: + if retrieval_mode == "bm25": + extra = bm.query(query, n=pool_size, where=bm25_where) if bm else [] + extra_ids = [cid for cid, _s in extra] + else: + extra_res = col.query(query_texts=[query], n_results=pool_size, where=where) + extra_ids = (extra_res.get("ids") or [[]])[0] + if extra_ids: + d2, m2, _ = _enrich_from_chroma(col, extra_ids, None) + docs, metas = d2, m2 + dists = [0.0] * len(docs) + # Reranker scores chunk_ids — collapse to (id, text) tuples + pairs = list(zip( + [f"{m.get('bundle_id','')}::{m.get('page_id','')}::{m.get('ordinal',0)}" for m in metas], + docs, + )) + reranked = _rerank(query, pairs) + if reranked is not None: + # Re-sort docs/metas to match. Recompute distances as descending + # ordinal ranks so display still shows a useful score. + by_cid = {p[0]: i for i, p in enumerate(pairs)} + order = [by_cid[cid] for cid, _t in reranked if cid in by_cid] + docs = [docs[i] for i in order][:k] + metas = [metas[i] for i in order][:k] + dists = [1.0 - (rank / len(reranked)) for rank, _ in enumerate(reranked)][:len(docs)] + reranker_fired = True + else: + docs, metas, dists = docs[:k], metas[:k], dists[:k] + + _call.set(hits_returned=len(docs), retrieval_mode=retrieval_mode, + top1_source=top1_source, reranker_fired=reranker_fired) + if not docs: + return f"_No matches for `{query}`._" + + out = [f"# {len(docs)} result(s) for `{query}`", ""] + for doc, meta, dist in zip(docs, metas, dists): + bid = meta.get("bundle_id", "") + pid = meta.get("page_id", "") + title = meta.get("title") or pid + ver = meta.get("version") or "" + url = _source_url(bid, pid) + header = f"## {title}" + if ver: + header += f" _(v{ver})_" + out.append(header) + out.append(f"[{bid}/{pid}]({url}) · score={1 - dist:.3f}") + out.append("") + out.append(doc.strip()) + out.append("") + return "\n".join(out) @mcp.tool() @@ -175,9 +410,21 @@ def get_page( return f"Page not found: {bundle_id}/{page_id}" md, meta = data _call.set(found=True, page_chars=len(md)) - # TODO: add a metadata header (title, version, source URL) above - # the body. Product-specific shape. - return md + title = meta.get("title") or page_id + ver = meta.get("version") + parent = meta.get("parent_title") + url = _source_url(bundle_id, page_id) + header = [f"# {title}"] + ctx = [] + if ver: + ctx.append(f"version **{ver}**") + if parent: + ctx.append(f"in **{parent}**") + if ctx: + header.append("_" + " · ".join(ctx) + "_") + header.append(f"[source]({url})") + header.append("") + return "\n".join(header) + "\n" + md @mcp.tool() @@ -193,45 +440,835 @@ def list_versions() -> str: versions = sorted({b.get("version") for b in cat.values() if b.get("version")}) platforms = sorted({b.get("platform") for b in cat.values() if b.get("platform")}) _call.set(versions=len(versions), platforms=len(platforms)) + products = sorted({b.get("product") for b in cat.values() if b.get("product")}) lines = [f"# Facets across {len(cat)} bundle(s)", ""] if versions: - lines.append("## Versions"); lines.append("") - for v in versions: lines.append(f"- `{v}`") - lines.append("") + lines += ["## Versions", ""] + [f"- `{v}`" for v in versions] + [""] if platforms: - lines.append("## Platforms"); lines.append("") - for p in platforms: lines.append(f"- `{p}`") + lines += ["## Platforms", ""] + [f"- `{p}`" for p in platforms] + [""] + if products: + lines += ["## Product / doc types", ""] + [f"- {p}" for p in products] + [""] + lines += ["## Bundles", ""] + for slug in sorted(cat): + b = cat[slug] + kind = b.get("product") or "" + ver = b.get("version") + pages = b.get("page_count", "?") + label = f"{kind} {ver}".strip() if ver else kind + lines.append(f"- `{slug}` — {label} ({pages} pages)") return "\n".join(lines) -# --------------------------------------------------------------------------- -# Stubs for later phases — keep the signatures in this file so refactors -# don't lose the contracts. Implementations come per phase. -# --------------------------------------------------------------------------- +# =========================================================================== +# Phase 9 — cross-version tools +# =========================================================================== -# @mcp.tool() # Phase 9 -# def list_cluster(bundle_id: str, page_id: str) -> str: ... +def _bundle_pages(bundle_id: str) -> set[str]: + """Page IDs (= GUID-XXXX) on disk in a bundle. Mirrors rag.index's md_path.stem.""" + bd = CORPUS / bundle_id + if not bd.is_dir(): + return set() + return {p.stem for p in bd.glob("*.md")} -# @mcp.tool() # Phase 9 -# def diff_versions(bundle_id: str, page_id: str, against_bundle_id: str, context: int = 3) -> str: ... -# @mcp.tool() # Phase 9 -# def bundle_changelog(bundle_id_new: str, bundle_id_old: str, min_churn: int = 5, max_changed: int = 50) -> str: ... +def _diff_churn(a: str, b: str) -> tuple[int, int]: + """Cheap (added, removed) line counts for a pair of markdown bodies.""" + diff = difflib.unified_diff(a.splitlines(keepends=False), + b.splitlines(keepends=False), n=0) + added = removed = 0 + for line in diff: + if line.startswith(("+++", "---", "@@")): + continue + if line.startswith("+"): + added += 1 + elif line.startswith("-"): + removed += 1 + return added, removed -# @mcp.tool() # Phase 13 -# def weekly_digest(days: int = 7, version: str | None = None, platform: str | None = None, ...) -> str: ... -# @mcp.tool() # Phase 9 (or 3 — useful early) -# def corpus_status() -> str: ... +@mcp.tool() +def list_cluster( + bundle_id: Annotated[str, Field(description="Bundle slug of the source topic.")], + page_id: Annotated[str, Field(description="Page id (GUID-XXXX) of the source topic.")], +) -> str: + """List cross-version peers of a topic in the HVM docs. -# @mcp.tool() # Phase 11 -# def myproduct_api_lessons(topic: str | None = None) -> str: ... + HPE re-mints the docId per product version but keeps page GUIDs stable, + so the scrape pipeline synthesizes `topic_cluster.clustered_topics` + from same-GUID overlap (374/376/376 pages overlap across 8.1.0/.1/.2). + """ + with TimedCall("list_cluster", {"bundle_id": bundle_id, "page_id": page_id}) as _call: + out = _read_page(bundle_id, page_id) + if out is None: + _call.set(found=False) + return f"Not found: {bundle_id}/{page_id}" + _, side = out + cluster = side.get("topic_cluster") or {} + peers = cluster.get("clustered_topics") or [] + _call.set(hits_returned=len(peers)) + src_label = cluster.get("clustering_title") or side.get("title") or page_id + lines = [f"# Cluster for {bundle_id}/{page_id} ({src_label})", ""] + if not peers: + lines.append("_No peer topics in cluster._") + return "\n".join(lines) + for p in peers: + lines.append(f"- `{p['bundle_id']}/{p['page_id']}` — {p.get('clustering_title') or ''}") + return "\n".join(lines) -# @mcp.tool() # Phase 12 -# def find_doc_inconsistencies(scope_query: str, ...) -> str: ... -# @mcp.tool() # Phase 12 -# def submit_doc_bug(page_url: str, content: str, email: str | None = None, ...) -> str: ... +@mcp.tool() +def diff_versions( + bundle_id: Annotated[str, Field(description="Bundle slug of the source topic (the 'new' side).")], + page_id: Annotated[str, Field(description="Page id of the source topic.")], + against_bundle_id: Annotated[str, Field(description="Bundle slug to diff against. Must be in the source's cluster, or share the same page_id.")], + context: Annotated[int, Field(description="Lines of context around each hunk.", ge=0, le=10)] = 3, +) -> str: + """Unified diff of one topic between two bundles (typically two HVM versions). + + Two matching strategies, tried in order: + + 1. `topic_cluster` peer (synthesized from same-GUID overlap by the scraper). + 2. Same `page_id` fallback (works because GUIDs are stable across HVM versions). + """ + with TimedCall("diff_versions", { + "bundle_id": bundle_id, "page_id": page_id, + "against_bundle_id": against_bundle_id, "context": context, + }) as _call: + src = _read_page(bundle_id, page_id) + if src is None: + _call.set(matched_via=None, reason="source_not_found") + return f"Source not found: {bundle_id}/{page_id}" + src_md, side = src + cluster = side.get("topic_cluster") or {} + peers = {p["bundle_id"]: p for p in (cluster.get("clustered_topics") or [])} + + peer = peers.get(against_bundle_id) + if peer is not None: + peer_page_id = peer["page_id"] + matched_via = "topic_cluster" + elif _read_page(against_bundle_id, page_id) is not None: + peer_page_id = page_id + matched_via = "filename" + else: + _call.set(matched_via=None, reason="no_peer") + valid = list(peers) or ["(no peers)"] + return (f"No match for {bundle_id}/{page_id} in {against_bundle_id}.\n" + f"- No cluster peer. Available peers: {valid}\n" + f"- No page {page_id!r} in {against_bundle_id} either.") + + _call.set(matched_via=matched_via) + peer_data = _read_page(against_bundle_id, peer_page_id) + if peer_data is None: + return f"Peer not found in corpus: {against_bundle_id}/{peer_page_id}" + peer_md, _ = peer_data + diff = difflib.unified_diff(peer_md.splitlines(keepends=True), + src_md.splitlines(keepends=True), + fromfile=f"{against_bundle_id}/{peer_page_id}", + tofile=f"{bundle_id}/{page_id}", + n=context) + body = "".join(diff) + header = f"_matched via {matched_via}_\n\n" + if not body.strip(): + return header + f"No differences between {bundle_id}/{page_id} and {against_bundle_id}/{peer_page_id}." + return header + f"```diff\n{body}```" + + +@mcp.tool() +def bundle_changelog( + bundle_id_new: Annotated[str, Field(description="New-side bundle slug, e.g. 'hvm_user_manual_8_1_2'.")], + bundle_id_old: Annotated[str, Field(description="Old-side bundle slug, e.g. 'hvm_user_manual_8_1_1'.")], + min_churn: Annotated[int, Field(description="Min (added + removed) lines to flag a page as changed.", ge=1, le=1000)] = 5, + max_changed: Annotated[int, Field(description="Max changed pages to list (sorted by churn desc).", ge=1, le=500)] = 50, +) -> str: + """High-level diff between two HVM bundles. + + Lists pages added, removed, and changed between an old bundle and a + new one. Match is by page_id (which is the stable GUID — same GUID + across versions = same topic). Use after `list_versions` to discover + valid bundle slugs. + """ + with TimedCall("bundle_changelog", { + "bundle_id_new": bundle_id_new, "bundle_id_old": bundle_id_old, + "min_churn": min_churn, "max_changed": max_changed, + }) as _call: + new_pages = _bundle_pages(bundle_id_new) + old_pages = _bundle_pages(bundle_id_old) + if not new_pages and not old_pages: + _call.set(reason="both_empty") + return f"Neither bundle has pages on disk: {bundle_id_new}, {bundle_id_old}" + if not new_pages: + return f"Bundle not found or empty: {bundle_id_new}" + if not old_pages: + return f"Bundle not found or empty: {bundle_id_old}" + + added = sorted(new_pages - old_pages) + removed = sorted(old_pages - new_pages) + common = sorted(new_pages & old_pages) + + changed: list[tuple[str, int, int]] = [] + for pid in common: + n = _read_page(bundle_id_new, pid) + o = _read_page(bundle_id_old, pid) + if n is None or o is None: + continue + a_lines, r_lines = _diff_churn(o[0], n[0]) + if a_lines + r_lines >= min_churn: + changed.append((pid, a_lines, r_lines)) + changed.sort(key=lambda t: -(t[1] + t[2])) + _call.set(added=len(added), removed=len(removed), + changed=len(changed), unchanged=len(common) - len(changed)) + + lines = [ + f"# Bundle changelog: {bundle_id_new} vs {bundle_id_old}", "", + f"- pages in new: **{len(new_pages)}**", + f"- pages in old: **{len(old_pages)}**", + f"- common: **{len(common)}**", + f"- **added** (in new only): {len(added)}", + f"- **removed** (in old only): {len(removed)}", + f"- **changed** (≥{min_churn} lines): {len(changed)} of {len(common)} common", + f"- unchanged: {len(common) - len(changed)}", "", + ] + if added: + lines += [f"## Added pages ({len(added)})", *(f"- `{p}`" for p in added), ""] + if removed: + lines += [f"## Removed pages ({len(removed)})", *(f"- `{p}`" for p in removed), ""] + if changed: + shown = changed[:max_changed] + lines += [ + f"## Changed pages — top {len(shown)} of {len(changed)} by churn", "", + "| page | +lines | -lines | total |", "|---|---|---|---|", + ] + for p, a, r in shown: + lines.append(f"| `{p}` | +{a} | -{r} | {a + r} |") + if len(changed) > max_changed: + lines.append(f"\n_({len(changed) - max_changed} more changed pages omitted; raise `max_changed` to see them.)_") + lines.append("\nInspect a specific page: `diff_versions(bundle_id_new, page_id, bundle_id_old)`.") + return "\n".join(lines) + + +# =========================================================================== +# Phase 13 — weekly digest from corpus/.digest/history.jsonl (built in CI) +# =========================================================================== + +_digest_cache: list[dict] | None = None + + +def _digest_history() -> list[dict]: + """Lazy load of the digest history JSONL written by scrape.changelog at CI time.""" + global _digest_cache + if _digest_cache is not None: + return _digest_cache + if not DIGEST_HISTORY_PATH.exists(): + log.warning("digest history not found at %s — weekly_digest will return empty.", + DIGEST_HISTORY_PATH) + _digest_cache = [] + return _digest_cache + records: list[dict] = [] + try: + with open(DIGEST_HISTORY_PATH) as fh: + for ln, line in enumerate(fh, start=1): + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError as e: + log.warning("digest history: skipping malformed line %d: %s", ln, e) + except OSError as e: + log.warning("digest history read failed: %s", e) + _digest_cache = records + return _digest_cache + + +@mcp.tool() +def weekly_digest( + days: Annotated[int, Field(description="How far back to summarize. 7=last week, 30=last month. Horizon ~120 days.", ge=1, le=120)] = 7, + version: Annotated[str | None, Field(description="OPTIONAL version filter, e.g. '8.1.2'.")] = None, + platform: Annotated[str | None, Field(description="OPTIONAL platform filter (HVM bundles don't set platform — leave None).")] = None, + max_bundles: Annotated[int, Field(description="Cap on per-bundle detail blocks.", ge=1, le=100)] = 25, + max_pages_per_bundle: Annotated[int, Field(description="Pages to list per bundle.", ge=1, le=50)] = 10, +) -> str: + """Summarize what changed in the HVM docs over the past N days. + + Call when the user asks *"what's new in HVM docs this week?"*, + *"what changed in 8.1.2?"*, or *"is there anything new since the + last release?"*. Reads the pre-baked digest history JSONL written + by CI from git log over corpus-touching commits. + """ + with TimedCall("weekly_digest", { + "days": days, "version": version, "platform": platform, + "max_bundles": max_bundles, "max_pages_per_bundle": max_pages_per_bundle, + }) as _call: + records = _digest_history() + if not records: + _call.set(returned="empty_no_history", record_count=0) + return ("# Weekly digest\n\n" + f"_No digest history on this image. `{DIGEST_HISTORY_PATH}` is " + "missing — it's populated by the weekly refresh workflow._") + + now = _dt.datetime.now(_dt.timezone.utc) + cutoff = now - _dt.timedelta(days=days) + filtered: list[dict] = [] + for r in records: + try: + ts = _dt.datetime.fromisoformat(r["timestamp"]) + except (KeyError, ValueError): + continue + if ts.tzinfo is None: + ts = ts.replace(tzinfo=_dt.timezone.utc) + if ts >= cutoff: + filtered.append({**r, "_ts": ts}) + + if not filtered: + _call.set(returned="empty_window", record_count=0) + covers = "" + if records: + oldest = min(records, key=lambda r: r.get("timestamp", "")) + newest = max(records, key=lambda r: r.get("timestamp", "")) + covers = (f"\n\n_(History on this image covers " + f"{oldest.get('timestamp','?')[:10]} through " + f"{newest.get('timestamp','?')[:10]}.)_") + return (f"# Weekly digest — last {days} day{'s' if days != 1 else ''}\n\n" + f"_No corpus changes recorded in this window._" + covers) + + cat = _bundles() + def _passes(bid: str) -> bool: + if not (version or platform): + return True + b = cat.get(bid) + if b is None: + return False + if version and b.get("version") != version: + return False + if platform and b.get("platform") != platform: + return False + return True + + filtered.sort(key=lambda r: r["_ts"], reverse=True) + per_bundle_pages: dict[str, list[str]] = {} + new_bundles_set: set[str] = set() + drift_bundles_set: set[str] = set() + commits_in_window = 0 + for r in filtered: + commits_in_window += 1 + for bid in r.get("new_bundles", []): + if _passes(bid): + new_bundles_set.add(bid) + for bid in r.get("json_only_bundles", []): + if _passes(bid): + drift_bundles_set.add(bid) + for bid, pages in (r.get("content_bundles") or {}).items(): + if not _passes(bid): + continue + seen = set(per_bundle_pages.get(bid, [])) + fresh = [p for p in pages if p not in seen] + if fresh: + per_bundle_pages.setdefault(bid, []).extend(fresh) + + total_md = sum(len(p) for p in per_bundle_pages.values()) + bundles_ranked = sorted(per_bundle_pages.items(), key=lambda kv: (-len(kv[1]), kv[0])) + _call.set(returned="ok", record_count=commits_in_window, + bundles_changed=len(per_bundle_pages), + new_bundles=len(new_bundles_set)) + + ts_oldest = filtered[-1]["_ts"].date().isoformat() + ts_newest = filtered[0]["_ts"].date().isoformat() + lines = [ + f"# HVM docs digest — last {days} day{'s' if days != 1 else ''}", "", + f"_Window: {ts_oldest} → {ts_newest}_ • _Filters: version={version}, platform={platform}_", "", + "## Headline", "", + f"- **{total_md}** page change(s) across **{len(per_bundle_pages)}** bundle(s)", + f"- **{commits_in_window}** corpus-touching commit(s) in this window", + f"- **{len(new_bundles_set)}** bundle(s) newly added", + f"- **{len(drift_bundles_set)}** bundle(s) with sidecar-only drift", "", + ] + if not per_bundle_pages and not new_bundles_set: + lines.append(f"_No bundle changes matched the filter in this window._") + return "\n".join(lines) + if new_bundles_set: + lines += ["## New bundles added", ""] + for bid in sorted(new_bundles_set): + b = cat.get(bid, {}) + t = b.get("title") or "" + tag = f" *({b.get('version') or '?'})*" if b.get("version") else "" + lines.append(f"- `{bid}`{tag} {t}") + lines.append("") + if bundles_ranked: + top = bundles_ranked[:max_bundles] + remainder = len(bundles_ranked) - len(top) + lines += [f"## Bundles with content changes — top {len(top)}" + + (f" of {len(bundles_ranked)}" if remainder else ""), ""] + for bid, pages in top: + b = cat.get(bid, {}) + tag = f" *({b.get('version') or ''})*" if b.get("version") else "" + lines.append(f"### `{bid}`{tag}") + if b.get("title"): + lines.append(f"_{b['title']}_") + lines.append(f"{len(pages)} page change(s).") + for p in pages[:max_pages_per_bundle]: + lines.append(f"- `{p}`") + if len(pages) > max_pages_per_bundle: + lines.append(f" _(+{len(pages) - max_pages_per_bundle} more)_") + lines.append("") + lines.append("\nInspect a specific page: `get_page(bundle_id, page_id)` or `diff_versions(...)`.") + return "\n".join(lines) + + +@mcp.tool() +def corpus_status() -> str: + """Freshness + size of the knowledge base. + + Combines: (1) image build time (bundles.json mtime in container), + (2) most-recent upstream Published date across bundles, (3) total + bundles / pages / Chroma chunks. + """ + lines: list[str] = ["# Corpus status", ""] + try: + ts = _dt.datetime.fromtimestamp(BUNDLES_JSON.stat().st_mtime, tz=_dt.timezone.utc).isoformat(timespec="seconds") + lines.append(f"- This image built at: **{ts}**") + except OSError: + lines.append("- This image build time: _unknown_") + + cat = _bundles() + latest_pub: str | None = None + per_bundle: list[tuple[str, str]] = [] + for slug, b in cat.items(): + pub = (b.get("dates") or {}).get("Published") + if pub: + if latest_pub is None or pub > latest_pub: + latest_pub = pub + per_bundle.append((slug, pub)) + if latest_pub: + lines.append(f"- Most-recent upstream Published date (any bundle): **{latest_pub}**") + lines.append("") + try: + chunk_count = _collection().count() + except Exception: + chunk_count = -1 + pages_count = sum(1 for d in (CORPUS.iterdir() if CORPUS.exists() else []) + if d.is_dir() for _ in d.glob("*.md")) + lines += [ + f"- Bundles indexed: **{len(cat)}**", + f"- Pages in corpus: **{pages_count}**", + f"- Chunks in Chroma: **{chunk_count}**" if chunk_count >= 0 else "- Chunks in Chroma: _(query failed)_", + "", + ] + if per_bundle: + per_bundle.sort(key=lambda kv: kv[1], reverse=True) + lines.append("## Most-recently-edited bundles (by HPE)") + for slug, when in per_bundle[:5]: + b = cat.get(slug, {}) + lines.append(f"- `{slug}` — {b.get('title') or slug} (published {when})") + return "\n".join(lines) + + +# =========================================================================== +# Phase 11 — curated knowledge: morpheus_api_lessons +# =========================================================================== + +def _split_lessons_sections(md: str) -> list[tuple[str, str]]: + sections: list[tuple[str, str]] = [] + current_title: str | None = None + current_lines: list[str] = [] + for line in md.splitlines(keepends=True): + m = re.match(r"^##\s+(.+?)\s*$", line) + if m: + if current_lines: + sections.append((current_title or "(prelude)", "".join(current_lines))) + current_title = m.group(1).strip() + current_lines = [line] + else: + current_lines.append(line) + if current_lines: + sections.append((current_title or "(prelude)", "".join(current_lines))) + return sections + + +@mcp.tool() +def morpheus_api_lessons( + topic: Annotated[str | None, Field(description="Optional keyword filter — returns only H2 sections whose heading or body contains this substring. Examples: 'manager', 'agent upgrade', 'plugin api', 'worker', 'console keyboard'. Omit for the full doc.")] = None, +) -> str: + """Curated lessons about HPE Morpheus Enterprise — non-obvious bits + that aren't in the official docs and gotchas learned from real + integration / operation work. + + **Call this proactively whenever the user asks you to:** + - install, upgrade, or troubleshoot an HVM cluster or manager + - integrate with HVM (REST API, automation, scripting) + - upgrade across versions (8.1.0 → 8.1.1 → 8.1.2) + - work with HVM Host agents + - configure backups, networking, or storage + - elevate to HPE Morpheus Enterprise + + With ``topic=...`` you'll get just the relevant H2 section(s). With + no argument you'll get the full doc — usually the right call when + starting on a new task since the TL;DR at the top primes the rest. + """ + with TimedCall("morpheus_api_lessons", {"topic": topic}) as _call: + try: + md = API_LESSONS_MD.read_text() + except OSError as e: + _call.set(error=str(e)) + return f"Lessons doc not present at {API_LESSONS_MD}: {e}" + if not topic: + _call.set(returned="full") + return md + needle = topic.lower() + sections = _split_lessons_sections(md) + kept: list[str] = [] + for title, body in sections: + if needle in title.lower() or needle in body.lower(): + kept.append(body) + if not kept: + _call.set(returned="empty", topic_matched=False) + return (f"_No sections matched topic={topic!r}. Returning the full document._\n\n" + md) + _call.set(returned="filtered", sections_matched=len(kept)) + return f"_Filtered to {len(kept)} section(s) matching topic={topic!r}._\n\n" + "".join(kept) + + +# =========================================================================== +# Phase 12 — find_doc_inconsistencies + submit_doc_bug +# =========================================================================== + +_REDIRECT_PHRASE_RE = re.compile( + r"\bsee\s+(?:the\s+)?[A-Z`\[][^.!?\n]{2,80}(?:for|topic|section|chapter|guide)\b", + re.IGNORECASE, +) +_VERSION_SUFFIX_RE = re.compile(r"_(\d+_\d+_\d+)$") + + +def _bundle_family(bundle_id: str) -> str: + """Strip a trailing `_X_Y_Z` version suffix from an HVM bundle slug. + + `hvm_user_manual_8_1_0` → `hvm_user_manual` + `hvm_deployment_guide` → `hvm_deployment_guide` (no version) + + Same-family bundles are version peers; cross-family pairs (User Manual + vs Release Notes) are intentionally different content. + """ + return _VERSION_SUFFIX_RE.sub("", bundle_id) + + +def _check_cross_version_drift(bundle_id: str, page_id: str, md: str, meta: dict) -> dict | None: + cluster = (meta.get("topic_cluster") or {}).get("clustered_topics") or [] + if not cluster: + return None + src_family = _bundle_family(bundle_id) + src_lines = max(1, len(md.splitlines())) + in_band: list[tuple[int, str, str, int]] = [] + out_band: list[tuple[int, str, str, int]] = [] + for peer in cluster: + peer_bid = peer.get("bundle_id") + peer_pid = peer.get("page_id") + if not (peer_bid and peer_pid) or peer_bid == bundle_id: + continue + if _bundle_family(peer_bid) != src_family: + continue + peer_data = _read_page(peer_bid, peer_pid) + if peer_data is None: + continue + peer_md, _ = peer_data + added, removed = _diff_churn(md, peer_md) + churn = added + removed + peer_lines = max(1, len(peer_md.splitlines())) + denom = max(src_lines, peer_lines) + pct = (churn * 100) // denom if denom else 0 + tup = (churn, peer_bid, peer_pid, peer_lines) + if 10 <= pct <= 60: + in_band.append(tup) + elif churn >= 5: + out_band.append(tup) + if in_band: + chosen = min(in_band, key=lambda t: t[0]) + confidence = "high" + elif out_band: + chosen = min(out_band, key=lambda t: t[0]) + confidence = "low" + else: + return None + churn, peer_bid, peer_pid, peer_lines = chosen + denom = max(src_lines, peer_lines) + churn_pct = min(100, (churn * 100) // denom) if denom else 0 + return { + "check": "cross_version_drift", + "bundle_id": bundle_id, "page_id": page_id, + "page_url": _source_url(bundle_id, page_id), + "peer_bundle_id": peer_bid, "peer_page_id": peer_pid, + "churn_lines": churn, "churn_pct_of_file": churn_pct, + "confidence": confidence, + "summary": (f"Drifts {churn} lines (~{churn_pct}% of file) vs peer " + f"{peer_bid}/{peer_pid}. Inspect with " + f"diff_versions({bundle_id!r}, {page_id!r}, {peer_bid!r})."), + } + + +def _check_redirect_chain(bundle_id: str, page_id: str, md: str, meta: dict) -> dict | None: + body = re.sub(r"^#[^\n]*\n", "", md, count=1).strip() + if "```" in body: + return None + text_only = re.sub(r"[`\[\]()*_>#-]", "", body) + text_only = re.sub(r"\s+", " ", text_only).strip() + if len(text_only) > 600: + return None + redirect_matches = list(_REDIRECT_PHRASE_RE.finditer(body)) + if not redirect_matches: + return None + evidence = redirect_matches[0].group(0).strip() + return { + "check": "redirect_chain", + "bundle_id": bundle_id, "page_id": page_id, + "page_url": _source_url(bundle_id, page_id), + "body_chars": len(text_only), + "redirect_phrase": evidence[:200], + "confidence": "medium", + "summary": (f"Page is {len(text_only)} chars of body text with a " + f'"see ... for ..." redirect: "{evidence[:120]}". ' + "Inspect with get_page to confirm."), + } + + +@mcp.tool() +def find_doc_inconsistencies( + scope_query: Annotated[str, Field(description="Natural-language scope describing what slice to scan. Used as a search to pick candidate pages. Examples: 'backup configuration', 'HVM cluster setup', 'VME manager installation'.")], + version: Annotated[str | None, Field(description="OPTIONAL version filter — e.g. '8.1.2'.")] = None, + platform: Annotated[str | None, Field(description="OPTIONAL platform filter (HVM bundles don't set platform — usually leave None).")] = None, + bundle_id: Annotated[str | None, Field(description="OPTIONAL specific bundle slug to restrict scanning to.")] = None, + max_pages: Annotated[int, Field(description="How many candidate pages to inspect.", ge=5, le=200)] = 30, + checks: Annotated[list[str] | None, Field(description="Which checks to run. Available: 'cross_version_drift', 'redirect_chain'. Defaults to all.")] = None, +) -> str: + """Scan a scoped set of HVM docs pages for likely documentation bugs. + + Surfaces concrete candidates for human review — NOT a stream of + bugs to auto-submit. Workflow: + + 1. Run this against a focused scope. + 2. Review each finding; many will be false positives. + 3. For real bugs, drill in with `get_page` / `diff_versions`. + 4. Draft a bug report; show the operator; ask explicitly. + 5. Only then call `submit_doc_bug`. One bug = one confirmation. + + **Do NOT loop submissions.** Even on "submit them all", confirm each + one individually. HPE's docs queue is a shared resource. + """ + with TimedCall("find_doc_inconsistencies", { + "scope_query": scope_query, "version": version, "platform": platform, + "bundle_id": bundle_id, "max_pages": max_pages, "checks": checks, + }) as _call: + all_checks = {"cross_version_drift", "redirect_chain"} + requested = all_checks if checks is None else {c for c in checks if c in all_checks} + if not requested: + _call.set(error="no_valid_checks") + return f"No valid checks requested. Available: {sorted(all_checks)}." + try: + col = _collection() + except Exception as e: + _call.set(error=f"collection: {e}") + return f"Couldn't open Chroma collection: {e}" + where = _build_where(version, platform, bundle_id) + try: + res = col.query(query_texts=[scope_query], n_results=max_pages * 3, + where=where, include=["metadatas"]) + except Exception as e: + _call.set(error=f"query: {e}") + return f"Scope query failed: {e}" + seen: set[tuple[str, str]] = set() + candidates: list[tuple[str, str]] = [] + for meta in (res.get("metadatas") or [[]])[0]: + key = (meta.get("bundle_id") or "", meta.get("page_id") or "") + if not key[0] or not key[1] or key in seen: + continue + seen.add(key) + candidates.append(key) + if len(candidates) >= max_pages: + break + _call.set(pages_inspected=len(candidates), checks=sorted(requested)) + if not candidates: + return f"No pages matched scope `{scope_query}`." + findings: dict[str, list[dict]] = {c: [] for c in requested} + for bid, pid in candidates: + data = _read_page(bid, pid) + if data is None: + continue + md, meta = data + if "cross_version_drift" in requested: + f = _check_cross_version_drift(bid, pid, md, meta) + if f: + findings["cross_version_drift"].append(f) + if "redirect_chain" in requested: + f = _check_redirect_chain(bid, pid, md, meta) + if f: + findings["redirect_chain"].append(f) + findings["cross_version_drift"] = sorted( + findings.get("cross_version_drift", []), + key=lambda f: (-(1 if f["confidence"] == "high" else 0), -f["churn_lines"])) + findings["redirect_chain"] = sorted( + findings.get("redirect_chain", []), key=lambda f: f["body_chars"]) + total = sum(len(v) for v in findings.values()) + _call.set(findings_total=total, + findings_by_check={k: len(v) for k, v in findings.items()}) + lines = [ + f"# Doc inconsistency scan — {len(candidates)} pages inspected", "", + f"_Scope_: `{scope_query}` • _Filters_: version={version}, platform={platform}, bundle_id={bundle_id} • _Checks_: {sorted(requested)}", "", + f"**{total} candidate finding{'' if total == 1 else 's'}.** Review each individually. " + "For real bugs, follow up with `get_page` / `diff_versions`, draft the report, " + "show the operator, and only call `submit_doc_bug` after explicit confirmation.", "", + ] + if not total: + lines.append("_No findings in this scope._") + return "\n".join(lines) + for check in sorted(requested): + items = findings.get(check, []) + lines += [f"## {check} ({len(items)})", ""] + if not items: + lines.append("_No findings for this check._\n") + continue + for i, f in enumerate(items, 1): + lines.append(f"### {i}. `{f['bundle_id']}/{f['page_id']}` *({f['confidence']} confidence)*") + lines.append(f"- URL: {f['page_url']}") + lines.append(f"- {f['summary']}") + if check == "cross_version_drift": + lines.append(f"- Peer: `{f['peer_bundle_id']}/{f['peer_page_id']}` • churn: {f['churn_lines']} lines ({f['churn_pct_of_file']}% of file)") + elif check == "redirect_chain": + lines.append(f"- Body length: {f['body_chars']} chars • Phrase: *\"{f['redirect_phrase']}\"*") + lines.append("") + lines += ["---", + "_Reminder: `submit_doc_bug` has a real side effect. Draft → show → confirm → submit, one at a time. Do not loop._"] + return "\n".join(lines) + + +# --- submit_doc_bug ---------------------------------------------------------- +# HPE Support DocPortal's "Was this helpful?" widget POSTs to an endpoint +# we haven't sniffed yet. Until DOC_BUG_API_URL is set AND +# DOC_BUG_SUBMIT_ENABLED=true, this tool refuses submission and tells the +# operator to paste manually. When you sniff the endpoint, set both env +# vars and verify the payload shape against the schema below. + +_DOC_BUG_ALLOWED_HOSTS = {"support.hpe.com"} +_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") + + +@mcp.tool() +def submit_doc_bug( + page_url: Annotated[str, Field(description="Full URL of the support.hpe.com page the bug is about. Must be a support.hpe.com URL.")], + content: Annotated[str, Field(description="Body of the bug report. Be specific: what the page says, what's wrong, what it should say. Cite exact passages. The docs team reads it verbatim.")], + email: Annotated[str | None, Field(description="OPTIONAL submitter email for follow-up. Omit if anonymous.")] = None, + rating: Annotated[int | None, Field(description="OPTIONAL star rating 1-5 (1-2 for serious bugs, 3 unclear, 4-5 only on explicit request).")] = None, + like: Annotated[bool | None, Field(description="OPTIONAL thumbs-up/down. False for bugs, True for positive feedback.")] = None, +) -> str: + """Submit a documentation bug to HPE's docs feedback channel. + + **⚠️ THIS TOOL HAS A REAL SIDE EFFECT (when enabled). It POSTs to + HPE's docs feedback endpoint and the submission lands in their queue.** + + **MANDATORY operator-confirmation workflow:** + + 1. Draft the bug content yourself. Show the operator the exact text + you intend to submit + the page URL + any rating/email fields. + 2. Ask explicitly: *"Submit this bug? (yes/no)"* + 3. Only call submit_doc_bug AFTER they answer yes. + 4. If they say *"submit them all"*, STILL confirm each one. This + tool MUST NOT be called in a loop without per-bug consent. + + **Do not call this autonomously.** Don't preemptively submit while + exploring inconsistencies. Don't call inside an agent loop without + a human in the loop. Misuse will get this MCP blocked at HPE's WAF. + + **What makes a good bug report:** + - Specific page URL. One bug = one page. + - Concrete quote of the problem text + version/platform context. + - Suggested correction when you have one. + - Avoid editorializing — factual bugs and broken links best. + """ + with TimedCall("submit_doc_bug", { + "page_url": page_url, "content_len": len(content or ""), + "email_present": bool(email), "rating": rating, "like": like, + }) as _call: + if not DOC_BUG_SUBMIT_ENABLED: + _call.set(error="disabled", outcome="refused_disabled") + return ( + "submit_doc_bug is disabled on this MCP deployment " + "(DOC_BUG_SUBMIT_ENABLED is not set). The operator's draft is good — " + f"they can paste it into the feedback widget on {page_url} themselves.\n\n" + "_(For maintainers: sniff HPE's feedback endpoint, set DOC_BUG_API_URL " + "to the POST target, and DOC_BUG_SUBMIT_ENABLED=true to activate.)_" + ) + if not DOC_BUG_API_URL: + _call.set(error="no_endpoint", outcome="refused_disabled") + return ("submit_doc_bug is enabled but DOC_BUG_API_URL is empty. " + f"Operator should paste manually at {page_url}.") + if not content or not content.strip(): + _call.set(error="empty_content", outcome="refused_invalid") + return "Refused: empty `content`." + if len(content) > 10000: + _call.set(error="content_too_long", outcome="refused_invalid") + return f"Refused: `content` is {len(content)} chars (cap 10000)." + try: + from urllib.parse import urlparse + parsed = urlparse(page_url) + except Exception as e: + _call.set(error=f"url_parse: {e}", outcome="refused_invalid") + return f"Refused: couldn't parse page_url ({e})." + if parsed.scheme not in ("http", "https"): + _call.set(error="bad_scheme", outcome="refused_invalid") + return f"Refused: scheme must be http(s), got {parsed.scheme!r}." + if parsed.hostname not in _DOC_BUG_ALLOWED_HOSTS: + _call.set(error=f"bad_host: {parsed.hostname}", outcome="refused_invalid") + return (f"Refused: page_url host {parsed.hostname!r} isn't a " + f"support.hpe.com URL. submit_doc_bug only accepts bugs against HPE Support pages.") + if email is not None and not _EMAIL_RE.match(email): + _call.set(error="bad_email", outcome="refused_invalid") + return f"Refused: email {email!r} doesn't look valid. Omit if anonymous." + if rating is not None and not (1 <= rating <= 5): + _call.set(error="bad_rating", outcome="refused_invalid") + return f"Refused: rating must be 1-5, got {rating}." + + href = f"{parsed.scheme}://{parsed.hostname}{parsed.path}{('?' + parsed.query) if parsed.query else ''}" + payload: dict = {"content": content, "href": href} + if email: + payload["email"] = email + if rating is not None: + payload["rating"] = rating + if like is not None: + payload["like"] = like + + try: + import httpx + except ImportError: + _call.set(error="httpx_missing", outcome="refused_runtime") + return "Refused: httpx not available." + + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + "User-Agent": "hvm-docs-mcp submit_doc_bug", + "Origin": "https://support.hpe.com", + "Referer": href, + } + try: + with httpx.Client(timeout=DOC_BUG_TIMEOUT) as c: + r = c.post(DOC_BUG_API_URL, json=payload, headers=headers) + except httpx.RequestError as e: + _call.set(error=f"transport: {e}", outcome="failed_transport") + return f"Submission failed (transport): {e}" + + comment_id: object = None + body_summary = "" + try: + resp_json = r.json() + comment_id = resp_json.get("commentId") or resp_json.get("id") + body_summary = json.dumps(resp_json)[:300] + except (ValueError, json.JSONDecodeError): + body_summary = (r.text or "")[:300] + _call.set(http_status=r.status_code, comment_id=comment_id, + outcome=("submitted" if r.is_success else "rejected_upstream")) + if r.is_success: + id_note = f" (commentId={comment_id})" if comment_id else "" + return f"Submitted. HTTP {r.status_code}{id_note}. HPE docs team will see this for {href}." + if r.status_code in (401, 403, 429): + return (f"Submission rejected upstream (HTTP {r.status_code}). " + "Likely captcha/auth/rate-limit on anonymous POSTs. " + f"Operator can paste manually at {href}.\n\nResponse (truncated): {body_summary}") + return f"Submission rejected upstream (HTTP {r.status_code}). Response (truncated): {body_summary}" # =========================================================================== diff --git a/eval/queries.jsonl b/eval/queries.jsonl new file mode 100644 index 0000000..98c50ce --- /dev/null +++ b/eval/queries.jsonl @@ -0,0 +1,4 @@ +{"query": "what's the per-socket licensing model for Morpheus Enterprise", "expected": [{"bundle_id": "morpheus_quickspecs", "page_id": "a50009231enw"}], "tags": ["licensing", "skus"]} +{"query": "add an AWS cloud integration", "expected": [], "tags": ["cloud", "TODO-populate-after-first-scrape"]} +{"query": "Plugin API version compatibility", "expected": [], "tags": ["api", "TODO"]} +{"query": "Morpheus Enterprise 8.1.2 what's new", "expected": [{"bundle_id": "morpheus_release_notes_8_1_2", "page_id": "sd00007733en_us"}], "tags": ["release-notes"]} diff --git a/eval/retrievers.py b/eval/retrievers.py index bc06a18..872cf31 100644 --- a/eval/retrievers.py +++ b/eval/retrievers.py @@ -10,7 +10,7 @@ to one entry; the highest-ranked chunk's position wins). """ from __future__ import annotations -from typing import Protocol, Iterable +from typing import Iterable, Protocol class Retriever(Protocol): @@ -21,12 +21,17 @@ class Retriever(Protocol): ... -def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]: - """Take a stream of (bundle_id, page_id, chunk_ordinal) and return - the first k unique pages in their first-seen order.""" +def _split_chunk_id(chunk_id: str) -> tuple[str, str, int]: + """`bundle::page::ordinal` -> (bundle, page, int(ordinal)).""" + bid, pid, ordinal = chunk_id.split("::") + return bid, pid, int(ordinal) + + +def _collapse_to_pages(chunk_ids: Iterable[str], k: int) -> list[tuple[str, str]]: seen: set[tuple[str, str]] = set() out: list[tuple[str, str]] = [] - for bid, pid, _ord in chunk_ids: + for cid in chunk_ids: + bid, pid, _ord = _split_chunk_id(cid) key = (bid, pid) if key in seen: continue @@ -37,26 +42,111 @@ def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> lis return out -# TODO Phase 2/3 — implement these once Chroma + the bm25 module are -# in place. Each one is small (15-30 LOC). The eval harness imports -# from this module by class name. -# -# class DenseRetriever: -# name = "dense" -# def __init__(self, collection): self.col = collection -# def retrieve(self, query, k=10): ... -# -# class RerankedRetriever: -# name = "dense+rerank" -# def __init__(self, collection, rerank_url, pool=200): ... -# def retrieve(self, query, k=10): ... -# -# class BM25Retriever: -# name = "bm25" -# def __init__(self, bm25_index): ... -# def retrieve(self, query, k=10): ... -# -# class HybridRetriever: -# name = "bm25+dense+rrf" -# def __init__(self, dense, bm25, k_rrf=60): ... -# def retrieve(self, query, k=10): ... +class DenseRetriever: + """Chroma cosine search via the live embedding function.""" + name = "dense" + + def __init__(self, collection, pool: int = 50): + self.col = collection + self.pool = pool + + def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: + res = self.col.query(query_texts=[query], n_results=self.pool) + ids = (res.get("ids") or [[]])[0] + return _collapse_to_pages(ids, k) + + +class BM25Retriever: + """SQLite FTS5 lexical search.""" + name = "bm25" + + def __init__(self, bm25_index, pool: int = 200): + self.bm = bm25_index + self.pool = pool + + def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: + hits = self.bm.query(query, n=self.pool) + return _collapse_to_pages((cid for cid, _score in hits), k) + + +class HybridRetriever: + """Reciprocal Rank Fusion of dense + BM25 rankings.""" + name = "hybrid_rrf" + + def __init__(self, dense: DenseRetriever, bm25: BM25Retriever, k_rrf: int = 60, pool: int = 100): + self.dense = dense + self.bm25 = bm25 + self.k_rrf = k_rrf + self.pool = pool + + def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: + dense_pages = self.dense.retrieve(query, k=self.pool) + bm25_pages = self.bm25.retrieve(query, k=self.pool) + scores: dict[tuple[str, str], float] = {} + for rank, page in enumerate(dense_pages, start=1): + scores[page] = scores.get(page, 0.0) + 1.0 / (self.k_rrf + rank) + for rank, page in enumerate(bm25_pages, start=1): + scores[page] = scores.get(page, 0.0) + 1.0 / (self.k_rrf + rank) + ranked = sorted(scores.items(), key=lambda kv: -kv[1]) + return [page for page, _s in ranked[:k]] + + +def _rerank_pool(rerank_url: str, query: str, ids_and_texts: list[tuple[str, str]], + timeout: float = 30.0) -> list[str] | None: + """POST to /v1/rerank, return ids in reranked order. None on failure.""" + if not ids_and_texts: + return [] + import httpx + try: + with httpx.Client(timeout=timeout) as c: + r = c.post(f"{rerank_url}/v1/rerank", json={ + "query": query, + "documents": [(t or "")[:2000] for _i, t in ids_and_texts], + "top_n": len(ids_and_texts), + }) + r.raise_for_status() + results = r.json().get("results") or [] + return [ids_and_texts[item["index"]][0] for item in results + if isinstance(item.get("index"), int) + and 0 <= item["index"] < len(ids_and_texts)] + except Exception: + return None + + +class RerankedRetriever: + """Pull a candidate pool via a base retriever, then cross-encoder re-rank.""" + + def __init__(self, base: Retriever, collection, rerank_url: str, name_suffix: str = "rerank", + pool: int = 50, timeout: float = 30.0): + self.base = base + self.col = collection + self.url = rerank_url + self.name = f"{base.name}+{name_suffix}" + self.pool = pool + self.timeout = timeout + + def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: + # Base returns deduplicated page-level tuples; rerank needs CHUNK-level + # texts to be informative. Pull each page's chunk 0 text from Chroma. + pages = self.base.retrieve(query, k=self.pool) + if not pages: + return [] + chunk_ids = [f"{bid}::{pid}::0" for bid, pid in pages] + g = self.col.get(ids=chunk_ids, include=["documents"]) + by_id = dict(zip(g["ids"], g["documents"])) + ids_and_texts = [(cid, by_id.get(cid, "")) for cid in chunk_ids] + order = _rerank_pool(self.url, query, ids_and_texts, timeout=self.timeout) + if order is None: + return pages[:k] + out: list[tuple[str, str]] = [] + seen: set[tuple[str, str]] = set() + for cid in order: + bid, pid, _ = cid.split("::") + key = (bid, pid) + if key in seen: + continue + seen.add(key) + out.append(key) + if len(out) >= k: + break + return out diff --git a/eval/run_eval.py b/eval/run_eval.py index 9ba3aa6..8daa807 100644 --- a/eval/run_eval.py +++ b/eval/run_eval.py @@ -76,15 +76,87 @@ def main() -> int: queries = load_queries(args.queries) print(f"loaded {len(queries)} queries") - # TODO Phase 7: instantiate the retrievers you implemented in - # eval/retrievers.py and run each one against each query. - # Aggregate MRR / Recall@K / nDCG@K per retriever. Emit a - # markdown table to args.output. Commit the file alongside the - # PR that changes retrieval. - raise NotImplementedError( - "Wire up the retrievers in eval/retrievers.py first, then " - "fill in this evaluation loop. See PLAN.md Phase 7." - ) + import os + import chromadb + from chromadb.config import Settings + from rag.embeddings import embedding_function + from rag.bm25 import BM25Index + from eval.retrievers import DenseRetriever, BM25Retriever, HybridRetriever + + product = os.environ.get("PRODUCT_NAME", "hvm") + repo_root = Path(__file__).resolve().parent.parent + client = chromadb.PersistentClient(path=str(repo_root / "chroma"), + settings=Settings(anonymized_telemetry=False)) + col = client.get_collection(f"{product}_docs", embedding_function=embedding_function()) + bm = BM25Index(str(repo_root / "bm25" / f"{product}_docs.db")) + + from eval.retrievers import RerankedRetriever + + dense = DenseRetriever(col) + bm25 = BM25Retriever(bm) + hybrid = HybridRetriever(DenseRetriever(col, pool=100), BM25Retriever(bm, pool=100)) + + retrievers = [dense, bm25, hybrid] + + rerank_url = os.environ.get("RERANK_URL", "").rstrip("/") + if rerank_url: + retrievers += [ + RerankedRetriever(bm25, col, rerank_url, name_suffix="rerank", pool=50), + RerankedRetriever(hybrid, col, rerank_url, name_suffix="rerank", pool=50), + ] + print(f"reranker enabled: {rerank_url}") + + rows: dict[str, dict[str, float]] = {} + per_query: list[dict] = [] + for r in retrievers: + mrr_sum = recall_sum = ndcg_sum = 0.0 + elapsed_sum = 0.0 + for q in queries: + expected = [(e["bundle_id"], e["page_id"]) for e in q["expected"]] + t0 = time.time() + retrieved = r.retrieve(q["query"], k=max(args.k, 10)) + elapsed = time.time() - t0 + mrr = reciprocal_rank(retrieved, expected) + recall = recall_at_k(retrieved, expected, args.k) + ndcg = ndcg_at_k(retrieved, expected, args.k) + mrr_sum += mrr + recall_sum += recall + ndcg_sum += ndcg + elapsed_sum += elapsed + per_query.append({ + "retriever": r.name, "query": q["query"], + "mrr": mrr, "recall@k": recall, "ndcg@k": ndcg, + "top1": list(retrieved[0]) if retrieved else None, + "elapsed_s": round(elapsed, 3), + }) + n = len(queries) + rows[r.name] = { + "MRR": mrr_sum / n, + f"Recall@{args.k}": recall_sum / n, + f"nDCG@{args.k}": ndcg_sum / n, + "avg_latency_s": elapsed_sum / n, + } + print(f" {r.name}: MRR={rows[r.name]['MRR']:.3f} " + f"Recall@{args.k}={rows[r.name][f'Recall@{args.k}']:.3f} " + f"nDCG@{args.k}={rows[r.name][f'nDCG@{args.k}']:.3f} " + f"avg={rows[r.name]['avg_latency_s']*1000:.0f}ms") + + args.output.parent.mkdir(parents=True, exist_ok=True) + md = [f"# Retrieval eval — k={args.k}", "", + f"_{len(queries)} hand-curated queries, generated {time.strftime('%Y-%m-%d %H:%M:%S')}_", "", + "| Retriever | MRR | Recall@{k} | nDCG@{k} | avg latency |".replace("{k}", str(args.k)), + "| --- | ---: | ---: | ---: | ---: |"] + for name, m in rows.items(): + md.append(f"| `{name}` | {m['MRR']:.3f} | {m[f'Recall@{args.k}']:.3f} " + f"| {m[f'nDCG@{args.k}']:.3f} | {m['avg_latency_s']*1000:.0f}ms |") + md += ["", "## Per-query results", "", + "| Retriever | Query | MRR | top-1 |", "| --- | --- | ---: | --- |"] + for r in per_query: + top1 = f"`{r['top1'][0]}/{r['top1'][1][:24]}...`" if r["top1"] else "—" + md.append(f"| `{r['retriever']}` | {r['query'][:60]} | {r['mrr']:.3f} | {top1} |") + args.output.write_text("\n".join(md) + "\n") + print(f"wrote {args.output}") + return 0 if __name__ == "__main__": diff --git a/rag/chunk.py b/rag/chunk.py index b8d7317..c937c1f 100644 --- a/rag/chunk.py +++ b/rag/chunk.py @@ -31,6 +31,31 @@ from typing import Iterator CHARS_PER_TOKEN = 4 TARGET_TOKENS = 500 TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN +# Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger +# 400s the entire embed batch. 6000 chars works for prose but markdown +# tables with lots of `|` separators tokenize ~1.4× denser; a 5839-char +# table chunk from the HVM qualification matrix tokenized past 2048 and +# crashed the rebuild. 4000 chars stays under 2048 tokens even for +# dense table content while leaving headroom for the query side. +MAX_CHARS = 4000 + + +def _hard_split(text: str) -> list[str]: + """Split an oversized block on line boundaries into MAX_CHARS pieces.""" + if len(text) <= MAX_CHARS: + return [text] + out: list[str] = [] + buf: list[str] = [] + buf_chars = 0 + for line in text.splitlines(keepends=True): + if buf_chars + len(line) > MAX_CHARS and buf: + out.append("".join(buf).rstrip()) + buf, buf_chars = [], 0 + buf.append(line) + buf_chars += len(line) + if buf: + out.append("".join(buf).rstrip()) + return out def estimate_tokens(text: str) -> int: @@ -104,23 +129,26 @@ def chunks_from_page( # ----- Body chunks: pack paragraphs up to TARGET_CHARS ------- ordinal = 1 + + def emit(buf: list[str]) -> Iterator[dict]: + nonlocal ordinal + merged = "\n\n".join(buf) + for piece in _hard_split(merged): + yield { + "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", + "text": piece, + "metadata": {**metadata, "ordinal": ordinal}, + } + ordinal += 1 + buf: list[str] = [] buf_chars = 0 for p in paragraphs: if buf_chars + len(p) > TARGET_CHARS and buf: - yield { - "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", - "text": "\n\n".join(buf), - "metadata": {**metadata, "ordinal": ordinal}, - } - ordinal += 1 + yield from emit(buf) buf = [] buf_chars = 0 buf.append(p) buf_chars += len(p) if buf: - yield { - "id": f"{metadata['bundle_id']}::{page_id}::{ordinal}", - "text": "\n\n".join(buf), - "metadata": {**metadata, "ordinal": ordinal}, - } + yield from emit(buf) diff --git a/rag/embeddings.py b/rag/embeddings.py index 84d3bbd..a072f7a 100644 --- a/rag/embeddings.py +++ b/rag/embeddings.py @@ -3,8 +3,15 @@ Swappable: implement the same `embedding_function()` interface returning a Chroma `EmbeddingFunction` and the rest of the pipeline doesn't care. -Defaults (override via env): - OLLAMA_URL one or more comma-separated URLs (load-balanced) +Env-configurable (matches the zerto-docs-rag pattern so the same Gitea +runner + GPU-pinned Ollama containers can serve every docs MCP build): + + OLLAMA_URLS comma-separated list, load-balanced round-robin per batch. + Preferred — set in the CI workflow to fan out across two + GPU-pinned Ollama containers on the Gitea host. + OLLAMA_URL single endpoint, fallback when OLLAMA_URLS is unset. + Default http://192.168.0.2:11434 (the host where the GPUs + live in Justin's lab). EMBED_MODEL model name; default 'nomic-embed-text' EMBED_DIM expected embedding dim; default 768 (nomic-embed-text) """ @@ -19,8 +26,18 @@ from chromadb import EmbeddingFunction, Documents, Embeddings log = logging.getLogger(__name__) -OLLAMA_URLS = [u.strip() for u in os.environ.get("OLLAMA_URL", - "http://localhost:11434").split(",") if u.strip()] +DEFAULT_OLLAMA_URL = "http://192.168.0.2:11434" + + +def _resolve_urls() -> list[str]: + raw = os.environ.get("OLLAMA_URLS", "").strip() + if raw: + return [u.strip().rstrip("/") for u in raw.split(",") if u.strip()] + single = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL).strip().rstrip("/") + return [single] + + +OLLAMA_URLS = _resolve_urls() EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text") EMBED_DIM = int(os.environ.get("EMBED_DIM", "768")) diff --git a/rag/index.py b/rag/index.py index 8d1c74f..f9b5ce2 100644 --- a/rag/index.py +++ b/rag/index.py @@ -29,7 +29,7 @@ CHROMA_DIR = ROOT / "chroma" # Collection name — convention: _docs. Override via env if needed. import os -PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct") +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "morpheus") COLLECTION = f"{PRODUCT_NAME}_docs" diff --git a/requirements-rerank.txt b/requirements-rerank.txt new file mode 100644 index 0000000..2c5fc27 --- /dev/null +++ b/requirements-rerank.txt @@ -0,0 +1,10 @@ +# Dev/CPU reranker — only for running scripts/rerank_server.py locally. +# Production uses the llama.cpp + jina-reranker GGUF sidecar (see +# deploy/docker-compose.yml). Install with: +# +# pip install -r requirements-rerank.txt +# +# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder +# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main +# requirements.txt so the production image stays slim. +sentence-transformers>=3.0 diff --git a/requirements.txt b/requirements.txt index b9982a9..431949b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,10 +10,18 @@ ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not # Scraping (Phase 1; adjust per product) beautifulsoup4>=4.12 requests>=2.31 +curl_cffi>=0.7 # for HPE QuickSpecs scrape (Chrome TLS impersonation) +markdownify>=0.11 # playwright>=1.40 # uncomment if you need headless browser fallback # Evaluation numpy>=1.26 +# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server +# only needs httpx (declared above) to call it. For the dev / CPU +# fallback reranker (scripts/rerank_server.py), install +# requirements-rerank.txt separately — it pulls in PyTorch which would +# triple the production image size. + # Dev / utility python-dateutil>=2.8 diff --git a/scrape/README.md b/scrape/README.md index 44d6df3..e9f6c95 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -7,6 +7,72 @@ the upstream doc portal. See `PLAN.md` Phase 1 for the corpus layout the rest of the pipeline expects. +--- + +## Product context — HPE Morpheus Enterprise Software + +**This repo is for HPE Morpheus Enterprise**, the full cloud-management +platform. It is a **different SKU** from HPE Morpheus VM Essentials +(HVM), which has its own MCP at `../hvm-docs/`. Don't ingest HVM +docs here; they're a separate, smaller product (the "VM-only" subset +of Morpheus). The Morpheus VM Essentials Deployment Guide refers to +Morpheus Enterprise as the "elevate to" target — that's the +relationship. + +`PRODUCT_NAME=morpheus`. Tool will be named `morpheus_api_lessons`, +collection `morpheus_docs`, etc. + +### Upstream portal + +HPE Support DocPortal (Tridion/SDL-derived, same surface as HVM and +the Zerto docs). Anonymous JSON API, no auth required. + +| Endpoint | Returns | +|---|---| +| `GET https://support.hpe.com/hpesc/public/api/document/{docId}` | DITA-source HTML — title page / abstract OR (for short docs like Release Notes) the entire body | +| `GET https://support.hpe.com/hpesc/public/api/document/{docId}/toc` | Nested JSON tree of `{topicName, topicLink, description, children}`. Empty/404 for single-doc Release Notes. | +| `GET https://support.hpe.com/hpesc/public/api/document/{docId}/render?page=GUID-XXXX.html` | `{docId, page_html, doc_meta, page_meta}` — single page body | + +User-facing URL format: +`https://support.hpe.com/hpesc/public/docDisplay?docId={docId}&page=GUID-XXXX.html` + +### Bundle IDs (confirmed 2026-05-22) + +**Morpheus Enterprise User Manual** — ~569 pages each, full nested TOC: + +| Version | docId | +|---|---| +| 8.1.0 | `sd00007510en_us` | +| 8.1.1 | `sd00007621en_us` | +| 8.1.2 | `sd00007732en_us` | + +**Morpheus Enterprise Release Notes** — short, single-doc-blob shape +(no TOC; full body returned by the `/document/{docId}` endpoint +itself; scraper needs a `--single-doc` mode for these): + +| Version | docId | +|---|---| +| 8.1.0 | `sd00007496en_us` | +| 8.1.1 | `sd00007610en_us` | +| 8.1.2 | `sd00007733en_us` | + +### Cross-version peers are free + +GUIDs are stable across versions (confirmed on HVM where 374/376/376 +pages had 100% GUID overlap between adjacent versions). Same-GUID = +same-topic. Synthesize `topic_cluster.clustered_topics` by looking +up the same GUID in the other bundle slugs — no fuzzy matching +needed. + +### Reusable from hvm-docs + +`../hvm-docs/scrape/bundles.py` and `../hvm-docs/scrape/runner.py` +solve the identical portal shape. Copy and adapt the BUNDLES list + +PRODUCT_NAME; the fetch logic should drop in unchanged. Both the +TOC-paginated path and the single-doc path are needed (the HVM +build covers both because HVM Release Notes follow the same shape). + + ## What you write At minimum, two scripts: diff --git a/scrape/bundles.py b/scrape/bundles.py new file mode 100644 index 0000000..3c3ab87 --- /dev/null +++ b/scrape/bundles.py @@ -0,0 +1,200 @@ +"""Discover Morpheus Enterprise doc bundles on HPE Support DocPortal and write bundles.json. + +Mirrors hvm-docs/scrape/bundles.py — same portal, same API shape, same single-doc-blob +treatment for Release Notes, but pointing at the Morpheus Enterprise docId range. + +For each bundle this script: + 1. GETs /hpesc/public/api/document/{docId} → abstract HTML + 2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc) + 3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents. + +QuickSpecs is a special case: lives at www.hpe.com (not support.hpe.com), gets the +html-file mode and is scraped via curl_cffi (see scrape/quickspecs.py). +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup + +API = "https://support.hpe.com/hpesc/public/api/document" +DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" +UA = "morpheus-docs-mcp/0.1 (+https://git.jpaul.io/justin/morpheus-docs; admin@jpaul.io)" +ROOT = Path(__file__).resolve().parent.parent +BUNDLES_JSON = ROOT / "bundles.json" + + +@dataclass +class BundleSpec: + slug: str + doc_id: str + title: str + version: str | None + product: str # e.g. "User Manual", "Release Notes", "QuickSpecs" + mode: str # "toc", "single", or "html-file" + platform: str | None = None + language: str = "en-US" + source_url: str | None = None # overrides the default support.hpe.com URL + + +# Declared bundles. Versions confirmed 2026-05-22 by probing the docId +# range sd00006500..7740 for `Morpheus Enterprise` matches in the abstract. +# +# Notes: +# - Morpheus Enterprise has User Manuals dating back to 8.0.10 +# (sd00006774en_us, Sep 2025) but we only ship the 8.1.x line for +# now. Add the 8.0.x bundles here if you need older versions in the +# corpus. +# - No dedicated Deployment Guide or Qualification Matrix for Morpheus +# Enterprise on HPE Support — the only QM (sd00006551en_us) covers +# HVM clusters managed by Morpheus, which lives in hvm-docs. +# - QuickSpecs lives on www.hpe.com (not support.hpe.com), uses the +# html-file scrape mode with curl_cffi Chrome impersonation. +BUNDLES: list[BundleSpec] = [ + BundleSpec("morpheus_user_manual_8_1_0", "sd00007510en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.0", "User Manual", "toc"), + BundleSpec("morpheus_user_manual_8_1_1", "sd00007621en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.1", "User Manual", "toc"), + BundleSpec("morpheus_user_manual_8_1_2", "sd00007732en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.2", "User Manual", "toc"), + BundleSpec("morpheus_release_notes_8_1_0", "sd00007496en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.0", "Release Notes", "single"), + BundleSpec("morpheus_release_notes_8_1_1", "sd00007610en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.1", "Release Notes", "single"), + BundleSpec("morpheus_release_notes_8_1_2", "sd00007733en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.2", "Release Notes", "single"), + BundleSpec("morpheus_quickspecs", "a50009231enw", "HPE Morpheus Enterprise Software QuickSpecs", + "v1", "QuickSpecs", "html-file", + source_url="https://www.hpe.com/psnow/doc/a50009231enw"), +] + + +def _session() -> requests.Session: + s = requests.Session() + s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) + return s + + +def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: + delay = 1.0 + for attempt in range(retries): + r = s.get(url, timeout=30) + if r.status_code == 200: + return r.json() if expect_json else r.text + if r.status_code == 404: + return None + if r.status_code in (429, 500, 502, 503, 504): + time.sleep(delay) + delay *= 2 + continue + r.raise_for_status() + raise RuntimeError(f"GET failed after {retries} retries: {url}") + + +def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]: + if not toc: + return 0, None + landing = None + n = 0 + + def walk(nodes: list[dict] | None, depth: int) -> None: + nonlocal n, landing + for node in nodes or []: + link = node.get("topicLink") + if link: + n += 1 + m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link) + if m and landing is None: + landing = m.group(1) + walk(node.get("children"), depth + 1) + + walk(toc, 0) + return n, landing + + +def _parse_abstract(html: str) -> dict[str, str]: + soup = BeautifulSoup(html, "html.parser") + out: dict[str, str] = {} + h1 = soup.select_one("h1.title.topictitle1") + if h1: + out["title"] = h1.get_text(" ", strip=True) + desc = soup.select_one("div.desc") + if desc: + out["abstract"] = desc.get_text(" ", strip=True) + pub = soup.select_one("div.publishedDate") + if pub: + out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip() + return out + + +def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]: + # html-file bundles are static fixtures or live-fetched outside support.hpe.com. + if spec.mode == "html-file": + return { + "slug": spec.slug, + "doc_id": spec.doc_id, + "title": spec.title, + "version": spec.version, + "platform": spec.platform, + "product": spec.product, + "language": spec.language, + "page_count": 1, + "mode": "html-file", + "abstract": "", + "dates": {}, + "landing_page": spec.doc_id, + "source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}", + } + + abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False) + meta = _parse_abstract(abstract_html or "") + + page_count: int + landing: str | None + if spec.mode == "toc": + toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True) + page_count, landing = _count_toc(toc) + if page_count == 0: + print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr) + spec.mode = "single" + page_count, landing = 1, spec.doc_id + else: + page_count, landing = 1, spec.doc_id + + return { + "slug": spec.slug, + "doc_id": spec.doc_id, + "title": meta.get("title") or spec.title, + "version": spec.version, + "platform": spec.platform, + "product": spec.product, + "language": spec.language, + "page_count": page_count, + "mode": spec.mode, + "abstract": meta.get("abstract", ""), + "dates": {"Published": meta.get("published", "")}, + "landing_page": landing, + "source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id), + } + + +def main() -> int: + p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.") + p.add_argument("--out", default=str(BUNDLES_JSON)) + args = p.parse_args() + + s = _session() + out: list[dict[str, Any]] = [] + for spec in BUNDLES: + print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr) + out.append(discover_bundle(s, spec)) + + Path(args.out).write_text(json.dumps(out, indent=2) + "\n") + print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrape/quickspecs.py b/scrape/quickspecs.py new file mode 100644 index 0000000..12b72d6 --- /dev/null +++ b/scrape/quickspecs.py @@ -0,0 +1,194 @@ +"""Scrape HPE QuickSpecs collateral pages into corpus markdown. + +HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral..html` +with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the +captured DOM). The blocker for automated scraping is `www.hpe.com`'s +edge bot defense, which drops connections from non-browser TLS +fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here +by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint. + +Content extraction uses these stable CSS selectors found in the page: + + .lr-right-rail hpe-highlights-container .collateral-content + — one per section ("Overview", "Standard Features", etc.) + h3.txto-title — section title + div.txto-description — section body + uc-table.uc-table-polaris — SKU / version-history tables + +A committed HTML fixture at `scrape/quickspecs/.html` is used +as a fallback when the live fetch fails (HPE edge churn, network +issues). Keeping a current fixture in the repo also makes diffing +QuickSpecs revisions easy. + +Usage (called by scrape.runner for bundles with mode="quickspecs"): + + python -m scrape.quickspecs a50004260enw + +Or programmatically: + + from scrape.quickspecs import scrape_quickspecs + scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...") +""" +from __future__ import annotations + +import argparse +import json +import logging +import sys +from pathlib import Path + +from bs4 import BeautifulSoup, NavigableString +from markdownify import markdownify as md + +log = logging.getLogger(__name__) + +ROOT = Path(__file__).resolve().parent.parent +SOURCE_DIR = ROOT / "scrape" / "quickspecs" +CORPUS_DIR = ROOT / "corpus" + +COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html" + + +def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None: + """GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint). + Returns the HTML body on success, None on any failure.""" + try: + from curl_cffi import requests as cc + except ImportError: + log.warning("curl_cffi not installed; can't fetch QuickSpecs live") + return None + try: + r = cc.get(COLLATERAL_URL.format(doc_id=doc_id), + impersonate="chrome120", timeout=timeout) + if r.status_code != 200 or not r.text: + log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or "")) + return None + return r.text + except Exception as e: + log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e) + return None + + +def fetch_fixture(doc_id: str) -> str | None: + """Read the committed HTML fixture as fallback.""" + p = SOURCE_DIR / f"{doc_id}.html" + if not p.exists(): + return None + return p.read_text() + + +def _extract_content_blocks(html: str) -> list[str]: + """Pull each section block (.collateral-content under .lr-right-rail). + + The fixture format (just .quickspecs-content wrapper) and the live + format (.lr-right-rail with nested hpe-highlights-container) are + both supported. Returns a list of section HTML strings, in document + order. + """ + soup = BeautifulSoup(html, "html.parser") + # Live format: each under .lr-right-rail has + # one or more .collateral-content blocks; concat them. + rail = soup.select_one(".lr-right-rail") + if rail is not None: + blocks = rail.select(".collateral-content") + return [str(b) for b in blocks] + # Fixture format: a single wrapper holding all the H2/H3 sections. + wrapper = soup.select_one(".quickspecs-content") + if wrapper is not None: + return [str(wrapper)] + # Last-resort: whole body. + body = soup.body or soup + return [str(body)] + + +def parse_html(html: str) -> str: + """Convert QuickSpecs HTML to clean markdown. + + Filters out the page chrome (nav, footer, recommendations carousel, + cookie banner, analytics blobs) by extracting only the content + blocks, then runs markdownify.""" + blocks = _extract_content_blocks(html) + chunks: list[str] = [] + for block in blocks: + soup = BeautifulSoup(block, "html.parser") + # Drop anchor placeholders that markdownify turns into noisy links + for a in soup.select('[hpe-left-rail-anchor]'): + a.decompose() + # Drop carousel / share / recommendation widgets if any leaked in. + for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar", + "esl-scrollbar", "esl-trigger", "video-overlay", + "generic-modal-loader", "style", "script"): + for el in soup.select(sel): + el.decompose() + chunks.append(md(str(soup), heading_style="ATX", bullets="-", + strip=["span", "div"])) + text = "\n\n".join(chunks) + # Collapse runs of blank lines markdownify likes to emit. + text = "\n".join(line.rstrip() for line in text.splitlines()) + while "\n\n\n" in text: + text = text.replace("\n\n\n", "\n\n") + return text.strip() + "\n" + + +def scrape_quickspecs(doc_id: str, bundle_id: str, title: str, + version: str | None = None, + product: str = "QuickSpecs", + source_url: str | None = None, + force: bool = False) -> bool: + """Live-fetch (or fall back to fixture), parse, write corpus files. + + Returns True if files were written, False if skipped (already exists + and --force not set).""" + bundle_dir = CORPUS_DIR / bundle_id + md_path = bundle_dir / f"{doc_id}.md" + json_path = bundle_dir / f"{doc_id}.json" + if not force and md_path.exists() and json_path.exists(): + log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id) + return False + + html = fetch_live(doc_id) + fetched_from = "live" + if html is None: + html = fetch_fixture(doc_id) + fetched_from = "fixture" + if html is None: + log.error("QuickSpecs %s: no live response and no fixture at %s", + doc_id, SOURCE_DIR / f"{doc_id}.html") + return False + + body_md = parse_html(html) + bundle_dir.mkdir(parents=True, exist_ok=True) + md_path.write_text(body_md) + sidecar = { + "bundle_id": bundle_id, + "page_id": doc_id, + "title": title, + "ordinal": 1, + "parent_title": None, + "doc_id": doc_id, + "version": version, + "product": product, + "source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}", + "fetched_from": fetched_from, + } + json_path.write_text(json.dumps(sidecar, indent=2) + "\n") + log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from) + return True + + +def main() -> int: + logging.basicConfig(level=logging.INFO, format="%(message)s") + p = argparse.ArgumentParser() + p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw") + p.add_argument("--bundle-id", default="hvm_quickspecs") + p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs") + p.add_argument("--version", default=None) + p.add_argument("--force", action="store_true") + args = p.parse_args() + ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title, + args.version, force=args.force) + return 0 if ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrape/quickspecs/README.md b/scrape/quickspecs/README.md new file mode 100644 index 0000000..e301411 --- /dev/null +++ b/scrape/quickspecs/README.md @@ -0,0 +1,27 @@ +# scrape/quickspecs/ + +Static HTML fixtures for HPE QuickSpecs documents that aren't reachable +from the runner (www.hpe.com edge drops connections from datacenter IPs +with non-browser User-Agents — verified 2026-05-22 with curl, wget, and +Anthropic's WebFetch). + +## Workflow + +1. Operator visits `https://www.hpe.com/psnow/doc/` in a real + browser, opens DevTools → Elements → Copy the `` HTML. +2. Save it at `scrape/quickspecs/.html`. +3. Add a bundle entry in `scrape/bundles.py` with `mode="html-file"`. +4. `python -m scrape.runner --bundle hvm_quickspecs --force` reads the + committed HTML and writes `corpus/hvm_quickspecs/.{md,json}`. +5. Re-index and ship. + +QuickSpecs only update every few months (HPE rebrand, new SKU added, +feature change). When a new version drops, refresh the local HTML +file and re-run the scrape. + +## Current fixtures + +- `a50004260enw.html` — HPE Morpheus VM Essentials Software QuickSpecs + (Version 4, 02-February-2026). SKUs: S5Q81AAE (1-yr), S5Q82AAE + (3-yr), S5Q83AAE (5-yr) — all "per Socket E-LTU" with Tech Care + Essentials included. diff --git a/scrape/runner.py b/scrape/runner.py new file mode 100644 index 0000000..c83f4b2 --- /dev/null +++ b/scrape/runner.py @@ -0,0 +1,339 @@ +"""Scrape HVM doc bundles into corpus//.{md,json}. + +Reads bundles.json (produced by scrape.bundles), then for each bundle: + - mode="toc": walks the TOC tree, fetches each page via the render + endpoint, converts page_html to markdown, writes + .md + .json sidecar. + - mode="single": fetches /document/{docId} directly, treats the whole + body as one page with page_id = doc_id. + +After all bundles are on disk, runs a finalize pass that synthesizes +topic_cluster.clustered_topics for each page by looking up the same +GUID in sibling bundles (HPE GUIDs are stable across versions — see +reference_hpe_docs_portal_api.md). + +Usage: + python -m scrape.runner --all + python -m scrape.runner --bundle hvm_user_manual_8_1_2 + python -m scrape.runner --all --force # re-download already-on-disk pages + python -m scrape.runner --finalize-only # only redo the topic_cluster pass +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup +from markdownify import markdownify as md + +API = "https://support.hpe.com/hpesc/public/api/document" +DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html" +DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" +UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)" +ROOT = Path(__file__).resolve().parent.parent +CORPUS = ROOT / "corpus" +BUNDLES_JSON = ROOT / "bundles.json" + +GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html") + + +@dataclass +class TocEntry: + page_id: str + title: str + ordinal: int + parent_title: str | None + + +def _session() -> requests.Session: + s = requests.Session() + s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) + return s + + +def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: + delay = 1.0 + for attempt in range(retries): + r = s.get(url, timeout=30) + if r.status_code == 200: + return r.json() if expect_json else r.text + if r.status_code == 404: + return None + if r.status_code in (429, 500, 502, 503, 504): + time.sleep(delay) + delay *= 2 + continue + r.raise_for_status() + raise RuntimeError(f"GET failed after {retries} retries: {url}") + + +def _flatten_toc(toc: list[dict]) -> list[TocEntry]: + out: list[TocEntry] = [] + ordinal = 0 + + def walk(nodes: list[dict] | None, parent_title: str | None) -> None: + nonlocal ordinal + for node in nodes or []: + title = node.get("topicName") or "" + link = node.get("topicLink") or "" + m = GUID_RE.search(link) + if m: + ordinal += 1 + out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title)) + walk(node.get("children"), title or parent_title) + + walk(toc, None) + return out + + +def _strip_dita_wrappers(html: str) -> str: + """Remove the outer
, drop the trademark Notices section, + and unwrap aria-only span markup so markdownify produces clean text. + + DITA's notices boilerplate repeats across every doc; if we leave it in, + every page chunk inherits the same trademark text and pollutes retrieval.""" + soup = BeautifulSoup(html, "html.parser") + # Drop the Notices/Acknowledgments/Abstract boilerplate by section heading. + # Every doc on the portal carries the same legal Notices and trademark + # Acknowledgments; if we leave them in, every chunk inherits the same + # text and pollutes retrieval. Abstract is one-line marketing. + boilerplate = {"Notices", "Acknowledgments", "Abstract"} + # Wrapped form:
/
/
whose first heading child is boilerplate. + for sec in soup.select("article, section, div"): + h = sec.find(["h1", "h2"], recursive=False) + if h and h.get_text(strip=True) in boilerplate: + sec.decompose() + # Unwrapped form: bare

/

Boilerplate

followed by its .desc/.body sibling. + for h in soup.find_all(["h1", "h2"]): + if h.get_text(strip=True) in boilerplate: + sib = h.find_next_sibling() + if sib and (sib.name in {"div", "section"}): + cls = " ".join(sib.get("class", []) or []) + if "desc" in cls or "body" in cls or "notices" in cls: + sib.decompose() + h.decompose() + main = soup.find("main") + return str(main) if main else str(soup) + + +def html_to_md(page_html: str) -> str: + cleaned = _strip_dita_wrappers(page_html) + text = md(cleaned, heading_style="ATX", bullets="-") + # collapse runs of blank lines + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return text + "\n" + + +def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str: + payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True) + if not payload: + return "" + return payload.get("page_html") or "" + + +def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]: + """Returns (page_html, title) for a single-doc-shape bundle.""" + html = _get(s, f"{API}/{doc_id}") + if not html: + return "", "" + soup = BeautifulSoup(html, "html.parser") + h1 = soup.select_one("h1.title.topictitle1") + title = h1.get_text(" ", strip=True) if h1 else doc_id + return html, title + + +def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool: + bundle_dir.mkdir(parents=True, exist_ok=True) + md_path = bundle_dir / f"{page_id}.md" + json_path = bundle_dir / f"{page_id}.json" + if not force and md_path.exists() and json_path.exists(): + return False + md_path.write_text(body_md) + json_path.write_text(json.dumps(sidecar, indent=2) + "\n") + return True + + +def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int: + doc_id = bundle["doc_id"] + slug = bundle["slug"] + bundle_dir = CORPUS / slug + + toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or [] + entries = _flatten_toc(toc) + print(f" {slug}: {len(entries)} pages", file=sys.stderr) + + written = 0 + def do_one(entry: TocEntry) -> bool: + page_html = fetch_toc_page(s, doc_id, entry.page_id) + if not page_html: + return False + body_md = html_to_md(page_html) + sidecar = { + "bundle_id": slug, + "page_id": entry.page_id, + "title": entry.title, + "ordinal": entry.ordinal, + "parent_title": entry.parent_title, + "doc_id": doc_id, + "version": bundle.get("version"), + "product": bundle.get("product"), + "source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id), + # topic_cluster filled in by finalize() + } + return write_page(bundle_dir, entry.page_id, body_md, sidecar, force) + + with ThreadPoolExecutor(max_workers=concurrency) as pool: + for fut in as_completed(pool.submit(do_one, e) for e in entries): + if fut.result(): + written += 1 + return written + + +def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int: + doc_id = bundle["doc_id"] + slug = bundle["slug"] + bundle_dir = CORPUS / slug + + html, title = fetch_single_doc(s, doc_id) + if not html: + print(f" ! {slug}: empty body", file=sys.stderr) + return 0 + body_md = html_to_md(html) + sidecar = { + "bundle_id": slug, + "page_id": doc_id, + "title": title or bundle["title"], + "ordinal": 1, + "parent_title": None, + "doc_id": doc_id, + "version": bundle.get("version"), + "product": bundle.get("product"), + "source_url": DOC_URL_SINGLE.format(doc_id=doc_id), + } + print(f" {slug}: 1 page (single-doc)", file=sys.stderr) + return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0 + + +def finalize_clusters(bundles: list[dict]) -> int: + """Cross-link sibling pages with the same GUID across version bundles. + + For TOC bundles, page_id == GUID; same GUID across two bundles = same + underlying topic. For single-doc bundles (page_id == doc_id), peer them + by matching product+version-sibling on the `product` field.""" + # GUID → list[(slug, sidecar_path, sidecar_dict)] + guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {} + # product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering + product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {} + + for b in bundles: + slug = b["slug"] + bundle_dir = CORPUS / slug + if not bundle_dir.exists(): + continue + for jp in bundle_dir.glob("*.json"): + data = json.loads(jp.read_text()) + pid = data["page_id"] + if pid.startswith("GUID-"): + guid_to_pages.setdefault(pid, []).append((slug, jp, data)) + else: + product_to_pages.setdefault(b["product"], []).append((slug, jp, data)) + + updated = 0 + # TOC pages — cluster by GUID + for guid, peers in guid_to_pages.items(): + if len(peers) < 2: + continue + for slug, jp, data in peers: + others = [ + {"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")} + for s2, _, d2 in peers if s2 != slug + ] + data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others} + jp.write_text(json.dumps(data, indent=2) + "\n") + updated += 1 + # Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2) + for product, peers in product_to_pages.items(): + if len(peers) < 2: + continue + for slug, jp, data in peers: + others = [ + {"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")} + for s2, _, d2 in peers if s2 != slug + ] + data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others} + jp.write_text(json.dumps(data, indent=2) + "\n") + updated += 1 + + return updated + + +def main() -> int: + p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.") + p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json") + p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)") + p.add_argument("--force", action="store_true", help="re-fetch pages already on disk") + p.add_argument("--concurrency", type=int, default=6) + p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields") + args = p.parse_args() + + if not BUNDLES_JSON.exists(): + print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr) + return 2 + + bundles = json.loads(BUNDLES_JSON.read_text()) + + if args.finalize_only: + n = finalize_clusters(bundles) + print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr) + return 0 + + if args.bundle: + bundles = [b for b in bundles if b["slug"] in args.bundle] + if not bundles: + print(f"no bundles matched: {args.bundle}", file=sys.stderr) + return 2 + elif not args.all: + print("specify --all or --bundle ", file=sys.stderr) + return 2 + + s = _session() + total = 0 + for b in bundles: + mode = b.get("mode") + if mode == "single": + total += scrape_single_bundle(s, b, args.force) + elif mode == "html-file": + # Live-scrape HPE collateral (QuickSpecs) via curl_cffi; falls back + # to scrape/quickspecs/.html fixture if the edge blocks us. + from scrape.quickspecs import scrape_quickspecs + ok = scrape_quickspecs( + doc_id=b["doc_id"], bundle_id=b["slug"], + title=b.get("title", b["doc_id"]), + version=b.get("version"), + product=b.get("product", "QuickSpecs"), + source_url=b.get("source_url"), + force=args.force, + ) + total += 1 if ok else 0 + else: + total += scrape_toc_bundle(s, b, args.force, args.concurrency) + print(f"scraped {total} new/updated pages", file=sys.stderr) + + # Always finalize after a scrape so sidecars are consistent. + all_bundles = json.loads(BUNDLES_JSON.read_text()) + n = finalize_clusters(all_bundles) + print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/registry_gc.py b/scripts/registry_gc.py index 41bbc52..0e900ca 100644 --- a/scripts/registry_gc.py +++ b/scripts/registry_gc.py @@ -1,42 +1,58 @@ """Gitea container-registry garbage collection. -Lists package versions for one container package and deletes versions -older than --keep-days. Always preserves: +Lists tagged versions of one container package and deletes old ones. +Always preserves: - - the :latest tag - - the --keep-latest most-recent date-tagged versions - - anything pushed in the last --keep-days days + - the `latest` tag (Watchtower's auto-deploy target) + - the `--keep-latest` most-recent date-tagged versions (YYYY.MM.DD) + - the `--keep-latest` most-recent short-SHA tags (rollback pins) + - anything pushed within `--keep-days` days -The actual disk reclaim happens on Gitea's next package GC cron (admin -site settings). This script just marks the versions for deletion. +OCI blob-level versions (`sha256:...`) are never touched directly — those +are managed by Gitea's internal package GC cron when their last tag +goes away. Usage: - python scripts/registry_gc.py \\ - --owner \\ - --package -docs-mcp \\ + GITEA_TOKEN=... python scripts/registry_gc.py \\ + --owner justin \\ + --package hvm-docs \\ --keep-days 90 \\ --keep-latest 5 -Auth: reads GITEA_TOKEN from env (set in the workflow as a secret). +The Gitea endpoint shape (confirmed 2026-05-22 against git.jpaul.io): + + GET /api/v1/packages/{owner}/container/{package} + -> [{id, version, created_at, ...}, ...] + DELETE /api/v1/packages/{owner}/container/{package}/{version} """ from __future__ import annotations import argparse +import json import os +import re import sys from datetime import datetime, timedelta, timezone -from urllib.request import Request, urlopen from urllib.error import HTTPError -import json - +from urllib.parse import quote +from urllib.request import Request, urlopen GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io") +DATE_TAG = re.compile(r"^\d{4}\.\d{2}\.\d{2}$") +SHA_TAG = re.compile(r"^[0-9a-f]{7,40}$") # short or full git SHA +BLOB_VER = re.compile(r"^sha256:") # OCI blob versions — skip def api(token: str, method: str, path: str) -> object: + # Explicit User-Agent: git.jpaul.io is behind Cloudflare, whose default + # Bot Fight Mode 403s `Python-urllib/X.Y` with error 1010. Any + # recognizable browser/curl-style UA passes. req = Request(f"{GITEA_HOST}{path}", - headers={"Authorization": f"token {token}"}, + headers={ + "Authorization": f"token {token}", + "User-Agent": "hvm-docs-registry-gc/1.0", + }, method=method) try: with urlopen(req, timeout=30) as r: @@ -63,44 +79,57 @@ def main() -> int: return 1 versions = api(token, "GET", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or [] + f"/api/v1/packages/{args.owner}/container/{args.package}") or [] if not versions: - print(f"no versions found for {args.owner}/{args.package}") + print(f"no versions found for {args.owner}/container/{args.package}") return 0 cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days) + print(f" {len(versions)} version(s); cutoff={cutoff.isoformat()} " + f"keep_days={args.keep_days} keep_latest={args.keep_latest}") - # Date-tagged versions (YYYY.MM.DD), newest first - date_tagged = [] - for v in versions: - tags = v.get("tags") or [] - for t in tags: - if len(t) == 10 and t[4] == "." and t[7] == ".": - date_tagged.append((t, v)) - break - date_tagged.sort(key=lambda kv: kv[0], reverse=True) - keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]} - - deleted = 0 - for v in versions: - tags = v.get("tags") or [] - if "latest" in tags: - continue - if any(t in keep_date_tags for t in tags): - continue + # Sort newest first by created_at. + def parsed_ts(v: dict) -> datetime: try: - created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) + return datetime.fromisoformat(v["created_at"].replace("Z", "+00:00")) except (KeyError, ValueError): + return datetime.min.replace(tzinfo=timezone.utc) + + versions.sort(key=parsed_ts, reverse=True) + + # Compute the keep-set: top-N date tags + top-N sha tags + always latest. + keep_dates: list[str] = [] + keep_shas: list[str] = [] + for v in versions: + ver = v.get("version") or "" + if DATE_TAG.match(ver) and len(keep_dates) < args.keep_latest: + keep_dates.append(ver) + elif SHA_TAG.match(ver) and len(keep_shas) < args.keep_latest: + keep_shas.append(ver) + keep = {"latest", *keep_dates, *keep_shas} + print(f" keep tags: {sorted(keep)}") + + deleted = skipped_blob = skipped_age = skipped_keep = 0 + for v in versions: + ver = v.get("version") or "" + ts = parsed_ts(v) + if BLOB_VER.match(ver): + skipped_blob += 1 continue - if created >= cutoff: + if ver in keep: + skipped_keep += 1 continue - version_id = v.get("id") - print(f" deleting v{version_id} tags={tags} created={v['created_at']}") + if ts >= cutoff: + skipped_age += 1 + continue + print(f" deleting {ver!r} id={v.get('id')} created={v.get('created_at')}") if not args.dry_run: api(token, "DELETE", - f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}") + f"/api/v1/packages/{args.owner}/container/{args.package}/{quote(ver, safe='')}") deleted += 1 - print(f"done: {deleted} version(s) deleted") + + print(f"done: deleted={deleted} kept_named={skipped_keep} " + f"kept_recent={skipped_age} skipped_blobs={skipped_blob}") return 0 diff --git a/scripts/rerank_server.py b/scripts/rerank_server.py new file mode 100644 index 0000000..f7e08b5 --- /dev/null +++ b/scripts/rerank_server.py @@ -0,0 +1,120 @@ +"""Minimal HTTP reranker — `/v1/rerank` endpoint over a sentence-transformers CrossEncoder. + +Matches the Cohere `/v1/rerank` request/response shape, which is what the +server's `_rerank()` helper expects. This is the dev-friendly fallback; +production replaces this with the llama.cpp + jina-reranker-v2-base GGUF +sidecar (see deploy/docker-compose.yml) without changing the client. + +Request: + POST /v1/rerank + {"model": "...", "query": "...", "documents": ["text", ...], "top_n": 10} + +Response: + {"model": "...", "results": [{"index": 0, "relevance_score": 0.93}, ...]} + +Usage: + python -m scripts.rerank_server # localhost:8001 + RERANK_MODEL=cross-encoder/ms-marco-MiniLM-L-12-v2 \\ + RERANK_PORT=8001 python -m scripts.rerank_server +""" +from __future__ import annotations + +import json +import logging +import os +import sys +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +log = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") + +MODEL_NAME = os.environ.get("RERANK_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") +PORT = int(os.environ.get("RERANK_PORT", "8001")) +HOST = os.environ.get("RERANK_HOST", "127.0.0.1") +# Truncate docs to this many chars before scoring. jina-reranker GGUF has a +# 1024-token per-pair cap that 400s the entire batch; ms-marco is more +# forgiving but we still cap to keep latency predictable. +MAX_DOC_CHARS = int(os.environ.get("RERANK_MAX_DOC_CHARS", "2000")) + +_model = None + + +def _get_model(): + global _model + if _model is None: + from sentence_transformers import CrossEncoder + log.info("loading %s", MODEL_NAME) + _model = CrossEncoder(MODEL_NAME) + log.info("loaded") + return _model + + +def _rerank(query: str, documents: list[str], top_n: int | None) -> list[dict]: + model = _get_model() + pairs = [[query, (d or "")[:MAX_DOC_CHARS]] for d in documents] + scores = model.predict(pairs) + ranked = sorted( + ({"index": i, "relevance_score": float(s)} for i, s in enumerate(scores)), + key=lambda r: -r["relevance_score"], + ) + if top_n is not None: + ranked = ranked[:top_n] + return ranked + + +class Handler(BaseHTTPRequestHandler): + def log_message(self, fmt, *args): + log.info("%s - %s", self.address_string(), fmt % args) + + def _send_json(self, status: int, payload: dict) -> None: + body = json.dumps(payload).encode() + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def do_GET(self): # noqa: N802 + if self.path in ("/", "/health"): + self._send_json(200, {"status": "ok", "model": MODEL_NAME}) + return + self._send_json(404, {"error": "not found"}) + + def do_POST(self): # noqa: N802 + if self.path not in ("/v1/rerank", "/rerank"): + self._send_json(404, {"error": "not found"}) + return + length = int(self.headers.get("Content-Length", "0")) + try: + req = json.loads(self.rfile.read(length).decode()) + except Exception as e: + self._send_json(400, {"error": f"bad json: {e}"}) + return + query = req.get("query") + documents = req.get("documents") + if not isinstance(query, str) or not isinstance(documents, list): + self._send_json(400, {"error": "expected {query: str, documents: list[str]}"}) + return + top_n = req.get("top_n") + try: + results = _rerank(query, documents, top_n if isinstance(top_n, int) else None) + except Exception as e: + log.exception("rerank failed") + self._send_json(500, {"error": str(e)}) + return + self._send_json(200, {"model": MODEL_NAME, "results": results}) + + +def main() -> int: + _get_model() # warm-load before accepting traffic + server = ThreadingHTTPServer((HOST, PORT), Handler) + log.info("listening on http://%s:%d", HOST, PORT) + try: + server.serve_forever() + except KeyboardInterrupt: + log.info("shutting down") + return 0 + + +if __name__ == "__main__": + sys.exit(main())