build out morpheus-docs MCP stack, mirroring hvm-docs through Phases 1-13
Initial scaffold: the docs-mcp-template clone with all the
HVM-validated stack ported across, customized for Morpheus
Enterprise (PRODUCT_NAME=morpheus, server name morpheus-docs).
Bundles (live-discovered 2026-05-22; 1710 cataloged pages total):
* morpheus_user_manual_8_1_0 sd00007510en_us 568 pages (Feb 2026)
* morpheus_user_manual_8_1_1 sd00007621en_us 569 pages (Mar 2026)
* morpheus_user_manual_8_1_2 sd00007732en_us 569 pages (Apr 2026)
* morpheus_release_notes_8_1_0 sd00007496en_us single-doc
* morpheus_release_notes_8_1_1 sd00007610en_us single-doc
* morpheus_release_notes_8_1_2 sd00007733en_us single-doc
* morpheus_quickspecs a50009231enw html-file (live
curl_cffi against www.hpe.com; all 12+ Enterprise SKUs captured —
S6E64..S6E73AAE for new/renewal/upgrade × 1/3/5-yr terms, plus
services SKUs HA124A1#V38/V39 and H46SBA1).
No Deployment Guide or Qualification Matrix on HPE Support for
Morpheus Enterprise specifically — the only QM (sd00006551en_us)
covers HVM clusters managed by Morpheus and lives in hvm-docs.
Stack carried forward from hvm-docs:
* rag/{index,chunk,embeddings,bm25}.py — including the
MAX_CHARS=4000 chunk-cap fix for table-dense content
* docs_mcp/{server,usage}.py — 11 MCP tools, BM25-default search,
cross-encoder rerank, hybrid behind HYBRID_SEARCH=true,
morpheus_api_lessons (renamed from hvm_api_lessons), env-gated
submit_doc_bug
* docs_mcp/api_lessons.md — Morpheus-specific scaffold covering
licensing model, HVM elevation path, REST vs Plugin API, with
TODO markers for sections to flesh out from real ops experience
* scrape/{runner,quickspecs,changelog,bundles}.py — TOC + single-doc
+ html-file modes, curl_cffi Chrome120 for www.hpe.com edge bypass
* eval/{retrievers,run_eval}.py + queries.jsonl scaffold (4 placeholder
queries; populate after first scrape)
* scripts/{rerank_server,usage_report,registry_gc}.py
* .gitea/workflows/{refresh,image-only}.yml — same Gitea Actions
setup zerto-docs uses (push LAN, pull public-URL, GPU Ollama pool)
* deploy/docker-compose.yml — morpheus-docs-mcp service definition,
shared jina-rerank sidecar, Watchtower-labeled
* Dockerfile, requirements.txt, requirements-rerank.txt
Verified locally: scrape produced 1599 .md pages (some TOC entries
are parent-only and yield no body), 6353 chunks all under the 4 KB
cap, MCP server boots and lists 11 tools cleanly.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -14,21 +14,17 @@ on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
REGISTRY_PUSH: <lan-host>:<port>
|
||||
REGISTRY_PULL: <public-registry-hostname>
|
||||
# Image name derives from the actual repo at runtime, so a clone
|
||||
# doesn't need to find/replace anything. e.g. justin/my-product-docs.
|
||||
# github.* context is Gitea Actions' inherited GitHub-Actions namespace
|
||||
# — values come from the Gitea server, not github.com.
|
||||
# PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare's 100 MB
|
||||
# body cap. PULL uses the public hostname (HTTPS). Same Gitea registry.
|
||||
REGISTRY_PUSH: 192.168.0.2:1234
|
||||
REGISTRY_PULL: git.jpaul.io
|
||||
IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
OLLAMA_URL: http://<gpu-host>:11434
|
||||
# Two GPU-pinned Ollama containers on the Gitea host — same infra
|
||||
# zerto-docs uses. :11435 = Titan X, :11436 = 1080 Ti. Indexer
|
||||
# round-robins per batch.
|
||||
OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436
|
||||
EMBED_MODEL: nomic-embed-text
|
||||
# PRODUCT_NAME defaults to the repo name so a clone works without
|
||||
# editing. Override here if you want a different identifier (e.g.
|
||||
# repo "my-product-docs" → PRODUCT_NAME "myproduct"). Used as the
|
||||
# Chroma collection name, BM25 db filename, and MCP server name —
|
||||
# see docs_mcp/server.py.
|
||||
PRODUCT_NAME: ${{ github.event.repository.name }}
|
||||
PRODUCT_NAME: morpheus
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -39,8 +35,7 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# Full history (not shallow) so the digest-history step can
|
||||
# walk git log up to --history-days back.
|
||||
# Full history so digest-history can walk git log.
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python
|
||||
@@ -54,9 +49,8 @@ jobs:
|
||||
python -m pip install -q -r requirements.txt
|
||||
|
||||
- name: Refresh digest history
|
||||
# Cheap (a few seconds); doesn't touch corpus content.
|
||||
# Without this step, a code-only deploy would ship an
|
||||
# increasingly-stale digest history relative to git.
|
||||
# Cheap (few seconds). Without this step, a code-only deploy
|
||||
# would ship an increasingly-stale digest history.
|
||||
run: |
|
||||
mkdir -p corpus/.digest
|
||||
python -m scrape.changelog \
|
||||
@@ -71,42 +65,69 @@ jobs:
|
||||
- name: Rebuild indexes from existing corpus
|
||||
run: python -m rag.index --rebuild
|
||||
|
||||
- name: Log in to registry (LAN endpoint)
|
||||
run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
# LAN registry is HTTP only.
|
||||
config-inline: |
|
||||
[registry."192.168.0.2:1234"]
|
||||
http = true
|
||||
insecure = true
|
||||
|
||||
- name: Build & push image
|
||||
- name: Configure registry credentials for buildx
|
||||
env:
|
||||
REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
||||
REGISTRY_USER: ${{ github.actor }}
|
||||
run: |
|
||||
SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12)
|
||||
DATE_TAG=$(date -u +%Y.%m.%d)
|
||||
docker build \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:latest" \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \
|
||||
.
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:latest"
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}"
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}"
|
||||
mkdir -p ~/.docker
|
||||
AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0)
|
||||
cat > ~/.docker/config.json <<EOF
|
||||
{
|
||||
"auths": {
|
||||
"192.168.0.2:1234": {
|
||||
"auth": "$AUTH"
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Compute tags
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: 192.168.0.2:1234/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
tags: |
|
||||
type=raw,value=latest
|
||||
type=sha,prefix=,format=short
|
||||
type=raw,value={{date 'YYYY.MM.DD'}}
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
org.opencontainers.image.url=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
|
||||
- name: Build & push (amd64)
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
|
||||
- name: Link container package to this repo
|
||||
# Gitea container packages are owned by a USER, not a repo —
|
||||
# they don't auto-appear under the repo's Packages tab.
|
||||
# This API call creates the association. One-time-effective:
|
||||
# re-running returns 400 once linked, which we swallow.
|
||||
# Endpoint requires Gitea 1.21+.
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
||||
run: |
|
||||
OWNER="${{ github.repository_owner }}"
|
||||
PKG="${{ github.event.repository.name }}"
|
||||
BODY=$(mktemp)
|
||||
CODE=$(curl -sS -o "$BODY" -w "%{http_code}" -X POST \
|
||||
code=$(curl -s -o /tmp/link.out -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
"https://${REGISTRY_PULL}/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
|
||||
echo "link http=$CODE body=$(cat "$BODY")"
|
||||
case "$CODE" in
|
||||
201) echo "linked package to ${OWNER}/${PKG}" ;;
|
||||
400) echo "already linked (re-link returns 400) — ok" ;;
|
||||
*) echo "unexpected status $CODE"; exit 1 ;;
|
||||
"https://git.jpaul.io/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
|
||||
echo "link ${OWNER}/container/${PKG} -> ${PKG}: HTTP ${code}"
|
||||
body=$(cat /tmp/link.out)
|
||||
case "$code" in
|
||||
201) echo "OK — newly linked" ;;
|
||||
400|409) echo "OK — already linked: ${body}" ;;
|
||||
*) echo "unexpected: ${body}"; exit 1 ;;
|
||||
esac
|
||||
|
||||
- name: Prune old container versions
|
||||
|
||||
@@ -19,27 +19,25 @@ on:
|
||||
default: false
|
||||
|
||||
env:
|
||||
# If your registry sits behind Cloudflare with its 100 MB body cap,
|
||||
# use a LAN endpoint for pushes (bypasses CF) and the public hostname
|
||||
# for pulls (response bodies aren't capped).
|
||||
REGISTRY_PUSH: <lan-host>:<port>
|
||||
REGISTRY_PULL: <public-registry-hostname>
|
||||
# Image name derives from the actual repo at runtime, so a clone
|
||||
# doesn't need to find/replace anything. e.g. justin/my-product-docs.
|
||||
# github.* context is Gitea Actions' inherited GitHub-Actions namespace
|
||||
# — values come from the Gitea server, not github.com.
|
||||
# PUSH goes to the LAN endpoint (HTTP) to bypass Cloudflare Tunnel's
|
||||
# 100 MB body cap. PULL uses the public hostname (HTTPS). Same Gitea
|
||||
# registry either way — package lands under the same owner/repo.
|
||||
REGISTRY_PUSH: 192.168.0.2:1234
|
||||
REGISTRY_PULL: git.jpaul.io
|
||||
|
||||
# Image name derives from the repo at runtime — clones don't need to
|
||||
# edit this. github.* is the Gitea-Actions inherited namespace.
|
||||
IMAGE: ${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
|
||||
# Embedder. One URL per GPU; the indexer round-robins.
|
||||
OLLAMA_URL: http://<gpu-host>:11434
|
||||
# Two GPU-pinned Ollama containers on the Gitea host — same infra
|
||||
# zerto-docs uses (deploy/ollama-rag.docker-compose.yml over there).
|
||||
# :11435 owns the Titan X, :11436 owns the 1080 Ti; the indexer
|
||||
# round-robins per batch so both cards run in parallel. The host's
|
||||
# primary Ollama on :11434 is left alone for OpenWebUI etc.
|
||||
OLLAMA_URLS: http://192.168.0.2:11435,http://192.168.0.2:11436
|
||||
EMBED_MODEL: nomic-embed-text
|
||||
|
||||
# PRODUCT_NAME defaults to the repo name so a clone works without
|
||||
# editing. Override here if you want a different identifier (e.g.
|
||||
# repo "my-product-docs" → PRODUCT_NAME "myproduct"). Used as the
|
||||
# Chroma collection name, BM25 db filename, and MCP server name —
|
||||
# see docs_mcp/server.py.
|
||||
PRODUCT_NAME: ${{ github.event.repository.name }}
|
||||
PRODUCT_NAME: morpheus
|
||||
|
||||
jobs:
|
||||
refresh:
|
||||
@@ -50,10 +48,12 @@ jobs:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
# Full history — required for the digest-history step to
|
||||
# walk git log. Default fetch-depth: 1 silently produces a
|
||||
# 0-byte history file.
|
||||
# Full history — required for digest-history. Default depth 1
|
||||
# silently produces a 0-byte history file.
|
||||
fetch-depth: 0
|
||||
# Set the credentials Gitea injects so we can push corpus
|
||||
# commits back. Persist them across the run.
|
||||
token: ${{ secrets.GITEA_TOKEN }}
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -89,8 +89,8 @@ jobs:
|
||||
- name: Commit corpus changes (if any)
|
||||
id: commit
|
||||
run: |
|
||||
git config user.name "<product>-docs-refresh"
|
||||
git config user.email "actions@<your-domain>"
|
||||
git config user.name "hvm-docs-refresh"
|
||||
git config user.email "actions@jpaul.io"
|
||||
git add bundles.json corpus
|
||||
if git diff --cached --quiet; then
|
||||
echo "no corpus changes — skipping reindex and image build"
|
||||
@@ -132,49 +132,89 @@ jobs:
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
run: python -m rag.index --rebuild
|
||||
|
||||
# ---- Build & push image ------------------------------------
|
||||
- name: Log in to registry (LAN endpoint)
|
||||
# ---- Build & push image (LAN endpoint, buildx) -------------
|
||||
- name: Set up Docker Buildx
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
run: echo "${{ secrets.REGISTRY_TOKEN }}" | docker login "${REGISTRY_PUSH}" -u "${{ github.repository_owner }}" --password-stdin
|
||||
uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
# LAN registry is HTTP only. Buildkit needs an explicit
|
||||
# insecure-registry config or it tries to upgrade to HTTPS.
|
||||
config-inline: |
|
||||
[registry."192.168.0.2:1234"]
|
||||
http = true
|
||||
insecure = true
|
||||
|
||||
- name: Build & push image
|
||||
- name: Configure registry credentials for buildx
|
||||
# Can't use docker/login-action against the LAN endpoint —
|
||||
# the host docker daemon errors on HTTP-vs-HTTPS. Buildx reads
|
||||
# ~/.docker/config.json directly, so write the auth ourselves.
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
# Runner shell is /bin/sh — use cut instead of ${VAR::N}.
|
||||
# Three tags: :latest (Watchtower target), :<sha12>
|
||||
# (rollback pin), :<YYYY.MM.DD> (human-readable).
|
||||
env:
|
||||
REGISTRY_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
||||
REGISTRY_USER: ${{ github.actor }}
|
||||
run: |
|
||||
SHA_TAG=$(echo "$GITHUB_SHA" | cut -c1-12)
|
||||
DATE_TAG=$(date -u +%Y.%m.%d)
|
||||
docker build \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:latest" \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}" \
|
||||
-t "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}" \
|
||||
.
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:latest"
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:${SHA_TAG}"
|
||||
docker push "${REGISTRY_PUSH}/${IMAGE}:${DATE_TAG}"
|
||||
mkdir -p ~/.docker
|
||||
AUTH=$(printf '%s:%s' "$REGISTRY_USER" "$REGISTRY_TOKEN" | base64 -w0)
|
||||
cat > ~/.docker/config.json <<EOF
|
||||
{
|
||||
"auths": {
|
||||
"192.168.0.2:1234": {
|
||||
"auth": "$AUTH"
|
||||
}
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
- name: Compute tags
|
||||
id: meta
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
# Tag with the LAN hostname so the push goes over LAN.
|
||||
# docker-compose on the deploy host pulls via git.jpaul.io.
|
||||
images: 192.168.0.2:1234/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
tags: |
|
||||
type=raw,value=latest
|
||||
type=sha,prefix=,format=short
|
||||
type=schedule,pattern={{date 'YYYY.MM.DD'}}
|
||||
type=raw,value={{date 'YYYY.MM.DD'}}
|
||||
# Override auto-derived labels with the PUBLIC URL so Gitea
|
||||
# can auto-link the package back to this repo.
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
org.opencontainers.image.url=https://git.jpaul.io/${{ github.repository_owner }}/${{ github.event.repository.name }}
|
||||
|
||||
- name: Build & push (amd64)
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
|
||||
- name: Link container package to this repo
|
||||
# Gitea container packages are owned by a USER, not a repo —
|
||||
# they don't auto-appear under the repo's Packages tab.
|
||||
# This API call creates the association. One-time-effective:
|
||||
# re-running returns 400 once linked, which we swallow.
|
||||
# Endpoint requires Gitea 1.21+.
|
||||
# Idempotent linkage so the package shows under the repo's
|
||||
# Packages tab. Gitea's auto-link from the source label is
|
||||
# unreliable in this setup (the runner reports an internal
|
||||
# server URL), so we link explicitly. 201 = newly linked,
|
||||
# 400 = already linked (treated as success).
|
||||
if: steps.commit.outputs.changed == 'true' || inputs.force_build == true
|
||||
env:
|
||||
GITEA_TOKEN: ${{ secrets.REGISTRY_TOKEN }}
|
||||
run: |
|
||||
OWNER="${{ github.repository_owner }}"
|
||||
PKG="${{ github.event.repository.name }}"
|
||||
BODY=$(mktemp)
|
||||
CODE=$(curl -sS -o "$BODY" -w "%{http_code}" -X POST \
|
||||
code=$(curl -s -o /tmp/link.out -w "%{http_code}" -X POST \
|
||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||
"https://${REGISTRY_PULL}/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
|
||||
echo "link http=$CODE body=$(cat "$BODY")"
|
||||
case "$CODE" in
|
||||
201) echo "linked package to ${OWNER}/${PKG}" ;;
|
||||
400) echo "already linked (re-link returns 400) — ok" ;;
|
||||
*) echo "unexpected status $CODE"; exit 1 ;;
|
||||
"https://git.jpaul.io/api/v1/packages/${OWNER}/container/${PKG}/-/link/${PKG}")
|
||||
echo "link ${OWNER}/container/${PKG} -> ${PKG}: HTTP ${code}"
|
||||
body=$(cat /tmp/link.out)
|
||||
case "$code" in
|
||||
201) echo "OK — newly linked" ;;
|
||||
400|409) echo "OK — already linked: ${body}" ;;
|
||||
*) echo "unexpected: ${body}"; exit 1 ;;
|
||||
esac
|
||||
|
||||
# ---- Registry GC -------------------------------------------
|
||||
|
||||
+119
@@ -0,0 +1,119 @@
|
||||
[
|
||||
{
|
||||
"slug": "morpheus_user_manual_8_1_0",
|
||||
"doc_id": "sd00007510en_us",
|
||||
"title": "HPE Morpheus Enterprise Software Documentation v8.1.0",
|
||||
"version": "8.1.0",
|
||||
"platform": null,
|
||||
"product": "User Manual",
|
||||
"language": "en-US",
|
||||
"page_count": 568,
|
||||
"mode": "toc",
|
||||
"abstract": "",
|
||||
"dates": {
|
||||
"Published": "February 2026"
|
||||
},
|
||||
"landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007510en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_user_manual_8_1_1",
|
||||
"doc_id": "sd00007621en_us",
|
||||
"title": "HPE Morpheus Enterprise Software Documentation v8.1.1",
|
||||
"version": "8.1.1",
|
||||
"platform": null,
|
||||
"product": "User Manual",
|
||||
"language": "en-US",
|
||||
"page_count": 569,
|
||||
"mode": "toc",
|
||||
"abstract": "",
|
||||
"dates": {
|
||||
"Published": "March 2026"
|
||||
},
|
||||
"landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007621en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_user_manual_8_1_2",
|
||||
"doc_id": "sd00007732en_us",
|
||||
"title": "HPE Morpheus Enterprise Software Documentation v8.1.2",
|
||||
"version": "8.1.2",
|
||||
"platform": null,
|
||||
"product": "User Manual",
|
||||
"language": "en-US",
|
||||
"page_count": 569,
|
||||
"mode": "toc",
|
||||
"abstract": "",
|
||||
"dates": {
|
||||
"Published": "April 2026"
|
||||
},
|
||||
"landing_page": "GUID-709AAADB-A9C1-40B6-AD22-958EE7E6F312",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007732en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_release_notes_8_1_0",
|
||||
"doc_id": "sd00007496en_us",
|
||||
"title": "v8.1.0 Release Notes",
|
||||
"version": "8.1.0",
|
||||
"platform": null,
|
||||
"product": "Release Notes",
|
||||
"language": "en-US",
|
||||
"page_count": 1,
|
||||
"mode": "single",
|
||||
"abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.0",
|
||||
"dates": {
|
||||
"Published": "February 2026"
|
||||
},
|
||||
"landing_page": "sd00007496en_us",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007496en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_release_notes_8_1_1",
|
||||
"doc_id": "sd00007610en_us",
|
||||
"title": "v8.1.1 Release Notes",
|
||||
"version": "8.1.1",
|
||||
"platform": null,
|
||||
"product": "Release Notes",
|
||||
"language": "en-US",
|
||||
"page_count": 1,
|
||||
"mode": "single",
|
||||
"abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.1",
|
||||
"dates": {
|
||||
"Published": "March 2026"
|
||||
},
|
||||
"landing_page": "sd00007610en_us",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007610en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_release_notes_8_1_2",
|
||||
"doc_id": "sd00007733en_us",
|
||||
"title": "v8.1.2 Release Notes",
|
||||
"version": "8.1.2",
|
||||
"platform": null,
|
||||
"product": "Release Notes",
|
||||
"language": "en-US",
|
||||
"page_count": 1,
|
||||
"mode": "single",
|
||||
"abstract": "Release notes for HPE Morpheus Enterprise Software version v8.1.2",
|
||||
"dates": {
|
||||
"Published": "April 2026"
|
||||
},
|
||||
"landing_page": "sd00007733en_us",
|
||||
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007733en_us"
|
||||
},
|
||||
{
|
||||
"slug": "morpheus_quickspecs",
|
||||
"doc_id": "a50009231enw",
|
||||
"title": "HPE Morpheus Enterprise Software QuickSpecs",
|
||||
"version": "v1",
|
||||
"platform": null,
|
||||
"product": "QuickSpecs",
|
||||
"language": "en-US",
|
||||
"page_count": 1,
|
||||
"mode": "html-file",
|
||||
"abstract": "",
|
||||
"dates": {},
|
||||
"landing_page": "a50009231enw",
|
||||
"source_url": "https://www.hpe.com/psnow/doc/a50009231enw"
|
||||
}
|
||||
]
|
||||
+23
-17
@@ -1,6 +1,6 @@
|
||||
# Hosting stack for a docs MCP server.
|
||||
#
|
||||
# Replace <product> below with your product name on first deploy.
|
||||
# Replace hvm below with your product name on first deploy.
|
||||
# Volumes: usage logs are mounted to a host path so they survive
|
||||
# Watchtower-driven container recreates.
|
||||
#
|
||||
@@ -10,15 +10,15 @@
|
||||
services:
|
||||
|
||||
# The MCP server. Watchtower auto-pulls on :latest changes.
|
||||
<product>-docs-mcp:
|
||||
image: <registry>/<owner>/<product>-docs-mcp:latest
|
||||
container_name: <product>-docs-mcp
|
||||
morpheus-docs-mcp:
|
||||
image: git.jpaul.io/justin/morpheus-docs:latest
|
||||
container_name: morpheus-docs-mcp
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
PRODUCT_NAME: "<product>"
|
||||
PRODUCT_DOCS_URL: "https://docs.example.com"
|
||||
PRODUCT_NAME: "morpheus"
|
||||
PRODUCT_DOCS_URL: "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007732en_us"
|
||||
|
||||
# Streamable-HTTP transport. Stateless mode is required for
|
||||
# production: clients don't lose sessions when Watchtower
|
||||
@@ -28,19 +28,21 @@ services:
|
||||
MCP_PORT: "8000"
|
||||
|
||||
# If you run MetaMCP or another gateway in front and reach
|
||||
# this container via its compose DNS name (e.g. <product>-docs-mcp:8000),
|
||||
# this container via its compose DNS name (e.g. morpheus-docs-mcp:8000),
|
||||
# add that hostname here. "*" disables the rebind check entirely.
|
||||
MCP_ALLOWED_HOSTS: "<product>-docs-mcp,localhost,127.0.0.1"
|
||||
MCP_ALLOWED_HOSTS: "morpheus-docs-mcp,localhost,127.0.0.1"
|
||||
|
||||
# Phase 6 — reranker sidecar (jina-reranker-v2-base via llama.cpp).
|
||||
RERANK_URL: http://<product>-rerank:8080
|
||||
RERANK_URL: http://hvm-rerank:8080
|
||||
RERANK_POOL: "200"
|
||||
RERANK_TIMEOUT: "30"
|
||||
|
||||
# Phase 8 — hybrid retrieval (BM25 + dense + RRF). Set true
|
||||
# only after the eval harness shows the dense-only path
|
||||
# missing technical-term queries that BM25 catches.
|
||||
HYBRID_SEARCH: "true"
|
||||
# Phase 8 — hybrid retrieval (BM25 + dense + RRF).
|
||||
# Eval on the HVM corpus (eval/results/baseline.md, 2026-05-22) shows
|
||||
# BM25-default + reranker beats hybrid on every metric (MRR 0.920 vs
|
||||
# 0.875). Leaving HYBRID_SEARCH off so search_docs runs BM25-first +
|
||||
# reranker; dense is the fallback when BM25 finds nothing.
|
||||
HYBRID_SEARCH: "false"
|
||||
|
||||
# Phase 10 — usage telemetry.
|
||||
USAGE_LOG_DIR: /app/var/logs
|
||||
@@ -52,9 +54,9 @@ services:
|
||||
# DOC_BUG_API_URL: "https://docs-be.example.com/api/feedback"
|
||||
volumes:
|
||||
# Usage logs persist across container recreates.
|
||||
- ./<product>-docs-mcp-logs:/app/var/logs
|
||||
- ./morpheus-docs-mcp-logs:/app/var/logs
|
||||
depends_on:
|
||||
- <product>-rerank
|
||||
- hvm-rerank
|
||||
labels:
|
||||
# Watchtower polls *only* containers with this label set true.
|
||||
com.centurylinklabs.watchtower.enable: "true"
|
||||
@@ -63,9 +65,13 @@ services:
|
||||
|
||||
# Reranker sidecar — llama.cpp serving jina-reranker-v2-base.
|
||||
# Requires GPU access; adjust runtime/devices for your hardware.
|
||||
<product>-rerank:
|
||||
#
|
||||
# For dev / CPU-only hosts, swap this service for scripts/rerank_server.py
|
||||
# (sentence-transformers ms-marco-MiniLM-L-6-v2). Same /v1/rerank shape,
|
||||
# ~500ms/batch on CPU vs ~50ms on GPU with the jina GGUF.
|
||||
hvm-rerank:
|
||||
image: ghcr.io/ggml-org/llama.cpp:server-cuda
|
||||
container_name: <product>-rerank
|
||||
container_name: hvm-rerank
|
||||
restart: unless-stopped
|
||||
# Mount the GGUF model from the host. Download from huggingface
|
||||
# (gguf-org/jina-reranker-v2-base-multilingual-GGUF) first.
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
# HPE Morpheus Enterprise — Lessons
|
||||
|
||||
Notes and gotchas about running, integrating with, and licensing
|
||||
**HPE Morpheus Enterprise Software** that aren't obvious from the
|
||||
official docs alone. The official User Manual + Release Notes +
|
||||
QuickSpecs describe the product as designed; this file is what
|
||||
experienced operators actually learn.
|
||||
|
||||
> Treat this as living context. Update it when you (or the LLM
|
||||
> driving this MCP) discover something non-obvious that the docs
|
||||
> don't say or don't make findable. Each section is an H2 so the
|
||||
> `morpheus_api_lessons(topic=...)` tool can return just the
|
||||
> relevant piece.
|
||||
|
||||
## TL;DR
|
||||
|
||||
- **Morpheus Enterprise is the full cloud-management platform.** HPE
|
||||
Morpheus VM Essentials (HVM) is the VM-only subset; Morpheus
|
||||
Enterprise is what you "elevate to" when you need multi-cloud,
|
||||
containers, automation, policy, FinOps, ITSM integration, and
|
||||
self-service catalogs. The relationship is one-way upgrade.
|
||||
- **Licensing is per physical CPU socket** on connected on-prem
|
||||
clouds (bare metal, hypervisor hosts, Kubernetes worker nodes).
|
||||
Public-cloud workloads (AWS / Azure / GCP / OCI) are factored at
|
||||
**15 workloads per socket** equivalent.
|
||||
- **All license SKUs include Tech Care Essentials 24×7** as part
|
||||
of the license cost. There is no separate purchase for support
|
||||
on the license tier.
|
||||
- **`morpheus_quickspecs` is the source of truth for SKUs.** Don't
|
||||
guess part numbers; query the QuickSpecs bundle.
|
||||
|
||||
## Licensing and SKUs
|
||||
|
||||
**Source of truth: the `morpheus_quickspecs` bundle.** Query it for
|
||||
the current SKU list — the catalog updates more often than this
|
||||
file does.
|
||||
|
||||
Pricing model summary (from QuickSpecs v1, 2026):
|
||||
|
||||
- **Per physical CPU socket** for connected on-prem clouds —
|
||||
KVM/HVM hosts, VMware ESXi hosts, bare metal servers, Kubernetes
|
||||
worker nodes. Count the **sockets**, not the cores; not the VMs.
|
||||
- **Public cloud workloads factor at 15:1** — one socket of license
|
||||
covers up to 15 public-cloud workloads (instances) across AWS,
|
||||
Azure, GCP, OCI.
|
||||
- **Term-based** licensing (not perpetual). 1, 3, and 5-year terms
|
||||
on E-LTU SKUs.
|
||||
- **All include HPE Tech Care Essentials** (24×7 support, 15-minute
|
||||
response for severity-1) bundled into the license cost.
|
||||
|
||||
> The exact ratios and SKU names can change between QuickSpecs
|
||||
> revisions. Use the `morpheus_quickspecs` tool / bundle for current
|
||||
> values rather than memorizing.
|
||||
|
||||
## Elevation from HVM
|
||||
|
||||
The "elevate to Morpheus Enterprise" path is the canonical journey
|
||||
for customers who started on HVM and outgrew it:
|
||||
|
||||
- **HVM clusters keep working unchanged after elevation.** You
|
||||
don't redeploy the manager; you upgrade-in-place using a
|
||||
Morpheus Enterprise license.
|
||||
- **What changes:** the manager UI unlocks the full Enterprise
|
||||
feature set — public-cloud integrations, container/Kubernetes
|
||||
management, blueprints/catalogs, automation workflows, policy
|
||||
engine, FinOps cost dashboards, ITSM connectors (ServiceNow etc.),
|
||||
and the full REST API surface.
|
||||
- **Existing HVM-tier work products survive the elevation:**
|
||||
Instance backups, network pools, storage providers, user
|
||||
accounts, integrations, scheduled jobs, etc.
|
||||
|
||||
The HVM User Manual page `Elevating to HPE Morpheus Enterprise`
|
||||
(GUID-ECCA4FDD-37C8-45CE-A71F-C6E73B3BA713) walks the procedure.
|
||||
See also the HVM `morpheus-docs` sibling MCP's
|
||||
`hvm_user_manual_8_1_*` bundles.
|
||||
|
||||
## API surface — Plugin vs REST
|
||||
|
||||
Morpheus exposes two completely separate extensibility surfaces:
|
||||
|
||||
- **REST API** at `https://<manager>/api/` — external automation
|
||||
and integration. Bearer-token authentication; tokens issued from
|
||||
the user profile → API tokens UI. Full Enterprise API surface
|
||||
available (vs HVM-only managers which 404 on Enterprise-only
|
||||
endpoints).
|
||||
- **Plugin API** — server-side extensions that load INTO the
|
||||
manager process. Versioned independently of the platform
|
||||
(Plugin API version listed in the Release Notes for each
|
||||
Morpheus version). A plugin built for Plugin API 1.3.x may not
|
||||
load on 1.4.x without changes.
|
||||
|
||||
**TODO — fill in real operational lessons as we accumulate them.**
|
||||
|
||||
## Multi-cloud onboarding
|
||||
|
||||
**TODO.** Each cloud (AWS, Azure, GCP, OCI, VMware vSphere, KVM/HVM,
|
||||
OpenStack, Nutanix, etc.) has its own onboarding ritual: credentials,
|
||||
networking, IAM roles, regions, storage providers, image catalogs.
|
||||
Search the User Manual: `search_docs(query="Add AWS cloud
|
||||
integration")`, `search_docs(query="Azure subscription cost")`, etc.
|
||||
|
||||
## Tenancy, RBAC, and groups
|
||||
|
||||
**TODO.** Morpheus Enterprise tenancy is one of the more complex areas
|
||||
— tenants, roles, groups, account groups, persona-based access.
|
||||
Lessons specific to "what surprised me" go here.
|
||||
|
||||
## Backups
|
||||
|
||||
**TODO.** Morpheus Enterprise inherits the backup framework HVM
|
||||
introduced (Storage Buckets, Execution Schedules, Backup Jobs)
|
||||
and adds: cloud-native backup integrations (AWS Backup, Azure
|
||||
Backup), per-instance backup policies via the policy engine,
|
||||
ServiceNow-driven backup orchestration. Document the gotchas you
|
||||
hit.
|
||||
|
||||
## Common operational gotchas
|
||||
|
||||
**TODO.** This is where the "experienced operator hallway
|
||||
conversation" notes go. Examples to seed (delete or replace as you
|
||||
learn):
|
||||
|
||||
- **Service plan vs Instance type** — same concept, different
|
||||
contexts. A service plan is the sizing template ("small / medium
|
||||
/ large with these CPU/RAM"); an instance type is what you
|
||||
provision FROM the plan. Operators conflate them.
|
||||
- **Cloud integration credentials are tenant-scoped, not
|
||||
global.** Adding a credential at the master tenant doesn't
|
||||
cascade — sub-tenants need their own (or the policy engine
|
||||
granting access).
|
||||
- **Policy engine vs Logic library** — both live under
|
||||
Library/Automation, both can gate provisioning. Policies are
|
||||
preventive (block bad config), logic is generative (run scripts
|
||||
on lifecycle events). Pick the right tool.
|
||||
|
||||
## Adding to this doc
|
||||
|
||||
Two ways:
|
||||
|
||||
1. Manually edit `docs_mcp/api_lessons.md` in this repo and commit.
|
||||
The next image build picks it up.
|
||||
2. Use `submit_doc_bug` for upstream issues, and append the
|
||||
takeaway here once the docs team responds.
|
||||
|
||||
The point of this doc is to surface the kind of context an
|
||||
experienced operator would mention in a hallway conversation but
|
||||
that doesn't quite fit anywhere in the formal product docs. Keep
|
||||
sections tight — one H2 = one topic the LLM can return on demand.
|
||||
+1077
-40
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,4 @@
|
||||
{"query": "what's the per-socket licensing model for Morpheus Enterprise", "expected": [{"bundle_id": "morpheus_quickspecs", "page_id": "a50009231enw"}], "tags": ["licensing", "skus"]}
|
||||
{"query": "add an AWS cloud integration", "expected": [], "tags": ["cloud", "TODO-populate-after-first-scrape"]}
|
||||
{"query": "Plugin API version compatibility", "expected": [], "tags": ["api", "TODO"]}
|
||||
{"query": "Morpheus Enterprise 8.1.2 what's new", "expected": [{"bundle_id": "morpheus_release_notes_8_1_2", "page_id": "sd00007733en_us"}], "tags": ["release-notes"]}
|
||||
+118
-28
@@ -10,7 +10,7 @@ to one entry; the highest-ranked chunk's position wins).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Protocol, Iterable
|
||||
from typing import Iterable, Protocol
|
||||
|
||||
|
||||
class Retriever(Protocol):
|
||||
@@ -21,12 +21,17 @@ class Retriever(Protocol):
|
||||
...
|
||||
|
||||
|
||||
def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]:
|
||||
"""Take a stream of (bundle_id, page_id, chunk_ordinal) and return
|
||||
the first k unique pages in their first-seen order."""
|
||||
def _split_chunk_id(chunk_id: str) -> tuple[str, str, int]:
|
||||
"""`bundle::page::ordinal` -> (bundle, page, int(ordinal))."""
|
||||
bid, pid, ordinal = chunk_id.split("::")
|
||||
return bid, pid, int(ordinal)
|
||||
|
||||
|
||||
def _collapse_to_pages(chunk_ids: Iterable[str], k: int) -> list[tuple[str, str]]:
|
||||
seen: set[tuple[str, str]] = set()
|
||||
out: list[tuple[str, str]] = []
|
||||
for bid, pid, _ord in chunk_ids:
|
||||
for cid in chunk_ids:
|
||||
bid, pid, _ord = _split_chunk_id(cid)
|
||||
key = (bid, pid)
|
||||
if key in seen:
|
||||
continue
|
||||
@@ -37,26 +42,111 @@ def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> lis
|
||||
return out
|
||||
|
||||
|
||||
# TODO Phase 2/3 — implement these once Chroma + the bm25 module are
|
||||
# in place. Each one is small (15-30 LOC). The eval harness imports
|
||||
# from this module by class name.
|
||||
#
|
||||
# class DenseRetriever:
|
||||
# name = "dense"
|
||||
# def __init__(self, collection): self.col = collection
|
||||
# def retrieve(self, query, k=10): ...
|
||||
#
|
||||
# class RerankedRetriever:
|
||||
# name = "dense+rerank"
|
||||
# def __init__(self, collection, rerank_url, pool=200): ...
|
||||
# def retrieve(self, query, k=10): ...
|
||||
#
|
||||
# class BM25Retriever:
|
||||
# name = "bm25"
|
||||
# def __init__(self, bm25_index): ...
|
||||
# def retrieve(self, query, k=10): ...
|
||||
#
|
||||
# class HybridRetriever:
|
||||
# name = "bm25+dense+rrf"
|
||||
# def __init__(self, dense, bm25, k_rrf=60): ...
|
||||
# def retrieve(self, query, k=10): ...
|
||||
class DenseRetriever:
|
||||
"""Chroma cosine search via the live embedding function."""
|
||||
name = "dense"
|
||||
|
||||
def __init__(self, collection, pool: int = 50):
|
||||
self.col = collection
|
||||
self.pool = pool
|
||||
|
||||
def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
|
||||
res = self.col.query(query_texts=[query], n_results=self.pool)
|
||||
ids = (res.get("ids") or [[]])[0]
|
||||
return _collapse_to_pages(ids, k)
|
||||
|
||||
|
||||
class BM25Retriever:
|
||||
"""SQLite FTS5 lexical search."""
|
||||
name = "bm25"
|
||||
|
||||
def __init__(self, bm25_index, pool: int = 200):
|
||||
self.bm = bm25_index
|
||||
self.pool = pool
|
||||
|
||||
def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
|
||||
hits = self.bm.query(query, n=self.pool)
|
||||
return _collapse_to_pages((cid for cid, _score in hits), k)
|
||||
|
||||
|
||||
class HybridRetriever:
|
||||
"""Reciprocal Rank Fusion of dense + BM25 rankings."""
|
||||
name = "hybrid_rrf"
|
||||
|
||||
def __init__(self, dense: DenseRetriever, bm25: BM25Retriever, k_rrf: int = 60, pool: int = 100):
|
||||
self.dense = dense
|
||||
self.bm25 = bm25
|
||||
self.k_rrf = k_rrf
|
||||
self.pool = pool
|
||||
|
||||
def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
|
||||
dense_pages = self.dense.retrieve(query, k=self.pool)
|
||||
bm25_pages = self.bm25.retrieve(query, k=self.pool)
|
||||
scores: dict[tuple[str, str], float] = {}
|
||||
for rank, page in enumerate(dense_pages, start=1):
|
||||
scores[page] = scores.get(page, 0.0) + 1.0 / (self.k_rrf + rank)
|
||||
for rank, page in enumerate(bm25_pages, start=1):
|
||||
scores[page] = scores.get(page, 0.0) + 1.0 / (self.k_rrf + rank)
|
||||
ranked = sorted(scores.items(), key=lambda kv: -kv[1])
|
||||
return [page for page, _s in ranked[:k]]
|
||||
|
||||
|
||||
def _rerank_pool(rerank_url: str, query: str, ids_and_texts: list[tuple[str, str]],
|
||||
timeout: float = 30.0) -> list[str] | None:
|
||||
"""POST to /v1/rerank, return ids in reranked order. None on failure."""
|
||||
if not ids_and_texts:
|
||||
return []
|
||||
import httpx
|
||||
try:
|
||||
with httpx.Client(timeout=timeout) as c:
|
||||
r = c.post(f"{rerank_url}/v1/rerank", json={
|
||||
"query": query,
|
||||
"documents": [(t or "")[:2000] for _i, t in ids_and_texts],
|
||||
"top_n": len(ids_and_texts),
|
||||
})
|
||||
r.raise_for_status()
|
||||
results = r.json().get("results") or []
|
||||
return [ids_and_texts[item["index"]][0] for item in results
|
||||
if isinstance(item.get("index"), int)
|
||||
and 0 <= item["index"] < len(ids_and_texts)]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
class RerankedRetriever:
|
||||
"""Pull a candidate pool via a base retriever, then cross-encoder re-rank."""
|
||||
|
||||
def __init__(self, base: Retriever, collection, rerank_url: str, name_suffix: str = "rerank",
|
||||
pool: int = 50, timeout: float = 30.0):
|
||||
self.base = base
|
||||
self.col = collection
|
||||
self.url = rerank_url
|
||||
self.name = f"{base.name}+{name_suffix}"
|
||||
self.pool = pool
|
||||
self.timeout = timeout
|
||||
|
||||
def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
|
||||
# Base returns deduplicated page-level tuples; rerank needs CHUNK-level
|
||||
# texts to be informative. Pull each page's chunk 0 text from Chroma.
|
||||
pages = self.base.retrieve(query, k=self.pool)
|
||||
if not pages:
|
||||
return []
|
||||
chunk_ids = [f"{bid}::{pid}::0" for bid, pid in pages]
|
||||
g = self.col.get(ids=chunk_ids, include=["documents"])
|
||||
by_id = dict(zip(g["ids"], g["documents"]))
|
||||
ids_and_texts = [(cid, by_id.get(cid, "")) for cid in chunk_ids]
|
||||
order = _rerank_pool(self.url, query, ids_and_texts, timeout=self.timeout)
|
||||
if order is None:
|
||||
return pages[:k]
|
||||
out: list[tuple[str, str]] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for cid in order:
|
||||
bid, pid, _ = cid.split("::")
|
||||
key = (bid, pid)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(key)
|
||||
if len(out) >= k:
|
||||
break
|
||||
return out
|
||||
|
||||
+81
-9
@@ -76,15 +76,87 @@ def main() -> int:
|
||||
queries = load_queries(args.queries)
|
||||
print(f"loaded {len(queries)} queries")
|
||||
|
||||
# TODO Phase 7: instantiate the retrievers you implemented in
|
||||
# eval/retrievers.py and run each one against each query.
|
||||
# Aggregate MRR / Recall@K / nDCG@K per retriever. Emit a
|
||||
# markdown table to args.output. Commit the file alongside the
|
||||
# PR that changes retrieval.
|
||||
raise NotImplementedError(
|
||||
"Wire up the retrievers in eval/retrievers.py first, then "
|
||||
"fill in this evaluation loop. See PLAN.md Phase 7."
|
||||
)
|
||||
import os
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
from rag.embeddings import embedding_function
|
||||
from rag.bm25 import BM25Index
|
||||
from eval.retrievers import DenseRetriever, BM25Retriever, HybridRetriever
|
||||
|
||||
product = os.environ.get("PRODUCT_NAME", "hvm")
|
||||
repo_root = Path(__file__).resolve().parent.parent
|
||||
client = chromadb.PersistentClient(path=str(repo_root / "chroma"),
|
||||
settings=Settings(anonymized_telemetry=False))
|
||||
col = client.get_collection(f"{product}_docs", embedding_function=embedding_function())
|
||||
bm = BM25Index(str(repo_root / "bm25" / f"{product}_docs.db"))
|
||||
|
||||
from eval.retrievers import RerankedRetriever
|
||||
|
||||
dense = DenseRetriever(col)
|
||||
bm25 = BM25Retriever(bm)
|
||||
hybrid = HybridRetriever(DenseRetriever(col, pool=100), BM25Retriever(bm, pool=100))
|
||||
|
||||
retrievers = [dense, bm25, hybrid]
|
||||
|
||||
rerank_url = os.environ.get("RERANK_URL", "").rstrip("/")
|
||||
if rerank_url:
|
||||
retrievers += [
|
||||
RerankedRetriever(bm25, col, rerank_url, name_suffix="rerank", pool=50),
|
||||
RerankedRetriever(hybrid, col, rerank_url, name_suffix="rerank", pool=50),
|
||||
]
|
||||
print(f"reranker enabled: {rerank_url}")
|
||||
|
||||
rows: dict[str, dict[str, float]] = {}
|
||||
per_query: list[dict] = []
|
||||
for r in retrievers:
|
||||
mrr_sum = recall_sum = ndcg_sum = 0.0
|
||||
elapsed_sum = 0.0
|
||||
for q in queries:
|
||||
expected = [(e["bundle_id"], e["page_id"]) for e in q["expected"]]
|
||||
t0 = time.time()
|
||||
retrieved = r.retrieve(q["query"], k=max(args.k, 10))
|
||||
elapsed = time.time() - t0
|
||||
mrr = reciprocal_rank(retrieved, expected)
|
||||
recall = recall_at_k(retrieved, expected, args.k)
|
||||
ndcg = ndcg_at_k(retrieved, expected, args.k)
|
||||
mrr_sum += mrr
|
||||
recall_sum += recall
|
||||
ndcg_sum += ndcg
|
||||
elapsed_sum += elapsed
|
||||
per_query.append({
|
||||
"retriever": r.name, "query": q["query"],
|
||||
"mrr": mrr, "recall@k": recall, "ndcg@k": ndcg,
|
||||
"top1": list(retrieved[0]) if retrieved else None,
|
||||
"elapsed_s": round(elapsed, 3),
|
||||
})
|
||||
n = len(queries)
|
||||
rows[r.name] = {
|
||||
"MRR": mrr_sum / n,
|
||||
f"Recall@{args.k}": recall_sum / n,
|
||||
f"nDCG@{args.k}": ndcg_sum / n,
|
||||
"avg_latency_s": elapsed_sum / n,
|
||||
}
|
||||
print(f" {r.name}: MRR={rows[r.name]['MRR']:.3f} "
|
||||
f"Recall@{args.k}={rows[r.name][f'Recall@{args.k}']:.3f} "
|
||||
f"nDCG@{args.k}={rows[r.name][f'nDCG@{args.k}']:.3f} "
|
||||
f"avg={rows[r.name]['avg_latency_s']*1000:.0f}ms")
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
md = [f"# Retrieval eval — k={args.k}", "",
|
||||
f"_{len(queries)} hand-curated queries, generated {time.strftime('%Y-%m-%d %H:%M:%S')}_", "",
|
||||
"| Retriever | MRR | Recall@{k} | nDCG@{k} | avg latency |".replace("{k}", str(args.k)),
|
||||
"| --- | ---: | ---: | ---: | ---: |"]
|
||||
for name, m in rows.items():
|
||||
md.append(f"| `{name}` | {m['MRR']:.3f} | {m[f'Recall@{args.k}']:.3f} "
|
||||
f"| {m[f'nDCG@{args.k}']:.3f} | {m['avg_latency_s']*1000:.0f}ms |")
|
||||
md += ["", "## Per-query results", "",
|
||||
"| Retriever | Query | MRR | top-1 |", "| --- | --- | ---: | --- |"]
|
||||
for r in per_query:
|
||||
top1 = f"`{r['top1'][0]}/{r['top1'][1][:24]}...`" if r["top1"] else "—"
|
||||
md.append(f"| `{r['retriever']}` | {r['query'][:60]} | {r['mrr']:.3f} | {top1} |")
|
||||
args.output.write_text("\n".join(md) + "\n")
|
||||
print(f"wrote {args.output}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+39
-11
@@ -31,6 +31,31 @@ from typing import Iterator
|
||||
CHARS_PER_TOKEN = 4
|
||||
TARGET_TOKENS = 500
|
||||
TARGET_CHARS = TARGET_TOKENS * CHARS_PER_TOKEN
|
||||
# Hard cap: nomic-embed-text's context is 2048 tokens. Anything larger
|
||||
# 400s the entire embed batch. 6000 chars works for prose but markdown
|
||||
# tables with lots of `|` separators tokenize ~1.4× denser; a 5839-char
|
||||
# table chunk from the HVM qualification matrix tokenized past 2048 and
|
||||
# crashed the rebuild. 4000 chars stays under 2048 tokens even for
|
||||
# dense table content while leaving headroom for the query side.
|
||||
MAX_CHARS = 4000
|
||||
|
||||
|
||||
def _hard_split(text: str) -> list[str]:
|
||||
"""Split an oversized block on line boundaries into MAX_CHARS pieces."""
|
||||
if len(text) <= MAX_CHARS:
|
||||
return [text]
|
||||
out: list[str] = []
|
||||
buf: list[str] = []
|
||||
buf_chars = 0
|
||||
for line in text.splitlines(keepends=True):
|
||||
if buf_chars + len(line) > MAX_CHARS and buf:
|
||||
out.append("".join(buf).rstrip())
|
||||
buf, buf_chars = [], 0
|
||||
buf.append(line)
|
||||
buf_chars += len(line)
|
||||
if buf:
|
||||
out.append("".join(buf).rstrip())
|
||||
return out
|
||||
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
@@ -104,23 +129,26 @@ def chunks_from_page(
|
||||
|
||||
# ----- Body chunks: pack paragraphs up to TARGET_CHARS -------
|
||||
ordinal = 1
|
||||
|
||||
def emit(buf: list[str]) -> Iterator[dict]:
|
||||
nonlocal ordinal
|
||||
merged = "\n\n".join(buf)
|
||||
for piece in _hard_split(merged):
|
||||
yield {
|
||||
"id": f"{metadata['bundle_id']}::{page_id}::{ordinal}",
|
||||
"text": piece,
|
||||
"metadata": {**metadata, "ordinal": ordinal},
|
||||
}
|
||||
ordinal += 1
|
||||
|
||||
buf: list[str] = []
|
||||
buf_chars = 0
|
||||
for p in paragraphs:
|
||||
if buf_chars + len(p) > TARGET_CHARS and buf:
|
||||
yield {
|
||||
"id": f"{metadata['bundle_id']}::{page_id}::{ordinal}",
|
||||
"text": "\n\n".join(buf),
|
||||
"metadata": {**metadata, "ordinal": ordinal},
|
||||
}
|
||||
ordinal += 1
|
||||
yield from emit(buf)
|
||||
buf = []
|
||||
buf_chars = 0
|
||||
buf.append(p)
|
||||
buf_chars += len(p)
|
||||
if buf:
|
||||
yield {
|
||||
"id": f"{metadata['bundle_id']}::{page_id}::{ordinal}",
|
||||
"text": "\n\n".join(buf),
|
||||
"metadata": {**metadata, "ordinal": ordinal},
|
||||
}
|
||||
yield from emit(buf)
|
||||
|
||||
+21
-4
@@ -3,8 +3,15 @@
|
||||
Swappable: implement the same `embedding_function()` interface returning
|
||||
a Chroma `EmbeddingFunction` and the rest of the pipeline doesn't care.
|
||||
|
||||
Defaults (override via env):
|
||||
OLLAMA_URL one or more comma-separated URLs (load-balanced)
|
||||
Env-configurable (matches the zerto-docs-rag pattern so the same Gitea
|
||||
runner + GPU-pinned Ollama containers can serve every docs MCP build):
|
||||
|
||||
OLLAMA_URLS comma-separated list, load-balanced round-robin per batch.
|
||||
Preferred — set in the CI workflow to fan out across two
|
||||
GPU-pinned Ollama containers on the Gitea host.
|
||||
OLLAMA_URL single endpoint, fallback when OLLAMA_URLS is unset.
|
||||
Default http://192.168.0.2:11434 (the host where the GPUs
|
||||
live in Justin's lab).
|
||||
EMBED_MODEL model name; default 'nomic-embed-text'
|
||||
EMBED_DIM expected embedding dim; default 768 (nomic-embed-text)
|
||||
"""
|
||||
@@ -19,8 +26,18 @@ from chromadb import EmbeddingFunction, Documents, Embeddings
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_URLS = [u.strip() for u in os.environ.get("OLLAMA_URL",
|
||||
"http://localhost:11434").split(",") if u.strip()]
|
||||
DEFAULT_OLLAMA_URL = "http://192.168.0.2:11434"
|
||||
|
||||
|
||||
def _resolve_urls() -> list[str]:
|
||||
raw = os.environ.get("OLLAMA_URLS", "").strip()
|
||||
if raw:
|
||||
return [u.strip().rstrip("/") for u in raw.split(",") if u.strip()]
|
||||
single = os.environ.get("OLLAMA_URL", DEFAULT_OLLAMA_URL).strip().rstrip("/")
|
||||
return [single]
|
||||
|
||||
|
||||
OLLAMA_URLS = _resolve_urls()
|
||||
EMBED_MODEL = os.environ.get("EMBED_MODEL", "nomic-embed-text")
|
||||
EMBED_DIM = int(os.environ.get("EMBED_DIM", "768"))
|
||||
|
||||
|
||||
+1
-1
@@ -29,7 +29,7 @@ CHROMA_DIR = ROOT / "chroma"
|
||||
|
||||
# Collection name — convention: <product>_docs. Override via env if needed.
|
||||
import os
|
||||
PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "myproduct")
|
||||
PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "morpheus")
|
||||
COLLECTION = f"{PRODUCT_NAME}_docs"
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# Dev/CPU reranker — only for running scripts/rerank_server.py locally.
|
||||
# Production uses the llama.cpp + jina-reranker GGUF sidecar (see
|
||||
# deploy/docker-compose.yml). Install with:
|
||||
#
|
||||
# pip install -r requirements-rerank.txt
|
||||
#
|
||||
# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder
|
||||
# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main
|
||||
# requirements.txt so the production image stays slim.
|
||||
sentence-transformers>=3.0
|
||||
@@ -10,10 +10,18 @@ ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not
|
||||
# Scraping (Phase 1; adjust per product)
|
||||
beautifulsoup4>=4.12
|
||||
requests>=2.31
|
||||
curl_cffi>=0.7 # for HPE QuickSpecs scrape (Chrome TLS impersonation)
|
||||
markdownify>=0.11
|
||||
# playwright>=1.40 # uncomment if you need headless browser fallback
|
||||
|
||||
# Evaluation
|
||||
numpy>=1.26
|
||||
|
||||
# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server
|
||||
# only needs httpx (declared above) to call it. For the dev / CPU
|
||||
# fallback reranker (scripts/rerank_server.py), install
|
||||
# requirements-rerank.txt separately — it pulls in PyTorch which would
|
||||
# triple the production image size.
|
||||
|
||||
# Dev / utility
|
||||
python-dateutil>=2.8
|
||||
|
||||
@@ -7,6 +7,72 @@ the upstream doc portal.
|
||||
See `PLAN.md` Phase 1 for the corpus layout the rest of the pipeline
|
||||
expects.
|
||||
|
||||
---
|
||||
|
||||
## Product context — HPE Morpheus Enterprise Software
|
||||
|
||||
**This repo is for HPE Morpheus Enterprise**, the full cloud-management
|
||||
platform. It is a **different SKU** from HPE Morpheus VM Essentials
|
||||
(HVM), which has its own MCP at `../hvm-docs/`. Don't ingest HVM
|
||||
docs here; they're a separate, smaller product (the "VM-only" subset
|
||||
of Morpheus). The Morpheus VM Essentials Deployment Guide refers to
|
||||
Morpheus Enterprise as the "elevate to" target — that's the
|
||||
relationship.
|
||||
|
||||
`PRODUCT_NAME=morpheus`. Tool will be named `morpheus_api_lessons`,
|
||||
collection `morpheus_docs`, etc.
|
||||
|
||||
### Upstream portal
|
||||
|
||||
HPE Support DocPortal (Tridion/SDL-derived, same surface as HVM and
|
||||
the Zerto docs). Anonymous JSON API, no auth required.
|
||||
|
||||
| Endpoint | Returns |
|
||||
|---|---|
|
||||
| `GET https://support.hpe.com/hpesc/public/api/document/{docId}` | DITA-source HTML — title page / abstract OR (for short docs like Release Notes) the entire body |
|
||||
| `GET https://support.hpe.com/hpesc/public/api/document/{docId}/toc` | Nested JSON tree of `{topicName, topicLink, description, children}`. Empty/404 for single-doc Release Notes. |
|
||||
| `GET https://support.hpe.com/hpesc/public/api/document/{docId}/render?page=GUID-XXXX.html` | `{docId, page_html, doc_meta, page_meta}` — single page body |
|
||||
|
||||
User-facing URL format:
|
||||
`https://support.hpe.com/hpesc/public/docDisplay?docId={docId}&page=GUID-XXXX.html`
|
||||
|
||||
### Bundle IDs (confirmed 2026-05-22)
|
||||
|
||||
**Morpheus Enterprise User Manual** — ~569 pages each, full nested TOC:
|
||||
|
||||
| Version | docId |
|
||||
|---|---|
|
||||
| 8.1.0 | `sd00007510en_us` |
|
||||
| 8.1.1 | `sd00007621en_us` |
|
||||
| 8.1.2 | `sd00007732en_us` |
|
||||
|
||||
**Morpheus Enterprise Release Notes** — short, single-doc-blob shape
|
||||
(no TOC; full body returned by the `/document/{docId}` endpoint
|
||||
itself; scraper needs a `--single-doc` mode for these):
|
||||
|
||||
| Version | docId |
|
||||
|---|---|
|
||||
| 8.1.0 | `sd00007496en_us` |
|
||||
| 8.1.1 | `sd00007610en_us` |
|
||||
| 8.1.2 | `sd00007733en_us` |
|
||||
|
||||
### Cross-version peers are free
|
||||
|
||||
GUIDs are stable across versions (confirmed on HVM where 374/376/376
|
||||
pages had 100% GUID overlap between adjacent versions). Same-GUID =
|
||||
same-topic. Synthesize `topic_cluster.clustered_topics` by looking
|
||||
up the same GUID in the other bundle slugs — no fuzzy matching
|
||||
needed.
|
||||
|
||||
### Reusable from hvm-docs
|
||||
|
||||
`../hvm-docs/scrape/bundles.py` and `../hvm-docs/scrape/runner.py`
|
||||
solve the identical portal shape. Copy and adapt the BUNDLES list +
|
||||
PRODUCT_NAME; the fetch logic should drop in unchanged. Both the
|
||||
TOC-paginated path and the single-doc path are needed (the HVM
|
||||
build covers both because HVM Release Notes follow the same shape).
|
||||
|
||||
|
||||
## What you write
|
||||
|
||||
At minimum, two scripts:
|
||||
|
||||
@@ -0,0 +1,200 @@
|
||||
"""Discover Morpheus Enterprise doc bundles on HPE Support DocPortal and write bundles.json.
|
||||
|
||||
Mirrors hvm-docs/scrape/bundles.py — same portal, same API shape, same single-doc-blob
|
||||
treatment for Release Notes, but pointing at the Morpheus Enterprise docId range.
|
||||
|
||||
For each bundle this script:
|
||||
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
|
||||
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
|
||||
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
|
||||
|
||||
QuickSpecs is a special case: lives at www.hpe.com (not support.hpe.com), gets the
|
||||
html-file mode and is scraped via curl_cffi (see scrape/quickspecs.py).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
API = "https://support.hpe.com/hpesc/public/api/document"
|
||||
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
|
||||
UA = "morpheus-docs-mcp/0.1 (+https://git.jpaul.io/justin/morpheus-docs; admin@jpaul.io)"
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
BUNDLES_JSON = ROOT / "bundles.json"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BundleSpec:
|
||||
slug: str
|
||||
doc_id: str
|
||||
title: str
|
||||
version: str | None
|
||||
product: str # e.g. "User Manual", "Release Notes", "QuickSpecs"
|
||||
mode: str # "toc", "single", or "html-file"
|
||||
platform: str | None = None
|
||||
language: str = "en-US"
|
||||
source_url: str | None = None # overrides the default support.hpe.com URL
|
||||
|
||||
|
||||
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
|
||||
# range sd00006500..7740 for `Morpheus Enterprise` matches in the abstract.
|
||||
#
|
||||
# Notes:
|
||||
# - Morpheus Enterprise has User Manuals dating back to 8.0.10
|
||||
# (sd00006774en_us, Sep 2025) but we only ship the 8.1.x line for
|
||||
# now. Add the 8.0.x bundles here if you need older versions in the
|
||||
# corpus.
|
||||
# - No dedicated Deployment Guide or Qualification Matrix for Morpheus
|
||||
# Enterprise on HPE Support — the only QM (sd00006551en_us) covers
|
||||
# HVM clusters managed by Morpheus, which lives in hvm-docs.
|
||||
# - QuickSpecs lives on www.hpe.com (not support.hpe.com), uses the
|
||||
# html-file scrape mode with curl_cffi Chrome impersonation.
|
||||
BUNDLES: list[BundleSpec] = [
|
||||
BundleSpec("morpheus_user_manual_8_1_0", "sd00007510en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.0", "User Manual", "toc"),
|
||||
BundleSpec("morpheus_user_manual_8_1_1", "sd00007621en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.1", "User Manual", "toc"),
|
||||
BundleSpec("morpheus_user_manual_8_1_2", "sd00007732en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.2", "User Manual", "toc"),
|
||||
BundleSpec("morpheus_release_notes_8_1_0", "sd00007496en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.0", "Release Notes", "single"),
|
||||
BundleSpec("morpheus_release_notes_8_1_1", "sd00007610en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.1", "Release Notes", "single"),
|
||||
BundleSpec("morpheus_release_notes_8_1_2", "sd00007733en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.2", "Release Notes", "single"),
|
||||
BundleSpec("morpheus_quickspecs", "a50009231enw", "HPE Morpheus Enterprise Software QuickSpecs",
|
||||
"v1", "QuickSpecs", "html-file",
|
||||
source_url="https://www.hpe.com/psnow/doc/a50009231enw"),
|
||||
]
|
||||
|
||||
|
||||
def _session() -> requests.Session:
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
|
||||
return s
|
||||
|
||||
|
||||
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
|
||||
delay = 1.0
|
||||
for attempt in range(retries):
|
||||
r = s.get(url, timeout=30)
|
||||
if r.status_code == 200:
|
||||
return r.json() if expect_json else r.text
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
if r.status_code in (429, 500, 502, 503, 504):
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
continue
|
||||
r.raise_for_status()
|
||||
raise RuntimeError(f"GET failed after {retries} retries: {url}")
|
||||
|
||||
|
||||
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
|
||||
if not toc:
|
||||
return 0, None
|
||||
landing = None
|
||||
n = 0
|
||||
|
||||
def walk(nodes: list[dict] | None, depth: int) -> None:
|
||||
nonlocal n, landing
|
||||
for node in nodes or []:
|
||||
link = node.get("topicLink")
|
||||
if link:
|
||||
n += 1
|
||||
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
|
||||
if m and landing is None:
|
||||
landing = m.group(1)
|
||||
walk(node.get("children"), depth + 1)
|
||||
|
||||
walk(toc, 0)
|
||||
return n, landing
|
||||
|
||||
|
||||
def _parse_abstract(html: str) -> dict[str, str]:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
out: dict[str, str] = {}
|
||||
h1 = soup.select_one("h1.title.topictitle1")
|
||||
if h1:
|
||||
out["title"] = h1.get_text(" ", strip=True)
|
||||
desc = soup.select_one("div.desc")
|
||||
if desc:
|
||||
out["abstract"] = desc.get_text(" ", strip=True)
|
||||
pub = soup.select_one("div.publishedDate")
|
||||
if pub:
|
||||
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
|
||||
return out
|
||||
|
||||
|
||||
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
|
||||
# html-file bundles are static fixtures or live-fetched outside support.hpe.com.
|
||||
if spec.mode == "html-file":
|
||||
return {
|
||||
"slug": spec.slug,
|
||||
"doc_id": spec.doc_id,
|
||||
"title": spec.title,
|
||||
"version": spec.version,
|
||||
"platform": spec.platform,
|
||||
"product": spec.product,
|
||||
"language": spec.language,
|
||||
"page_count": 1,
|
||||
"mode": "html-file",
|
||||
"abstract": "",
|
||||
"dates": {},
|
||||
"landing_page": spec.doc_id,
|
||||
"source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}",
|
||||
}
|
||||
|
||||
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
|
||||
meta = _parse_abstract(abstract_html or "")
|
||||
|
||||
page_count: int
|
||||
landing: str | None
|
||||
if spec.mode == "toc":
|
||||
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
|
||||
page_count, landing = _count_toc(toc)
|
||||
if page_count == 0:
|
||||
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
|
||||
spec.mode = "single"
|
||||
page_count, landing = 1, spec.doc_id
|
||||
else:
|
||||
page_count, landing = 1, spec.doc_id
|
||||
|
||||
return {
|
||||
"slug": spec.slug,
|
||||
"doc_id": spec.doc_id,
|
||||
"title": meta.get("title") or spec.title,
|
||||
"version": spec.version,
|
||||
"platform": spec.platform,
|
||||
"product": spec.product,
|
||||
"language": spec.language,
|
||||
"page_count": page_count,
|
||||
"mode": spec.mode,
|
||||
"abstract": meta.get("abstract", ""),
|
||||
"dates": {"Published": meta.get("published", "")},
|
||||
"landing_page": landing,
|
||||
"source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id),
|
||||
}
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
|
||||
p.add_argument("--out", default=str(BUNDLES_JSON))
|
||||
args = p.parse_args()
|
||||
|
||||
s = _session()
|
||||
out: list[dict[str, Any]] = []
|
||||
for spec in BUNDLES:
|
||||
print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
|
||||
out.append(discover_bundle(s, spec))
|
||||
|
||||
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
|
||||
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,194 @@
|
||||
"""Scrape HPE QuickSpecs collateral pages into corpus markdown.
|
||||
|
||||
HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral.<doc_id>.html`
|
||||
with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the
|
||||
captured DOM). The blocker for automated scraping is `www.hpe.com`'s
|
||||
edge bot defense, which drops connections from non-browser TLS
|
||||
fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here
|
||||
by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint.
|
||||
|
||||
Content extraction uses these stable CSS selectors found in the page:
|
||||
|
||||
.lr-right-rail hpe-highlights-container .collateral-content
|
||||
— one per section ("Overview", "Standard Features", etc.)
|
||||
h3.txto-title — section title
|
||||
div.txto-description — section body
|
||||
uc-table.uc-table-polaris — SKU / version-history tables
|
||||
|
||||
A committed HTML fixture at `scrape/quickspecs/<doc_id>.html` is used
|
||||
as a fallback when the live fetch fails (HPE edge churn, network
|
||||
issues). Keeping a current fixture in the repo also makes diffing
|
||||
QuickSpecs revisions easy.
|
||||
|
||||
Usage (called by scrape.runner for bundles with mode="quickspecs"):
|
||||
|
||||
python -m scrape.quickspecs a50004260enw
|
||||
|
||||
Or programmatically:
|
||||
|
||||
from scrape.quickspecs import scrape_quickspecs
|
||||
scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...")
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from markdownify import markdownify as md
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
SOURCE_DIR = ROOT / "scrape" / "quickspecs"
|
||||
CORPUS_DIR = ROOT / "corpus"
|
||||
|
||||
COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html"
|
||||
|
||||
|
||||
def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None:
|
||||
"""GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint).
|
||||
Returns the HTML body on success, None on any failure."""
|
||||
try:
|
||||
from curl_cffi import requests as cc
|
||||
except ImportError:
|
||||
log.warning("curl_cffi not installed; can't fetch QuickSpecs live")
|
||||
return None
|
||||
try:
|
||||
r = cc.get(COLLATERAL_URL.format(doc_id=doc_id),
|
||||
impersonate="chrome120", timeout=timeout)
|
||||
if r.status_code != 200 or not r.text:
|
||||
log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or ""))
|
||||
return None
|
||||
return r.text
|
||||
except Exception as e:
|
||||
log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_fixture(doc_id: str) -> str | None:
|
||||
"""Read the committed HTML fixture as fallback."""
|
||||
p = SOURCE_DIR / f"{doc_id}.html"
|
||||
if not p.exists():
|
||||
return None
|
||||
return p.read_text()
|
||||
|
||||
|
||||
def _extract_content_blocks(html: str) -> list[str]:
|
||||
"""Pull each section block (.collateral-content under .lr-right-rail).
|
||||
|
||||
The fixture format (just .quickspecs-content wrapper) and the live
|
||||
format (.lr-right-rail with nested hpe-highlights-container) are
|
||||
both supported. Returns a list of section HTML strings, in document
|
||||
order.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Live format: each <hpe-highlights-container> under .lr-right-rail has
|
||||
# one or more .collateral-content blocks; concat them.
|
||||
rail = soup.select_one(".lr-right-rail")
|
||||
if rail is not None:
|
||||
blocks = rail.select(".collateral-content")
|
||||
return [str(b) for b in blocks]
|
||||
# Fixture format: a single wrapper holding all the H2/H3 sections.
|
||||
wrapper = soup.select_one(".quickspecs-content")
|
||||
if wrapper is not None:
|
||||
return [str(wrapper)]
|
||||
# Last-resort: whole body.
|
||||
body = soup.body or soup
|
||||
return [str(body)]
|
||||
|
||||
|
||||
def parse_html(html: str) -> str:
|
||||
"""Convert QuickSpecs HTML to clean markdown.
|
||||
|
||||
Filters out the page chrome (nav, footer, recommendations carousel,
|
||||
cookie banner, analytics blobs) by extracting only the content
|
||||
blocks, then runs markdownify."""
|
||||
blocks = _extract_content_blocks(html)
|
||||
chunks: list[str] = []
|
||||
for block in blocks:
|
||||
soup = BeautifulSoup(block, "html.parser")
|
||||
# Drop anchor placeholders that markdownify turns into noisy links
|
||||
for a in soup.select('[hpe-left-rail-anchor]'):
|
||||
a.decompose()
|
||||
# Drop carousel / share / recommendation widgets if any leaked in.
|
||||
for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar",
|
||||
"esl-scrollbar", "esl-trigger", "video-overlay",
|
||||
"generic-modal-loader", "style", "script"):
|
||||
for el in soup.select(sel):
|
||||
el.decompose()
|
||||
chunks.append(md(str(soup), heading_style="ATX", bullets="-",
|
||||
strip=["span", "div"]))
|
||||
text = "\n\n".join(chunks)
|
||||
# Collapse runs of blank lines markdownify likes to emit.
|
||||
text = "\n".join(line.rstrip() for line in text.splitlines())
|
||||
while "\n\n\n" in text:
|
||||
text = text.replace("\n\n\n", "\n\n")
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def scrape_quickspecs(doc_id: str, bundle_id: str, title: str,
|
||||
version: str | None = None,
|
||||
product: str = "QuickSpecs",
|
||||
source_url: str | None = None,
|
||||
force: bool = False) -> bool:
|
||||
"""Live-fetch (or fall back to fixture), parse, write corpus files.
|
||||
|
||||
Returns True if files were written, False if skipped (already exists
|
||||
and --force not set)."""
|
||||
bundle_dir = CORPUS_DIR / bundle_id
|
||||
md_path = bundle_dir / f"{doc_id}.md"
|
||||
json_path = bundle_dir / f"{doc_id}.json"
|
||||
if not force and md_path.exists() and json_path.exists():
|
||||
log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id)
|
||||
return False
|
||||
|
||||
html = fetch_live(doc_id)
|
||||
fetched_from = "live"
|
||||
if html is None:
|
||||
html = fetch_fixture(doc_id)
|
||||
fetched_from = "fixture"
|
||||
if html is None:
|
||||
log.error("QuickSpecs %s: no live response and no fixture at %s",
|
||||
doc_id, SOURCE_DIR / f"{doc_id}.html")
|
||||
return False
|
||||
|
||||
body_md = parse_html(html)
|
||||
bundle_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path.write_text(body_md)
|
||||
sidecar = {
|
||||
"bundle_id": bundle_id,
|
||||
"page_id": doc_id,
|
||||
"title": title,
|
||||
"ordinal": 1,
|
||||
"parent_title": None,
|
||||
"doc_id": doc_id,
|
||||
"version": version,
|
||||
"product": product,
|
||||
"source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}",
|
||||
"fetched_from": fetched_from,
|
||||
}
|
||||
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
|
||||
log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from)
|
||||
return True
|
||||
|
||||
|
||||
def main() -> int:
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw")
|
||||
p.add_argument("--bundle-id", default="hvm_quickspecs")
|
||||
p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs")
|
||||
p.add_argument("--version", default=None)
|
||||
p.add_argument("--force", action="store_true")
|
||||
args = p.parse_args()
|
||||
ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title,
|
||||
args.version, force=args.force)
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,27 @@
|
||||
# scrape/quickspecs/
|
||||
|
||||
Static HTML fixtures for HPE QuickSpecs documents that aren't reachable
|
||||
from the runner (www.hpe.com edge drops connections from datacenter IPs
|
||||
with non-browser User-Agents — verified 2026-05-22 with curl, wget, and
|
||||
Anthropic's WebFetch).
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Operator visits `https://www.hpe.com/psnow/doc/<doc_id>` in a real
|
||||
browser, opens DevTools → Elements → Copy the `<body>` HTML.
|
||||
2. Save it at `scrape/quickspecs/<doc_id>.html`.
|
||||
3. Add a bundle entry in `scrape/bundles.py` with `mode="html-file"`.
|
||||
4. `python -m scrape.runner --bundle hvm_quickspecs --force` reads the
|
||||
committed HTML and writes `corpus/hvm_quickspecs/<doc_id>.{md,json}`.
|
||||
5. Re-index and ship.
|
||||
|
||||
QuickSpecs only update every few months (HPE rebrand, new SKU added,
|
||||
feature change). When a new version drops, refresh the local HTML
|
||||
file and re-run the scrape.
|
||||
|
||||
## Current fixtures
|
||||
|
||||
- `a50004260enw.html` — HPE Morpheus VM Essentials Software QuickSpecs
|
||||
(Version 4, 02-February-2026). SKUs: S5Q81AAE (1-yr), S5Q82AAE
|
||||
(3-yr), S5Q83AAE (5-yr) — all "per Socket E-LTU" with Tech Care
|
||||
Essentials included.
|
||||
@@ -0,0 +1,339 @@
|
||||
"""Scrape HVM doc bundles into corpus/<slug>/<page_id>.{md,json}.
|
||||
|
||||
Reads bundles.json (produced by scrape.bundles), then for each bundle:
|
||||
- mode="toc": walks the TOC tree, fetches each page via the render
|
||||
endpoint, converts page_html to markdown, writes
|
||||
<page_id>.md + <page_id>.json sidecar.
|
||||
- mode="single": fetches /document/{docId} directly, treats the whole
|
||||
body as one page with page_id = doc_id.
|
||||
|
||||
After all bundles are on disk, runs a finalize pass that synthesizes
|
||||
topic_cluster.clustered_topics for each page by looking up the same
|
||||
GUID in sibling bundles (HPE GUIDs are stable across versions — see
|
||||
reference_hpe_docs_portal_api.md).
|
||||
|
||||
Usage:
|
||||
python -m scrape.runner --all
|
||||
python -m scrape.runner --bundle hvm_user_manual_8_1_2
|
||||
python -m scrape.runner --all --force # re-download already-on-disk pages
|
||||
python -m scrape.runner --finalize-only # only redo the topic_cluster pass
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from markdownify import markdownify as md
|
||||
|
||||
API = "https://support.hpe.com/hpesc/public/api/document"
|
||||
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html"
|
||||
DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
|
||||
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
CORPUS = ROOT / "corpus"
|
||||
BUNDLES_JSON = ROOT / "bundles.json"
|
||||
|
||||
GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TocEntry:
|
||||
page_id: str
|
||||
title: str
|
||||
ordinal: int
|
||||
parent_title: str | None
|
||||
|
||||
|
||||
def _session() -> requests.Session:
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
|
||||
return s
|
||||
|
||||
|
||||
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
|
||||
delay = 1.0
|
||||
for attempt in range(retries):
|
||||
r = s.get(url, timeout=30)
|
||||
if r.status_code == 200:
|
||||
return r.json() if expect_json else r.text
|
||||
if r.status_code == 404:
|
||||
return None
|
||||
if r.status_code in (429, 500, 502, 503, 504):
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
continue
|
||||
r.raise_for_status()
|
||||
raise RuntimeError(f"GET failed after {retries} retries: {url}")
|
||||
|
||||
|
||||
def _flatten_toc(toc: list[dict]) -> list[TocEntry]:
|
||||
out: list[TocEntry] = []
|
||||
ordinal = 0
|
||||
|
||||
def walk(nodes: list[dict] | None, parent_title: str | None) -> None:
|
||||
nonlocal ordinal
|
||||
for node in nodes or []:
|
||||
title = node.get("topicName") or ""
|
||||
link = node.get("topicLink") or ""
|
||||
m = GUID_RE.search(link)
|
||||
if m:
|
||||
ordinal += 1
|
||||
out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title))
|
||||
walk(node.get("children"), title or parent_title)
|
||||
|
||||
walk(toc, None)
|
||||
return out
|
||||
|
||||
|
||||
def _strip_dita_wrappers(html: str) -> str:
|
||||
"""Remove the outer <main class="ditasrc">, drop the trademark Notices section,
|
||||
and unwrap aria-only span markup so markdownify produces clean text.
|
||||
|
||||
DITA's notices boilerplate repeats across every doc; if we leave it in,
|
||||
every page chunk inherits the same trademark text and pollutes retrieval."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Drop the Notices/Acknowledgments/Abstract boilerplate by section heading.
|
||||
# Every doc on the portal carries the same legal Notices and trademark
|
||||
# Acknowledgments; if we leave them in, every chunk inherits the same
|
||||
# text and pollutes retrieval. Abstract is one-line marketing.
|
||||
boilerplate = {"Notices", "Acknowledgments", "Abstract"}
|
||||
# Wrapped form: <article>/<section>/<div> whose first heading child is boilerplate.
|
||||
for sec in soup.select("article, section, div"):
|
||||
h = sec.find(["h1", "h2"], recursive=False)
|
||||
if h and h.get_text(strip=True) in boilerplate:
|
||||
sec.decompose()
|
||||
# Unwrapped form: bare <h1>/<h2>Boilerplate</h2> followed by its .desc/.body sibling.
|
||||
for h in soup.find_all(["h1", "h2"]):
|
||||
if h.get_text(strip=True) in boilerplate:
|
||||
sib = h.find_next_sibling()
|
||||
if sib and (sib.name in {"div", "section"}):
|
||||
cls = " ".join(sib.get("class", []) or [])
|
||||
if "desc" in cls or "body" in cls or "notices" in cls:
|
||||
sib.decompose()
|
||||
h.decompose()
|
||||
main = soup.find("main")
|
||||
return str(main) if main else str(soup)
|
||||
|
||||
|
||||
def html_to_md(page_html: str) -> str:
|
||||
cleaned = _strip_dita_wrappers(page_html)
|
||||
text = md(cleaned, heading_style="ATX", bullets="-")
|
||||
# collapse runs of blank lines
|
||||
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||
return text + "\n"
|
||||
|
||||
|
||||
def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str:
|
||||
payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True)
|
||||
if not payload:
|
||||
return ""
|
||||
return payload.get("page_html") or ""
|
||||
|
||||
|
||||
def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]:
|
||||
"""Returns (page_html, title) for a single-doc-shape bundle."""
|
||||
html = _get(s, f"{API}/{doc_id}")
|
||||
if not html:
|
||||
return "", ""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
h1 = soup.select_one("h1.title.topictitle1")
|
||||
title = h1.get_text(" ", strip=True) if h1 else doc_id
|
||||
return html, title
|
||||
|
||||
|
||||
def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool:
|
||||
bundle_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path = bundle_dir / f"{page_id}.md"
|
||||
json_path = bundle_dir / f"{page_id}.json"
|
||||
if not force and md_path.exists() and json_path.exists():
|
||||
return False
|
||||
md_path.write_text(body_md)
|
||||
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
|
||||
return True
|
||||
|
||||
|
||||
def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int:
|
||||
doc_id = bundle["doc_id"]
|
||||
slug = bundle["slug"]
|
||||
bundle_dir = CORPUS / slug
|
||||
|
||||
toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or []
|
||||
entries = _flatten_toc(toc)
|
||||
print(f" {slug}: {len(entries)} pages", file=sys.stderr)
|
||||
|
||||
written = 0
|
||||
def do_one(entry: TocEntry) -> bool:
|
||||
page_html = fetch_toc_page(s, doc_id, entry.page_id)
|
||||
if not page_html:
|
||||
return False
|
||||
body_md = html_to_md(page_html)
|
||||
sidecar = {
|
||||
"bundle_id": slug,
|
||||
"page_id": entry.page_id,
|
||||
"title": entry.title,
|
||||
"ordinal": entry.ordinal,
|
||||
"parent_title": entry.parent_title,
|
||||
"doc_id": doc_id,
|
||||
"version": bundle.get("version"),
|
||||
"product": bundle.get("product"),
|
||||
"source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id),
|
||||
# topic_cluster filled in by finalize()
|
||||
}
|
||||
return write_page(bundle_dir, entry.page_id, body_md, sidecar, force)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
||||
for fut in as_completed(pool.submit(do_one, e) for e in entries):
|
||||
if fut.result():
|
||||
written += 1
|
||||
return written
|
||||
|
||||
|
||||
def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int:
|
||||
doc_id = bundle["doc_id"]
|
||||
slug = bundle["slug"]
|
||||
bundle_dir = CORPUS / slug
|
||||
|
||||
html, title = fetch_single_doc(s, doc_id)
|
||||
if not html:
|
||||
print(f" ! {slug}: empty body", file=sys.stderr)
|
||||
return 0
|
||||
body_md = html_to_md(html)
|
||||
sidecar = {
|
||||
"bundle_id": slug,
|
||||
"page_id": doc_id,
|
||||
"title": title or bundle["title"],
|
||||
"ordinal": 1,
|
||||
"parent_title": None,
|
||||
"doc_id": doc_id,
|
||||
"version": bundle.get("version"),
|
||||
"product": bundle.get("product"),
|
||||
"source_url": DOC_URL_SINGLE.format(doc_id=doc_id),
|
||||
}
|
||||
print(f" {slug}: 1 page (single-doc)", file=sys.stderr)
|
||||
return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0
|
||||
|
||||
|
||||
def finalize_clusters(bundles: list[dict]) -> int:
|
||||
"""Cross-link sibling pages with the same GUID across version bundles.
|
||||
|
||||
For TOC bundles, page_id == GUID; same GUID across two bundles = same
|
||||
underlying topic. For single-doc bundles (page_id == doc_id), peer them
|
||||
by matching product+version-sibling on the `product` field."""
|
||||
# GUID → list[(slug, sidecar_path, sidecar_dict)]
|
||||
guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
|
||||
# product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering
|
||||
product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
|
||||
|
||||
for b in bundles:
|
||||
slug = b["slug"]
|
||||
bundle_dir = CORPUS / slug
|
||||
if not bundle_dir.exists():
|
||||
continue
|
||||
for jp in bundle_dir.glob("*.json"):
|
||||
data = json.loads(jp.read_text())
|
||||
pid = data["page_id"]
|
||||
if pid.startswith("GUID-"):
|
||||
guid_to_pages.setdefault(pid, []).append((slug, jp, data))
|
||||
else:
|
||||
product_to_pages.setdefault(b["product"], []).append((slug, jp, data))
|
||||
|
||||
updated = 0
|
||||
# TOC pages — cluster by GUID
|
||||
for guid, peers in guid_to_pages.items():
|
||||
if len(peers) < 2:
|
||||
continue
|
||||
for slug, jp, data in peers:
|
||||
others = [
|
||||
{"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")}
|
||||
for s2, _, d2 in peers if s2 != slug
|
||||
]
|
||||
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
|
||||
jp.write_text(json.dumps(data, indent=2) + "\n")
|
||||
updated += 1
|
||||
# Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2)
|
||||
for product, peers in product_to_pages.items():
|
||||
if len(peers) < 2:
|
||||
continue
|
||||
for slug, jp, data in peers:
|
||||
others = [
|
||||
{"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")}
|
||||
for s2, _, d2 in peers if s2 != slug
|
||||
]
|
||||
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
|
||||
jp.write_text(json.dumps(data, indent=2) + "\n")
|
||||
updated += 1
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.")
|
||||
p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json")
|
||||
p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)")
|
||||
p.add_argument("--force", action="store_true", help="re-fetch pages already on disk")
|
||||
p.add_argument("--concurrency", type=int, default=6)
|
||||
p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields")
|
||||
args = p.parse_args()
|
||||
|
||||
if not BUNDLES_JSON.exists():
|
||||
print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
bundles = json.loads(BUNDLES_JSON.read_text())
|
||||
|
||||
if args.finalize_only:
|
||||
n = finalize_clusters(bundles)
|
||||
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if args.bundle:
|
||||
bundles = [b for b in bundles if b["slug"] in args.bundle]
|
||||
if not bundles:
|
||||
print(f"no bundles matched: {args.bundle}", file=sys.stderr)
|
||||
return 2
|
||||
elif not args.all:
|
||||
print("specify --all or --bundle <slug>", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
s = _session()
|
||||
total = 0
|
||||
for b in bundles:
|
||||
mode = b.get("mode")
|
||||
if mode == "single":
|
||||
total += scrape_single_bundle(s, b, args.force)
|
||||
elif mode == "html-file":
|
||||
# Live-scrape HPE collateral (QuickSpecs) via curl_cffi; falls back
|
||||
# to scrape/quickspecs/<doc_id>.html fixture if the edge blocks us.
|
||||
from scrape.quickspecs import scrape_quickspecs
|
||||
ok = scrape_quickspecs(
|
||||
doc_id=b["doc_id"], bundle_id=b["slug"],
|
||||
title=b.get("title", b["doc_id"]),
|
||||
version=b.get("version"),
|
||||
product=b.get("product", "QuickSpecs"),
|
||||
source_url=b.get("source_url"),
|
||||
force=args.force,
|
||||
)
|
||||
total += 1 if ok else 0
|
||||
else:
|
||||
total += scrape_toc_bundle(s, b, args.force, args.concurrency)
|
||||
print(f"scraped {total} new/updated pages", file=sys.stderr)
|
||||
|
||||
# Always finalize after a scrape so sidecars are consistent.
|
||||
all_bundles = json.loads(BUNDLES_JSON.read_text())
|
||||
n = finalize_clusters(all_bundles)
|
||||
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
+70
-41
@@ -1,42 +1,58 @@
|
||||
"""Gitea container-registry garbage collection.
|
||||
|
||||
Lists package versions for one container package and deletes versions
|
||||
older than --keep-days. Always preserves:
|
||||
Lists tagged versions of one container package and deletes old ones.
|
||||
Always preserves:
|
||||
|
||||
- the :latest tag
|
||||
- the --keep-latest most-recent date-tagged versions
|
||||
- anything pushed in the last --keep-days days
|
||||
- the `latest` tag (Watchtower's auto-deploy target)
|
||||
- the `--keep-latest` most-recent date-tagged versions (YYYY.MM.DD)
|
||||
- the `--keep-latest` most-recent short-SHA tags (rollback pins)
|
||||
- anything pushed within `--keep-days` days
|
||||
|
||||
The actual disk reclaim happens on Gitea's next package GC cron (admin
|
||||
site settings). This script just marks the versions for deletion.
|
||||
OCI blob-level versions (`sha256:...`) are never touched directly — those
|
||||
are managed by Gitea's internal package GC cron when their last tag
|
||||
goes away.
|
||||
|
||||
Usage:
|
||||
|
||||
python scripts/registry_gc.py \\
|
||||
--owner <user> \\
|
||||
--package <product>-docs-mcp \\
|
||||
GITEA_TOKEN=... python scripts/registry_gc.py \\
|
||||
--owner justin \\
|
||||
--package hvm-docs \\
|
||||
--keep-days 90 \\
|
||||
--keep-latest 5
|
||||
|
||||
Auth: reads GITEA_TOKEN from env (set in the workflow as a secret).
|
||||
The Gitea endpoint shape (confirmed 2026-05-22 against git.jpaul.io):
|
||||
|
||||
GET /api/v1/packages/{owner}/container/{package}
|
||||
-> [{id, version, created_at, ...}, ...]
|
||||
DELETE /api/v1/packages/{owner}/container/{package}/{version}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import HTTPError
|
||||
import json
|
||||
|
||||
from urllib.parse import quote
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
GITEA_HOST = os.environ.get("GITEA_HOST", "https://git.jpaul.io")
|
||||
DATE_TAG = re.compile(r"^\d{4}\.\d{2}\.\d{2}$")
|
||||
SHA_TAG = re.compile(r"^[0-9a-f]{7,40}$") # short or full git SHA
|
||||
BLOB_VER = re.compile(r"^sha256:") # OCI blob versions — skip
|
||||
|
||||
|
||||
def api(token: str, method: str, path: str) -> object:
|
||||
# Explicit User-Agent: git.jpaul.io is behind Cloudflare, whose default
|
||||
# Bot Fight Mode 403s `Python-urllib/X.Y` with error 1010. Any
|
||||
# recognizable browser/curl-style UA passes.
|
||||
req = Request(f"{GITEA_HOST}{path}",
|
||||
headers={"Authorization": f"token {token}"},
|
||||
headers={
|
||||
"Authorization": f"token {token}",
|
||||
"User-Agent": "hvm-docs-registry-gc/1.0",
|
||||
},
|
||||
method=method)
|
||||
try:
|
||||
with urlopen(req, timeout=30) as r:
|
||||
@@ -63,44 +79,57 @@ def main() -> int:
|
||||
return 1
|
||||
|
||||
versions = api(token, "GET",
|
||||
f"/api/v1/packages/{args.owner}/container/{args.package}/versions") or []
|
||||
f"/api/v1/packages/{args.owner}/container/{args.package}") or []
|
||||
if not versions:
|
||||
print(f"no versions found for {args.owner}/{args.package}")
|
||||
print(f"no versions found for {args.owner}/container/{args.package}")
|
||||
return 0
|
||||
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(days=args.keep_days)
|
||||
print(f" {len(versions)} version(s); cutoff={cutoff.isoformat()} "
|
||||
f"keep_days={args.keep_days} keep_latest={args.keep_latest}")
|
||||
|
||||
# Date-tagged versions (YYYY.MM.DD), newest first
|
||||
date_tagged = []
|
||||
for v in versions:
|
||||
tags = v.get("tags") or []
|
||||
for t in tags:
|
||||
if len(t) == 10 and t[4] == "." and t[7] == ".":
|
||||
date_tagged.append((t, v))
|
||||
break
|
||||
date_tagged.sort(key=lambda kv: kv[0], reverse=True)
|
||||
keep_date_tags = {t for t, _ in date_tagged[:args.keep_latest]}
|
||||
|
||||
deleted = 0
|
||||
for v in versions:
|
||||
tags = v.get("tags") or []
|
||||
if "latest" in tags:
|
||||
continue
|
||||
if any(t in keep_date_tags for t in tags):
|
||||
continue
|
||||
# Sort newest first by created_at.
|
||||
def parsed_ts(v: dict) -> datetime:
|
||||
try:
|
||||
created = datetime.fromisoformat(v["created_at"].replace("Z", "+00:00"))
|
||||
return datetime.fromisoformat(v["created_at"].replace("Z", "+00:00"))
|
||||
except (KeyError, ValueError):
|
||||
return datetime.min.replace(tzinfo=timezone.utc)
|
||||
|
||||
versions.sort(key=parsed_ts, reverse=True)
|
||||
|
||||
# Compute the keep-set: top-N date tags + top-N sha tags + always latest.
|
||||
keep_dates: list[str] = []
|
||||
keep_shas: list[str] = []
|
||||
for v in versions:
|
||||
ver = v.get("version") or ""
|
||||
if DATE_TAG.match(ver) and len(keep_dates) < args.keep_latest:
|
||||
keep_dates.append(ver)
|
||||
elif SHA_TAG.match(ver) and len(keep_shas) < args.keep_latest:
|
||||
keep_shas.append(ver)
|
||||
keep = {"latest", *keep_dates, *keep_shas}
|
||||
print(f" keep tags: {sorted(keep)}")
|
||||
|
||||
deleted = skipped_blob = skipped_age = skipped_keep = 0
|
||||
for v in versions:
|
||||
ver = v.get("version") or ""
|
||||
ts = parsed_ts(v)
|
||||
if BLOB_VER.match(ver):
|
||||
skipped_blob += 1
|
||||
continue
|
||||
if created >= cutoff:
|
||||
if ver in keep:
|
||||
skipped_keep += 1
|
||||
continue
|
||||
version_id = v.get("id")
|
||||
print(f" deleting v{version_id} tags={tags} created={v['created_at']}")
|
||||
if ts >= cutoff:
|
||||
skipped_age += 1
|
||||
continue
|
||||
print(f" deleting {ver!r} id={v.get('id')} created={v.get('created_at')}")
|
||||
if not args.dry_run:
|
||||
api(token, "DELETE",
|
||||
f"/api/v1/packages/{args.owner}/container/{args.package}/versions/{version_id}")
|
||||
f"/api/v1/packages/{args.owner}/container/{args.package}/{quote(ver, safe='')}")
|
||||
deleted += 1
|
||||
print(f"done: {deleted} version(s) deleted")
|
||||
|
||||
print(f"done: deleted={deleted} kept_named={skipped_keep} "
|
||||
f"kept_recent={skipped_age} skipped_blobs={skipped_blob}")
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
"""Minimal HTTP reranker — `/v1/rerank` endpoint over a sentence-transformers CrossEncoder.
|
||||
|
||||
Matches the Cohere `/v1/rerank` request/response shape, which is what the
|
||||
server's `_rerank()` helper expects. This is the dev-friendly fallback;
|
||||
production replaces this with the llama.cpp + jina-reranker-v2-base GGUF
|
||||
sidecar (see deploy/docker-compose.yml) without changing the client.
|
||||
|
||||
Request:
|
||||
POST /v1/rerank
|
||||
{"model": "...", "query": "...", "documents": ["text", ...], "top_n": 10}
|
||||
|
||||
Response:
|
||||
{"model": "...", "results": [{"index": 0, "relevance_score": 0.93}, ...]}
|
||||
|
||||
Usage:
|
||||
python -m scripts.rerank_server # localhost:8001
|
||||
RERANK_MODEL=cross-encoder/ms-marco-MiniLM-L-12-v2 \\
|
||||
RERANK_PORT=8001 python -m scripts.rerank_server
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
|
||||
|
||||
MODEL_NAME = os.environ.get("RERANK_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
|
||||
PORT = int(os.environ.get("RERANK_PORT", "8001"))
|
||||
HOST = os.environ.get("RERANK_HOST", "127.0.0.1")
|
||||
# Truncate docs to this many chars before scoring. jina-reranker GGUF has a
|
||||
# 1024-token per-pair cap that 400s the entire batch; ms-marco is more
|
||||
# forgiving but we still cap to keep latency predictable.
|
||||
MAX_DOC_CHARS = int(os.environ.get("RERANK_MAX_DOC_CHARS", "2000"))
|
||||
|
||||
_model = None
|
||||
|
||||
|
||||
def _get_model():
|
||||
global _model
|
||||
if _model is None:
|
||||
from sentence_transformers import CrossEncoder
|
||||
log.info("loading %s", MODEL_NAME)
|
||||
_model = CrossEncoder(MODEL_NAME)
|
||||
log.info("loaded")
|
||||
return _model
|
||||
|
||||
|
||||
def _rerank(query: str, documents: list[str], top_n: int | None) -> list[dict]:
|
||||
model = _get_model()
|
||||
pairs = [[query, (d or "")[:MAX_DOC_CHARS]] for d in documents]
|
||||
scores = model.predict(pairs)
|
||||
ranked = sorted(
|
||||
({"index": i, "relevance_score": float(s)} for i, s in enumerate(scores)),
|
||||
key=lambda r: -r["relevance_score"],
|
||||
)
|
||||
if top_n is not None:
|
||||
ranked = ranked[:top_n]
|
||||
return ranked
|
||||
|
||||
|
||||
class Handler(BaseHTTPRequestHandler):
|
||||
def log_message(self, fmt, *args):
|
||||
log.info("%s - %s", self.address_string(), fmt % args)
|
||||
|
||||
def _send_json(self, status: int, payload: dict) -> None:
|
||||
body = json.dumps(payload).encode()
|
||||
self.send_response(status)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def do_GET(self): # noqa: N802
|
||||
if self.path in ("/", "/health"):
|
||||
self._send_json(200, {"status": "ok", "model": MODEL_NAME})
|
||||
return
|
||||
self._send_json(404, {"error": "not found"})
|
||||
|
||||
def do_POST(self): # noqa: N802
|
||||
if self.path not in ("/v1/rerank", "/rerank"):
|
||||
self._send_json(404, {"error": "not found"})
|
||||
return
|
||||
length = int(self.headers.get("Content-Length", "0"))
|
||||
try:
|
||||
req = json.loads(self.rfile.read(length).decode())
|
||||
except Exception as e:
|
||||
self._send_json(400, {"error": f"bad json: {e}"})
|
||||
return
|
||||
query = req.get("query")
|
||||
documents = req.get("documents")
|
||||
if not isinstance(query, str) or not isinstance(documents, list):
|
||||
self._send_json(400, {"error": "expected {query: str, documents: list[str]}"})
|
||||
return
|
||||
top_n = req.get("top_n")
|
||||
try:
|
||||
results = _rerank(query, documents, top_n if isinstance(top_n, int) else None)
|
||||
except Exception as e:
|
||||
log.exception("rerank failed")
|
||||
self._send_json(500, {"error": str(e)})
|
||||
return
|
||||
self._send_json(200, {"model": MODEL_NAME, "results": results})
|
||||
|
||||
|
||||
def main() -> int:
|
||||
_get_model() # warm-load before accepting traffic
|
||||
server = ThreadingHTTPServer((HOST, PORT), Handler)
|
||||
log.info("listening on http://%s:%d", HOST, PORT)
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
log.info("shutting down")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user