Files
morpheus-docs/scrape/bundles.py
T
justin fa448f94e1 build out morpheus-docs MCP stack, mirroring hvm-docs through Phases 1-13
Initial scaffold: the docs-mcp-template clone with all the
HVM-validated stack ported across, customized for Morpheus
Enterprise (PRODUCT_NAME=morpheus, server name morpheus-docs).

Bundles (live-discovered 2026-05-22; 1710 cataloged pages total):
* morpheus_user_manual_8_1_0  sd00007510en_us  568 pages (Feb 2026)
* morpheus_user_manual_8_1_1  sd00007621en_us  569 pages (Mar 2026)
* morpheus_user_manual_8_1_2  sd00007732en_us  569 pages (Apr 2026)
* morpheus_release_notes_8_1_0  sd00007496en_us  single-doc
* morpheus_release_notes_8_1_1  sd00007610en_us  single-doc
* morpheus_release_notes_8_1_2  sd00007733en_us  single-doc
* morpheus_quickspecs            a50009231enw     html-file (live
  curl_cffi against www.hpe.com; all 12+ Enterprise SKUs captured —
  S6E64..S6E73AAE for new/renewal/upgrade × 1/3/5-yr terms, plus
  services SKUs HA124A1#V38/V39 and H46SBA1).

No Deployment Guide or Qualification Matrix on HPE Support for
Morpheus Enterprise specifically — the only QM (sd00006551en_us)
covers HVM clusters managed by Morpheus and lives in hvm-docs.

Stack carried forward from hvm-docs:
* rag/{index,chunk,embeddings,bm25}.py — including the
  MAX_CHARS=4000 chunk-cap fix for table-dense content
* docs_mcp/{server,usage}.py — 11 MCP tools, BM25-default search,
  cross-encoder rerank, hybrid behind HYBRID_SEARCH=true,
  morpheus_api_lessons (renamed from hvm_api_lessons), env-gated
  submit_doc_bug
* docs_mcp/api_lessons.md — Morpheus-specific scaffold covering
  licensing model, HVM elevation path, REST vs Plugin API, with
  TODO markers for sections to flesh out from real ops experience
* scrape/{runner,quickspecs,changelog,bundles}.py — TOC + single-doc
  + html-file modes, curl_cffi Chrome120 for www.hpe.com edge bypass
* eval/{retrievers,run_eval}.py + queries.jsonl scaffold (4 placeholder
  queries; populate after first scrape)
* scripts/{rerank_server,usage_report,registry_gc}.py
* .gitea/workflows/{refresh,image-only}.yml — same Gitea Actions
  setup zerto-docs uses (push LAN, pull public-URL, GPU Ollama pool)
* deploy/docker-compose.yml — morpheus-docs-mcp service definition,
  shared jina-rerank sidecar, Watchtower-labeled
* Dockerfile, requirements.txt, requirements-rerank.txt

Verified locally: scrape produced 1599 .md pages (some TOC entries
are parent-only and yield no body), 6353 chunks all under the 4 KB
cap, MCP server boots and lists 11 tools cleanly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 15:26:24 -04:00

201 lines
7.6 KiB
Python

"""Discover Morpheus Enterprise doc bundles on HPE Support DocPortal and write bundles.json.
Mirrors hvm-docs/scrape/bundles.py — same portal, same API shape, same single-doc-blob
treatment for Release Notes, but pointing at the Morpheus Enterprise docId range.
For each bundle this script:
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
QuickSpecs is a special case: lives at www.hpe.com (not support.hpe.com), gets the
html-file mode and is scraped via curl_cffi (see scrape/quickspecs.py).
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "morpheus-docs-mcp/0.1 (+https://git.jpaul.io/justin/morpheus-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"
@dataclass
class BundleSpec:
slug: str
doc_id: str
title: str
version: str | None
product: str # e.g. "User Manual", "Release Notes", "QuickSpecs"
mode: str # "toc", "single", or "html-file"
platform: str | None = None
language: str = "en-US"
source_url: str | None = None # overrides the default support.hpe.com URL
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00006500..7740 for `Morpheus Enterprise` matches in the abstract.
#
# Notes:
# - Morpheus Enterprise has User Manuals dating back to 8.0.10
# (sd00006774en_us, Sep 2025) but we only ship the 8.1.x line for
# now. Add the 8.0.x bundles here if you need older versions in the
# corpus.
# - No dedicated Deployment Guide or Qualification Matrix for Morpheus
# Enterprise on HPE Support — the only QM (sd00006551en_us) covers
# HVM clusters managed by Morpheus, which lives in hvm-docs.
# - QuickSpecs lives on www.hpe.com (not support.hpe.com), uses the
# html-file scrape mode with curl_cffi Chrome impersonation.
BUNDLES: list[BundleSpec] = [
BundleSpec("morpheus_user_manual_8_1_0", "sd00007510en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.0", "User Manual", "toc"),
BundleSpec("morpheus_user_manual_8_1_1", "sd00007621en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.1", "User Manual", "toc"),
BundleSpec("morpheus_user_manual_8_1_2", "sd00007732en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.2", "User Manual", "toc"),
BundleSpec("morpheus_release_notes_8_1_0", "sd00007496en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.0", "Release Notes", "single"),
BundleSpec("morpheus_release_notes_8_1_1", "sd00007610en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.1", "Release Notes", "single"),
BundleSpec("morpheus_release_notes_8_1_2", "sd00007733en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.2", "Release Notes", "single"),
BundleSpec("morpheus_quickspecs", "a50009231enw", "HPE Morpheus Enterprise Software QuickSpecs",
"v1", "QuickSpecs", "html-file",
source_url="https://www.hpe.com/psnow/doc/a50009231enw"),
]
def _session() -> requests.Session:
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
return s
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
delay = 1.0
for attempt in range(retries):
r = s.get(url, timeout=30)
if r.status_code == 200:
return r.json() if expect_json else r.text
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
time.sleep(delay)
delay *= 2
continue
r.raise_for_status()
raise RuntimeError(f"GET failed after {retries} retries: {url}")
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
if not toc:
return 0, None
landing = None
n = 0
def walk(nodes: list[dict] | None, depth: int) -> None:
nonlocal n, landing
for node in nodes or []:
link = node.get("topicLink")
if link:
n += 1
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
if m and landing is None:
landing = m.group(1)
walk(node.get("children"), depth + 1)
walk(toc, 0)
return n, landing
def _parse_abstract(html: str) -> dict[str, str]:
soup = BeautifulSoup(html, "html.parser")
out: dict[str, str] = {}
h1 = soup.select_one("h1.title.topictitle1")
if h1:
out["title"] = h1.get_text(" ", strip=True)
desc = soup.select_one("div.desc")
if desc:
out["abstract"] = desc.get_text(" ", strip=True)
pub = soup.select_one("div.publishedDate")
if pub:
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
return out
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
# html-file bundles are static fixtures or live-fetched outside support.hpe.com.
if spec.mode == "html-file":
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": 1,
"mode": "html-file",
"abstract": "",
"dates": {},
"landing_page": spec.doc_id,
"source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}",
}
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
meta = _parse_abstract(abstract_html or "")
page_count: int
landing: str | None
if spec.mode == "toc":
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
page_count, landing = _count_toc(toc)
if page_count == 0:
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
spec.mode = "single"
page_count, landing = 1, spec.doc_id
else:
page_count, landing = 1, spec.doc_id
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": meta.get("title") or spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": page_count,
"mode": spec.mode,
"abstract": meta.get("abstract", ""),
"dates": {"Published": meta.get("published", "")},
"landing_page": landing,
"source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id),
}
def main() -> int:
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
p.add_argument("--out", default=str(BUNDLES_JSON))
args = p.parse_args()
s = _session()
out: list[dict[str, Any]] = []
for spec in BUNDLES:
print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
out.append(discover_bundle(s, spec))
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())