"""Discover Morpheus Enterprise doc bundles on HPE Support DocPortal and write bundles.json. Mirrors hvm-docs/scrape/bundles.py — same portal, same API shape, same single-doc-blob treatment for Release Notes, but pointing at the Morpheus Enterprise docId range. For each bundle this script: 1. GETs /hpesc/public/api/document/{docId} → abstract HTML 2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc) 3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents. QuickSpecs is a special case: lives at www.hpe.com (not support.hpe.com), gets the html-file mode and is scraped via curl_cffi (see scrape/quickspecs.py). """ from __future__ import annotations import argparse import json import re import sys import time from dataclasses import dataclass, field from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup API = "https://support.hpe.com/hpesc/public/api/document" DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" UA = "morpheus-docs-mcp/0.1 (+https://git.jpaul.io/justin/morpheus-docs; admin@jpaul.io)" ROOT = Path(__file__).resolve().parent.parent BUNDLES_JSON = ROOT / "bundles.json" @dataclass class BundleSpec: slug: str doc_id: str title: str version: str | None product: str # e.g. "User Manual", "Release Notes", "QuickSpecs" mode: str # "toc", "single", or "html-file" platform: str | None = None language: str = "en-US" source_url: str | None = None # overrides the default support.hpe.com URL # Declared bundles. Versions confirmed 2026-05-22 by probing the docId # range sd00006500..7740 for `Morpheus Enterprise` matches in the abstract. # # Notes: # - Morpheus Enterprise has User Manuals dating back to 8.0.10 # (sd00006774en_us, Sep 2025) but we only ship the 8.1.x line for # now. Add the 8.0.x bundles here if you need older versions in the # corpus. # - No dedicated Deployment Guide or Qualification Matrix for Morpheus # Enterprise on HPE Support — the only QM (sd00006551en_us) covers # HVM clusters managed by Morpheus, which lives in hvm-docs. # - QuickSpecs lives on www.hpe.com (not support.hpe.com), uses the # html-file scrape mode with curl_cffi Chrome impersonation. BUNDLES: list[BundleSpec] = [ BundleSpec("morpheus_user_manual_8_1_0", "sd00007510en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.0", "User Manual", "toc"), BundleSpec("morpheus_user_manual_8_1_1", "sd00007621en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.1", "User Manual", "toc"), BundleSpec("morpheus_user_manual_8_1_2", "sd00007732en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.2", "User Manual", "toc"), BundleSpec("morpheus_release_notes_8_1_0", "sd00007496en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.0", "Release Notes", "single"), BundleSpec("morpheus_release_notes_8_1_1", "sd00007610en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.1", "Release Notes", "single"), BundleSpec("morpheus_release_notes_8_1_2", "sd00007733en_us", "HPE Morpheus Enterprise Software Release Notes", "8.1.2", "Release Notes", "single"), BundleSpec("morpheus_quickspecs", "a50009231enw", "HPE Morpheus Enterprise Software QuickSpecs", "v1", "QuickSpecs", "html-file", source_url="https://www.hpe.com/psnow/doc/a50009231enw"), ] def _session() -> requests.Session: s = requests.Session() s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) return s def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: delay = 1.0 for attempt in range(retries): r = s.get(url, timeout=30) if r.status_code == 200: return r.json() if expect_json else r.text if r.status_code == 404: return None if r.status_code in (429, 500, 502, 503, 504): time.sleep(delay) delay *= 2 continue r.raise_for_status() raise RuntimeError(f"GET failed after {retries} retries: {url}") def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]: if not toc: return 0, None landing = None n = 0 def walk(nodes: list[dict] | None, depth: int) -> None: nonlocal n, landing for node in nodes or []: link = node.get("topicLink") if link: n += 1 m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link) if m and landing is None: landing = m.group(1) walk(node.get("children"), depth + 1) walk(toc, 0) return n, landing def _parse_abstract(html: str) -> dict[str, str]: soup = BeautifulSoup(html, "html.parser") out: dict[str, str] = {} h1 = soup.select_one("h1.title.topictitle1") if h1: out["title"] = h1.get_text(" ", strip=True) desc = soup.select_one("div.desc") if desc: out["abstract"] = desc.get_text(" ", strip=True) pub = soup.select_one("div.publishedDate") if pub: out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip() return out def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]: # html-file bundles are static fixtures or live-fetched outside support.hpe.com. if spec.mode == "html-file": return { "slug": spec.slug, "doc_id": spec.doc_id, "title": spec.title, "version": spec.version, "platform": spec.platform, "product": spec.product, "language": spec.language, "page_count": 1, "mode": "html-file", "abstract": "", "dates": {}, "landing_page": spec.doc_id, "source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}", } abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False) meta = _parse_abstract(abstract_html or "") page_count: int landing: str | None if spec.mode == "toc": toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True) page_count, landing = _count_toc(toc) if page_count == 0: print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr) spec.mode = "single" page_count, landing = 1, spec.doc_id else: page_count, landing = 1, spec.doc_id return { "slug": spec.slug, "doc_id": spec.doc_id, "title": meta.get("title") or spec.title, "version": spec.version, "platform": spec.platform, "product": spec.product, "language": spec.language, "page_count": page_count, "mode": spec.mode, "abstract": meta.get("abstract", ""), "dates": {"Published": meta.get("published", "")}, "landing_page": landing, "source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id), } def main() -> int: p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.") p.add_argument("--out", default=str(BUNDLES_JSON)) args = p.parse_args() s = _session() out: list[dict[str, Any]] = [] for spec in BUNDLES: print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr) out.append(discover_bundle(s, spec)) Path(args.out).write_text(json.dumps(out, indent=2) + "\n") print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr) return 0 if __name__ == "__main__": sys.exit(main())