"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json. Bundle IDs are declared statically here because HPE mints a new docId per product version rather than versioning a single doc (see ~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new version drops, add a new entry to BUNDLES and re-run; the runner will pick it up on the next pass. For each bundle this script: 1. GETs /hpesc/public/api/document/{docId} → abstract HTML 2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc) 3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents. """ from __future__ import annotations import argparse import json import re import sys import time from dataclasses import dataclass, field from pathlib import Path from typing import Any import requests from bs4 import BeautifulSoup API = "https://support.hpe.com/hpesc/public/api/document" DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)" ROOT = Path(__file__).resolve().parent.parent BUNDLES_JSON = ROOT / "bundles.json" @dataclass class BundleSpec: slug: str doc_id: str title: str version: str | None product: str # e.g. "User Manual", "Release Notes", "Deployment Guide" mode: str # "toc", "single", or "html-file" (committed fixture under scrape/quickspecs/) platform: str | None = None language: str = "en-US" source_url: str | None = None # overrides the default support.hpe.com URL # Declared bundles. Versions confirmed 2026-05-22 by probing the docId # range sd00007400..7740 for `v8.1.x` matches in the abstract. BUNDLES: list[BundleSpec] = [ BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"), BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"), BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"), BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"), BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"), BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"), BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"), BundleSpec("hvm_qualification_matrix","sd00006551en_us", "Qualification Matrix for HVM Clusters Managed by HPE Morpheus Software", None, "Qualification Matrix", "toc"), # QuickSpecs is a static-HTML fixture (www.hpe.com edge drops automated # connections — see scrape/quickspecs/README.md). doc_id = the QuickSpecs # PSNow ref (a50004260enw). page_count is 1; source_url points at the # public PSNow URL. BundleSpec("hvm_quickspecs", "a50004260enw", "HPE Morpheus VM Essentials Software QuickSpecs", "v4-2026-02-02", "QuickSpecs", "html-file", source_url="https://www.hpe.com/psnow/doc/a50004260enw"), ] def _session() -> requests.Session: s = requests.Session() s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) return s def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: delay = 1.0 for attempt in range(retries): r = s.get(url, timeout=30) if r.status_code == 200: return r.json() if expect_json else r.text if r.status_code == 404: return None if r.status_code in (429, 500, 502, 503, 504): time.sleep(delay) delay *= 2 continue r.raise_for_status() raise RuntimeError(f"GET failed after {retries} retries: {url}") def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]: """Returns (page_count, landing_page_guid).""" if not toc: return 0, None landing = None n = 0 def walk(nodes: list[dict] | None, depth: int) -> None: nonlocal n, landing for node in nodes or []: link = node.get("topicLink") if link: n += 1 m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link) if m and landing is None: landing = m.group(1) walk(node.get("children"), depth + 1) walk(toc, 0) return n, landing def _parse_abstract(html: str) -> dict[str, str]: """Pull title / abstract text / published date out of the DITA abstract HTML.""" soup = BeautifulSoup(html, "html.parser") out: dict[str, str] = {} h1 = soup.select_one("h1.title.topictitle1") if h1: out["title"] = h1.get_text(" ", strip=True) desc = soup.select_one("div.desc") if desc: out["abstract"] = desc.get_text(" ", strip=True) pub = soup.select_one("div.publishedDate") if pub: out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip() return out def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]: # html-file bundles are static fixtures — no upstream fetch. if spec.mode == "html-file": return { "slug": spec.slug, "doc_id": spec.doc_id, "title": spec.title, "version": spec.version, "platform": spec.platform, "product": spec.product, "language": spec.language, "page_count": 1, "mode": "html-file", "abstract": "", "dates": {}, "landing_page": spec.doc_id, "source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}", } abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False) meta = _parse_abstract(abstract_html or "") page_count: int landing: str | None if spec.mode == "toc": toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True) page_count, landing = _count_toc(toc) if page_count == 0: print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr) spec.mode = "single" page_count, landing = 1, spec.doc_id else: page_count, landing = 1, spec.doc_id return { "slug": spec.slug, "doc_id": spec.doc_id, "title": meta.get("title") or spec.title, "version": spec.version, "platform": spec.platform, "product": spec.product, "language": spec.language, "page_count": page_count, "mode": spec.mode, "abstract": meta.get("abstract", ""), "dates": {"Published": meta.get("published", "")}, "landing_page": landing, "source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id), } def main() -> int: p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.") p.add_argument("--out", default=str(BUNDLES_JSON)) args = p.parse_args() s = _session() out: list[dict[str, Any]] = [] for spec in BUNDLES: print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr) out.append(discover_bundle(s, spec)) Path(args.out).write_text(json.dumps(out, indent=2) + "\n") print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr) return 0 if __name__ == "__main__": sys.exit(main())