Files
hvm-docs/scrape/bundles.py
T
justin 7a491ba9e4 scrape: HVM bundles + runner for HPE Support DocPortal
Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2),
and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown.

Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}:
/toc walks the page tree (for TOC-paginated docs), /render?page=GUID
fetches per-page HTML, /document/{docId} returns the whole body for
single-doc shapes like Release Notes.

Runner converts DITA-source HTML to clean markdown (strips Notices/
Acknowledgments/Abstract boilerplate), writes corpus/<bundle>/<page>.{md,json},
then a finalize pass synthesizes topic_cluster.clustered_topics by GUID
overlap across versions (HPE GUIDs are stable cross-version — confirmed
374/376/376 with 100% overlap on shared pages).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 13:06:26 -04:00

171 lines
6.3 KiB
Python

"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
Bundle IDs are declared statically here because HPE mints a new docId
per product version rather than versioning a single doc (see
~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
version drops, add a new entry to BUNDLES and re-run; the runner will
pick it up on the next pass.
For each bundle this script:
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"
@dataclass
class BundleSpec:
slug: str
doc_id: str
title: str
version: str | None
product: str # e.g. "User Manual", "Release Notes", "Deployment Guide"
mode: str # "toc" or "single"
platform: str | None = None
language: str = "en-US"
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00007400..7740 for `v8.1.x` matches in the abstract.
BUNDLES: list[BundleSpec] = [
BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"),
BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"),
BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"),
]
def _session() -> requests.Session:
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
return s
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
delay = 1.0
for attempt in range(retries):
r = s.get(url, timeout=30)
if r.status_code == 200:
return r.json() if expect_json else r.text
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
time.sleep(delay)
delay *= 2
continue
r.raise_for_status()
raise RuntimeError(f"GET failed after {retries} retries: {url}")
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
"""Returns (page_count, landing_page_guid)."""
if not toc:
return 0, None
landing = None
n = 0
def walk(nodes: list[dict] | None, depth: int) -> None:
nonlocal n, landing
for node in nodes or []:
link = node.get("topicLink")
if link:
n += 1
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
if m and landing is None:
landing = m.group(1)
walk(node.get("children"), depth + 1)
walk(toc, 0)
return n, landing
def _parse_abstract(html: str) -> dict[str, str]:
"""Pull title / abstract text / published date out of the DITA abstract HTML."""
soup = BeautifulSoup(html, "html.parser")
out: dict[str, str] = {}
h1 = soup.select_one("h1.title.topictitle1")
if h1:
out["title"] = h1.get_text(" ", strip=True)
desc = soup.select_one("div.desc")
if desc:
out["abstract"] = desc.get_text(" ", strip=True)
pub = soup.select_one("div.publishedDate")
if pub:
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
return out
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
meta = _parse_abstract(abstract_html or "")
page_count: int
landing: str | None
if spec.mode == "toc":
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
page_count, landing = _count_toc(toc)
if page_count == 0:
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
spec.mode = "single"
page_count, landing = 1, spec.doc_id
else:
page_count, landing = 1, spec.doc_id
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": meta.get("title") or spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": page_count,
"mode": spec.mode,
"abstract": meta.get("abstract", ""),
"dates": {"Published": meta.get("published", "")},
"landing_page": landing,
"source_url": DOC_URL.format(doc_id=spec.doc_id),
}
def main() -> int:
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
p.add_argument("--out", default=str(BUNDLES_JSON))
args = p.parse_args()
s = _session()
out: list[dict[str, Any]] = []
for spec in BUNDLES:
print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
out.append(discover_bundle(s, spec))
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())