Files
hvm-docs/scrape/bundles.py
T
justin a0727da8da scrape: add Qualification Matrix + QuickSpecs bundles (live curl_cffi for HPE www)
Two new bundles:

* hvm_qualification_matrix (sd00006551en_us) — the "Qualification Matrix
  for HVM Clusters Managed by HPE Morpheus Software". Single TOC bundle,
  2 pages (parent + content). The content page is ~100 KB of HTML
  containing five tables: Server Hardware Support, Storage Hardware
  Support, Independent Software Vendor (ISV) Support, Hypervisor OS
  Compatibility and Interoperability Matrix, and Guest OS. Scraped via
  the same /hpesc/public/api/document/{docId}/render endpoint as every
  other bundle on support.hpe.com — the API returns server-rendered
  DITA HTML, so no JS/SPA shenanigans.

* hvm_quickspecs (a50004260enw) — HPE Morpheus VM Essentials Software
  QuickSpecs, Version 4 (02-Feb-2026). SKUs: S5Q81AAE (1-yr per Socket
  E-LTU), S5Q82AAE (3-yr), S5Q83AAE (5-yr); each includes Tech Care
  Essentials. QuickSpecs lives at www.hpe.com (not support.hpe.com),
  which drops connections at the edge for non-browser TLS fingerprints —
  verified 2026-05-22 against curl, wget, urllib, and Anthropic's
  WebFetch (all = 0 bytes / connection timeout in headers). Bypassed
  here via curl_cffi impersonating Chrome 120's JA3/JA4 fingerprint.
  HTTP 200, 255 KB on first try, all four sections + all three SKUs
  cleanly parseable from the server-rendered HTML.

New module scrape/quickspecs.py drives the live fetch + parse for any
hvm_*_quickspecs bundle. CSS selectors taken from the captured DOM:
  .lr-right-rail hpe-highlights-container .collateral-content
       — one block per H3 section
  h3.txto-title             — section title
  div.txto-description      — section body
  uc-table.uc-table-polaris — SKU and version-history tables
On any live failure the parser falls back to a committed HTML fixture
at scrape/quickspecs/<doc_id>.html so the build never breaks on a
transient edge hiccup.

scrape/runner.py learned a new mode "html-file" that dispatches to
scrape.quickspecs; bundles.py extended with an optional source_url on
BundleSpec for cases where the page lives outside support.hpe.com.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 15:05:11 -04:00

198 lines
7.7 KiB
Python

"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
Bundle IDs are declared statically here because HPE mints a new docId
per product version rather than versioning a single doc (see
~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
version drops, add a new entry to BUNDLES and re-run; the runner will
pick it up on the next pass.
For each bundle this script:
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"
@dataclass
class BundleSpec:
slug: str
doc_id: str
title: str
version: str | None
product: str # e.g. "User Manual", "Release Notes", "Deployment Guide"
mode: str # "toc", "single", or "html-file" (committed fixture under scrape/quickspecs/)
platform: str | None = None
language: str = "en-US"
source_url: str | None = None # overrides the default support.hpe.com URL
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00007400..7740 for `v8.1.x` matches in the abstract.
BUNDLES: list[BundleSpec] = [
BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"),
BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"),
BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"),
BundleSpec("hvm_qualification_matrix","sd00006551en_us", "Qualification Matrix for HVM Clusters Managed by HPE Morpheus Software", None, "Qualification Matrix", "toc"),
# QuickSpecs is a static-HTML fixture (www.hpe.com edge drops automated
# connections — see scrape/quickspecs/README.md). doc_id = the QuickSpecs
# PSNow ref (a50004260enw). page_count is 1; source_url points at the
# public PSNow URL.
BundleSpec("hvm_quickspecs", "a50004260enw", "HPE Morpheus VM Essentials Software QuickSpecs",
"v4-2026-02-02", "QuickSpecs", "html-file",
source_url="https://www.hpe.com/psnow/doc/a50004260enw"),
]
def _session() -> requests.Session:
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
return s
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
delay = 1.0
for attempt in range(retries):
r = s.get(url, timeout=30)
if r.status_code == 200:
return r.json() if expect_json else r.text
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
time.sleep(delay)
delay *= 2
continue
r.raise_for_status()
raise RuntimeError(f"GET failed after {retries} retries: {url}")
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
"""Returns (page_count, landing_page_guid)."""
if not toc:
return 0, None
landing = None
n = 0
def walk(nodes: list[dict] | None, depth: int) -> None:
nonlocal n, landing
for node in nodes or []:
link = node.get("topicLink")
if link:
n += 1
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
if m and landing is None:
landing = m.group(1)
walk(node.get("children"), depth + 1)
walk(toc, 0)
return n, landing
def _parse_abstract(html: str) -> dict[str, str]:
"""Pull title / abstract text / published date out of the DITA abstract HTML."""
soup = BeautifulSoup(html, "html.parser")
out: dict[str, str] = {}
h1 = soup.select_one("h1.title.topictitle1")
if h1:
out["title"] = h1.get_text(" ", strip=True)
desc = soup.select_one("div.desc")
if desc:
out["abstract"] = desc.get_text(" ", strip=True)
pub = soup.select_one("div.publishedDate")
if pub:
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
return out
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
# html-file bundles are static fixtures — no upstream fetch.
if spec.mode == "html-file":
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": 1,
"mode": "html-file",
"abstract": "",
"dates": {},
"landing_page": spec.doc_id,
"source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}",
}
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
meta = _parse_abstract(abstract_html or "")
page_count: int
landing: str | None
if spec.mode == "toc":
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
page_count, landing = _count_toc(toc)
if page_count == 0:
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
spec.mode = "single"
page_count, landing = 1, spec.doc_id
else:
page_count, landing = 1, spec.doc_id
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": meta.get("title") or spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": page_count,
"mode": spec.mode,
"abstract": meta.get("abstract", ""),
"dates": {"Published": meta.get("published", "")},
"landing_page": landing,
"source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id),
}
def main() -> int:
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
p.add_argument("--out", default=str(BUNDLES_JSON))
args = p.parse_args()
s = _session()
out: list[dict[str, Any]] = []
for spec in BUNDLES:
print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
out.append(discover_bundle(s, spec))
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())