hvm-docs/scrape/bundles.py

"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.

Bundle IDs are declared statically here because HPE mints a new docId
per product version rather than versioning a single doc (see
~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
version drops, add a new entry to BUNDLES and re-run; the runner will
pick it up on the next pass.

For each bundle this script:
  1. GETs /hpesc/public/api/document/{docId}        → abstract HTML
  2. GETs /hpesc/public/api/document/{docId}/toc    → page tree (or 404 for single-doc)
  3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
"""
from __future__ import annotations

import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"


@dataclass
class BundleSpec:
    slug: str
    doc_id: str
    title: str
    version: str | None
    product: str  # e.g. "User Manual", "Release Notes", "Deployment Guide"
    mode: str    # "toc", "single", or "html-file" (committed fixture under scrape/quickspecs/)
    platform: str | None = None
    language: str = "en-US"
    source_url: str | None = None   # overrides the default support.hpe.com URL


# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00007400..7740 for `v8.1.x` matches in the abstract.
BUNDLES: list[BundleSpec] = [
    BundleSpec("hvm_user_manual_8_1_0",   "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual",      "toc"),
    BundleSpec("hvm_user_manual_8_1_1",   "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual",      "toc"),
    BundleSpec("hvm_user_manual_8_1_2",   "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual",      "toc"),
    BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.0", "Release Notes",   "single"),
    BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.1", "Release Notes",   "single"),
    BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.2", "Release Notes",   "single"),
    BundleSpec("hvm_deployment_guide",    "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide",        None,    "Deployment Guide","toc"),
    BundleSpec("hvm_qualification_matrix","sd00006551en_us", "Qualification Matrix for HVM Clusters Managed by HPE Morpheus Software", None, "Qualification Matrix", "toc"),
    # QuickSpecs is a static-HTML fixture (www.hpe.com edge drops automated
    # connections — see scrape/quickspecs/README.md). doc_id = the QuickSpecs
    # PSNow ref (a50004260enw). page_count is 1; source_url points at the
    # public PSNow URL.
    BundleSpec("hvm_quickspecs", "a50004260enw", "HPE Morpheus VM Essentials Software QuickSpecs",
               "v4-2026-02-02", "QuickSpecs", "html-file",
               source_url="https://www.hpe.com/psnow/doc/a50004260enw"),
]


def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
    return s


def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
    delay = 1.0
    for attempt in range(retries):
        r = s.get(url, timeout=30)
        if r.status_code == 200:
            return r.json() if expect_json else r.text
        if r.status_code == 404:
            return None
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay)
            delay *= 2
            continue
        r.raise_for_status()
    raise RuntimeError(f"GET failed after {retries} retries: {url}")


def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
    """Returns (page_count, landing_page_guid)."""
    if not toc:
        return 0, None
    landing = None
    n = 0

    def walk(nodes: list[dict] | None, depth: int) -> None:
        nonlocal n, landing
        for node in nodes or []:
            link = node.get("topicLink")
            if link:
                n += 1
                m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
                if m and landing is None:
                    landing = m.group(1)
            walk(node.get("children"), depth + 1)

    walk(toc, 0)
    return n, landing


def _parse_abstract(html: str) -> dict[str, str]:
    """Pull title / abstract text / published date out of the DITA abstract HTML."""
    soup = BeautifulSoup(html, "html.parser")
    out: dict[str, str] = {}
    h1 = soup.select_one("h1.title.topictitle1")
    if h1:
        out["title"] = h1.get_text(" ", strip=True)
    desc = soup.select_one("div.desc")
    if desc:
        out["abstract"] = desc.get_text(" ", strip=True)
    pub = soup.select_one("div.publishedDate")
    if pub:
        out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
    return out


def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
    # html-file bundles are static fixtures — no upstream fetch.
    if spec.mode == "html-file":
        return {
            "slug": spec.slug,
            "doc_id": spec.doc_id,
            "title": spec.title,
            "version": spec.version,
            "platform": spec.platform,
            "product": spec.product,
            "language": spec.language,
            "page_count": 1,
            "mode": "html-file",
            "abstract": "",
            "dates": {},
            "landing_page": spec.doc_id,
            "source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}",
        }

    abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
    meta = _parse_abstract(abstract_html or "")

    page_count: int
    landing: str | None
    if spec.mode == "toc":
        toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
        page_count, landing = _count_toc(toc)
        if page_count == 0:
            print(f"  ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
            spec.mode = "single"
            page_count, landing = 1, spec.doc_id
    else:
        page_count, landing = 1, spec.doc_id

    return {
        "slug": spec.slug,
        "doc_id": spec.doc_id,
        "title": meta.get("title") or spec.title,
        "version": spec.version,
        "platform": spec.platform,
        "product": spec.product,
        "language": spec.language,
        "page_count": page_count,
        "mode": spec.mode,
        "abstract": meta.get("abstract", ""),
        "dates": {"Published": meta.get("published", "")},
        "landing_page": landing,
        "source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id),
    }


def main() -> int:
    p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
    p.add_argument("--out", default=str(BUNDLES_JSON))
    args = p.parse_args()

    s = _session()
    out: list[dict[str, Any]] = []
    for spec in BUNDLES:
        print(f"  • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
        out.append(discover_bundle(s, spec))

    Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
    print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())