morpheus-docs/scrape/bundles.py

"""Discover Morpheus Enterprise doc bundles on HPE Support DocPortal and write bundles.json.

Mirrors hvm-docs/scrape/bundles.py — same portal, same API shape, same single-doc-blob
treatment for Release Notes, but pointing at the Morpheus Enterprise docId range.

For each bundle this script:
  1. GETs /hpesc/public/api/document/{docId}        → abstract HTML
  2. GETs /hpesc/public/api/document/{docId}/toc    → page tree (or 404 for single-doc)
  3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.

QuickSpecs is a special case: lives at www.hpe.com (not support.hpe.com), gets the
html-file mode and is scraped via curl_cffi (see scrape/quickspecs.py).
"""
from __future__ import annotations

import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "morpheus-docs-mcp/0.1 (+https://git.jpaul.io/justin/morpheus-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"


@dataclass
class BundleSpec:
    slug: str
    doc_id: str
    title: str
    version: str | None
    product: str  # e.g. "User Manual", "Release Notes", "QuickSpecs"
    mode: str    # "toc", "single", or "html-file"
    platform: str | None = None
    language: str = "en-US"
    source_url: str | None = None   # overrides the default support.hpe.com URL


# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00006500..7740 for `Morpheus Enterprise` matches in the abstract.
#
# Notes:
#   - Morpheus Enterprise has User Manuals dating back to 8.0.10
#     (sd00006774en_us, Sep 2025) but we only ship the 8.1.x line for
#     now. Add the 8.0.x bundles here if you need older versions in the
#     corpus.
#   - No dedicated Deployment Guide or Qualification Matrix for Morpheus
#     Enterprise on HPE Support — the only QM (sd00006551en_us) covers
#     HVM clusters managed by Morpheus, which lives in hvm-docs.
#   - QuickSpecs lives on www.hpe.com (not support.hpe.com), uses the
#     html-file scrape mode with curl_cffi Chrome impersonation.
BUNDLES: list[BundleSpec] = [
    BundleSpec("morpheus_user_manual_8_1_0",   "sd00007510en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.0", "User Manual",   "toc"),
    BundleSpec("morpheus_user_manual_8_1_1",   "sd00007621en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.1", "User Manual",   "toc"),
    BundleSpec("morpheus_user_manual_8_1_2",   "sd00007732en_us", "HPE Morpheus Enterprise Software Documentation", "8.1.2", "User Manual",   "toc"),
    BundleSpec("morpheus_release_notes_8_1_0", "sd00007496en_us", "HPE Morpheus Enterprise Software Release Notes",  "8.1.0", "Release Notes", "single"),
    BundleSpec("morpheus_release_notes_8_1_1", "sd00007610en_us", "HPE Morpheus Enterprise Software Release Notes",  "8.1.1", "Release Notes", "single"),
    BundleSpec("morpheus_release_notes_8_1_2", "sd00007733en_us", "HPE Morpheus Enterprise Software Release Notes",  "8.1.2", "Release Notes", "single"),
    BundleSpec("morpheus_quickspecs",          "a50009231enw",    "HPE Morpheus Enterprise Software QuickSpecs",
               "v1", "QuickSpecs", "html-file",
               source_url="https://www.hpe.com/psnow/doc/a50009231enw"),
]


def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
    return s


def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
    delay = 1.0
    for attempt in range(retries):
        r = s.get(url, timeout=30)
        if r.status_code == 200:
            return r.json() if expect_json else r.text
        if r.status_code == 404:
            return None
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay)
            delay *= 2
            continue
        r.raise_for_status()
    raise RuntimeError(f"GET failed after {retries} retries: {url}")


def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
    if not toc:
        return 0, None
    landing = None
    n = 0

    def walk(nodes: list[dict] | None, depth: int) -> None:
        nonlocal n, landing
        for node in nodes or []:
            link = node.get("topicLink")
            if link:
                n += 1
                m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
                if m and landing is None:
                    landing = m.group(1)
            walk(node.get("children"), depth + 1)

    walk(toc, 0)
    return n, landing


def _parse_abstract(html: str) -> dict[str, str]:
    soup = BeautifulSoup(html, "html.parser")
    out: dict[str, str] = {}
    h1 = soup.select_one("h1.title.topictitle1")
    if h1:
        out["title"] = h1.get_text(" ", strip=True)
    desc = soup.select_one("div.desc")
    if desc:
        out["abstract"] = desc.get_text(" ", strip=True)
    pub = soup.select_one("div.publishedDate")
    if pub:
        out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
    return out


def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
    # html-file bundles are static fixtures or live-fetched outside support.hpe.com.
    if spec.mode == "html-file":
        return {
            "slug": spec.slug,
            "doc_id": spec.doc_id,
            "title": spec.title,
            "version": spec.version,
            "platform": spec.platform,
            "product": spec.product,
            "language": spec.language,
            "page_count": 1,
            "mode": "html-file",
            "abstract": "",
            "dates": {},
            "landing_page": spec.doc_id,
            "source_url": spec.source_url or f"https://www.hpe.com/psnow/doc/{spec.doc_id}",
        }

    abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
    meta = _parse_abstract(abstract_html or "")

    page_count: int
    landing: str | None
    if spec.mode == "toc":
        toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
        page_count, landing = _count_toc(toc)
        if page_count == 0:
            print(f"  ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
            spec.mode = "single"
            page_count, landing = 1, spec.doc_id
    else:
        page_count, landing = 1, spec.doc_id

    return {
        "slug": spec.slug,
        "doc_id": spec.doc_id,
        "title": meta.get("title") or spec.title,
        "version": spec.version,
        "platform": spec.platform,
        "product": spec.product,
        "language": spec.language,
        "page_count": page_count,
        "mode": spec.mode,
        "abstract": meta.get("abstract", ""),
        "dates": {"Published": meta.get("published", "")},
        "landing_page": landing,
        "source_url": spec.source_url or DOC_URL.format(doc_id=spec.doc_id),
    }


def main() -> int:
    p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
    p.add_argument("--out", default=str(BUNDLES_JSON))
    args = p.parse_args()

    s = _session()
    out: list[dict[str, Any]] = []
    for spec in BUNDLES:
        print(f"  • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
        out.append(discover_bundle(s, spec))

    Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
    print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())