hvm-docs/scrape/quickspecs.py

"""Scrape HPE QuickSpecs collateral pages into corpus markdown.

HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral.<doc_id>.html`
with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the
captured DOM). The blocker for automated scraping is `www.hpe.com`'s
edge bot defense, which drops connections from non-browser TLS
fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here
by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint.

Content extraction uses these stable CSS selectors found in the page:

  .lr-right-rail hpe-highlights-container .collateral-content
       — one per section ("Overview", "Standard Features", etc.)
  h3.txto-title          — section title
  div.txto-description   — section body
  uc-table.uc-table-polaris   — SKU / version-history tables

A committed HTML fixture at `scrape/quickspecs/<doc_id>.html` is used
as a fallback when the live fetch fails (HPE edge churn, network
issues). Keeping a current fixture in the repo also makes diffing
QuickSpecs revisions easy.

Usage (called by scrape.runner for bundles with mode="quickspecs"):

    python -m scrape.quickspecs a50004260enw

Or programmatically:

    from scrape.quickspecs import scrape_quickspecs
    scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...")
"""
from __future__ import annotations

import argparse
import json
import logging
import sys
from pathlib import Path

from bs4 import BeautifulSoup, NavigableString
from markdownify import markdownify as md

log = logging.getLogger(__name__)

ROOT = Path(__file__).resolve().parent.parent
SOURCE_DIR = ROOT / "scrape" / "quickspecs"
CORPUS_DIR = ROOT / "corpus"

COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html"


def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None:
    """GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint).
    Returns the HTML body on success, None on any failure."""
    try:
        from curl_cffi import requests as cc
    except ImportError:
        log.warning("curl_cffi not installed; can't fetch QuickSpecs live")
        return None
    try:
        r = cc.get(COLLATERAL_URL.format(doc_id=doc_id),
                   impersonate="chrome120", timeout=timeout)
        if r.status_code != 200 or not r.text:
            log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or ""))
            return None
        return r.text
    except Exception as e:
        log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e)
        return None


def fetch_fixture(doc_id: str) -> str | None:
    """Read the committed HTML fixture as fallback."""
    p = SOURCE_DIR / f"{doc_id}.html"
    if not p.exists():
        return None
    return p.read_text()


def _extract_content_blocks(html: str) -> list[str]:
    """Pull each section block (.collateral-content under .lr-right-rail).

    The fixture format (just .quickspecs-content wrapper) and the live
    format (.lr-right-rail with nested hpe-highlights-container) are
    both supported. Returns a list of section HTML strings, in document
    order.
    """
    soup = BeautifulSoup(html, "html.parser")
    # Live format: each <hpe-highlights-container> under .lr-right-rail has
    # one or more .collateral-content blocks; concat them.
    rail = soup.select_one(".lr-right-rail")
    if rail is not None:
        blocks = rail.select(".collateral-content")
        return [str(b) for b in blocks]
    # Fixture format: a single wrapper holding all the H2/H3 sections.
    wrapper = soup.select_one(".quickspecs-content")
    if wrapper is not None:
        return [str(wrapper)]
    # Last-resort: whole body.
    body = soup.body or soup
    return [str(body)]


def parse_html(html: str) -> str:
    """Convert QuickSpecs HTML to clean markdown.

    Filters out the page chrome (nav, footer, recommendations carousel,
    cookie banner, analytics blobs) by extracting only the content
    blocks, then runs markdownify."""
    blocks = _extract_content_blocks(html)
    chunks: list[str] = []
    for block in blocks:
        soup = BeautifulSoup(block, "html.parser")
        # Drop anchor placeholders that markdownify turns into noisy links
        for a in soup.select('[hpe-left-rail-anchor]'):
            a.decompose()
        # Drop carousel / share / recommendation widgets if any leaked in.
        for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar",
                    "esl-scrollbar", "esl-trigger", "video-overlay",
                    "generic-modal-loader", "style", "script"):
            for el in soup.select(sel):
                el.decompose()
        chunks.append(md(str(soup), heading_style="ATX", bullets="-",
                          strip=["span", "div"]))
    text = "\n\n".join(chunks)
    # Collapse runs of blank lines markdownify likes to emit.
    text = "\n".join(line.rstrip() for line in text.splitlines())
    while "\n\n\n" in text:
        text = text.replace("\n\n\n", "\n\n")
    return text.strip() + "\n"


def scrape_quickspecs(doc_id: str, bundle_id: str, title: str,
                     version: str | None = None,
                     product: str = "QuickSpecs",
                     source_url: str | None = None,
                     force: bool = False) -> bool:
    """Live-fetch (or fall back to fixture), parse, write corpus files.

    Returns True if files were written, False if skipped (already exists
    and --force not set)."""
    bundle_dir = CORPUS_DIR / bundle_id
    md_path = bundle_dir / f"{doc_id}.md"
    json_path = bundle_dir / f"{doc_id}.json"
    if not force and md_path.exists() and json_path.exists():
        log.info("  %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id)
        return False

    html = fetch_live(doc_id)
    fetched_from = "live"
    if html is None:
        html = fetch_fixture(doc_id)
        fetched_from = "fixture"
    if html is None:
        log.error("QuickSpecs %s: no live response and no fixture at %s",
                  doc_id, SOURCE_DIR / f"{doc_id}.html")
        return False

    body_md = parse_html(html)
    bundle_dir.mkdir(parents=True, exist_ok=True)
    md_path.write_text(body_md)
    sidecar = {
        "bundle_id": bundle_id,
        "page_id": doc_id,
        "title": title,
        "ordinal": 1,
        "parent_title": None,
        "doc_id": doc_id,
        "version": version,
        "product": product,
        "source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}",
        "fetched_from": fetched_from,
    }
    json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
    log.info("  %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from)
    return True


def main() -> int:
    logging.basicConfig(level=logging.INFO, format="%(message)s")
    p = argparse.ArgumentParser()
    p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw")
    p.add_argument("--bundle-id", default="hvm_quickspecs")
    p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs")
    p.add_argument("--version", default=None)
    p.add_argument("--force", action="store_true")
    args = p.parse_args()
    ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title,
                            args.version, force=args.force)
    return 0 if ok else 1


if __name__ == "__main__":
    sys.exit(main())