"""Scrape HPE QuickSpecs collateral pages into corpus markdown. HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral..html` with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the captured DOM). The blocker for automated scraping is `www.hpe.com`'s edge bot defense, which drops connections from non-browser TLS fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint. Content extraction uses these stable CSS selectors found in the page: .lr-right-rail hpe-highlights-container .collateral-content — one per section ("Overview", "Standard Features", etc.) h3.txto-title — section title div.txto-description — section body uc-table.uc-table-polaris — SKU / version-history tables A committed HTML fixture at `scrape/quickspecs/.html` is used as a fallback when the live fetch fails (HPE edge churn, network issues). Keeping a current fixture in the repo also makes diffing QuickSpecs revisions easy. Usage (called by scrape.runner for bundles with mode="quickspecs"): python -m scrape.quickspecs a50004260enw Or programmatically: from scrape.quickspecs import scrape_quickspecs scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...") """ from __future__ import annotations import argparse import json import logging import sys from pathlib import Path from bs4 import BeautifulSoup, NavigableString from markdownify import markdownify as md log = logging.getLogger(__name__) ROOT = Path(__file__).resolve().parent.parent SOURCE_DIR = ROOT / "scrape" / "quickspecs" CORPUS_DIR = ROOT / "corpus" COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html" def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None: """GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint). Returns the HTML body on success, None on any failure.""" try: from curl_cffi import requests as cc except ImportError: log.warning("curl_cffi not installed; can't fetch QuickSpecs live") return None try: r = cc.get(COLLATERAL_URL.format(doc_id=doc_id), impersonate="chrome120", timeout=timeout) if r.status_code != 200 or not r.text: log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or "")) return None return r.text except Exception as e: log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e) return None def fetch_fixture(doc_id: str) -> str | None: """Read the committed HTML fixture as fallback.""" p = SOURCE_DIR / f"{doc_id}.html" if not p.exists(): return None return p.read_text() def _extract_content_blocks(html: str) -> list[str]: """Pull each section block (.collateral-content under .lr-right-rail). The fixture format (just .quickspecs-content wrapper) and the live format (.lr-right-rail with nested hpe-highlights-container) are both supported. Returns a list of section HTML strings, in document order. """ soup = BeautifulSoup(html, "html.parser") # Live format: each under .lr-right-rail has # one or more .collateral-content blocks; concat them. rail = soup.select_one(".lr-right-rail") if rail is not None: blocks = rail.select(".collateral-content") return [str(b) for b in blocks] # Fixture format: a single wrapper holding all the H2/H3 sections. wrapper = soup.select_one(".quickspecs-content") if wrapper is not None: return [str(wrapper)] # Last-resort: whole body. body = soup.body or soup return [str(body)] def parse_html(html: str) -> str: """Convert QuickSpecs HTML to clean markdown. Filters out the page chrome (nav, footer, recommendations carousel, cookie banner, analytics blobs) by extracting only the content blocks, then runs markdownify.""" blocks = _extract_content_blocks(html) chunks: list[str] = [] for block in blocks: soup = BeautifulSoup(block, "html.parser") # Drop anchor placeholders that markdownify turns into noisy links for a in soup.select('[hpe-left-rail-anchor]'): a.decompose() # Drop carousel / share / recommendation widgets if any leaked in. for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar", "esl-scrollbar", "esl-trigger", "video-overlay", "generic-modal-loader", "style", "script"): for el in soup.select(sel): el.decompose() chunks.append(md(str(soup), heading_style="ATX", bullets="-", strip=["span", "div"])) text = "\n\n".join(chunks) # Collapse runs of blank lines markdownify likes to emit. text = "\n".join(line.rstrip() for line in text.splitlines()) while "\n\n\n" in text: text = text.replace("\n\n\n", "\n\n") return text.strip() + "\n" def scrape_quickspecs(doc_id: str, bundle_id: str, title: str, version: str | None = None, product: str = "QuickSpecs", source_url: str | None = None, force: bool = False) -> bool: """Live-fetch (or fall back to fixture), parse, write corpus files. Returns True if files were written, False if skipped (already exists and --force not set).""" bundle_dir = CORPUS_DIR / bundle_id md_path = bundle_dir / f"{doc_id}.md" json_path = bundle_dir / f"{doc_id}.json" if not force and md_path.exists() and json_path.exists(): log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id) return False html = fetch_live(doc_id) fetched_from = "live" if html is None: html = fetch_fixture(doc_id) fetched_from = "fixture" if html is None: log.error("QuickSpecs %s: no live response and no fixture at %s", doc_id, SOURCE_DIR / f"{doc_id}.html") return False body_md = parse_html(html) bundle_dir.mkdir(parents=True, exist_ok=True) md_path.write_text(body_md) sidecar = { "bundle_id": bundle_id, "page_id": doc_id, "title": title, "ordinal": 1, "parent_title": None, "doc_id": doc_id, "version": version, "product": product, "source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}", "fetched_from": fetched_from, } json_path.write_text(json.dumps(sidecar, indent=2) + "\n") log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from) return True def main() -> int: logging.basicConfig(level=logging.INFO, format="%(message)s") p = argparse.ArgumentParser() p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw") p.add_argument("--bundle-id", default="hvm_quickspecs") p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs") p.add_argument("--version", default=None) p.add_argument("--force", action="store_true") args = p.parse_args() ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title, args.version, force=args.force) return 0 if ok else 1 if __name__ == "__main__": sys.exit(main())