a0727da8da
Two new bundles:
* hvm_qualification_matrix (sd00006551en_us) — the "Qualification Matrix
for HVM Clusters Managed by HPE Morpheus Software". Single TOC bundle,
2 pages (parent + content). The content page is ~100 KB of HTML
containing five tables: Server Hardware Support, Storage Hardware
Support, Independent Software Vendor (ISV) Support, Hypervisor OS
Compatibility and Interoperability Matrix, and Guest OS. Scraped via
the same /hpesc/public/api/document/{docId}/render endpoint as every
other bundle on support.hpe.com — the API returns server-rendered
DITA HTML, so no JS/SPA shenanigans.
* hvm_quickspecs (a50004260enw) — HPE Morpheus VM Essentials Software
QuickSpecs, Version 4 (02-Feb-2026). SKUs: S5Q81AAE (1-yr per Socket
E-LTU), S5Q82AAE (3-yr), S5Q83AAE (5-yr); each includes Tech Care
Essentials. QuickSpecs lives at www.hpe.com (not support.hpe.com),
which drops connections at the edge for non-browser TLS fingerprints —
verified 2026-05-22 against curl, wget, urllib, and Anthropic's
WebFetch (all = 0 bytes / connection timeout in headers). Bypassed
here via curl_cffi impersonating Chrome 120's JA3/JA4 fingerprint.
HTTP 200, 255 KB on first try, all four sections + all three SKUs
cleanly parseable from the server-rendered HTML.
New module scrape/quickspecs.py drives the live fetch + parse for any
hvm_*_quickspecs bundle. CSS selectors taken from the captured DOM:
.lr-right-rail hpe-highlights-container .collateral-content
— one block per H3 section
h3.txto-title — section title
div.txto-description — section body
uc-table.uc-table-polaris — SKU and version-history tables
On any live failure the parser falls back to a committed HTML fixture
at scrape/quickspecs/<doc_id>.html so the build never breaks on a
transient edge hiccup.
scrape/runner.py learned a new mode "html-file" that dispatches to
scrape.quickspecs; bundles.py extended with an optional source_url on
BundleSpec for cases where the page lives outside support.hpe.com.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
195 lines
7.3 KiB
Python
195 lines
7.3 KiB
Python
"""Scrape HPE QuickSpecs collateral pages into corpus markdown.
|
|
|
|
HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral.<doc_id>.html`
|
|
with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the
|
|
captured DOM). The blocker for automated scraping is `www.hpe.com`'s
|
|
edge bot defense, which drops connections from non-browser TLS
|
|
fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here
|
|
by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint.
|
|
|
|
Content extraction uses these stable CSS selectors found in the page:
|
|
|
|
.lr-right-rail hpe-highlights-container .collateral-content
|
|
— one per section ("Overview", "Standard Features", etc.)
|
|
h3.txto-title — section title
|
|
div.txto-description — section body
|
|
uc-table.uc-table-polaris — SKU / version-history tables
|
|
|
|
A committed HTML fixture at `scrape/quickspecs/<doc_id>.html` is used
|
|
as a fallback when the live fetch fails (HPE edge churn, network
|
|
issues). Keeping a current fixture in the repo also makes diffing
|
|
QuickSpecs revisions easy.
|
|
|
|
Usage (called by scrape.runner for bundles with mode="quickspecs"):
|
|
|
|
python -m scrape.quickspecs a50004260enw
|
|
|
|
Or programmatically:
|
|
|
|
from scrape.quickspecs import scrape_quickspecs
|
|
scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...")
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
from markdownify import markdownify as md
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
SOURCE_DIR = ROOT / "scrape" / "quickspecs"
|
|
CORPUS_DIR = ROOT / "corpus"
|
|
|
|
COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html"
|
|
|
|
|
|
def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None:
|
|
"""GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint).
|
|
Returns the HTML body on success, None on any failure."""
|
|
try:
|
|
from curl_cffi import requests as cc
|
|
except ImportError:
|
|
log.warning("curl_cffi not installed; can't fetch QuickSpecs live")
|
|
return None
|
|
try:
|
|
r = cc.get(COLLATERAL_URL.format(doc_id=doc_id),
|
|
impersonate="chrome120", timeout=timeout)
|
|
if r.status_code != 200 or not r.text:
|
|
log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or ""))
|
|
return None
|
|
return r.text
|
|
except Exception as e:
|
|
log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e)
|
|
return None
|
|
|
|
|
|
def fetch_fixture(doc_id: str) -> str | None:
|
|
"""Read the committed HTML fixture as fallback."""
|
|
p = SOURCE_DIR / f"{doc_id}.html"
|
|
if not p.exists():
|
|
return None
|
|
return p.read_text()
|
|
|
|
|
|
def _extract_content_blocks(html: str) -> list[str]:
|
|
"""Pull each section block (.collateral-content under .lr-right-rail).
|
|
|
|
The fixture format (just .quickspecs-content wrapper) and the live
|
|
format (.lr-right-rail with nested hpe-highlights-container) are
|
|
both supported. Returns a list of section HTML strings, in document
|
|
order.
|
|
"""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
# Live format: each <hpe-highlights-container> under .lr-right-rail has
|
|
# one or more .collateral-content blocks; concat them.
|
|
rail = soup.select_one(".lr-right-rail")
|
|
if rail is not None:
|
|
blocks = rail.select(".collateral-content")
|
|
return [str(b) for b in blocks]
|
|
# Fixture format: a single wrapper holding all the H2/H3 sections.
|
|
wrapper = soup.select_one(".quickspecs-content")
|
|
if wrapper is not None:
|
|
return [str(wrapper)]
|
|
# Last-resort: whole body.
|
|
body = soup.body or soup
|
|
return [str(body)]
|
|
|
|
|
|
def parse_html(html: str) -> str:
|
|
"""Convert QuickSpecs HTML to clean markdown.
|
|
|
|
Filters out the page chrome (nav, footer, recommendations carousel,
|
|
cookie banner, analytics blobs) by extracting only the content
|
|
blocks, then runs markdownify."""
|
|
blocks = _extract_content_blocks(html)
|
|
chunks: list[str] = []
|
|
for block in blocks:
|
|
soup = BeautifulSoup(block, "html.parser")
|
|
# Drop anchor placeholders that markdownify turns into noisy links
|
|
for a in soup.select('[hpe-left-rail-anchor]'):
|
|
a.decompose()
|
|
# Drop carousel / share / recommendation widgets if any leaked in.
|
|
for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar",
|
|
"esl-scrollbar", "esl-trigger", "video-overlay",
|
|
"generic-modal-loader", "style", "script"):
|
|
for el in soup.select(sel):
|
|
el.decompose()
|
|
chunks.append(md(str(soup), heading_style="ATX", bullets="-",
|
|
strip=["span", "div"]))
|
|
text = "\n\n".join(chunks)
|
|
# Collapse runs of blank lines markdownify likes to emit.
|
|
text = "\n".join(line.rstrip() for line in text.splitlines())
|
|
while "\n\n\n" in text:
|
|
text = text.replace("\n\n\n", "\n\n")
|
|
return text.strip() + "\n"
|
|
|
|
|
|
def scrape_quickspecs(doc_id: str, bundle_id: str, title: str,
|
|
version: str | None = None,
|
|
product: str = "QuickSpecs",
|
|
source_url: str | None = None,
|
|
force: bool = False) -> bool:
|
|
"""Live-fetch (or fall back to fixture), parse, write corpus files.
|
|
|
|
Returns True if files were written, False if skipped (already exists
|
|
and --force not set)."""
|
|
bundle_dir = CORPUS_DIR / bundle_id
|
|
md_path = bundle_dir / f"{doc_id}.md"
|
|
json_path = bundle_dir / f"{doc_id}.json"
|
|
if not force and md_path.exists() and json_path.exists():
|
|
log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id)
|
|
return False
|
|
|
|
html = fetch_live(doc_id)
|
|
fetched_from = "live"
|
|
if html is None:
|
|
html = fetch_fixture(doc_id)
|
|
fetched_from = "fixture"
|
|
if html is None:
|
|
log.error("QuickSpecs %s: no live response and no fixture at %s",
|
|
doc_id, SOURCE_DIR / f"{doc_id}.html")
|
|
return False
|
|
|
|
body_md = parse_html(html)
|
|
bundle_dir.mkdir(parents=True, exist_ok=True)
|
|
md_path.write_text(body_md)
|
|
sidecar = {
|
|
"bundle_id": bundle_id,
|
|
"page_id": doc_id,
|
|
"title": title,
|
|
"ordinal": 1,
|
|
"parent_title": None,
|
|
"doc_id": doc_id,
|
|
"version": version,
|
|
"product": product,
|
|
"source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}",
|
|
"fetched_from": fetched_from,
|
|
}
|
|
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
|
|
log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from)
|
|
return True
|
|
|
|
|
|
def main() -> int:
|
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw")
|
|
p.add_argument("--bundle-id", default="hvm_quickspecs")
|
|
p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs")
|
|
p.add_argument("--version", default=None)
|
|
p.add_argument("--force", action="store_true")
|
|
args = p.parse_args()
|
|
ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title,
|
|
args.version, force=args.force)
|
|
return 0 if ok else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|