scrape: add Qualification Matrix + QuickSpecs bundles (live curl_cffi for HPE www)
Two new bundles:
* hvm_qualification_matrix (sd00006551en_us) — the "Qualification Matrix
for HVM Clusters Managed by HPE Morpheus Software". Single TOC bundle,
2 pages (parent + content). The content page is ~100 KB of HTML
containing five tables: Server Hardware Support, Storage Hardware
Support, Independent Software Vendor (ISV) Support, Hypervisor OS
Compatibility and Interoperability Matrix, and Guest OS. Scraped via
the same /hpesc/public/api/document/{docId}/render endpoint as every
other bundle on support.hpe.com — the API returns server-rendered
DITA HTML, so no JS/SPA shenanigans.
* hvm_quickspecs (a50004260enw) — HPE Morpheus VM Essentials Software
QuickSpecs, Version 4 (02-Feb-2026). SKUs: S5Q81AAE (1-yr per Socket
E-LTU), S5Q82AAE (3-yr), S5Q83AAE (5-yr); each includes Tech Care
Essentials. QuickSpecs lives at www.hpe.com (not support.hpe.com),
which drops connections at the edge for non-browser TLS fingerprints —
verified 2026-05-22 against curl, wget, urllib, and Anthropic's
WebFetch (all = 0 bytes / connection timeout in headers). Bypassed
here via curl_cffi impersonating Chrome 120's JA3/JA4 fingerprint.
HTTP 200, 255 KB on first try, all four sections + all three SKUs
cleanly parseable from the server-rendered HTML.
New module scrape/quickspecs.py drives the live fetch + parse for any
hvm_*_quickspecs bundle. CSS selectors taken from the captured DOM:
.lr-right-rail hpe-highlights-container .collateral-content
— one block per H3 section
h3.txto-title — section title
div.txto-description — section body
uc-table.uc-table-polaris — SKU and version-history tables
On any live failure the parser falls back to a committed HTML fixture
at scrape/quickspecs/<doc_id>.html so the build never breaks on a
transient edge hiccup.
scrape/runner.py learned a new mode "html-file" that dispatches to
scrape.quickspecs; bundles.py extended with an optional source_url on
BundleSpec for cases where the page lives outside support.hpe.com.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,194 @@
|
||||
"""Scrape HPE QuickSpecs collateral pages into corpus markdown.
|
||||
|
||||
HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral.<doc_id>.html`
|
||||
with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the
|
||||
captured DOM). The blocker for automated scraping is `www.hpe.com`'s
|
||||
edge bot defense, which drops connections from non-browser TLS
|
||||
fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here
|
||||
by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint.
|
||||
|
||||
Content extraction uses these stable CSS selectors found in the page:
|
||||
|
||||
.lr-right-rail hpe-highlights-container .collateral-content
|
||||
— one per section ("Overview", "Standard Features", etc.)
|
||||
h3.txto-title — section title
|
||||
div.txto-description — section body
|
||||
uc-table.uc-table-polaris — SKU / version-history tables
|
||||
|
||||
A committed HTML fixture at `scrape/quickspecs/<doc_id>.html` is used
|
||||
as a fallback when the live fetch fails (HPE edge churn, network
|
||||
issues). Keeping a current fixture in the repo also makes diffing
|
||||
QuickSpecs revisions easy.
|
||||
|
||||
Usage (called by scrape.runner for bundles with mode="quickspecs"):
|
||||
|
||||
python -m scrape.quickspecs a50004260enw
|
||||
|
||||
Or programmatically:
|
||||
|
||||
from scrape.quickspecs import scrape_quickspecs
|
||||
scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...")
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from markdownify import markdownify as md
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
SOURCE_DIR = ROOT / "scrape" / "quickspecs"
|
||||
CORPUS_DIR = ROOT / "corpus"
|
||||
|
||||
COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html"
|
||||
|
||||
|
||||
def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None:
|
||||
"""GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint).
|
||||
Returns the HTML body on success, None on any failure."""
|
||||
try:
|
||||
from curl_cffi import requests as cc
|
||||
except ImportError:
|
||||
log.warning("curl_cffi not installed; can't fetch QuickSpecs live")
|
||||
return None
|
||||
try:
|
||||
r = cc.get(COLLATERAL_URL.format(doc_id=doc_id),
|
||||
impersonate="chrome120", timeout=timeout)
|
||||
if r.status_code != 200 or not r.text:
|
||||
log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or ""))
|
||||
return None
|
||||
return r.text
|
||||
except Exception as e:
|
||||
log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_fixture(doc_id: str) -> str | None:
|
||||
"""Read the committed HTML fixture as fallback."""
|
||||
p = SOURCE_DIR / f"{doc_id}.html"
|
||||
if not p.exists():
|
||||
return None
|
||||
return p.read_text()
|
||||
|
||||
|
||||
def _extract_content_blocks(html: str) -> list[str]:
|
||||
"""Pull each section block (.collateral-content under .lr-right-rail).
|
||||
|
||||
The fixture format (just .quickspecs-content wrapper) and the live
|
||||
format (.lr-right-rail with nested hpe-highlights-container) are
|
||||
both supported. Returns a list of section HTML strings, in document
|
||||
order.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
# Live format: each <hpe-highlights-container> under .lr-right-rail has
|
||||
# one or more .collateral-content blocks; concat them.
|
||||
rail = soup.select_one(".lr-right-rail")
|
||||
if rail is not None:
|
||||
blocks = rail.select(".collateral-content")
|
||||
return [str(b) for b in blocks]
|
||||
# Fixture format: a single wrapper holding all the H2/H3 sections.
|
||||
wrapper = soup.select_one(".quickspecs-content")
|
||||
if wrapper is not None:
|
||||
return [str(wrapper)]
|
||||
# Last-resort: whole body.
|
||||
body = soup.body or soup
|
||||
return [str(body)]
|
||||
|
||||
|
||||
def parse_html(html: str) -> str:
|
||||
"""Convert QuickSpecs HTML to clean markdown.
|
||||
|
||||
Filters out the page chrome (nav, footer, recommendations carousel,
|
||||
cookie banner, analytics blobs) by extracting only the content
|
||||
blocks, then runs markdownify."""
|
||||
blocks = _extract_content_blocks(html)
|
||||
chunks: list[str] = []
|
||||
for block in blocks:
|
||||
soup = BeautifulSoup(block, "html.parser")
|
||||
# Drop anchor placeholders that markdownify turns into noisy links
|
||||
for a in soup.select('[hpe-left-rail-anchor]'):
|
||||
a.decompose()
|
||||
# Drop carousel / share / recommendation widgets if any leaked in.
|
||||
for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar",
|
||||
"esl-scrollbar", "esl-trigger", "video-overlay",
|
||||
"generic-modal-loader", "style", "script"):
|
||||
for el in soup.select(sel):
|
||||
el.decompose()
|
||||
chunks.append(md(str(soup), heading_style="ATX", bullets="-",
|
||||
strip=["span", "div"]))
|
||||
text = "\n\n".join(chunks)
|
||||
# Collapse runs of blank lines markdownify likes to emit.
|
||||
text = "\n".join(line.rstrip() for line in text.splitlines())
|
||||
while "\n\n\n" in text:
|
||||
text = text.replace("\n\n\n", "\n\n")
|
||||
return text.strip() + "\n"
|
||||
|
||||
|
||||
def scrape_quickspecs(doc_id: str, bundle_id: str, title: str,
|
||||
version: str | None = None,
|
||||
product: str = "QuickSpecs",
|
||||
source_url: str | None = None,
|
||||
force: bool = False) -> bool:
|
||||
"""Live-fetch (or fall back to fixture), parse, write corpus files.
|
||||
|
||||
Returns True if files were written, False if skipped (already exists
|
||||
and --force not set)."""
|
||||
bundle_dir = CORPUS_DIR / bundle_id
|
||||
md_path = bundle_dir / f"{doc_id}.md"
|
||||
json_path = bundle_dir / f"{doc_id}.json"
|
||||
if not force and md_path.exists() and json_path.exists():
|
||||
log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id)
|
||||
return False
|
||||
|
||||
html = fetch_live(doc_id)
|
||||
fetched_from = "live"
|
||||
if html is None:
|
||||
html = fetch_fixture(doc_id)
|
||||
fetched_from = "fixture"
|
||||
if html is None:
|
||||
log.error("QuickSpecs %s: no live response and no fixture at %s",
|
||||
doc_id, SOURCE_DIR / f"{doc_id}.html")
|
||||
return False
|
||||
|
||||
body_md = parse_html(html)
|
||||
bundle_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_path.write_text(body_md)
|
||||
sidecar = {
|
||||
"bundle_id": bundle_id,
|
||||
"page_id": doc_id,
|
||||
"title": title,
|
||||
"ordinal": 1,
|
||||
"parent_title": None,
|
||||
"doc_id": doc_id,
|
||||
"version": version,
|
||||
"product": product,
|
||||
"source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}",
|
||||
"fetched_from": fetched_from,
|
||||
}
|
||||
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
|
||||
log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from)
|
||||
return True
|
||||
|
||||
|
||||
def main() -> int:
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw")
|
||||
p.add_argument("--bundle-id", default="hvm_quickspecs")
|
||||
p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs")
|
||||
p.add_argument("--version", default=None)
|
||||
p.add_argument("--force", action="store_true")
|
||||
args = p.parse_args()
|
||||
ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title,
|
||||
args.version, force=args.force)
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user