Files

195 lines
7.3 KiB
Python

"""Scrape HPE QuickSpecs collateral pages into corpus markdown.
HPE QuickSpecs live at `https://www.hpe.com/us/en/collaterals/collateral.<doc_id>.html`
with a server-rendered HTML body (confirmed 2026-05-22 by inspecting the
captured DOM). The blocker for automated scraping is `www.hpe.com`'s
edge bot defense, which drops connections from non-browser TLS
fingerprints (curl, wget, Python-urllib, even WebFetch). Bypassed here
by `curl_cffi` impersonating Chrome 120's JA3/JA4 fingerprint.
Content extraction uses these stable CSS selectors found in the page:
.lr-right-rail hpe-highlights-container .collateral-content
— one per section ("Overview", "Standard Features", etc.)
h3.txto-title — section title
div.txto-description — section body
uc-table.uc-table-polaris — SKU / version-history tables
A committed HTML fixture at `scrape/quickspecs/<doc_id>.html` is used
as a fallback when the live fetch fails (HPE edge churn, network
issues). Keeping a current fixture in the repo also makes diffing
QuickSpecs revisions easy.
Usage (called by scrape.runner for bundles with mode="quickspecs"):
python -m scrape.quickspecs a50004260enw
Or programmatically:
from scrape.quickspecs import scrape_quickspecs
scrape_quickspecs("a50004260enw", bundle_id="hvm_quickspecs", title="...")
"""
from __future__ import annotations
import argparse
import json
import logging
import sys
from pathlib import Path
from bs4 import BeautifulSoup, NavigableString
from markdownify import markdownify as md
log = logging.getLogger(__name__)
ROOT = Path(__file__).resolve().parent.parent
SOURCE_DIR = ROOT / "scrape" / "quickspecs"
CORPUS_DIR = ROOT / "corpus"
COLLATERAL_URL = "https://www.hpe.com/us/en/collaterals/collateral.{doc_id}.html"
def fetch_live(doc_id: str, timeout: float = 30.0) -> str | None:
"""GET the collateral page via curl_cffi (Chrome 120 TLS fingerprint).
Returns the HTML body on success, None on any failure."""
try:
from curl_cffi import requests as cc
except ImportError:
log.warning("curl_cffi not installed; can't fetch QuickSpecs live")
return None
try:
r = cc.get(COLLATERAL_URL.format(doc_id=doc_id),
impersonate="chrome120", timeout=timeout)
if r.status_code != 200 or not r.text:
log.warning("QuickSpecs %s: http=%s bytes=%d", doc_id, r.status_code, len(r.text or ""))
return None
return r.text
except Exception as e:
log.warning("QuickSpecs %s live fetch failed: %s", doc_id, e)
return None
def fetch_fixture(doc_id: str) -> str | None:
"""Read the committed HTML fixture as fallback."""
p = SOURCE_DIR / f"{doc_id}.html"
if not p.exists():
return None
return p.read_text()
def _extract_content_blocks(html: str) -> list[str]:
"""Pull each section block (.collateral-content under .lr-right-rail).
The fixture format (just .quickspecs-content wrapper) and the live
format (.lr-right-rail with nested hpe-highlights-container) are
both supported. Returns a list of section HTML strings, in document
order.
"""
soup = BeautifulSoup(html, "html.parser")
# Live format: each <hpe-highlights-container> under .lr-right-rail has
# one or more .collateral-content blocks; concat them.
rail = soup.select_one(".lr-right-rail")
if rail is not None:
blocks = rail.select(".collateral-content")
return [str(b) for b in blocks]
# Fixture format: a single wrapper holding all the H2/H3 sections.
wrapper = soup.select_one(".quickspecs-content")
if wrapper is not None:
return [str(wrapper)]
# Last-resort: whole body.
body = soup.body or soup
return [str(body)]
def parse_html(html: str) -> str:
"""Convert QuickSpecs HTML to clean markdown.
Filters out the page chrome (nav, footer, recommendations carousel,
cookie banner, analytics blobs) by extracting only the content
blocks, then runs markdownify."""
blocks = _extract_content_blocks(html)
chunks: list[str] = []
for block in blocks:
soup = BeautifulSoup(block, "html.parser")
# Drop anchor placeholders that markdownify turns into noisy links
for a in soup.select('[hpe-left-rail-anchor]'):
a.decompose()
# Drop carousel / share / recommendation widgets if any leaked in.
for sel in ("esl-share", "hpe-recommendations", "hpe-sticky-bar",
"esl-scrollbar", "esl-trigger", "video-overlay",
"generic-modal-loader", "style", "script"):
for el in soup.select(sel):
el.decompose()
chunks.append(md(str(soup), heading_style="ATX", bullets="-",
strip=["span", "div"]))
text = "\n\n".join(chunks)
# Collapse runs of blank lines markdownify likes to emit.
text = "\n".join(line.rstrip() for line in text.splitlines())
while "\n\n\n" in text:
text = text.replace("\n\n\n", "\n\n")
return text.strip() + "\n"
def scrape_quickspecs(doc_id: str, bundle_id: str, title: str,
version: str | None = None,
product: str = "QuickSpecs",
source_url: str | None = None,
force: bool = False) -> bool:
"""Live-fetch (or fall back to fixture), parse, write corpus files.
Returns True if files were written, False if skipped (already exists
and --force not set)."""
bundle_dir = CORPUS_DIR / bundle_id
md_path = bundle_dir / f"{doc_id}.md"
json_path = bundle_dir / f"{doc_id}.json"
if not force and md_path.exists() and json_path.exists():
log.info(" %s/%s: already on disk (use --force to refresh)", bundle_id, doc_id)
return False
html = fetch_live(doc_id)
fetched_from = "live"
if html is None:
html = fetch_fixture(doc_id)
fetched_from = "fixture"
if html is None:
log.error("QuickSpecs %s: no live response and no fixture at %s",
doc_id, SOURCE_DIR / f"{doc_id}.html")
return False
body_md = parse_html(html)
bundle_dir.mkdir(parents=True, exist_ok=True)
md_path.write_text(body_md)
sidecar = {
"bundle_id": bundle_id,
"page_id": doc_id,
"title": title,
"ordinal": 1,
"parent_title": None,
"doc_id": doc_id,
"version": version,
"product": product,
"source_url": source_url or f"https://www.hpe.com/psnow/doc/{doc_id}",
"fetched_from": fetched_from,
}
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
log.info(" %s/%s: %d bytes from %s", bundle_id, doc_id, len(body_md), fetched_from)
return True
def main() -> int:
logging.basicConfig(level=logging.INFO, format="%(message)s")
p = argparse.ArgumentParser()
p.add_argument("doc_id", help="QuickSpecs document id, e.g. a50004260enw")
p.add_argument("--bundle-id", default="hvm_quickspecs")
p.add_argument("--title", default="HPE Morpheus VM Essentials Software QuickSpecs")
p.add_argument("--version", default=None)
p.add_argument("--force", action="store_true")
args = p.parse_args()
ok = scrape_quickspecs(args.doc_id, args.bundle_id, args.title,
args.version, force=args.force)
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())