scrape: HVM bundles + runner for HPE Support DocPortal
Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2),
and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown.
Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}:
/toc walks the page tree (for TOC-paginated docs), /render?page=GUID
fetches per-page HTML, /document/{docId} returns the whole body for
single-doc shapes like Release Notes.
Runner converts DITA-source HTML to clean markdown (strips Notices/
Acknowledgments/Abstract boilerplate), writes corpus/<bundle>/<page>.{md,json},
then a finalize pass synthesizes topic_cluster.clustered_topics by GUID
overlap across versions (HPE GUIDs are stable cross-version — confirmed
374/376/376 with 100% overlap on shared pages).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+121
@@ -0,0 +1,121 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"slug": "hvm_user_manual_8_1_0",
|
||||||
|
"doc_id": "sd00007520en_us",
|
||||||
|
"title": "HPE Morpheus VM Essentials Software Documentation",
|
||||||
|
"version": "8.1.0",
|
||||||
|
"platform": null,
|
||||||
|
"product": "User Manual",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 378,
|
||||||
|
"mode": "toc",
|
||||||
|
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.0",
|
||||||
|
"dates": {
|
||||||
|
"Published": "February 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007520en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_user_manual_8_1_1",
|
||||||
|
"doc_id": "sd00007620en_us",
|
||||||
|
"title": "HPE Morpheus VM Essentials Software Documentation",
|
||||||
|
"version": "8.1.1",
|
||||||
|
"platform": null,
|
||||||
|
"product": "User Manual",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 380,
|
||||||
|
"mode": "toc",
|
||||||
|
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.1",
|
||||||
|
"dates": {
|
||||||
|
"Published": "March 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007620en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_user_manual_8_1_2",
|
||||||
|
"doc_id": "sd00007735en_us",
|
||||||
|
"title": "HPE Morpheus VM Essentials Software Documentation",
|
||||||
|
"version": "8.1.2",
|
||||||
|
"platform": null,
|
||||||
|
"product": "User Manual",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 380,
|
||||||
|
"mode": "toc",
|
||||||
|
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.2",
|
||||||
|
"dates": {
|
||||||
|
"Published": "April 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007735en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_release_notes_8_1_0",
|
||||||
|
"doc_id": "sd00007497en_us",
|
||||||
|
"title": "v8.1.0 Release Notes",
|
||||||
|
"version": "8.1.0",
|
||||||
|
"platform": null,
|
||||||
|
"product": "Release Notes",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 1,
|
||||||
|
"mode": "single",
|
||||||
|
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.0",
|
||||||
|
"dates": {
|
||||||
|
"Published": "February 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "sd00007497en_us",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007497en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_release_notes_8_1_1",
|
||||||
|
"doc_id": "sd00007609en_us",
|
||||||
|
"title": "v8.1.1 Release Notes",
|
||||||
|
"version": "8.1.1",
|
||||||
|
"platform": null,
|
||||||
|
"product": "Release Notes",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 1,
|
||||||
|
"mode": "single",
|
||||||
|
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.1",
|
||||||
|
"dates": {
|
||||||
|
"Published": "March 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "sd00007609en_us",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007609en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_release_notes_8_1_2",
|
||||||
|
"doc_id": "sd00007734en_us",
|
||||||
|
"title": "v8.1.2 Release Notes",
|
||||||
|
"version": "8.1.2",
|
||||||
|
"platform": null,
|
||||||
|
"product": "Release Notes",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 1,
|
||||||
|
"mode": "single",
|
||||||
|
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.2",
|
||||||
|
"dates": {
|
||||||
|
"Published": "April 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "sd00007734en_us",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007734en_us"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slug": "hvm_deployment_guide",
|
||||||
|
"doc_id": "sd00007332en_us",
|
||||||
|
"title": "HPE Morpheus VM Essentials Deployment Guide",
|
||||||
|
"version": null,
|
||||||
|
"platform": null,
|
||||||
|
"product": "Deployment Guide",
|
||||||
|
"language": "en-US",
|
||||||
|
"page_count": 42,
|
||||||
|
"mode": "toc",
|
||||||
|
"abstract": "HPE Morpheus VM Essentials Deployment Guide",
|
||||||
|
"dates": {
|
||||||
|
"Published": "January 2026"
|
||||||
|
},
|
||||||
|
"landing_page": "GUID-BF94B8DA-C4F6-4CDF-99E6-0AAA03177099",
|
||||||
|
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007332en_us"
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
# Dev/CPU reranker — only for running scripts/rerank_server.py locally.
|
||||||
|
# Production uses the llama.cpp + jina-reranker GGUF sidecar (see
|
||||||
|
# deploy/docker-compose.yml). Install with:
|
||||||
|
#
|
||||||
|
# pip install -r requirements-rerank.txt
|
||||||
|
#
|
||||||
|
# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder
|
||||||
|
# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main
|
||||||
|
# requirements.txt so the production image stays slim.
|
||||||
|
sentence-transformers>=3.0
|
||||||
@@ -10,10 +10,17 @@ ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not
|
|||||||
# Scraping (Phase 1; adjust per product)
|
# Scraping (Phase 1; adjust per product)
|
||||||
beautifulsoup4>=4.12
|
beautifulsoup4>=4.12
|
||||||
requests>=2.31
|
requests>=2.31
|
||||||
|
markdownify>=0.11
|
||||||
# playwright>=1.40 # uncomment if you need headless browser fallback
|
# playwright>=1.40 # uncomment if you need headless browser fallback
|
||||||
|
|
||||||
# Evaluation
|
# Evaluation
|
||||||
numpy>=1.26
|
numpy>=1.26
|
||||||
|
|
||||||
|
# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server
|
||||||
|
# only needs httpx (declared above) to call it. For the dev / CPU
|
||||||
|
# fallback reranker (scripts/rerank_server.py), install
|
||||||
|
# requirements-rerank.txt separately — it pulls in PyTorch which would
|
||||||
|
# triple the production image size.
|
||||||
|
|
||||||
# Dev / utility
|
# Dev / utility
|
||||||
python-dateutil>=2.8
|
python-dateutil>=2.8
|
||||||
|
|||||||
@@ -0,0 +1,170 @@
|
|||||||
|
"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
|
||||||
|
|
||||||
|
Bundle IDs are declared statically here because HPE mints a new docId
|
||||||
|
per product version rather than versioning a single doc (see
|
||||||
|
~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
|
||||||
|
version drops, add a new entry to BUNDLES and re-run; the runner will
|
||||||
|
pick it up on the next pass.
|
||||||
|
|
||||||
|
For each bundle this script:
|
||||||
|
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
|
||||||
|
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
|
||||||
|
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
API = "https://support.hpe.com/hpesc/public/api/document"
|
||||||
|
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
|
||||||
|
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
BUNDLES_JSON = ROOT / "bundles.json"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BundleSpec:
|
||||||
|
slug: str
|
||||||
|
doc_id: str
|
||||||
|
title: str
|
||||||
|
version: str | None
|
||||||
|
product: str # e.g. "User Manual", "Release Notes", "Deployment Guide"
|
||||||
|
mode: str # "toc" or "single"
|
||||||
|
platform: str | None = None
|
||||||
|
language: str = "en-US"
|
||||||
|
|
||||||
|
|
||||||
|
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
|
||||||
|
# range sd00007400..7740 for `v8.1.x` matches in the abstract.
|
||||||
|
BUNDLES: list[BundleSpec] = [
|
||||||
|
BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"),
|
||||||
|
BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"),
|
||||||
|
BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"),
|
||||||
|
BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"),
|
||||||
|
BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"),
|
||||||
|
BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"),
|
||||||
|
BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _session() -> requests.Session:
|
||||||
|
s = requests.Session()
|
||||||
|
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
|
||||||
|
delay = 1.0
|
||||||
|
for attempt in range(retries):
|
||||||
|
r = s.get(url, timeout=30)
|
||||||
|
if r.status_code == 200:
|
||||||
|
return r.json() if expect_json else r.text
|
||||||
|
if r.status_code == 404:
|
||||||
|
return None
|
||||||
|
if r.status_code in (429, 500, 502, 503, 504):
|
||||||
|
time.sleep(delay)
|
||||||
|
delay *= 2
|
||||||
|
continue
|
||||||
|
r.raise_for_status()
|
||||||
|
raise RuntimeError(f"GET failed after {retries} retries: {url}")
|
||||||
|
|
||||||
|
|
||||||
|
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
|
||||||
|
"""Returns (page_count, landing_page_guid)."""
|
||||||
|
if not toc:
|
||||||
|
return 0, None
|
||||||
|
landing = None
|
||||||
|
n = 0
|
||||||
|
|
||||||
|
def walk(nodes: list[dict] | None, depth: int) -> None:
|
||||||
|
nonlocal n, landing
|
||||||
|
for node in nodes or []:
|
||||||
|
link = node.get("topicLink")
|
||||||
|
if link:
|
||||||
|
n += 1
|
||||||
|
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
|
||||||
|
if m and landing is None:
|
||||||
|
landing = m.group(1)
|
||||||
|
walk(node.get("children"), depth + 1)
|
||||||
|
|
||||||
|
walk(toc, 0)
|
||||||
|
return n, landing
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_abstract(html: str) -> dict[str, str]:
|
||||||
|
"""Pull title / abstract text / published date out of the DITA abstract HTML."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
out: dict[str, str] = {}
|
||||||
|
h1 = soup.select_one("h1.title.topictitle1")
|
||||||
|
if h1:
|
||||||
|
out["title"] = h1.get_text(" ", strip=True)
|
||||||
|
desc = soup.select_one("div.desc")
|
||||||
|
if desc:
|
||||||
|
out["abstract"] = desc.get_text(" ", strip=True)
|
||||||
|
pub = soup.select_one("div.publishedDate")
|
||||||
|
if pub:
|
||||||
|
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
|
||||||
|
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
|
||||||
|
meta = _parse_abstract(abstract_html or "")
|
||||||
|
|
||||||
|
page_count: int
|
||||||
|
landing: str | None
|
||||||
|
if spec.mode == "toc":
|
||||||
|
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
|
||||||
|
page_count, landing = _count_toc(toc)
|
||||||
|
if page_count == 0:
|
||||||
|
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
|
||||||
|
spec.mode = "single"
|
||||||
|
page_count, landing = 1, spec.doc_id
|
||||||
|
else:
|
||||||
|
page_count, landing = 1, spec.doc_id
|
||||||
|
|
||||||
|
return {
|
||||||
|
"slug": spec.slug,
|
||||||
|
"doc_id": spec.doc_id,
|
||||||
|
"title": meta.get("title") or spec.title,
|
||||||
|
"version": spec.version,
|
||||||
|
"platform": spec.platform,
|
||||||
|
"product": spec.product,
|
||||||
|
"language": spec.language,
|
||||||
|
"page_count": page_count,
|
||||||
|
"mode": spec.mode,
|
||||||
|
"abstract": meta.get("abstract", ""),
|
||||||
|
"dates": {"Published": meta.get("published", "")},
|
||||||
|
"landing_page": landing,
|
||||||
|
"source_url": DOC_URL.format(doc_id=spec.doc_id),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
|
||||||
|
p.add_argument("--out", default=str(BUNDLES_JSON))
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
s = _session()
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
for spec in BUNDLES:
|
||||||
|
print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
|
||||||
|
out.append(discover_bundle(s, spec))
|
||||||
|
|
||||||
|
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
|
||||||
|
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -0,0 +1,325 @@
|
|||||||
|
"""Scrape HVM doc bundles into corpus/<slug>/<page_id>.{md,json}.
|
||||||
|
|
||||||
|
Reads bundles.json (produced by scrape.bundles), then for each bundle:
|
||||||
|
- mode="toc": walks the TOC tree, fetches each page via the render
|
||||||
|
endpoint, converts page_html to markdown, writes
|
||||||
|
<page_id>.md + <page_id>.json sidecar.
|
||||||
|
- mode="single": fetches /document/{docId} directly, treats the whole
|
||||||
|
body as one page with page_id = doc_id.
|
||||||
|
|
||||||
|
After all bundles are on disk, runs a finalize pass that synthesizes
|
||||||
|
topic_cluster.clustered_topics for each page by looking up the same
|
||||||
|
GUID in sibling bundles (HPE GUIDs are stable across versions — see
|
||||||
|
reference_hpe_docs_portal_api.md).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python -m scrape.runner --all
|
||||||
|
python -m scrape.runner --bundle hvm_user_manual_8_1_2
|
||||||
|
python -m scrape.runner --all --force # re-download already-on-disk pages
|
||||||
|
python -m scrape.runner --finalize-only # only redo the topic_cluster pass
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from markdownify import markdownify as md
|
||||||
|
|
||||||
|
API = "https://support.hpe.com/hpesc/public/api/document"
|
||||||
|
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html"
|
||||||
|
DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
|
||||||
|
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
CORPUS = ROOT / "corpus"
|
||||||
|
BUNDLES_JSON = ROOT / "bundles.json"
|
||||||
|
|
||||||
|
GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TocEntry:
|
||||||
|
page_id: str
|
||||||
|
title: str
|
||||||
|
ordinal: int
|
||||||
|
parent_title: str | None
|
||||||
|
|
||||||
|
|
||||||
|
def _session() -> requests.Session:
|
||||||
|
s = requests.Session()
|
||||||
|
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
|
||||||
|
delay = 1.0
|
||||||
|
for attempt in range(retries):
|
||||||
|
r = s.get(url, timeout=30)
|
||||||
|
if r.status_code == 200:
|
||||||
|
return r.json() if expect_json else r.text
|
||||||
|
if r.status_code == 404:
|
||||||
|
return None
|
||||||
|
if r.status_code in (429, 500, 502, 503, 504):
|
||||||
|
time.sleep(delay)
|
||||||
|
delay *= 2
|
||||||
|
continue
|
||||||
|
r.raise_for_status()
|
||||||
|
raise RuntimeError(f"GET failed after {retries} retries: {url}")
|
||||||
|
|
||||||
|
|
||||||
|
def _flatten_toc(toc: list[dict]) -> list[TocEntry]:
|
||||||
|
out: list[TocEntry] = []
|
||||||
|
ordinal = 0
|
||||||
|
|
||||||
|
def walk(nodes: list[dict] | None, parent_title: str | None) -> None:
|
||||||
|
nonlocal ordinal
|
||||||
|
for node in nodes or []:
|
||||||
|
title = node.get("topicName") or ""
|
||||||
|
link = node.get("topicLink") or ""
|
||||||
|
m = GUID_RE.search(link)
|
||||||
|
if m:
|
||||||
|
ordinal += 1
|
||||||
|
out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title))
|
||||||
|
walk(node.get("children"), title or parent_title)
|
||||||
|
|
||||||
|
walk(toc, None)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_dita_wrappers(html: str) -> str:
|
||||||
|
"""Remove the outer <main class="ditasrc">, drop the trademark Notices section,
|
||||||
|
and unwrap aria-only span markup so markdownify produces clean text.
|
||||||
|
|
||||||
|
DITA's notices boilerplate repeats across every doc; if we leave it in,
|
||||||
|
every page chunk inherits the same trademark text and pollutes retrieval."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
# Drop the Notices/Acknowledgments/Abstract boilerplate by section heading.
|
||||||
|
# Every doc on the portal carries the same legal Notices and trademark
|
||||||
|
# Acknowledgments; if we leave them in, every chunk inherits the same
|
||||||
|
# text and pollutes retrieval. Abstract is one-line marketing.
|
||||||
|
boilerplate = {"Notices", "Acknowledgments", "Abstract"}
|
||||||
|
# Wrapped form: <article>/<section>/<div> whose first heading child is boilerplate.
|
||||||
|
for sec in soup.select("article, section, div"):
|
||||||
|
h = sec.find(["h1", "h2"], recursive=False)
|
||||||
|
if h and h.get_text(strip=True) in boilerplate:
|
||||||
|
sec.decompose()
|
||||||
|
# Unwrapped form: bare <h1>/<h2>Boilerplate</h2> followed by its .desc/.body sibling.
|
||||||
|
for h in soup.find_all(["h1", "h2"]):
|
||||||
|
if h.get_text(strip=True) in boilerplate:
|
||||||
|
sib = h.find_next_sibling()
|
||||||
|
if sib and (sib.name in {"div", "section"}):
|
||||||
|
cls = " ".join(sib.get("class", []) or [])
|
||||||
|
if "desc" in cls or "body" in cls or "notices" in cls:
|
||||||
|
sib.decompose()
|
||||||
|
h.decompose()
|
||||||
|
main = soup.find("main")
|
||||||
|
return str(main) if main else str(soup)
|
||||||
|
|
||||||
|
|
||||||
|
def html_to_md(page_html: str) -> str:
|
||||||
|
cleaned = _strip_dita_wrappers(page_html)
|
||||||
|
text = md(cleaned, heading_style="ATX", bullets="-")
|
||||||
|
# collapse runs of blank lines
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
||||||
|
return text + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str:
|
||||||
|
payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True)
|
||||||
|
if not payload:
|
||||||
|
return ""
|
||||||
|
return payload.get("page_html") or ""
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]:
|
||||||
|
"""Returns (page_html, title) for a single-doc-shape bundle."""
|
||||||
|
html = _get(s, f"{API}/{doc_id}")
|
||||||
|
if not html:
|
||||||
|
return "", ""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
h1 = soup.select_one("h1.title.topictitle1")
|
||||||
|
title = h1.get_text(" ", strip=True) if h1 else doc_id
|
||||||
|
return html, title
|
||||||
|
|
||||||
|
|
||||||
|
def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool:
|
||||||
|
bundle_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
md_path = bundle_dir / f"{page_id}.md"
|
||||||
|
json_path = bundle_dir / f"{page_id}.json"
|
||||||
|
if not force and md_path.exists() and json_path.exists():
|
||||||
|
return False
|
||||||
|
md_path.write_text(body_md)
|
||||||
|
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int:
|
||||||
|
doc_id = bundle["doc_id"]
|
||||||
|
slug = bundle["slug"]
|
||||||
|
bundle_dir = CORPUS / slug
|
||||||
|
|
||||||
|
toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or []
|
||||||
|
entries = _flatten_toc(toc)
|
||||||
|
print(f" {slug}: {len(entries)} pages", file=sys.stderr)
|
||||||
|
|
||||||
|
written = 0
|
||||||
|
def do_one(entry: TocEntry) -> bool:
|
||||||
|
page_html = fetch_toc_page(s, doc_id, entry.page_id)
|
||||||
|
if not page_html:
|
||||||
|
return False
|
||||||
|
body_md = html_to_md(page_html)
|
||||||
|
sidecar = {
|
||||||
|
"bundle_id": slug,
|
||||||
|
"page_id": entry.page_id,
|
||||||
|
"title": entry.title,
|
||||||
|
"ordinal": entry.ordinal,
|
||||||
|
"parent_title": entry.parent_title,
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"version": bundle.get("version"),
|
||||||
|
"product": bundle.get("product"),
|
||||||
|
"source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id),
|
||||||
|
# topic_cluster filled in by finalize()
|
||||||
|
}
|
||||||
|
return write_page(bundle_dir, entry.page_id, body_md, sidecar, force)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=concurrency) as pool:
|
||||||
|
for fut in as_completed(pool.submit(do_one, e) for e in entries):
|
||||||
|
if fut.result():
|
||||||
|
written += 1
|
||||||
|
return written
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int:
|
||||||
|
doc_id = bundle["doc_id"]
|
||||||
|
slug = bundle["slug"]
|
||||||
|
bundle_dir = CORPUS / slug
|
||||||
|
|
||||||
|
html, title = fetch_single_doc(s, doc_id)
|
||||||
|
if not html:
|
||||||
|
print(f" ! {slug}: empty body", file=sys.stderr)
|
||||||
|
return 0
|
||||||
|
body_md = html_to_md(html)
|
||||||
|
sidecar = {
|
||||||
|
"bundle_id": slug,
|
||||||
|
"page_id": doc_id,
|
||||||
|
"title": title or bundle["title"],
|
||||||
|
"ordinal": 1,
|
||||||
|
"parent_title": None,
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"version": bundle.get("version"),
|
||||||
|
"product": bundle.get("product"),
|
||||||
|
"source_url": DOC_URL_SINGLE.format(doc_id=doc_id),
|
||||||
|
}
|
||||||
|
print(f" {slug}: 1 page (single-doc)", file=sys.stderr)
|
||||||
|
return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0
|
||||||
|
|
||||||
|
|
||||||
|
def finalize_clusters(bundles: list[dict]) -> int:
|
||||||
|
"""Cross-link sibling pages with the same GUID across version bundles.
|
||||||
|
|
||||||
|
For TOC bundles, page_id == GUID; same GUID across two bundles = same
|
||||||
|
underlying topic. For single-doc bundles (page_id == doc_id), peer them
|
||||||
|
by matching product+version-sibling on the `product` field."""
|
||||||
|
# GUID → list[(slug, sidecar_path, sidecar_dict)]
|
||||||
|
guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
|
||||||
|
# product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering
|
||||||
|
product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
|
||||||
|
|
||||||
|
for b in bundles:
|
||||||
|
slug = b["slug"]
|
||||||
|
bundle_dir = CORPUS / slug
|
||||||
|
if not bundle_dir.exists():
|
||||||
|
continue
|
||||||
|
for jp in bundle_dir.glob("*.json"):
|
||||||
|
data = json.loads(jp.read_text())
|
||||||
|
pid = data["page_id"]
|
||||||
|
if pid.startswith("GUID-"):
|
||||||
|
guid_to_pages.setdefault(pid, []).append((slug, jp, data))
|
||||||
|
else:
|
||||||
|
product_to_pages.setdefault(b["product"], []).append((slug, jp, data))
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
# TOC pages — cluster by GUID
|
||||||
|
for guid, peers in guid_to_pages.items():
|
||||||
|
if len(peers) < 2:
|
||||||
|
continue
|
||||||
|
for slug, jp, data in peers:
|
||||||
|
others = [
|
||||||
|
{"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")}
|
||||||
|
for s2, _, d2 in peers if s2 != slug
|
||||||
|
]
|
||||||
|
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
|
||||||
|
jp.write_text(json.dumps(data, indent=2) + "\n")
|
||||||
|
updated += 1
|
||||||
|
# Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2)
|
||||||
|
for product, peers in product_to_pages.items():
|
||||||
|
if len(peers) < 2:
|
||||||
|
continue
|
||||||
|
for slug, jp, data in peers:
|
||||||
|
others = [
|
||||||
|
{"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")}
|
||||||
|
for s2, _, d2 in peers if s2 != slug
|
||||||
|
]
|
||||||
|
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
|
||||||
|
jp.write_text(json.dumps(data, indent=2) + "\n")
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.")
|
||||||
|
p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json")
|
||||||
|
p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)")
|
||||||
|
p.add_argument("--force", action="store_true", help="re-fetch pages already on disk")
|
||||||
|
p.add_argument("--concurrency", type=int, default=6)
|
||||||
|
p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
if not BUNDLES_JSON.exists():
|
||||||
|
print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
bundles = json.loads(BUNDLES_JSON.read_text())
|
||||||
|
|
||||||
|
if args.finalize_only:
|
||||||
|
n = finalize_clusters(bundles)
|
||||||
|
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.bundle:
|
||||||
|
bundles = [b for b in bundles if b["slug"] in args.bundle]
|
||||||
|
if not bundles:
|
||||||
|
print(f"no bundles matched: {args.bundle}", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
elif not args.all:
|
||||||
|
print("specify --all or --bundle <slug>", file=sys.stderr)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
s = _session()
|
||||||
|
total = 0
|
||||||
|
for b in bundles:
|
||||||
|
if b.get("mode") == "single":
|
||||||
|
total += scrape_single_bundle(s, b, args.force)
|
||||||
|
else:
|
||||||
|
total += scrape_toc_bundle(s, b, args.force, args.concurrency)
|
||||||
|
print(f"scraped {total} new/updated pages", file=sys.stderr)
|
||||||
|
|
||||||
|
# Always finalize after a scrape so sidecars are consistent.
|
||||||
|
all_bundles = json.loads(BUNDLES_JSON.read_text())
|
||||||
|
n = finalize_clusters(all_bundles)
|
||||||
|
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user