From 7a491ba9e423ea16ce2cff935537cf0d085963bb Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Fri, 22 May 2026 13:06:26 -0400
Subject: [PATCH] scrape: HVM bundles + runner for HPE Support DocPortal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2),
and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown.

Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}:
/toc walks the page tree (for TOC-paginated docs), /render?page=GUID
fetches per-page HTML, /document/{docId} returns the whole body for
single-doc shapes like Release Notes.

Runner converts DITA-source HTML to clean markdown (strips Notices/
Acknowledgments/Abstract boilerplate), writes corpus/<bundle>/<page>.{md,json},
then a finalize pass synthesizes topic_cluster.clustered_topics by GUID
overlap across versions (HPE GUIDs are stable cross-version — confirmed
374/376/376 with 100% overlap on shared pages).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bundles.json            | 121 +++++++++++++++
 requirements-rerank.txt |  10 ++
 requirements.txt        |   7 +
 scrape/bundles.py       | 170 +++++++++++++++++++++
 scrape/runner.py        | 325 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 633 insertions(+)
 create mode 100644 bundles.json
 create mode 100644 requirements-rerank.txt
 create mode 100644 scrape/bundles.py
 create mode 100644 scrape/runner.py
diff --git a/bundles.json b/bundles.json
new file mode 100644
index 0000000..4855ee1
--- /dev/null
+++ b/bundles.json
@@ -0,0 +1,121 @@
+[
+  {
+    "slug": "hvm_user_manual_8_1_0",
+    "doc_id": "sd00007520en_us",
+    "title": "HPE Morpheus VM Essentials Software Documentation",
+    "version": "8.1.0",
+    "platform": null,
+    "product": "User Manual",
+    "language": "en-US",
+    "page_count": 378,
+    "mode": "toc",
+    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.0",
+    "dates": {
+      "Published": "February 2026"
+    },
+    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007520en_us"
+  },
+  {
+    "slug": "hvm_user_manual_8_1_1",
+    "doc_id": "sd00007620en_us",
+    "title": "HPE Morpheus VM Essentials Software Documentation",
+    "version": "8.1.1",
+    "platform": null,
+    "product": "User Manual",
+    "language": "en-US",
+    "page_count": 380,
+    "mode": "toc",
+    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.1",
+    "dates": {
+      "Published": "March 2026"
+    },
+    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007620en_us"
+  },
+  {
+    "slug": "hvm_user_manual_8_1_2",
+    "doc_id": "sd00007735en_us",
+    "title": "HPE Morpheus VM Essentials Software Documentation",
+    "version": "8.1.2",
+    "platform": null,
+    "product": "User Manual",
+    "language": "en-US",
+    "page_count": 380,
+    "mode": "toc",
+    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.2",
+    "dates": {
+      "Published": "April 2026"
+    },
+    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007735en_us"
+  },
+  {
+    "slug": "hvm_release_notes_8_1_0",
+    "doc_id": "sd00007497en_us",
+    "title": "v8.1.0 Release Notes",
+    "version": "8.1.0",
+    "platform": null,
+    "product": "Release Notes",
+    "language": "en-US",
+    "page_count": 1,
+    "mode": "single",
+    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.0",
+    "dates": {
+      "Published": "February 2026"
+    },
+    "landing_page": "sd00007497en_us",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007497en_us"
+  },
+  {
+    "slug": "hvm_release_notes_8_1_1",
+    "doc_id": "sd00007609en_us",
+    "title": "v8.1.1 Release Notes",
+    "version": "8.1.1",
+    "platform": null,
+    "product": "Release Notes",
+    "language": "en-US",
+    "page_count": 1,
+    "mode": "single",
+    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.1",
+    "dates": {
+      "Published": "March 2026"
+    },
+    "landing_page": "sd00007609en_us",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007609en_us"
+  },
+  {
+    "slug": "hvm_release_notes_8_1_2",
+    "doc_id": "sd00007734en_us",
+    "title": "v8.1.2 Release Notes",
+    "version": "8.1.2",
+    "platform": null,
+    "product": "Release Notes",
+    "language": "en-US",
+    "page_count": 1,
+    "mode": "single",
+    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.2",
+    "dates": {
+      "Published": "April 2026"
+    },
+    "landing_page": "sd00007734en_us",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007734en_us"
+  },
+  {
+    "slug": "hvm_deployment_guide",
+    "doc_id": "sd00007332en_us",
+    "title": "HPE Morpheus VM Essentials Deployment Guide",
+    "version": null,
+    "platform": null,
+    "product": "Deployment Guide",
+    "language": "en-US",
+    "page_count": 42,
+    "mode": "toc",
+    "abstract": "HPE Morpheus VM Essentials Deployment Guide",
+    "dates": {
+      "Published": "January 2026"
+    },
+    "landing_page": "GUID-BF94B8DA-C4F6-4CDF-99E6-0AAA03177099",
+    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007332en_us"
+  }
+]
diff --git a/requirements-rerank.txt b/requirements-rerank.txt
new file mode 100644
index 0000000..2c5fc27
--- /dev/null
+++ b/requirements-rerank.txt
@@ -0,0 +1,10 @@
+# Dev/CPU reranker — only for running scripts/rerank_server.py locally.
+# Production uses the llama.cpp + jina-reranker GGUF sidecar (see
+# deploy/docker-compose.yml). Install with:
+#
+#   pip install -r requirements-rerank.txt
+#
+# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder
+# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main
+# requirements.txt so the production image stays slim.
+sentence-transformers>=3.0
diff --git a/requirements.txt b/requirements.txt
index b9982a9..a89002f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,10 +10,17 @@ ollama>=0.4.0      # if using Ollama-hosted embedder; swap if not
 # Scraping (Phase 1; adjust per product)
 beautifulsoup4>=4.12
 requests>=2.31
+markdownify>=0.11
 # playwright>=1.40  # uncomment if you need headless browser fallback
 
 # Evaluation
 numpy>=1.26
 
+# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server
+# only needs httpx (declared above) to call it. For the dev / CPU
+# fallback reranker (scripts/rerank_server.py), install
+# requirements-rerank.txt separately — it pulls in PyTorch which would
+# triple the production image size.
+
 # Dev / utility
 python-dateutil>=2.8
diff --git a/scrape/bundles.py b/scrape/bundles.py
new file mode 100644
index 0000000..4bd0c59
--- /dev/null
+++ b/scrape/bundles.py
@@ -0,0 +1,170 @@
+"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
+
+Bundle IDs are declared statically here because HPE mints a new docId
+per product version rather than versioning a single doc (see
+~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
+version drops, add a new entry to BUNDLES and re-run; the runner will
+pick it up on the next pass.
+
+For each bundle this script:
+  1. GETs /hpesc/public/api/document/{docId}        → abstract HTML
+  2. GETs /hpesc/public/api/document/{docId}/toc    → page tree (or 404 for single-doc)
+  3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+
+API = "https://support.hpe.com/hpesc/public/api/document"
+DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
+UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
+ROOT = Path(__file__).resolve().parent.parent
+BUNDLES_JSON = ROOT / "bundles.json"
+
+
+@dataclass
+class BundleSpec:
+    slug: str
+    doc_id: str
+    title: str
+    version: str | None
+    product: str  # e.g. "User Manual", "Release Notes", "Deployment Guide"
+    mode: str    # "toc" or "single"
+    platform: str | None = None
+    language: str = "en-US"
+
+
+# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
+# range sd00007400..7740 for `v8.1.x` matches in the abstract.
+BUNDLES: list[BundleSpec] = [
+    BundleSpec("hvm_user_manual_8_1_0",   "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual",      "toc"),
+    BundleSpec("hvm_user_manual_8_1_1",   "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual",      "toc"),
+    BundleSpec("hvm_user_manual_8_1_2",   "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual",      "toc"),
+    BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.0", "Release Notes",   "single"),
+    BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.1", "Release Notes",   "single"),
+    BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.2", "Release Notes",   "single"),
+    BundleSpec("hvm_deployment_guide",    "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide",        None,    "Deployment Guide","toc"),
+]
+
+
+def _session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
+    return s
+
+
+def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
+    delay = 1.0
+    for attempt in range(retries):
+        r = s.get(url, timeout=30)
+        if r.status_code == 200:
+            return r.json() if expect_json else r.text
+        if r.status_code == 404:
+            return None
+        if r.status_code in (429, 500, 502, 503, 504):
+            time.sleep(delay)
+            delay *= 2
+            continue
+        r.raise_for_status()
+    raise RuntimeError(f"GET failed after {retries} retries: {url}")
+
+
+def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
+    """Returns (page_count, landing_page_guid)."""
+    if not toc:
+        return 0, None
+    landing = None
+    n = 0
+
+    def walk(nodes: list[dict] | None, depth: int) -> None:
+        nonlocal n, landing
+        for node in nodes or []:
+            link = node.get("topicLink")
+            if link:
+                n += 1
+                m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
+                if m and landing is None:
+                    landing = m.group(1)
+            walk(node.get("children"), depth + 1)
+
+    walk(toc, 0)
+    return n, landing
+
+
+def _parse_abstract(html: str) -> dict[str, str]:
+    """Pull title / abstract text / published date out of the DITA abstract HTML."""
+    soup = BeautifulSoup(html, "html.parser")
+    out: dict[str, str] = {}
+    h1 = soup.select_one("h1.title.topictitle1")
+    if h1:
+        out["title"] = h1.get_text(" ", strip=True)
+    desc = soup.select_one("div.desc")
+    if desc:
+        out["abstract"] = desc.get_text(" ", strip=True)
+    pub = soup.select_one("div.publishedDate")
+    if pub:
+        out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
+    return out
+
+
+def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
+    abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
+    meta = _parse_abstract(abstract_html or "")
+
+    page_count: int
+    landing: str | None
+    if spec.mode == "toc":
+        toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
+        page_count, landing = _count_toc(toc)
+        if page_count == 0:
+            print(f"  ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
+            spec.mode = "single"
+            page_count, landing = 1, spec.doc_id
+    else:
+        page_count, landing = 1, spec.doc_id
+
+    return {
+        "slug": spec.slug,
+        "doc_id": spec.doc_id,
+        "title": meta.get("title") or spec.title,
+        "version": spec.version,
+        "platform": spec.platform,
+        "product": spec.product,
+        "language": spec.language,
+        "page_count": page_count,
+        "mode": spec.mode,
+        "abstract": meta.get("abstract", ""),
+        "dates": {"Published": meta.get("published", "")},
+        "landing_page": landing,
+        "source_url": DOC_URL.format(doc_id=spec.doc_id),
+    }
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
+    p.add_argument("--out", default=str(BUNDLES_JSON))
+    args = p.parse_args()
+
+    s = _session()
+    out: list[dict[str, Any]] = []
+    for spec in BUNDLES:
+        print(f"  • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
+        out.append(discover_bundle(s, spec))
+
+    Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
+    print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scrape/runner.py b/scrape/runner.py
new file mode 100644
index 0000000..ad5909b
--- /dev/null
+++ b/scrape/runner.py
@@ -0,0 +1,325 @@
+"""Scrape HVM doc bundles into corpus/<slug>/<page_id>.{md,json}.
+
+Reads bundles.json (produced by scrape.bundles), then for each bundle:
+  - mode="toc":    walks the TOC tree, fetches each page via the render
+                   endpoint, converts page_html to markdown, writes
+                   <page_id>.md + <page_id>.json sidecar.
+  - mode="single": fetches /document/{docId} directly, treats the whole
+                   body as one page with page_id = doc_id.
+
+After all bundles are on disk, runs a finalize pass that synthesizes
+topic_cluster.clustered_topics for each page by looking up the same
+GUID in sibling bundles (HPE GUIDs are stable across versions — see
+reference_hpe_docs_portal_api.md).
+
+Usage:
+    python -m scrape.runner --all
+    python -m scrape.runner --bundle hvm_user_manual_8_1_2
+    python -m scrape.runner --all --force        # re-download already-on-disk pages
+    python -m scrape.runner --finalize-only      # only redo the topic_cluster pass
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+from markdownify import markdownify as md
+
+API = "https://support.hpe.com/hpesc/public/api/document"
+DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html"
+DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
+UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
+ROOT = Path(__file__).resolve().parent.parent
+CORPUS = ROOT / "corpus"
+BUNDLES_JSON = ROOT / "bundles.json"
+
+GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html")
+
+
+@dataclass
+class TocEntry:
+    page_id: str
+    title: str
+    ordinal: int
+    parent_title: str | None
+
+
+def _session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
+    return s
+
+
+def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
+    delay = 1.0
+    for attempt in range(retries):
+        r = s.get(url, timeout=30)
+        if r.status_code == 200:
+            return r.json() if expect_json else r.text
+        if r.status_code == 404:
+            return None
+        if r.status_code in (429, 500, 502, 503, 504):
+            time.sleep(delay)
+            delay *= 2
+            continue
+        r.raise_for_status()
+    raise RuntimeError(f"GET failed after {retries} retries: {url}")
+
+
+def _flatten_toc(toc: list[dict]) -> list[TocEntry]:
+    out: list[TocEntry] = []
+    ordinal = 0
+
+    def walk(nodes: list[dict] | None, parent_title: str | None) -> None:
+        nonlocal ordinal
+        for node in nodes or []:
+            title = node.get("topicName") or ""
+            link = node.get("topicLink") or ""
+            m = GUID_RE.search(link)
+            if m:
+                ordinal += 1
+                out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title))
+            walk(node.get("children"), title or parent_title)
+
+    walk(toc, None)
+    return out
+
+
+def _strip_dita_wrappers(html: str) -> str:
+    """Remove the outer <main class="ditasrc">, drop the trademark Notices section,
+    and unwrap aria-only span markup so markdownify produces clean text.
+
+    DITA's notices boilerplate repeats across every doc; if we leave it in,
+    every page chunk inherits the same trademark text and pollutes retrieval."""
+    soup = BeautifulSoup(html, "html.parser")
+    # Drop the Notices/Acknowledgments/Abstract boilerplate by section heading.
+    # Every doc on the portal carries the same legal Notices and trademark
+    # Acknowledgments; if we leave them in, every chunk inherits the same
+    # text and pollutes retrieval. Abstract is one-line marketing.
+    boilerplate = {"Notices", "Acknowledgments", "Abstract"}
+    # Wrapped form: <article>/<section>/<div> whose first heading child is boilerplate.
+    for sec in soup.select("article, section, div"):
+        h = sec.find(["h1", "h2"], recursive=False)
+        if h and h.get_text(strip=True) in boilerplate:
+            sec.decompose()
+    # Unwrapped form: bare <h1>/<h2>Boilerplate</h2> followed by its .desc/.body sibling.
+    for h in soup.find_all(["h1", "h2"]):
+        if h.get_text(strip=True) in boilerplate:
+            sib = h.find_next_sibling()
+            if sib and (sib.name in {"div", "section"}):
+                cls = " ".join(sib.get("class", []) or [])
+                if "desc" in cls or "body" in cls or "notices" in cls:
+                    sib.decompose()
+            h.decompose()
+    main = soup.find("main")
+    return str(main) if main else str(soup)
+
+
+def html_to_md(page_html: str) -> str:
+    cleaned = _strip_dita_wrappers(page_html)
+    text = md(cleaned, heading_style="ATX", bullets="-")
+    # collapse runs of blank lines
+    text = re.sub(r"\n{3,}", "\n\n", text).strip()
+    return text + "\n"
+
+
+def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str:
+    payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True)
+    if not payload:
+        return ""
+    return payload.get("page_html") or ""
+
+
+def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]:
+    """Returns (page_html, title) for a single-doc-shape bundle."""
+    html = _get(s, f"{API}/{doc_id}")
+    if not html:
+        return "", ""
+    soup = BeautifulSoup(html, "html.parser")
+    h1 = soup.select_one("h1.title.topictitle1")
+    title = h1.get_text(" ", strip=True) if h1 else doc_id
+    return html, title
+
+
+def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool:
+    bundle_dir.mkdir(parents=True, exist_ok=True)
+    md_path = bundle_dir / f"{page_id}.md"
+    json_path = bundle_dir / f"{page_id}.json"
+    if not force and md_path.exists() and json_path.exists():
+        return False
+    md_path.write_text(body_md)
+    json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
+    return True
+
+
+def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int:
+    doc_id = bundle["doc_id"]
+    slug = bundle["slug"]
+    bundle_dir = CORPUS / slug
+
+    toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or []
+    entries = _flatten_toc(toc)
+    print(f"  {slug}: {len(entries)} pages", file=sys.stderr)
+
+    written = 0
+    def do_one(entry: TocEntry) -> bool:
+        page_html = fetch_toc_page(s, doc_id, entry.page_id)
+        if not page_html:
+            return False
+        body_md = html_to_md(page_html)
+        sidecar = {
+            "bundle_id": slug,
+            "page_id": entry.page_id,
+            "title": entry.title,
+            "ordinal": entry.ordinal,
+            "parent_title": entry.parent_title,
+            "doc_id": doc_id,
+            "version": bundle.get("version"),
+            "product": bundle.get("product"),
+            "source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id),
+            # topic_cluster filled in by finalize()
+        }
+        return write_page(bundle_dir, entry.page_id, body_md, sidecar, force)
+
+    with ThreadPoolExecutor(max_workers=concurrency) as pool:
+        for fut in as_completed(pool.submit(do_one, e) for e in entries):
+            if fut.result():
+                written += 1
+    return written
+
+
+def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int:
+    doc_id = bundle["doc_id"]
+    slug = bundle["slug"]
+    bundle_dir = CORPUS / slug
+
+    html, title = fetch_single_doc(s, doc_id)
+    if not html:
+        print(f"  ! {slug}: empty body", file=sys.stderr)
+        return 0
+    body_md = html_to_md(html)
+    sidecar = {
+        "bundle_id": slug,
+        "page_id": doc_id,
+        "title": title or bundle["title"],
+        "ordinal": 1,
+        "parent_title": None,
+        "doc_id": doc_id,
+        "version": bundle.get("version"),
+        "product": bundle.get("product"),
+        "source_url": DOC_URL_SINGLE.format(doc_id=doc_id),
+    }
+    print(f"  {slug}: 1 page (single-doc)", file=sys.stderr)
+    return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0
+
+
+def finalize_clusters(bundles: list[dict]) -> int:
+    """Cross-link sibling pages with the same GUID across version bundles.
+
+    For TOC bundles, page_id == GUID; same GUID across two bundles = same
+    underlying topic. For single-doc bundles (page_id == doc_id), peer them
+    by matching product+version-sibling on the `product` field."""
+    # GUID → list[(slug, sidecar_path, sidecar_dict)]
+    guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
+    # product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering
+    product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
+
+    for b in bundles:
+        slug = b["slug"]
+        bundle_dir = CORPUS / slug
+        if not bundle_dir.exists():
+            continue
+        for jp in bundle_dir.glob("*.json"):
+            data = json.loads(jp.read_text())
+            pid = data["page_id"]
+            if pid.startswith("GUID-"):
+                guid_to_pages.setdefault(pid, []).append((slug, jp, data))
+            else:
+                product_to_pages.setdefault(b["product"], []).append((slug, jp, data))
+
+    updated = 0
+    # TOC pages — cluster by GUID
+    for guid, peers in guid_to_pages.items():
+        if len(peers) < 2:
+            continue
+        for slug, jp, data in peers:
+            others = [
+                {"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")}
+                for s2, _, d2 in peers if s2 != slug
+            ]
+            data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
+            jp.write_text(json.dumps(data, indent=2) + "\n")
+            updated += 1
+    # Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2)
+    for product, peers in product_to_pages.items():
+        if len(peers) < 2:
+            continue
+        for slug, jp, data in peers:
+            others = [
+                {"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")}
+                for s2, _, d2 in peers if s2 != slug
+            ]
+            data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
+            jp.write_text(json.dumps(data, indent=2) + "\n")
+            updated += 1
+
+    return updated
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.")
+    p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json")
+    p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)")
+    p.add_argument("--force", action="store_true", help="re-fetch pages already on disk")
+    p.add_argument("--concurrency", type=int, default=6)
+    p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields")
+    args = p.parse_args()
+
+    if not BUNDLES_JSON.exists():
+        print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr)
+        return 2
+
+    bundles = json.loads(BUNDLES_JSON.read_text())
+
+    if args.finalize_only:
+        n = finalize_clusters(bundles)
+        print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
+        return 0
+
+    if args.bundle:
+        bundles = [b for b in bundles if b["slug"] in args.bundle]
+        if not bundles:
+            print(f"no bundles matched: {args.bundle}", file=sys.stderr)
+            return 2
+    elif not args.all:
+        print("specify --all or --bundle <slug>", file=sys.stderr)
+        return 2
+
+    s = _session()
+    total = 0
+    for b in bundles:
+        if b.get("mode") == "single":
+            total += scrape_single_bundle(s, b, args.force)
+        else:
+            total += scrape_toc_bundle(s, b, args.force, args.concurrency)
+    print(f"scraped {total} new/updated pages", file=sys.stderr)
+
+    # Always finalize after a scrape so sidecars are consistent.
+    all_bundles = json.loads(BUNDLES_JSON.read_text())
+    n = finalize_clusters(all_bundles)
+    print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())