scrape: HVM bundles + runner for HPE Support DocPortal

Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2), and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown. Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}: /toc walks the page tree (for TOC-paginated docs), /render?page=GUID fetches per-page HTML, /document/{docId} returns the whole body for single-doc shapes like Release Notes. Runner converts DITA-source HTML to clean markdown (strips Notices/ Acknowledgments/Abstract boilerplate), writes corpus/<bundle>/<page>.{md,json}, then a finalize pass synthesizes topic_cluster.clustered_topics by GUID overlap across versions (HPE GUIDs are stable cross-version — confirmed 374/376/376 with 100% overlap on shared pages). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 13:06:26 -04:00
parent 43728320bf
commit 7a491ba9e4
5 changed files with 633 additions and 0 deletions
@@ -0,0 +1,121 @@
 [
  {
    "slug": "hvm_user_manual_8_1_0",
    "doc_id": "sd00007520en_us",
    "title": "HPE Morpheus VM Essentials Software Documentation",
    "version": "8.1.0",
    "platform": null,
    "product": "User Manual",
    "language": "en-US",
    "page_count": 378,
    "mode": "toc",
    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.0",
    "dates": {
      "Published": "February 2026"
    },
    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007520en_us"
  },
  {
    "slug": "hvm_user_manual_8_1_1",
    "doc_id": "sd00007620en_us",
    "title": "HPE Morpheus VM Essentials Software Documentation",
    "version": "8.1.1",
    "platform": null,
    "product": "User Manual",
    "language": "en-US",
    "page_count": 380,
    "mode": "toc",
    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.1",
    "dates": {
      "Published": "March 2026"
    },
    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007620en_us"
  },
  {
    "slug": "hvm_user_manual_8_1_2",
    "doc_id": "sd00007735en_us",
    "title": "HPE Morpheus VM Essentials Software Documentation",
    "version": "8.1.2",
    "platform": null,
    "product": "User Manual",
    "language": "en-US",
    "page_count": 380,
    "mode": "toc",
    "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.2",
    "dates": {
      "Published": "April 2026"
    },
    "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007735en_us"
  },
  {
    "slug": "hvm_release_notes_8_1_0",
    "doc_id": "sd00007497en_us",
    "title": "v8.1.0 Release Notes",
    "version": "8.1.0",
    "platform": null,
    "product": "Release Notes",
    "language": "en-US",
    "page_count": 1,
    "mode": "single",
    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.0",
    "dates": {
      "Published": "February 2026"
    },
    "landing_page": "sd00007497en_us",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007497en_us"
  },
  {
    "slug": "hvm_release_notes_8_1_1",
    "doc_id": "sd00007609en_us",
    "title": "v8.1.1 Release Notes",
    "version": "8.1.1",
    "platform": null,
    "product": "Release Notes",
    "language": "en-US",
    "page_count": 1,
    "mode": "single",
    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.1",
    "dates": {
      "Published": "March 2026"
    },
    "landing_page": "sd00007609en_us",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007609en_us"
  },
  {
    "slug": "hvm_release_notes_8_1_2",
    "doc_id": "sd00007734en_us",
    "title": "v8.1.2 Release Notes",
    "version": "8.1.2",
    "platform": null,
    "product": "Release Notes",
    "language": "en-US",
    "page_count": 1,
    "mode": "single",
    "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.2",
    "dates": {
      "Published": "April 2026"
    },
    "landing_page": "sd00007734en_us",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007734en_us"
  },
  {
    "slug": "hvm_deployment_guide",
    "doc_id": "sd00007332en_us",
    "title": "HPE Morpheus VM Essentials Deployment Guide",
    "version": null,
    "platform": null,
    "product": "Deployment Guide",
    "language": "en-US",
    "page_count": 42,
    "mode": "toc",
    "abstract": "HPE Morpheus VM Essentials Deployment Guide",
    "dates": {
      "Published": "January 2026"
    },
    "landing_page": "GUID-BF94B8DA-C4F6-4CDF-99E6-0AAA03177099",
    "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007332en_us"
  }
 ]
@@ -0,0 +1,10 @@
 # Dev/CPU reranker — only for running scripts/rerank_server.py locally.
 # Production uses the llama.cpp + jina-reranker GGUF sidecar (see
 # deploy/docker-compose.yml). Install with:
 #
 #   pip install -r requirements-rerank.txt
 #
 # This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder
 # (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main
 # requirements.txt so the production image stays slim.
 sentence-transformers>=3.0
@@ -10,10 +10,17 @@ ollama>=0.4.0      # if using Ollama-hosted embedder; swap if not
 # Scraping (Phase 1; adjust per product)
 beautifulsoup4>=4.12
 requests>=2.31
 markdownify>=0.11
 # playwright>=1.40  # uncomment if you need headless browser fallback
 # Evaluation
 numpy>=1.26
 # Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server
 # only needs httpx (declared above) to call it. For the dev / CPU
 # fallback reranker (scripts/rerank_server.py), install
 # requirements-rerank.txt separately — it pulls in PyTorch which would
 # triple the production image size.
 # Dev / utility
 python-dateutil>=2.8
@@ -0,0 +1,170 @@
 """Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
 Bundle IDs are declared statically here because HPE mints a new docId
 per product version rather than versioning a single doc (see
 ~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
 version drops, add a new entry to BUNDLES and re-run; the runner will
 pick it up on the next pass.
 For each bundle this script:
  1. GETs /hpesc/public/api/document/{docId}        → abstract HTML
  2. GETs /hpesc/public/api/document/{docId}/toc    → page tree (or 404 for single-doc)
  3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
 import requests
 from bs4 import BeautifulSoup
 API = "https://support.hpe.com/hpesc/public/api/document"
 DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
 UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
 ROOT = Path(__file__).resolve().parent.parent
 BUNDLES_JSON = ROOT / "bundles.json"
@dataclass
 class BundleSpec:
    slug: str
    doc_id: str
    title: str
    version: str | None
    product: str  # e.g. "User Manual", "Release Notes", "Deployment Guide"
    mode: str    # "toc" or "single"
    platform: str | None = None
    language: str = "en-US"
 # Declared bundles. Versions confirmed 2026-05-22 by probing the docId
 # range sd00007400..7740 for `v8.1.x` matches in the abstract.
 BUNDLES: list[BundleSpec] = [
    BundleSpec("hvm_user_manual_8_1_0",   "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual",      "toc"),
    BundleSpec("hvm_user_manual_8_1_1",   "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual",      "toc"),
    BundleSpec("hvm_user_manual_8_1_2",   "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual",      "toc"),
    BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.0", "Release Notes",   "single"),
    BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.1", "Release Notes",   "single"),
    BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes",  "8.1.2", "Release Notes",   "single"),
    BundleSpec("hvm_deployment_guide",    "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide",        None,    "Deployment Guide","toc"),
 ]
 def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
    return s
 def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
    delay = 1.0
    for attempt in range(retries):
        r = s.get(url, timeout=30)
        if r.status_code == 200:
            return r.json() if expect_json else r.text
        if r.status_code == 404:
            return None
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay)
            delay *= 2
            continue
        r.raise_for_status()
    raise RuntimeError(f"GET failed after {retries} retries: {url}")
 def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
    """Returns (page_count, landing_page_guid)."""
    if not toc:
        return 0, None
    landing = None
    n = 0
    def walk(nodes: list[dict] | None, depth: int) -> None:
        nonlocal n, landing
        for node in nodes or []:
            link = node.get("topicLink")
            if link:
                n += 1
                m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
                if m and landing is None:
                    landing = m.group(1)
            walk(node.get("children"), depth + 1)
    walk(toc, 0)
    return n, landing
 def _parse_abstract(html: str) -> dict[str, str]:
    """Pull title / abstract text / published date out of the DITA abstract HTML."""
    soup = BeautifulSoup(html, "html.parser")
    out: dict[str, str] = {}
    h1 = soup.select_one("h1.title.topictitle1")
    if h1:
        out["title"] = h1.get_text(" ", strip=True)
    desc = soup.select_one("div.desc")
    if desc:
        out["abstract"] = desc.get_text(" ", strip=True)
    pub = soup.select_one("div.publishedDate")
    if pub:
        out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
    return out
 def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
    abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
    meta = _parse_abstract(abstract_html or "")
    page_count: int
    landing: str | None
    if spec.mode == "toc":
        toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
        page_count, landing = _count_toc(toc)
        if page_count == 0:
            print(f"  ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
            spec.mode = "single"
            page_count, landing = 1, spec.doc_id
    else:
        page_count, landing = 1, spec.doc_id
    return {
        "slug": spec.slug,
        "doc_id": spec.doc_id,
        "title": meta.get("title") or spec.title,
        "version": spec.version,
        "platform": spec.platform,
        "product": spec.product,
        "language": spec.language,
        "page_count": page_count,
        "mode": spec.mode,
        "abstract": meta.get("abstract", ""),
        "dates": {"Published": meta.get("published", "")},
        "landing_page": landing,
        "source_url": DOC_URL.format(doc_id=spec.doc_id),
    }
 def main() -> int:
    p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
    p.add_argument("--out", default=str(BUNDLES_JSON))
    args = p.parse_args()
    s = _session()
    out: list[dict[str, Any]] = []
    for spec in BUNDLES:
        print(f"  • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
        out.append(discover_bundle(s, spec))
    Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
    print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
    return 0
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,325 @@
 """Scrape HVM doc bundles into corpus/<slug>/<page_id>.{md,json}.
 Reads bundles.json (produced by scrape.bundles), then for each bundle:
  - mode="toc":    walks the TOC tree, fetches each page via the render
                   endpoint, converts page_html to markdown, writes
                   <page_id>.md + <page_id>.json sidecar.
  - mode="single": fetches /document/{docId} directly, treats the whole
                   body as one page with page_id = doc_id.
 After all bundles are on disk, runs a finalize pass that synthesizes
 topic_cluster.clustered_topics for each page by looking up the same
 GUID in sibling bundles (HPE GUIDs are stable across versions — see
 reference_hpe_docs_portal_api.md).
 Usage:
    python -m scrape.runner --all
    python -m scrape.runner --bundle hvm_user_manual_8_1_2
    python -m scrape.runner --all --force        # re-download already-on-disk pages
    python -m scrape.runner --finalize-only      # only redo the topic_cluster pass
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 import requests
 from bs4 import BeautifulSoup
 from markdownify import markdownify as md
 API = "https://support.hpe.com/hpesc/public/api/document"
 DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html"
 DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
 UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
 ROOT = Path(__file__).resolve().parent.parent
 CORPUS = ROOT / "corpus"
 BUNDLES_JSON = ROOT / "bundles.json"
 GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html")
@dataclass
 class TocEntry:
    page_id: str
    title: str
    ordinal: int
    parent_title: str | None
 def _session() -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
    return s
 def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
    delay = 1.0
    for attempt in range(retries):
        r = s.get(url, timeout=30)
        if r.status_code == 200:
            return r.json() if expect_json else r.text
        if r.status_code == 404:
            return None
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(delay)
            delay *= 2
            continue
        r.raise_for_status()
    raise RuntimeError(f"GET failed after {retries} retries: {url}")
 def _flatten_toc(toc: list[dict]) -> list[TocEntry]:
    out: list[TocEntry] = []
    ordinal = 0
    def walk(nodes: list[dict] | None, parent_title: str | None) -> None:
        nonlocal ordinal
        for node in nodes or []:
            title = node.get("topicName") or ""
            link = node.get("topicLink") or ""
            m = GUID_RE.search(link)
            if m:
                ordinal += 1
                out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title))
            walk(node.get("children"), title or parent_title)
    walk(toc, None)
    return out
 def _strip_dita_wrappers(html: str) -> str:
    """Remove the outer <main class="ditasrc">, drop the trademark Notices section,
    and unwrap aria-only span markup so markdownify produces clean text.
    DITA's notices boilerplate repeats across every doc; if we leave it in,
    every page chunk inherits the same trademark text and pollutes retrieval."""
    soup = BeautifulSoup(html, "html.parser")
    # Drop the Notices/Acknowledgments/Abstract boilerplate by section heading.
    # Every doc on the portal carries the same legal Notices and trademark
    # Acknowledgments; if we leave them in, every chunk inherits the same
    # text and pollutes retrieval. Abstract is one-line marketing.
    boilerplate = {"Notices", "Acknowledgments", "Abstract"}
    # Wrapped form: <article>/<section>/<div> whose first heading child is boilerplate.
    for sec in soup.select("article, section, div"):
        h = sec.find(["h1", "h2"], recursive=False)
        if h and h.get_text(strip=True) in boilerplate:
            sec.decompose()
    # Unwrapped form: bare <h1>/<h2>Boilerplate</h2> followed by its .desc/.body sibling.
    for h in soup.find_all(["h1", "h2"]):
        if h.get_text(strip=True) in boilerplate:
            sib = h.find_next_sibling()
            if sib and (sib.name in {"div", "section"}):
                cls = " ".join(sib.get("class", []) or [])
                if "desc" in cls or "body" in cls or "notices" in cls:
                    sib.decompose()
            h.decompose()
    main = soup.find("main")
    return str(main) if main else str(soup)
 def html_to_md(page_html: str) -> str:
    cleaned = _strip_dita_wrappers(page_html)
    text = md(cleaned, heading_style="ATX", bullets="-")
    # collapse runs of blank lines
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text + "\n"
 def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str:
    payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True)
    if not payload:
        return ""
    return payload.get("page_html") or ""
 def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]:
    """Returns (page_html, title) for a single-doc-shape bundle."""
    html = _get(s, f"{API}/{doc_id}")
    if not html:
        return "", ""
    soup = BeautifulSoup(html, "html.parser")
    h1 = soup.select_one("h1.title.topictitle1")
    title = h1.get_text(" ", strip=True) if h1 else doc_id
    return html, title
 def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool:
    bundle_dir.mkdir(parents=True, exist_ok=True)
    md_path = bundle_dir / f"{page_id}.md"
    json_path = bundle_dir / f"{page_id}.json"
    if not force and md_path.exists() and json_path.exists():
        return False
    md_path.write_text(body_md)
    json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
    return True
 def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int:
    doc_id = bundle["doc_id"]
    slug = bundle["slug"]
    bundle_dir = CORPUS / slug
    toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or []
    entries = _flatten_toc(toc)
    print(f"  {slug}: {len(entries)} pages", file=sys.stderr)
    written = 0
    def do_one(entry: TocEntry) -> bool:
        page_html = fetch_toc_page(s, doc_id, entry.page_id)
        if not page_html:
            return False
        body_md = html_to_md(page_html)
        sidecar = {
            "bundle_id": slug,
            "page_id": entry.page_id,
            "title": entry.title,
            "ordinal": entry.ordinal,
            "parent_title": entry.parent_title,
            "doc_id": doc_id,
            "version": bundle.get("version"),
            "product": bundle.get("product"),
            "source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id),
            # topic_cluster filled in by finalize()
        }
        return write_page(bundle_dir, entry.page_id, body_md, sidecar, force)
    with ThreadPoolExecutor(max_workers=concurrency) as pool:
        for fut in as_completed(pool.submit(do_one, e) for e in entries):
            if fut.result():
                written += 1
    return written
 def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int:
    doc_id = bundle["doc_id"]
    slug = bundle["slug"]
    bundle_dir = CORPUS / slug
    html, title = fetch_single_doc(s, doc_id)
    if not html:
        print(f"  ! {slug}: empty body", file=sys.stderr)
        return 0
    body_md = html_to_md(html)
    sidecar = {
        "bundle_id": slug,
        "page_id": doc_id,
        "title": title or bundle["title"],
        "ordinal": 1,
        "parent_title": None,
        "doc_id": doc_id,
        "version": bundle.get("version"),
        "product": bundle.get("product"),
        "source_url": DOC_URL_SINGLE.format(doc_id=doc_id),
    }
    print(f"  {slug}: 1 page (single-doc)", file=sys.stderr)
    return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0
 def finalize_clusters(bundles: list[dict]) -> int:
    """Cross-link sibling pages with the same GUID across version bundles.
    For TOC bundles, page_id == GUID; same GUID across two bundles = same
    underlying topic. For single-doc bundles (page_id == doc_id), peer them
    by matching product+version-sibling on the `product` field."""
    # GUID → list[(slug, sidecar_path, sidecar_dict)]
    guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
    # product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering
    product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
    for b in bundles:
        slug = b["slug"]
        bundle_dir = CORPUS / slug
        if not bundle_dir.exists():
            continue
        for jp in bundle_dir.glob("*.json"):
            data = json.loads(jp.read_text())
            pid = data["page_id"]
            if pid.startswith("GUID-"):
                guid_to_pages.setdefault(pid, []).append((slug, jp, data))
            else:
                product_to_pages.setdefault(b["product"], []).append((slug, jp, data))
    updated = 0
    # TOC pages — cluster by GUID
    for guid, peers in guid_to_pages.items():
        if len(peers) < 2:
            continue
        for slug, jp, data in peers:
            others = [
                {"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")}
                for s2, _, d2 in peers if s2 != slug
            ]
            data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
            jp.write_text(json.dumps(data, indent=2) + "\n")
            updated += 1
    # Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2)
    for product, peers in product_to_pages.items():
        if len(peers) < 2:
            continue
        for slug, jp, data in peers:
            others = [
                {"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")}
                for s2, _, d2 in peers if s2 != slug
            ]
            data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
            jp.write_text(json.dumps(data, indent=2) + "\n")
            updated += 1
    return updated
 def main() -> int:
    p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.")
    p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json")
    p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)")
    p.add_argument("--force", action="store_true", help="re-fetch pages already on disk")
    p.add_argument("--concurrency", type=int, default=6)
    p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields")
    args = p.parse_args()
    if not BUNDLES_JSON.exists():
        print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr)
        return 2
    bundles = json.loads(BUNDLES_JSON.read_text())
    if args.finalize_only:
        n = finalize_clusters(bundles)
        print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
        return 0
    if args.bundle:
        bundles = [b for b in bundles if b["slug"] in args.bundle]
        if not bundles:
            print(f"no bundles matched: {args.bundle}", file=sys.stderr)
            return 2
    elif not args.all:
        print("specify --all or --bundle <slug>", file=sys.stderr)
        return 2
    s = _session()
    total = 0
    for b in bundles:
        if b.get("mode") == "single":
            total += scrape_single_bundle(s, b, args.force)
        else:
            total += scrape_toc_bundle(s, b, args.force, args.concurrency)
    print(f"scraped {total} new/updated pages", file=sys.stderr)
    # Always finalize after a scrape so sidecars are consistent.
    all_bundles = json.loads(BUNDLES_JSON.read_text())
    n = finalize_clusters(all_bundles)
    print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
    return 0
 if __name__ == "__main__":
    sys.exit(main())