scrape: HVM bundles + runner for HPE Support DocPortal

Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2),
and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown.

Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}:
/toc walks the page tree (for TOC-paginated docs), /render?page=GUID
fetches per-page HTML, /document/{docId} returns the whole body for
single-doc shapes like Release Notes.

Runner converts DITA-source HTML to clean markdown (strips Notices/
Acknowledgments/Abstract boilerplate), writes corpus/<bundle>/<page>.{md,json},
then a finalize pass synthesizes topic_cluster.clustered_topics by GUID
overlap across versions (HPE GUIDs are stable cross-version — confirmed
374/376/376 with 100% overlap on shared pages).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-22 13:06:26 -04:00
parent 43728320bf
commit 7a491ba9e4
5 changed files with 633 additions and 0 deletions
+121
View File
@@ -0,0 +1,121 @@
[
{
"slug": "hvm_user_manual_8_1_0",
"doc_id": "sd00007520en_us",
"title": "HPE Morpheus VM Essentials Software Documentation",
"version": "8.1.0",
"platform": null,
"product": "User Manual",
"language": "en-US",
"page_count": 378,
"mode": "toc",
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.0",
"dates": {
"Published": "February 2026"
},
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007520en_us"
},
{
"slug": "hvm_user_manual_8_1_1",
"doc_id": "sd00007620en_us",
"title": "HPE Morpheus VM Essentials Software Documentation",
"version": "8.1.1",
"platform": null,
"product": "User Manual",
"language": "en-US",
"page_count": 380,
"mode": "toc",
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.1",
"dates": {
"Published": "March 2026"
},
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007620en_us"
},
{
"slug": "hvm_user_manual_8_1_2",
"doc_id": "sd00007735en_us",
"title": "HPE Morpheus VM Essentials Software Documentation",
"version": "8.1.2",
"platform": null,
"product": "User Manual",
"language": "en-US",
"page_count": 380,
"mode": "toc",
"abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.2",
"dates": {
"Published": "April 2026"
},
"landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007735en_us"
},
{
"slug": "hvm_release_notes_8_1_0",
"doc_id": "sd00007497en_us",
"title": "v8.1.0 Release Notes",
"version": "8.1.0",
"platform": null,
"product": "Release Notes",
"language": "en-US",
"page_count": 1,
"mode": "single",
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.0",
"dates": {
"Published": "February 2026"
},
"landing_page": "sd00007497en_us",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007497en_us"
},
{
"slug": "hvm_release_notes_8_1_1",
"doc_id": "sd00007609en_us",
"title": "v8.1.1 Release Notes",
"version": "8.1.1",
"platform": null,
"product": "Release Notes",
"language": "en-US",
"page_count": 1,
"mode": "single",
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.1",
"dates": {
"Published": "March 2026"
},
"landing_page": "sd00007609en_us",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007609en_us"
},
{
"slug": "hvm_release_notes_8_1_2",
"doc_id": "sd00007734en_us",
"title": "v8.1.2 Release Notes",
"version": "8.1.2",
"platform": null,
"product": "Release Notes",
"language": "en-US",
"page_count": 1,
"mode": "single",
"abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.2",
"dates": {
"Published": "April 2026"
},
"landing_page": "sd00007734en_us",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007734en_us"
},
{
"slug": "hvm_deployment_guide",
"doc_id": "sd00007332en_us",
"title": "HPE Morpheus VM Essentials Deployment Guide",
"version": null,
"platform": null,
"product": "Deployment Guide",
"language": "en-US",
"page_count": 42,
"mode": "toc",
"abstract": "HPE Morpheus VM Essentials Deployment Guide",
"dates": {
"Published": "January 2026"
},
"landing_page": "GUID-BF94B8DA-C4F6-4CDF-99E6-0AAA03177099",
"source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007332en_us"
}
]
+10
View File
@@ -0,0 +1,10 @@
# Dev/CPU reranker — only for running scripts/rerank_server.py locally.
# Production uses the llama.cpp + jina-reranker GGUF sidecar (see
# deploy/docker-compose.yml). Install with:
#
# pip install -r requirements-rerank.txt
#
# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder
# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main
# requirements.txt so the production image stays slim.
sentence-transformers>=3.0
+7
View File
@@ -10,10 +10,17 @@ ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not
# Scraping (Phase 1; adjust per product) # Scraping (Phase 1; adjust per product)
beautifulsoup4>=4.12 beautifulsoup4>=4.12
requests>=2.31 requests>=2.31
markdownify>=0.11
# playwright>=1.40 # uncomment if you need headless browser fallback # playwright>=1.40 # uncomment if you need headless browser fallback
# Evaluation # Evaluation
numpy>=1.26 numpy>=1.26
# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server
# only needs httpx (declared above) to call it. For the dev / CPU
# fallback reranker (scripts/rerank_server.py), install
# requirements-rerank.txt separately — it pulls in PyTorch which would
# triple the production image size.
# Dev / utility # Dev / utility
python-dateutil>=2.8 python-dateutil>=2.8
+170
View File
@@ -0,0 +1,170 @@
"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json.
Bundle IDs are declared statically here because HPE mints a new docId
per product version rather than versioning a single doc (see
~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new
version drops, add a new entry to BUNDLES and re-run; the runner will
pick it up on the next pass.
For each bundle this script:
1. GETs /hpesc/public/api/document/{docId} → abstract HTML
2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc)
3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
BUNDLES_JSON = ROOT / "bundles.json"
@dataclass
class BundleSpec:
slug: str
doc_id: str
title: str
version: str | None
product: str # e.g. "User Manual", "Release Notes", "Deployment Guide"
mode: str # "toc" or "single"
platform: str | None = None
language: str = "en-US"
# Declared bundles. Versions confirmed 2026-05-22 by probing the docId
# range sd00007400..7740 for `v8.1.x` matches in the abstract.
BUNDLES: list[BundleSpec] = [
BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"),
BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"),
BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"),
BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"),
BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"),
]
def _session() -> requests.Session:
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
return s
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
delay = 1.0
for attempt in range(retries):
r = s.get(url, timeout=30)
if r.status_code == 200:
return r.json() if expect_json else r.text
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
time.sleep(delay)
delay *= 2
continue
r.raise_for_status()
raise RuntimeError(f"GET failed after {retries} retries: {url}")
def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]:
"""Returns (page_count, landing_page_guid)."""
if not toc:
return 0, None
landing = None
n = 0
def walk(nodes: list[dict] | None, depth: int) -> None:
nonlocal n, landing
for node in nodes or []:
link = node.get("topicLink")
if link:
n += 1
m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link)
if m and landing is None:
landing = m.group(1)
walk(node.get("children"), depth + 1)
walk(toc, 0)
return n, landing
def _parse_abstract(html: str) -> dict[str, str]:
"""Pull title / abstract text / published date out of the DITA abstract HTML."""
soup = BeautifulSoup(html, "html.parser")
out: dict[str, str] = {}
h1 = soup.select_one("h1.title.topictitle1")
if h1:
out["title"] = h1.get_text(" ", strip=True)
desc = soup.select_one("div.desc")
if desc:
out["abstract"] = desc.get_text(" ", strip=True)
pub = soup.select_one("div.publishedDate")
if pub:
out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip()
return out
def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]:
abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False)
meta = _parse_abstract(abstract_html or "")
page_count: int
landing: str | None
if spec.mode == "toc":
toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True)
page_count, landing = _count_toc(toc)
if page_count == 0:
print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr)
spec.mode = "single"
page_count, landing = 1, spec.doc_id
else:
page_count, landing = 1, spec.doc_id
return {
"slug": spec.slug,
"doc_id": spec.doc_id,
"title": meta.get("title") or spec.title,
"version": spec.version,
"platform": spec.platform,
"product": spec.product,
"language": spec.language,
"page_count": page_count,
"mode": spec.mode,
"abstract": meta.get("abstract", ""),
"dates": {"Published": meta.get("published", "")},
"landing_page": landing,
"source_url": DOC_URL.format(doc_id=spec.doc_id),
}
def main() -> int:
p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.")
p.add_argument("--out", default=str(BUNDLES_JSON))
args = p.parse_args()
s = _session()
out: list[dict[str, Any]] = []
for spec in BUNDLES:
print(f"{spec.slug} ({spec.doc_id}) ...", file=sys.stderr)
out.append(discover_bundle(s, spec))
Path(args.out).write_text(json.dumps(out, indent=2) + "\n")
print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())
+325
View File
@@ -0,0 +1,325 @@
"""Scrape HVM doc bundles into corpus/<slug>/<page_id>.{md,json}.
Reads bundles.json (produced by scrape.bundles), then for each bundle:
- mode="toc": walks the TOC tree, fetches each page via the render
endpoint, converts page_html to markdown, writes
<page_id>.md + <page_id>.json sidecar.
- mode="single": fetches /document/{docId} directly, treats the whole
body as one page with page_id = doc_id.
After all bundles are on disk, runs a finalize pass that synthesizes
topic_cluster.clustered_topics for each page by looking up the same
GUID in sibling bundles (HPE GUIDs are stable across versions — see
reference_hpe_docs_portal_api.md).
Usage:
python -m scrape.runner --all
python -m scrape.runner --bundle hvm_user_manual_8_1_2
python -m scrape.runner --all --force # re-download already-on-disk pages
python -m scrape.runner --finalize-only # only redo the topic_cluster pass
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
API = "https://support.hpe.com/hpesc/public/api/document"
DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html"
DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}"
UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)"
ROOT = Path(__file__).resolve().parent.parent
CORPUS = ROOT / "corpus"
BUNDLES_JSON = ROOT / "bundles.json"
GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html")
@dataclass
class TocEntry:
page_id: str
title: str
ordinal: int
parent_title: str | None
def _session() -> requests.Session:
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"})
return s
def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any:
delay = 1.0
for attempt in range(retries):
r = s.get(url, timeout=30)
if r.status_code == 200:
return r.json() if expect_json else r.text
if r.status_code == 404:
return None
if r.status_code in (429, 500, 502, 503, 504):
time.sleep(delay)
delay *= 2
continue
r.raise_for_status()
raise RuntimeError(f"GET failed after {retries} retries: {url}")
def _flatten_toc(toc: list[dict]) -> list[TocEntry]:
out: list[TocEntry] = []
ordinal = 0
def walk(nodes: list[dict] | None, parent_title: str | None) -> None:
nonlocal ordinal
for node in nodes or []:
title = node.get("topicName") or ""
link = node.get("topicLink") or ""
m = GUID_RE.search(link)
if m:
ordinal += 1
out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title))
walk(node.get("children"), title or parent_title)
walk(toc, None)
return out
def _strip_dita_wrappers(html: str) -> str:
"""Remove the outer <main class="ditasrc">, drop the trademark Notices section,
and unwrap aria-only span markup so markdownify produces clean text.
DITA's notices boilerplate repeats across every doc; if we leave it in,
every page chunk inherits the same trademark text and pollutes retrieval."""
soup = BeautifulSoup(html, "html.parser")
# Drop the Notices/Acknowledgments/Abstract boilerplate by section heading.
# Every doc on the portal carries the same legal Notices and trademark
# Acknowledgments; if we leave them in, every chunk inherits the same
# text and pollutes retrieval. Abstract is one-line marketing.
boilerplate = {"Notices", "Acknowledgments", "Abstract"}
# Wrapped form: <article>/<section>/<div> whose first heading child is boilerplate.
for sec in soup.select("article, section, div"):
h = sec.find(["h1", "h2"], recursive=False)
if h and h.get_text(strip=True) in boilerplate:
sec.decompose()
# Unwrapped form: bare <h1>/<h2>Boilerplate</h2> followed by its .desc/.body sibling.
for h in soup.find_all(["h1", "h2"]):
if h.get_text(strip=True) in boilerplate:
sib = h.find_next_sibling()
if sib and (sib.name in {"div", "section"}):
cls = " ".join(sib.get("class", []) or [])
if "desc" in cls or "body" in cls or "notices" in cls:
sib.decompose()
h.decompose()
main = soup.find("main")
return str(main) if main else str(soup)
def html_to_md(page_html: str) -> str:
cleaned = _strip_dita_wrappers(page_html)
text = md(cleaned, heading_style="ATX", bullets="-")
# collapse runs of blank lines
text = re.sub(r"\n{3,}", "\n\n", text).strip()
return text + "\n"
def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str:
payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True)
if not payload:
return ""
return payload.get("page_html") or ""
def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]:
"""Returns (page_html, title) for a single-doc-shape bundle."""
html = _get(s, f"{API}/{doc_id}")
if not html:
return "", ""
soup = BeautifulSoup(html, "html.parser")
h1 = soup.select_one("h1.title.topictitle1")
title = h1.get_text(" ", strip=True) if h1 else doc_id
return html, title
def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool:
bundle_dir.mkdir(parents=True, exist_ok=True)
md_path = bundle_dir / f"{page_id}.md"
json_path = bundle_dir / f"{page_id}.json"
if not force and md_path.exists() and json_path.exists():
return False
md_path.write_text(body_md)
json_path.write_text(json.dumps(sidecar, indent=2) + "\n")
return True
def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int:
doc_id = bundle["doc_id"]
slug = bundle["slug"]
bundle_dir = CORPUS / slug
toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or []
entries = _flatten_toc(toc)
print(f" {slug}: {len(entries)} pages", file=sys.stderr)
written = 0
def do_one(entry: TocEntry) -> bool:
page_html = fetch_toc_page(s, doc_id, entry.page_id)
if not page_html:
return False
body_md = html_to_md(page_html)
sidecar = {
"bundle_id": slug,
"page_id": entry.page_id,
"title": entry.title,
"ordinal": entry.ordinal,
"parent_title": entry.parent_title,
"doc_id": doc_id,
"version": bundle.get("version"),
"product": bundle.get("product"),
"source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id),
# topic_cluster filled in by finalize()
}
return write_page(bundle_dir, entry.page_id, body_md, sidecar, force)
with ThreadPoolExecutor(max_workers=concurrency) as pool:
for fut in as_completed(pool.submit(do_one, e) for e in entries):
if fut.result():
written += 1
return written
def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int:
doc_id = bundle["doc_id"]
slug = bundle["slug"]
bundle_dir = CORPUS / slug
html, title = fetch_single_doc(s, doc_id)
if not html:
print(f" ! {slug}: empty body", file=sys.stderr)
return 0
body_md = html_to_md(html)
sidecar = {
"bundle_id": slug,
"page_id": doc_id,
"title": title or bundle["title"],
"ordinal": 1,
"parent_title": None,
"doc_id": doc_id,
"version": bundle.get("version"),
"product": bundle.get("product"),
"source_url": DOC_URL_SINGLE.format(doc_id=doc_id),
}
print(f" {slug}: 1 page (single-doc)", file=sys.stderr)
return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0
def finalize_clusters(bundles: list[dict]) -> int:
"""Cross-link sibling pages with the same GUID across version bundles.
For TOC bundles, page_id == GUID; same GUID across two bundles = same
underlying topic. For single-doc bundles (page_id == doc_id), peer them
by matching product+version-sibling on the `product` field."""
# GUID → list[(slug, sidecar_path, sidecar_dict)]
guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
# product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering
product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {}
for b in bundles:
slug = b["slug"]
bundle_dir = CORPUS / slug
if not bundle_dir.exists():
continue
for jp in bundle_dir.glob("*.json"):
data = json.loads(jp.read_text())
pid = data["page_id"]
if pid.startswith("GUID-"):
guid_to_pages.setdefault(pid, []).append((slug, jp, data))
else:
product_to_pages.setdefault(b["product"], []).append((slug, jp, data))
updated = 0
# TOC pages — cluster by GUID
for guid, peers in guid_to_pages.items():
if len(peers) < 2:
continue
for slug, jp, data in peers:
others = [
{"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")}
for s2, _, d2 in peers if s2 != slug
]
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
jp.write_text(json.dumps(data, indent=2) + "\n")
updated += 1
# Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2)
for product, peers in product_to_pages.items():
if len(peers) < 2:
continue
for slug, jp, data in peers:
others = [
{"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")}
for s2, _, d2 in peers if s2 != slug
]
data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others}
jp.write_text(json.dumps(data, indent=2) + "\n")
updated += 1
return updated
def main() -> int:
p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.")
p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json")
p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)")
p.add_argument("--force", action="store_true", help="re-fetch pages already on disk")
p.add_argument("--concurrency", type=int, default=6)
p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields")
args = p.parse_args()
if not BUNDLES_JSON.exists():
print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr)
return 2
bundles = json.loads(BUNDLES_JSON.read_text())
if args.finalize_only:
n = finalize_clusters(bundles)
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
return 0
if args.bundle:
bundles = [b for b in bundles if b["slug"] in args.bundle]
if not bundles:
print(f"no bundles matched: {args.bundle}", file=sys.stderr)
return 2
elif not args.all:
print("specify --all or --bundle <slug>", file=sys.stderr)
return 2
s = _session()
total = 0
for b in bundles:
if b.get("mode") == "single":
total += scrape_single_bundle(s, b, args.force)
else:
total += scrape_toc_bundle(s, b, args.force, args.concurrency)
print(f"scraped {total} new/updated pages", file=sys.stderr)
# Always finalize after a scrape so sidecars are consistent.
all_bundles = json.loads(BUNDLES_JSON.read_text())
n = finalize_clusters(all_bundles)
print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())