From 7a491ba9e423ea16ce2cff935537cf0d085963bb Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 22 May 2026 13:06:26 -0400 Subject: [PATCH] scrape: HVM bundles + runner for HPE Support DocPortal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1: scrape User Manual (8.1.0/.1/.2), Release Notes (8.1.0/.1/.2), and the unversioned Deployment Guide. Total ~1,160 pages, 9.7 MB markdown. Discovers via the anonymous JSON API at /hpesc/public/api/document/{docId}: /toc walks the page tree (for TOC-paginated docs), /render?page=GUID fetches per-page HTML, /document/{docId} returns the whole body for single-doc shapes like Release Notes. Runner converts DITA-source HTML to clean markdown (strips Notices/ Acknowledgments/Abstract boilerplate), writes corpus//.{md,json}, then a finalize pass synthesizes topic_cluster.clustered_topics by GUID overlap across versions (HPE GUIDs are stable cross-version — confirmed 374/376/376 with 100% overlap on shared pages). Co-Authored-By: Claude Opus 4.7 (1M context) --- bundles.json | 121 +++++++++++++++ requirements-rerank.txt | 10 ++ requirements.txt | 7 + scrape/bundles.py | 170 +++++++++++++++++++++ scrape/runner.py | 325 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 633 insertions(+) create mode 100644 bundles.json create mode 100644 requirements-rerank.txt create mode 100644 scrape/bundles.py create mode 100644 scrape/runner.py diff --git a/bundles.json b/bundles.json new file mode 100644 index 0000000..4855ee1 --- /dev/null +++ b/bundles.json @@ -0,0 +1,121 @@ +[ + { + "slug": "hvm_user_manual_8_1_0", + "doc_id": "sd00007520en_us", + "title": "HPE Morpheus VM Essentials Software Documentation", + "version": "8.1.0", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 378, + "mode": "toc", + "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.0", + "dates": { + "Published": "February 2026" + }, + "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007520en_us" + }, + { + "slug": "hvm_user_manual_8_1_1", + "doc_id": "sd00007620en_us", + "title": "HPE Morpheus VM Essentials Software Documentation", + "version": "8.1.1", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 380, + "mode": "toc", + "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.1", + "dates": { + "Published": "March 2026" + }, + "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007620en_us" + }, + { + "slug": "hvm_user_manual_8_1_2", + "doc_id": "sd00007735en_us", + "title": "HPE Morpheus VM Essentials Software Documentation", + "version": "8.1.2", + "platform": null, + "product": "User Manual", + "language": "en-US", + "page_count": 380, + "mode": "toc", + "abstract": "User Manual for HPE Morpheus VM Essentials Software version v8.1.2", + "dates": { + "Published": "April 2026" + }, + "landing_page": "GUID-498C49E5-5D26-44E1-A2CC-9AAC0813BA93", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007735en_us" + }, + { + "slug": "hvm_release_notes_8_1_0", + "doc_id": "sd00007497en_us", + "title": "v8.1.0 Release Notes", + "version": "8.1.0", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.0", + "dates": { + "Published": "February 2026" + }, + "landing_page": "sd00007497en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007497en_us" + }, + { + "slug": "hvm_release_notes_8_1_1", + "doc_id": "sd00007609en_us", + "title": "v8.1.1 Release Notes", + "version": "8.1.1", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.1", + "dates": { + "Published": "March 2026" + }, + "landing_page": "sd00007609en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007609en_us" + }, + { + "slug": "hvm_release_notes_8_1_2", + "doc_id": "sd00007734en_us", + "title": "v8.1.2 Release Notes", + "version": "8.1.2", + "platform": null, + "product": "Release Notes", + "language": "en-US", + "page_count": 1, + "mode": "single", + "abstract": "Release notes for HPE Morpheus VM Essentials Software version v8.1.2", + "dates": { + "Published": "April 2026" + }, + "landing_page": "sd00007734en_us", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007734en_us" + }, + { + "slug": "hvm_deployment_guide", + "doc_id": "sd00007332en_us", + "title": "HPE Morpheus VM Essentials Deployment Guide", + "version": null, + "platform": null, + "product": "Deployment Guide", + "language": "en-US", + "page_count": 42, + "mode": "toc", + "abstract": "HPE Morpheus VM Essentials Deployment Guide", + "dates": { + "Published": "January 2026" + }, + "landing_page": "GUID-BF94B8DA-C4F6-4CDF-99E6-0AAA03177099", + "source_url": "https://support.hpe.com/hpesc/public/docDisplay?docId=sd00007332en_us" + } +] diff --git a/requirements-rerank.txt b/requirements-rerank.txt new file mode 100644 index 0000000..2c5fc27 --- /dev/null +++ b/requirements-rerank.txt @@ -0,0 +1,10 @@ +# Dev/CPU reranker — only for running scripts/rerank_server.py locally. +# Production uses the llama.cpp + jina-reranker GGUF sidecar (see +# deploy/docker-compose.yml). Install with: +# +# pip install -r requirements-rerank.txt +# +# This adds PyTorch (~2 GB) and the sentence-transformers cross-encoder +# (cross-encoder/ms-marco-MiniLM-L-6-v2, ~22 MB). Keep out of the main +# requirements.txt so the production image stays slim. +sentence-transformers>=3.0 diff --git a/requirements.txt b/requirements.txt index b9982a9..a89002f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,10 +10,17 @@ ollama>=0.4.0 # if using Ollama-hosted embedder; swap if not # Scraping (Phase 1; adjust per product) beautifulsoup4>=4.12 requests>=2.31 +markdownify>=0.11 # playwright>=1.40 # uncomment if you need headless browser fallback # Evaluation numpy>=1.26 +# Reranker is a sidecar (see deploy/docker-compose.yml). The MCP server +# only needs httpx (declared above) to call it. For the dev / CPU +# fallback reranker (scripts/rerank_server.py), install +# requirements-rerank.txt separately — it pulls in PyTorch which would +# triple the production image size. + # Dev / utility python-dateutil>=2.8 diff --git a/scrape/bundles.py b/scrape/bundles.py new file mode 100644 index 0000000..4bd0c59 --- /dev/null +++ b/scrape/bundles.py @@ -0,0 +1,170 @@ +"""Discover HVM doc bundles on HPE Support DocPortal and write bundles.json. + +Bundle IDs are declared statically here because HPE mints a new docId +per product version rather than versioning a single doc (see +~/.claude/.../reference_hpe_docs_portal_api.md for context). When a new +version drops, add a new entry to BUNDLES and re-run; the runner will +pick it up on the next pass. + +For each bundle this script: + 1. GETs /hpesc/public/api/document/{docId} → abstract HTML + 2. GETs /hpesc/public/api/document/{docId}/toc → page tree (or 404 for single-doc) + 3. Writes bundles.json at repo root with the schema PLAN.md Phase 1 documents. +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +import time +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup + +API = "https://support.hpe.com/hpesc/public/api/document" +DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" +UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)" +ROOT = Path(__file__).resolve().parent.parent +BUNDLES_JSON = ROOT / "bundles.json" + + +@dataclass +class BundleSpec: + slug: str + doc_id: str + title: str + version: str | None + product: str # e.g. "User Manual", "Release Notes", "Deployment Guide" + mode: str # "toc" or "single" + platform: str | None = None + language: str = "en-US" + + +# Declared bundles. Versions confirmed 2026-05-22 by probing the docId +# range sd00007400..7740 for `v8.1.x` matches in the abstract. +BUNDLES: list[BundleSpec] = [ + BundleSpec("hvm_user_manual_8_1_0", "sd00007520en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.0", "User Manual", "toc"), + BundleSpec("hvm_user_manual_8_1_1", "sd00007620en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.1", "User Manual", "toc"), + BundleSpec("hvm_user_manual_8_1_2", "sd00007735en_us", "HPE Morpheus VM Essentials Software Documentation", "8.1.2", "User Manual", "toc"), + BundleSpec("hvm_release_notes_8_1_0", "sd00007497en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.0", "Release Notes", "single"), + BundleSpec("hvm_release_notes_8_1_1", "sd00007609en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.1", "Release Notes", "single"), + BundleSpec("hvm_release_notes_8_1_2", "sd00007734en_us", "HPE Morpheus VM Essentials Software Release Notes", "8.1.2", "Release Notes", "single"), + BundleSpec("hvm_deployment_guide", "sd00007332en_us", "HPE Morpheus VM Essentials Deployment Guide", None, "Deployment Guide","toc"), +] + + +def _session() -> requests.Session: + s = requests.Session() + s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) + return s + + +def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: + delay = 1.0 + for attempt in range(retries): + r = s.get(url, timeout=30) + if r.status_code == 200: + return r.json() if expect_json else r.text + if r.status_code == 404: + return None + if r.status_code in (429, 500, 502, 503, 504): + time.sleep(delay) + delay *= 2 + continue + r.raise_for_status() + raise RuntimeError(f"GET failed after {retries} retries: {url}") + + +def _count_toc(toc: list[dict] | None) -> tuple[int, str | None]: + """Returns (page_count, landing_page_guid).""" + if not toc: + return 0, None + landing = None + n = 0 + + def walk(nodes: list[dict] | None, depth: int) -> None: + nonlocal n, landing + for node in nodes or []: + link = node.get("topicLink") + if link: + n += 1 + m = re.search(r"page=(GUID-[A-F0-9-]+)\.html", link) + if m and landing is None: + landing = m.group(1) + walk(node.get("children"), depth + 1) + + walk(toc, 0) + return n, landing + + +def _parse_abstract(html: str) -> dict[str, str]: + """Pull title / abstract text / published date out of the DITA abstract HTML.""" + soup = BeautifulSoup(html, "html.parser") + out: dict[str, str] = {} + h1 = soup.select_one("h1.title.topictitle1") + if h1: + out["title"] = h1.get_text(" ", strip=True) + desc = soup.select_one("div.desc") + if desc: + out["abstract"] = desc.get_text(" ", strip=True) + pub = soup.select_one("div.publishedDate") + if pub: + out["published"] = pub.get_text(" ", strip=True).replace("Published:", "").strip() + return out + + +def discover_bundle(s: requests.Session, spec: BundleSpec) -> dict[str, Any]: + abstract_html = _get(s, f"{API}/{spec.doc_id}", expect_json=False) + meta = _parse_abstract(abstract_html or "") + + page_count: int + landing: str | None + if spec.mode == "toc": + toc = _get(s, f"{API}/{spec.doc_id}/toc", expect_json=True) + page_count, landing = _count_toc(toc) + if page_count == 0: + print(f" ! {spec.slug}: TOC empty — falling back to single-doc mode", file=sys.stderr) + spec.mode = "single" + page_count, landing = 1, spec.doc_id + else: + page_count, landing = 1, spec.doc_id + + return { + "slug": spec.slug, + "doc_id": spec.doc_id, + "title": meta.get("title") or spec.title, + "version": spec.version, + "platform": spec.platform, + "product": spec.product, + "language": spec.language, + "page_count": page_count, + "mode": spec.mode, + "abstract": meta.get("abstract", ""), + "dates": {"Published": meta.get("published", "")}, + "landing_page": landing, + "source_url": DOC_URL.format(doc_id=spec.doc_id), + } + + +def main() -> int: + p = argparse.ArgumentParser(description="Build bundles.json from BUNDLES list.") + p.add_argument("--out", default=str(BUNDLES_JSON)) + args = p.parse_args() + + s = _session() + out: list[dict[str, Any]] = [] + for spec in BUNDLES: + print(f" • {spec.slug} ({spec.doc_id}) ...", file=sys.stderr) + out.append(discover_bundle(s, spec)) + + Path(args.out).write_text(json.dumps(out, indent=2) + "\n") + print(f"wrote {args.out}: {len(out)} bundles, {sum(b['page_count'] for b in out)} pages total", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrape/runner.py b/scrape/runner.py new file mode 100644 index 0000000..ad5909b --- /dev/null +++ b/scrape/runner.py @@ -0,0 +1,325 @@ +"""Scrape HVM doc bundles into corpus//.{md,json}. + +Reads bundles.json (produced by scrape.bundles), then for each bundle: + - mode="toc": walks the TOC tree, fetches each page via the render + endpoint, converts page_html to markdown, writes + .md + .json sidecar. + - mode="single": fetches /document/{docId} directly, treats the whole + body as one page with page_id = doc_id. + +After all bundles are on disk, runs a finalize pass that synthesizes +topic_cluster.clustered_topics for each page by looking up the same +GUID in sibling bundles (HPE GUIDs are stable across versions — see +reference_hpe_docs_portal_api.md). + +Usage: + python -m scrape.runner --all + python -m scrape.runner --bundle hvm_user_manual_8_1_2 + python -m scrape.runner --all --force # re-download already-on-disk pages + python -m scrape.runner --finalize-only # only redo the topic_cluster pass +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import requests +from bs4 import BeautifulSoup +from markdownify import markdownify as md + +API = "https://support.hpe.com/hpesc/public/api/document" +DOC_URL = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}&page={page_id}.html" +DOC_URL_SINGLE = "https://support.hpe.com/hpesc/public/docDisplay?docId={doc_id}" +UA = "hvm-docs-mcp/0.1 (+https://git.jpaul.io/justin/hvm-docs; admin@jpaul.io)" +ROOT = Path(__file__).resolve().parent.parent +CORPUS = ROOT / "corpus" +BUNDLES_JSON = ROOT / "bundles.json" + +GUID_RE = re.compile(r"page=(GUID-[A-F0-9-]+)\.html") + + +@dataclass +class TocEntry: + page_id: str + title: str + ordinal: int + parent_title: str | None + + +def _session() -> requests.Session: + s = requests.Session() + s.headers.update({"User-Agent": UA, "Accept": "application/json, text/html"}) + return s + + +def _get(s: requests.Session, url: str, expect_json: bool = False, retries: int = 4) -> Any: + delay = 1.0 + for attempt in range(retries): + r = s.get(url, timeout=30) + if r.status_code == 200: + return r.json() if expect_json else r.text + if r.status_code == 404: + return None + if r.status_code in (429, 500, 502, 503, 504): + time.sleep(delay) + delay *= 2 + continue + r.raise_for_status() + raise RuntimeError(f"GET failed after {retries} retries: {url}") + + +def _flatten_toc(toc: list[dict]) -> list[TocEntry]: + out: list[TocEntry] = [] + ordinal = 0 + + def walk(nodes: list[dict] | None, parent_title: str | None) -> None: + nonlocal ordinal + for node in nodes or []: + title = node.get("topicName") or "" + link = node.get("topicLink") or "" + m = GUID_RE.search(link) + if m: + ordinal += 1 + out.append(TocEntry(page_id=m.group(1), title=title, ordinal=ordinal, parent_title=parent_title)) + walk(node.get("children"), title or parent_title) + + walk(toc, None) + return out + + +def _strip_dita_wrappers(html: str) -> str: + """Remove the outer
, drop the trademark Notices section, + and unwrap aria-only span markup so markdownify produces clean text. + + DITA's notices boilerplate repeats across every doc; if we leave it in, + every page chunk inherits the same trademark text and pollutes retrieval.""" + soup = BeautifulSoup(html, "html.parser") + # Drop the Notices/Acknowledgments/Abstract boilerplate by section heading. + # Every doc on the portal carries the same legal Notices and trademark + # Acknowledgments; if we leave them in, every chunk inherits the same + # text and pollutes retrieval. Abstract is one-line marketing. + boilerplate = {"Notices", "Acknowledgments", "Abstract"} + # Wrapped form:
/
/
whose first heading child is boilerplate. + for sec in soup.select("article, section, div"): + h = sec.find(["h1", "h2"], recursive=False) + if h and h.get_text(strip=True) in boilerplate: + sec.decompose() + # Unwrapped form: bare

/

Boilerplate

followed by its .desc/.body sibling. + for h in soup.find_all(["h1", "h2"]): + if h.get_text(strip=True) in boilerplate: + sib = h.find_next_sibling() + if sib and (sib.name in {"div", "section"}): + cls = " ".join(sib.get("class", []) or []) + if "desc" in cls or "body" in cls or "notices" in cls: + sib.decompose() + h.decompose() + main = soup.find("main") + return str(main) if main else str(soup) + + +def html_to_md(page_html: str) -> str: + cleaned = _strip_dita_wrappers(page_html) + text = md(cleaned, heading_style="ATX", bullets="-") + # collapse runs of blank lines + text = re.sub(r"\n{3,}", "\n\n", text).strip() + return text + "\n" + + +def fetch_toc_page(s: requests.Session, doc_id: str, page_id: str) -> str: + payload = _get(s, f"{API}/{doc_id}/render?page={page_id}.html", expect_json=True) + if not payload: + return "" + return payload.get("page_html") or "" + + +def fetch_single_doc(s: requests.Session, doc_id: str) -> tuple[str, str]: + """Returns (page_html, title) for a single-doc-shape bundle.""" + html = _get(s, f"{API}/{doc_id}") + if not html: + return "", "" + soup = BeautifulSoup(html, "html.parser") + h1 = soup.select_one("h1.title.topictitle1") + title = h1.get_text(" ", strip=True) if h1 else doc_id + return html, title + + +def write_page(bundle_dir: Path, page_id: str, body_md: str, sidecar: dict[str, Any], force: bool) -> bool: + bundle_dir.mkdir(parents=True, exist_ok=True) + md_path = bundle_dir / f"{page_id}.md" + json_path = bundle_dir / f"{page_id}.json" + if not force and md_path.exists() and json_path.exists(): + return False + md_path.write_text(body_md) + json_path.write_text(json.dumps(sidecar, indent=2) + "\n") + return True + + +def scrape_toc_bundle(s: requests.Session, bundle: dict, force: bool, concurrency: int) -> int: + doc_id = bundle["doc_id"] + slug = bundle["slug"] + bundle_dir = CORPUS / slug + + toc = _get(s, f"{API}/{doc_id}/toc", expect_json=True) or [] + entries = _flatten_toc(toc) + print(f" {slug}: {len(entries)} pages", file=sys.stderr) + + written = 0 + def do_one(entry: TocEntry) -> bool: + page_html = fetch_toc_page(s, doc_id, entry.page_id) + if not page_html: + return False + body_md = html_to_md(page_html) + sidecar = { + "bundle_id": slug, + "page_id": entry.page_id, + "title": entry.title, + "ordinal": entry.ordinal, + "parent_title": entry.parent_title, + "doc_id": doc_id, + "version": bundle.get("version"), + "product": bundle.get("product"), + "source_url": DOC_URL.format(doc_id=doc_id, page_id=entry.page_id), + # topic_cluster filled in by finalize() + } + return write_page(bundle_dir, entry.page_id, body_md, sidecar, force) + + with ThreadPoolExecutor(max_workers=concurrency) as pool: + for fut in as_completed(pool.submit(do_one, e) for e in entries): + if fut.result(): + written += 1 + return written + + +def scrape_single_bundle(s: requests.Session, bundle: dict, force: bool) -> int: + doc_id = bundle["doc_id"] + slug = bundle["slug"] + bundle_dir = CORPUS / slug + + html, title = fetch_single_doc(s, doc_id) + if not html: + print(f" ! {slug}: empty body", file=sys.stderr) + return 0 + body_md = html_to_md(html) + sidecar = { + "bundle_id": slug, + "page_id": doc_id, + "title": title or bundle["title"], + "ordinal": 1, + "parent_title": None, + "doc_id": doc_id, + "version": bundle.get("version"), + "product": bundle.get("product"), + "source_url": DOC_URL_SINGLE.format(doc_id=doc_id), + } + print(f" {slug}: 1 page (single-doc)", file=sys.stderr) + return 1 if write_page(bundle_dir, doc_id, body_md, sidecar, force) else 0 + + +def finalize_clusters(bundles: list[dict]) -> int: + """Cross-link sibling pages with the same GUID across version bundles. + + For TOC bundles, page_id == GUID; same GUID across two bundles = same + underlying topic. For single-doc bundles (page_id == doc_id), peer them + by matching product+version-sibling on the `product` field.""" + # GUID → list[(slug, sidecar_path, sidecar_dict)] + guid_to_pages: dict[str, list[tuple[str, Path, dict]]] = {} + # product → list[(slug, sidecar_path, sidecar_dict)] for single-doc peering + product_to_pages: dict[str, list[tuple[str, Path, dict]]] = {} + + for b in bundles: + slug = b["slug"] + bundle_dir = CORPUS / slug + if not bundle_dir.exists(): + continue + for jp in bundle_dir.glob("*.json"): + data = json.loads(jp.read_text()) + pid = data["page_id"] + if pid.startswith("GUID-"): + guid_to_pages.setdefault(pid, []).append((slug, jp, data)) + else: + product_to_pages.setdefault(b["product"], []).append((slug, jp, data)) + + updated = 0 + # TOC pages — cluster by GUID + for guid, peers in guid_to_pages.items(): + if len(peers) < 2: + continue + for slug, jp, data in peers: + others = [ + {"bundle_id": s2, "page_id": guid, "clustering_title": d2.get("title", "")} + for s2, _, d2 in peers if s2 != slug + ] + data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others} + jp.write_text(json.dumps(data, indent=2) + "\n") + updated += 1 + # Single-doc pages — cluster by product (e.g. Release Notes 8.1.0/.1/.2) + for product, peers in product_to_pages.items(): + if len(peers) < 2: + continue + for slug, jp, data in peers: + others = [ + {"bundle_id": s2, "page_id": d2["page_id"], "clustering_title": d2.get("title", "")} + for s2, _, d2 in peers if s2 != slug + ] + data["topic_cluster"] = {"clustering_title": data.get("title", ""), "clustered_topics": others} + jp.write_text(json.dumps(data, indent=2) + "\n") + updated += 1 + + return updated + + +def main() -> int: + p = argparse.ArgumentParser(description="Scrape HVM bundles into corpus/.") + p.add_argument("--all", action="store_true", help="scrape every bundle in bundles.json") + p.add_argument("--bundle", action="append", help="scrape one bundle by slug (repeatable)") + p.add_argument("--force", action="store_true", help="re-fetch pages already on disk") + p.add_argument("--concurrency", type=int, default=6) + p.add_argument("--finalize-only", action="store_true", help="only rebuild topic_cluster sidecar fields") + args = p.parse_args() + + if not BUNDLES_JSON.exists(): + print(f"bundles.json missing — run `python -m scrape.bundles` first", file=sys.stderr) + return 2 + + bundles = json.loads(BUNDLES_JSON.read_text()) + + if args.finalize_only: + n = finalize_clusters(bundles) + print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr) + return 0 + + if args.bundle: + bundles = [b for b in bundles if b["slug"] in args.bundle] + if not bundles: + print(f"no bundles matched: {args.bundle}", file=sys.stderr) + return 2 + elif not args.all: + print("specify --all or --bundle ", file=sys.stderr) + return 2 + + s = _session() + total = 0 + for b in bundles: + if b.get("mode") == "single": + total += scrape_single_bundle(s, b, args.force) + else: + total += scrape_toc_bundle(s, b, args.force, args.concurrency) + print(f"scraped {total} new/updated pages", file=sys.stderr) + + # Always finalize after a scrape so sidecars are consistent. + all_bundles = json.loads(BUNDLES_JSON.read_text()) + n = finalize_clusters(all_bundles) + print(f"finalize: updated topic_cluster on {n} sidecars", file=sys.stderr) + + return 0 + + +if __name__ == "__main__": + sys.exit(main())