"""Generate a summary of corpus changes. Two output shapes for two consumers: 1. Human-readable text (default) — written into the weekly-refresh commit message so the commit log is greppable for *"what changed this week"* instead of *"806 files changed"*. 2. Structured JSON (``--json``) and rolling JSONL history (``--history-out``) — consumed by the ``weekly_digest`` MCP tool. Computed in CI and committed at ``corpus/.digest/history.jsonl``; the tool reads it at runtime because the prod container is a static filesystem COPY with no git available. Usage: # Commit-message helper (existing behavior — unchanged) python -m scrape.changelog [--cached] [--ref REF] # One-shot JSON for the current diff range python -m scrape.changelog --cached --json # Build / refresh the digest history file (CI use) python -m scrape.changelog --history-out corpus/.digest/history.jsonl \\ --history-days 120 The history walker only includes commits that touch ``corpus/`` (or ``bundles.json``); it skips pure code/CI commits. Each emitted record carries the commit's short sha, ISO timestamp, subject, and the same structured summary the ``--json`` path produces, so the consumer can treat history records and one-shot summaries interchangeably. """ from __future__ import annotations import argparse import json import subprocess import sys from collections import defaultdict from typing import Any def git(*args: str) -> str: return subprocess.check_output(["git", *args], text=True) def summarize_diff(diff_output: str) -> dict[str, Any]: """Parse ``git diff --name-status`` output into a structured summary. Pure function (no IO, no git calls) so the same logic is exercised by the human-readable, JSON-one-shot, and history-walking paths. Returns a dict with: md_count int — total .md files changed json_count int — total .json sidecars changed content_bundles dict — {bundle_id: [page_id_without_.md, ...]} Only bundles where at least one .md file moved. Lists are in the order git emitted them. json_only_bundles list[str] — bundles whose ONLY change was sidecar drift (no .md changes). Sorted. new_bundles list[str] — bundles whose first .md was Added in this diff. Sorted. other_files list[str] — any non-corpus path mentioned in the diff, as ``"STATUS path"`` strings. """ md_changes: dict[str, list[str]] = defaultdict(list) json_only_bundles: set[str] = set() new_bundles: set[str] = set() md_count = json_count = 0 other_files: list[str] = [] for line in diff_output.splitlines(): if not line.strip(): continue # statuspath (or statusoldnew for renames; we take # the post-rename path as the canonical location). parts = line.split("\t") status, path = parts[0], parts[-1] if not path.startswith("corpus/"): other_files.append(f"{status} {path}") continue segs = path.split("/", 2) if len(segs) < 3: # corpus/ with no bundle dir — skip. continue _, bundle, page = segs if page.endswith(".md"): md_changes[bundle].append(page[:-3]) md_count += 1 if status == "A": new_bundles.add(bundle) elif page.endswith(".json"): json_count += 1 json_only_bundles.add(bundle) # A bundle counts as "content-changing" if it had any .md edit. Sidecar- # only drift goes in the separate bucket so the commit message doesn't # report timestamp churn as if it were real edits. content_bundles_set = set(md_changes) drift_only = sorted(json_only_bundles - content_bundles_set) return { "md_count": md_count, "json_count": json_count, "content_bundles": dict(md_changes), # cast back to plain dict for JSON "json_only_bundles": drift_only, "new_bundles": sorted(new_bundles), "other_files": other_files, } def render_human(summary: dict[str, Any]) -> str: """Format a summary dict as the multi-line commit-message text. Matches the historical output exactly so existing commit-message tooling and downstream readers don't have to change. """ lines: list[str] = [] content_bundles = sorted(summary["content_bundles"]) md_count = summary["md_count"] json_count = summary["json_count"] new_bundles = set(summary["new_bundles"]) drift_only = summary["json_only_bundles"] other_files = summary["other_files"] lines.append(f"{md_count} content change(s) across {len(content_bundles)} bundle(s)") lines.append(f"{json_count} sidecar metadata update(s)") if new_bundles: lines.append(f"{len(new_bundles)} new bundle(s) added") if other_files: lines.append(f"{len(other_files)} other file change(s)") if content_bundles: lines.append("") lines.append("Bundles with content changes:") for b in content_bundles: pages = summary["content_bundles"][b] tag = " (NEW)" if b in new_bundles else "" lines.append(f" {b}{tag}: {len(pages)} page(s)") for p in pages[:5]: lines.append(f" - {p}") if len(pages) > 5: lines.append(f" ... and {len(pages) - 5} more") if drift_only: lines.append("") head = ", ".join(drift_only[:10]) suffix = " …" if len(drift_only) > 10 else "" lines.append(f"Bundles with sidecar-only drift ({len(drift_only)}): {head}{suffix}") return "\n".join(lines) def walk_history(history_days: int) -> list[dict[str, Any]]: """Walk recent corpus-touching commits, emit one summary per commit. Uses ``git log --first-parent main`` to keep the rolling weekly- refresh line clean of branch-merge noise. Only commits whose diff touches ``corpus/`` or ``bundles.json`` are emitted; pure code commits are skipped (they have nothing to digest). Each record: { "sha": "", "timestamp": "", "subject": "", ... + every field from summarize_diff() } """ # Find candidate commits. --first-parent keeps the linear refresh history # on main and ignores branch-side merges. We still need to filter by what # the commit actually touched, because non-corpus commits can land on # main (PR merges for code, CI tweaks, etc.). raw = git( "log", f"--since={history_days} days ago", "--first-parent", "main", "--pretty=format:%H%x09%cI%x09%s", ) records: list[dict[str, Any]] = [] for line in raw.splitlines(): if not line.strip(): continue parts = line.split("\t", 2) if len(parts) < 3: continue sha, ts, subject = parts # What did this commit actually touch? Cheap: just the name-status diff # against its first parent. Empty stdout = commit didn't change any # files we care about. Root commits (no parent) error out — suppress # the stderr noise and skip them. try: diff = subprocess.check_output( ["git", "diff", "--name-status", f"{sha}^..{sha}"], text=True, stderr=subprocess.DEVNULL, ) except subprocess.CalledProcessError: continue if not diff.strip(): continue summary = summarize_diff(diff) # Skip pure code commits — only emit records that have actual corpus # content motion. This is what makes the history "interesting" for # the weekly digest. if summary["md_count"] == 0 and summary["json_count"] == 0 and not summary["new_bundles"]: continue records.append({ "sha": sha[:12], "timestamp": ts, "subject": subject, **summary, }) return records def main() -> int: p = argparse.ArgumentParser(description=__doc__) p.add_argument("--cached", action="store_true", help="Summarize staged changes instead of a ref range.") p.add_argument("--ref", default="HEAD^..HEAD", help="Diff range to summarize (default: HEAD^..HEAD).") p.add_argument("--json", dest="as_json", action="store_true", help="Emit one JSON object instead of the human-readable form.") p.add_argument("--history-out", metavar="PATH", help="Walk recent corpus-touching commits and write a " "JSONL history file at PATH. Overwrites if it exists. " "Implies the history walker; --cached/--ref are ignored.") p.add_argument("--history-days", type=int, default=120, help="How far back the history walker looks (default 120).") args = p.parse_args() # History-walker path: build the JSONL file consumed by the # weekly_digest MCP tool, then exit. CI uses this. if args.history_out: records = walk_history(args.history_days) # Sort by timestamp ascending so the file is roughly stable # across rebuilds (commits within a single run could otherwise # depend on git log default ordering). records.sort(key=lambda r: r["timestamp"]) with open(args.history_out, "w") as fh: for rec in records: fh.write(json.dumps(rec, separators=(",", ":")) + "\n") # Brief stdout signal for CI logs — easy to spot in the workflow run. print(f"wrote {len(records)} commit record(s) to {args.history_out} " f"covering up to {args.history_days} days") return 0 # One-shot summary path. Unchanged behavior for --cached / --ref. if args.cached: diff_args = ["diff", "--name-status", "--cached"] else: diff_args = ["diff", "--name-status", args.ref] diff = git(*diff_args) summary = summarize_diff(diff) if args.as_json: print(json.dumps(summary, separators=(",", ":"))) else: print(render_human(summary)) return 0 if __name__ == "__main__": sys.exit(main())