crop-chem-docs/scrape/changelog.py

"""Generate a summary of corpus changes.

Two output shapes for two consumers:

  1. Human-readable text (default) — written into the weekly-refresh
     commit message so the commit log is greppable for *"what changed
     this week"* instead of *"806 files changed"*.

  2. Structured JSON (``--json``) and rolling JSONL history
     (``--history-out``) — consumed by the ``weekly_digest`` MCP tool.
     Computed in CI and committed at ``corpus/.digest/history.jsonl``;
     the tool reads it at runtime because the prod container is a
     static filesystem COPY with no git available.

Usage:

    # Commit-message helper (existing behavior — unchanged)
    python -m scrape.changelog [--cached] [--ref REF]

    # One-shot JSON for the current diff range
    python -m scrape.changelog --cached --json

    # Build / refresh the digest history file (CI use)
    python -m scrape.changelog --history-out corpus/.digest/history.jsonl \\
        --history-days 120

The history walker only includes commits that touch ``corpus/`` (or
``bundles.json``); it skips pure code/CI commits. Each emitted record
carries the commit's short sha, ISO timestamp, subject, and the same
structured summary the ``--json`` path produces, so the consumer can
treat history records and one-shot summaries interchangeably.
"""
from __future__ import annotations

import argparse
import json
import subprocess
import sys
from collections import defaultdict
from typing import Any


def git(*args: str) -> str:
    return subprocess.check_output(["git", *args], text=True)


def summarize_diff(diff_output: str) -> dict[str, Any]:
    """Parse ``git diff --name-status`` output into a structured summary.

    Pure function (no IO, no git calls) so the same logic is exercised
    by the human-readable, JSON-one-shot, and history-walking paths.

    Returns a dict with:

        md_count           int       — total .md files changed
        json_count         int       — total .json sidecars changed
        content_bundles    dict      — {bundle_id: [page_id_without_.md, ...]}
                                       Only bundles where at least one .md
                                       file moved. Lists are in the order
                                       git emitted them.
        json_only_bundles  list[str] — bundles whose ONLY change was sidecar
                                       drift (no .md changes). Sorted.
        new_bundles        list[str] — bundles whose first .md was Added
                                       in this diff. Sorted.
        other_files        list[str] — any non-corpus path mentioned in the
                                       diff, as ``"STATUS path"`` strings.
    """
    md_changes: dict[str, list[str]] = defaultdict(list)
    json_only_bundles: set[str] = set()
    new_bundles: set[str] = set()
    md_count = json_count = 0
    other_files: list[str] = []

    for line in diff_output.splitlines():
        if not line.strip():
            continue
        # status<TAB>path (or status<TAB>old<TAB>new for renames; we take
        # the post-rename path as the canonical location).
        parts = line.split("\t")
        status, path = parts[0], parts[-1]
        if not path.startswith("corpus/"):
            other_files.append(f"{status} {path}")
            continue
        segs = path.split("/", 2)
        if len(segs) < 3:
            # corpus/<filename> with no bundle dir — skip.
            continue
        _, bundle, page = segs
        if page.endswith(".md"):
            md_changes[bundle].append(page[:-3])
            md_count += 1
            if status == "A":
                new_bundles.add(bundle)
        elif page.endswith(".json"):
            json_count += 1
            json_only_bundles.add(bundle)

    # A bundle counts as "content-changing" if it had any .md edit. Sidecar-
    # only drift goes in the separate bucket so the commit message doesn't
    # report timestamp churn as if it were real edits.
    content_bundles_set = set(md_changes)
    drift_only = sorted(json_only_bundles - content_bundles_set)

    return {
        "md_count":          md_count,
        "json_count":        json_count,
        "content_bundles":   dict(md_changes),   # cast back to plain dict for JSON
        "json_only_bundles": drift_only,
        "new_bundles":       sorted(new_bundles),
        "other_files":       other_files,
    }


def render_human(summary: dict[str, Any]) -> str:
    """Format a summary dict as the multi-line commit-message text.

    Matches the historical output exactly so existing commit-message
    tooling and downstream readers don't have to change.
    """
    lines: list[str] = []
    content_bundles = sorted(summary["content_bundles"])
    md_count = summary["md_count"]
    json_count = summary["json_count"]
    new_bundles = set(summary["new_bundles"])
    drift_only = summary["json_only_bundles"]
    other_files = summary["other_files"]

    lines.append(f"{md_count} content change(s) across {len(content_bundles)} bundle(s)")
    lines.append(f"{json_count} sidecar metadata update(s)")
    if new_bundles:
        lines.append(f"{len(new_bundles)} new bundle(s) added")
    if other_files:
        lines.append(f"{len(other_files)} other file change(s)")

    if content_bundles:
        lines.append("")
        lines.append("Bundles with content changes:")
        for b in content_bundles:
            pages = summary["content_bundles"][b]
            tag = " (NEW)" if b in new_bundles else ""
            lines.append(f"  {b}{tag}: {len(pages)} page(s)")
            for p in pages[:5]:
                lines.append(f"    - {p}")
            if len(pages) > 5:
                lines.append(f"    ... and {len(pages) - 5} more")
    if drift_only:
        lines.append("")
        head = ", ".join(drift_only[:10])
        suffix = " …" if len(drift_only) > 10 else ""
        lines.append(f"Bundles with sidecar-only drift ({len(drift_only)}): {head}{suffix}")
    return "\n".join(lines)


def walk_history(history_days: int) -> list[dict[str, Any]]:
    """Walk recent corpus-touching commits, emit one summary per commit.

    Uses ``git log --first-parent main`` to keep the rolling weekly-
    refresh line clean of branch-merge noise. Only commits whose diff
    touches ``corpus/`` or ``bundles.json`` are emitted; pure code
    commits are skipped (they have nothing to digest).

    Each record:

        {
          "sha":       "<short sha>",
          "timestamp": "<ISO 8601, UTC>",
          "subject":   "<commit subject line>",
          ... + every field from summarize_diff()
        }
    """
    # Find candidate commits. --first-parent keeps the linear refresh history
    # on main and ignores branch-side merges. We still need to filter by what
    # the commit actually touched, because non-corpus commits can land on
    # main (PR merges for code, CI tweaks, etc.).
    raw = git(
        "log",
        f"--since={history_days} days ago",
        "--first-parent",
        "main",
        "--pretty=format:%H%x09%cI%x09%s",
    )

    records: list[dict[str, Any]] = []
    for line in raw.splitlines():
        if not line.strip():
            continue
        parts = line.split("\t", 2)
        if len(parts) < 3:
            continue
        sha, ts, subject = parts

        # What did this commit actually touch? Cheap: just the name-status diff
        # against its first parent. Empty stdout = commit didn't change any
        # files we care about. Root commits (no parent) error out — suppress
        # the stderr noise and skip them.
        try:
            diff = subprocess.check_output(
                ["git", "diff", "--name-status", f"{sha}^..{sha}"],
                text=True,
                stderr=subprocess.DEVNULL,
            )
        except subprocess.CalledProcessError:
            continue
        if not diff.strip():
            continue

        summary = summarize_diff(diff)
        # Skip pure code commits — only emit records that have actual corpus
        # content motion. This is what makes the history "interesting" for
        # the weekly digest.
        if summary["md_count"] == 0 and summary["json_count"] == 0 and not summary["new_bundles"]:
            continue

        records.append({
            "sha":       sha[:12],
            "timestamp": ts,
            "subject":   subject,
            **summary,
        })

    return records


def main() -> int:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("--cached", action="store_true",
                   help="Summarize staged changes instead of a ref range.")
    p.add_argument("--ref", default="HEAD^..HEAD",
                   help="Diff range to summarize (default: HEAD^..HEAD).")
    p.add_argument("--json", dest="as_json", action="store_true",
                   help="Emit one JSON object instead of the human-readable form.")
    p.add_argument("--history-out", metavar="PATH",
                   help="Walk recent corpus-touching commits and write a "
                        "JSONL history file at PATH. Overwrites if it exists. "
                        "Implies the history walker; --cached/--ref are ignored.")
    p.add_argument("--history-days", type=int, default=120,
                   help="How far back the history walker looks (default 120).")
    args = p.parse_args()

    # History-walker path: build the JSONL file consumed by the
    # weekly_digest MCP tool, then exit. CI uses this.
    if args.history_out:
        records = walk_history(args.history_days)
        # Sort by timestamp ascending so the file is roughly stable
        # across rebuilds (commits within a single run could otherwise
        # depend on git log default ordering).
        records.sort(key=lambda r: r["timestamp"])
        with open(args.history_out, "w") as fh:
            for rec in records:
                fh.write(json.dumps(rec, separators=(",", ":")) + "\n")
        # Brief stdout signal for CI logs — easy to spot in the workflow run.
        print(f"wrote {len(records)} commit record(s) to {args.history_out} "
              f"covering up to {args.history_days} days")
        return 0

    # One-shot summary path. Unchanged behavior for --cached / --ref.
    if args.cached:
        diff_args = ["diff", "--name-status", "--cached"]
    else:
        diff_args = ["diff", "--name-status", args.ref]
    diff = git(*diff_args)
    summary = summarize_diff(diff)

    if args.as_json:
        print(json.dumps(summary, separators=(",", ":")))
    else:
        print(render_human(summary))
    return 0


if __name__ == "__main__":
    sys.exit(main())