From edb663ad72f02b88db0f1cf9112030ea1bcfd15c Mon Sep 17 00:00:00 2001
From: mohitagw15856 <119053560+mohitagw15856@users.noreply.github.com>
Date: Thu, 18 Jun 2026 12:58:45 +0100
Subject: [PATCH] CI workflow to run evals and update the leaderboard (#43)

Lets the leaderboard show real numbers without a local key: the new
"Update Skill Leaderboard" workflow (workflow_dispatch) runs the eval harness
with the ANTHROPIC_API_KEY secret, commits evals/results.json, and the Pages
deploy re-renders the public leaderboard with real data.

- .github/workflows/eval-leaderboard.yml: manual trigger, contents: write,
  runs run-evals.mjs + build-leaderboard.mjs, commits results.json.
- deploy-playground.yml: also trigger on evals/results.json (and the build
  scripts) so the committed results refresh the live page.
- evals/README + CHANGELOG document the CI route.


Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px

Co-authored-by: Claude <noreply@anthropic.com>
---
 .github/workflows/deploy-playground.yml |  4 ++
 .github/workflows/eval-leaderboard.yml  | 67 +++++++++++++++++++++++++
 CHANGELOG.md                            |  7 ++-
 evals/README.md                         |  6 +++
 4 files changed, 83 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/eval-leaderboard.yml

diff --git a/.github/workflows/deploy-playground.yml b/.github/workflows/deploy-playground.yml
index ccfc6ea..1e21e60 100644
--- a/.github/workflows/deploy-playground.yml
+++ b/.github/workflows/deploy-playground.yml
@@ -10,6 +10,10 @@ on:
     paths:
       - 'skills/**'
       - 'web/**'
+      - 'evals/results.json'
+      - 'skill-tiers.json'
+      - 'scripts/build-docs.mjs'
+      - 'scripts/build-leaderboard.mjs'
       - '.github/workflows/deploy-playground.yml'
   workflow_dispatch:
 
diff --git a/.github/workflows/eval-leaderboard.yml b/.github/workflows/eval-leaderboard.yml
new file mode 100644
index 0000000..4520082
--- /dev/null
+++ b/.github/workflows/eval-leaderboard.yml
@@ -0,0 +1,67 @@
+name: Update Skill Leaderboard
+
+# Runs the eval harness with your ANTHROPIC_API_KEY secret, commits the real
+# results (evals/results.json), and lets the Pages deploy re-render the public
+# leaderboard with real numbers. Manual trigger so it never burns tokens by
+# surprise. (Uncomment the schedule to re-run, e.g. monthly, after model upgrades.)
+
+on:
+  workflow_dispatch:
+    inputs:
+      models:
+        description: 'Comma-separated model ids to score'
+        required: false
+        default: 'claude-sonnet-4-6,claude-haiku-4-5-20251001'
+      judge:
+        description: 'Judge model id'
+        required: false
+        default: 'claude-opus-4-8'
+  # schedule:
+  #   - cron: '0 6 1 * *'   # 06:00 on the 1st of each month
+
+permissions:
+  contents: write
+
+concurrency:
+  group: eval-leaderboard
+  cancel-in-progress: false
+
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Run evals
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          if [ -z "$ANTHROPIC_API_KEY" ]; then
+            echo "::error::ANTHROPIC_API_KEY secret is not set. Add it in Settings → Secrets and variables → Actions."
+            exit 1
+          fi
+          node evals/run-evals.mjs \
+            --models "${{ github.event.inputs.models || 'claude-sonnet-4-6,claude-haiku-4-5-20251001' }}" \
+            --judge "${{ github.event.inputs.judge || 'claude-opus-4-8' }}"
+
+      - name: Build the leaderboard page (sanity check)
+        run: node scripts/build-leaderboard.mjs
+
+      - name: Commit results
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add evals/results.json
+          if git diff --cached --quiet; then
+            echo "No change in results."
+          else
+            git commit -m "chore(evals): refresh leaderboard results"
+            git push
+            echo "Committed evals/results.json — the Pages deploy will render real numbers."
+          fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b987376..2267b55 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,12 @@ each new wave of skills bumps the **major** version, extensions and fixes bump
 
 ## [Unreleased]
 
-_Nothing yet._
+### Added
+- **One-click leaderboard updates in CI** — `.github/workflows/eval-leaderboard.yml`
+  ("Update Skill Leaderboard") runs the evals with the `ANTHROPIC_API_KEY` secret, commits
+  `evals/results.json`, and the Pages deploy re-renders the public leaderboard with real
+  numbers — no local key needed. The deploy workflow now also triggers on
+  `evals/results.json`.
 
 ## [20.0.0] — Agentic Tooling — 2026-06-18
 
diff --git a/evals/README.md b/evals/README.md
index a830901..0bf62c2 100644
--- a/evals/README.md
+++ b/evals/README.md
@@ -28,6 +28,12 @@ node scripts/build-leaderboard.mjs       # render web/leaderboard.html
 `run-evals.mjs` writes `evals/results.json`; the leaderboard builder prefers it and falls
 back to `results.example.json` (clearly labelled) so the page renders before you run real evals.
 
+### No local key? Run it in CI
+
+Add an `ANTHROPIC_API_KEY` repo secret, then go to **Actions → "Update Skill Leaderboard"
+→ Run workflow**. It runs the evals, commits `evals/results.json`, and the Pages deploy
+re-renders the public leaderboard with real numbers — no laptop required.
+
 ## Add a case
 
 Append to [`cases.json`](cases.json): `{ "skill": "<name>", "input": "<a realistic prompt>" }`.