From edb663ad72f02b88db0f1cf9112030ea1bcfd15c Mon Sep 17 00:00:00 2001 From: mohitagw15856 <119053560+mohitagw15856@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:58:45 +0100 Subject: [PATCH] CI workflow to run evals and update the leaderboard (#43) Lets the leaderboard show real numbers without a local key: the new "Update Skill Leaderboard" workflow (workflow_dispatch) runs the eval harness with the ANTHROPIC_API_KEY secret, commits evals/results.json, and the Pages deploy re-renders the public leaderboard with real data. - .github/workflows/eval-leaderboard.yml: manual trigger, contents: write, runs run-evals.mjs + build-leaderboard.mjs, commits results.json. - deploy-playground.yml: also trigger on evals/results.json (and the build scripts) so the committed results refresh the live page. - evals/README + CHANGELOG document the CI route. Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px Co-authored-by: Claude --- .github/workflows/deploy-playground.yml | 4 ++ .github/workflows/eval-leaderboard.yml | 67 +++++++++++++++++++++++++ CHANGELOG.md | 7 ++- evals/README.md | 6 +++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/eval-leaderboard.yml diff --git a/.github/workflows/deploy-playground.yml b/.github/workflows/deploy-playground.yml index ccfc6ea..1e21e60 100644 --- a/.github/workflows/deploy-playground.yml +++ b/.github/workflows/deploy-playground.yml @@ -10,6 +10,10 @@ on: paths: - 'skills/**' - 'web/**' + - 'evals/results.json' + - 'skill-tiers.json' + - 'scripts/build-docs.mjs' + - 'scripts/build-leaderboard.mjs' - '.github/workflows/deploy-playground.yml' workflow_dispatch: diff --git a/.github/workflows/eval-leaderboard.yml b/.github/workflows/eval-leaderboard.yml new file mode 100644 index 0000000..4520082 --- /dev/null +++ b/.github/workflows/eval-leaderboard.yml @@ -0,0 +1,67 @@ +name: Update Skill Leaderboard + +# Runs the eval harness with your ANTHROPIC_API_KEY secret, commits the real +# results (evals/results.json), and lets the Pages deploy re-render the public +# leaderboard with real numbers. Manual trigger so it never burns tokens by +# surprise. (Uncomment the schedule to re-run, e.g. monthly, after model upgrades.) + +on: + workflow_dispatch: + inputs: + models: + description: 'Comma-separated model ids to score' + required: false + default: 'claude-sonnet-4-6,claude-haiku-4-5-20251001' + judge: + description: 'Judge model id' + required: false + default: 'claude-opus-4-8' + # schedule: + # - cron: '0 6 1 * *' # 06:00 on the 1st of each month + +permissions: + contents: write + +concurrency: + group: eval-leaderboard + cancel-in-progress: false + +jobs: + evaluate: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Node + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Run evals + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + run: | + if [ -z "$ANTHROPIC_API_KEY" ]; then + echo "::error::ANTHROPIC_API_KEY secret is not set. Add it in Settings → Secrets and variables → Actions." + exit 1 + fi + node evals/run-evals.mjs \ + --models "${{ github.event.inputs.models || 'claude-sonnet-4-6,claude-haiku-4-5-20251001' }}" \ + --judge "${{ github.event.inputs.judge || 'claude-opus-4-8' }}" + + - name: Build the leaderboard page (sanity check) + run: node scripts/build-leaderboard.mjs + + - name: Commit results + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add evals/results.json + if git diff --cached --quiet; then + echo "No change in results." + else + git commit -m "chore(evals): refresh leaderboard results" + git push + echo "Committed evals/results.json — the Pages deploy will render real numbers." + fi diff --git a/CHANGELOG.md b/CHANGELOG.md index b987376..2267b55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,12 @@ each new wave of skills bumps the **major** version, extensions and fixes bump ## [Unreleased] -_Nothing yet._ +### Added +- **One-click leaderboard updates in CI** — `.github/workflows/eval-leaderboard.yml` + ("Update Skill Leaderboard") runs the evals with the `ANTHROPIC_API_KEY` secret, commits + `evals/results.json`, and the Pages deploy re-renders the public leaderboard with real + numbers — no local key needed. The deploy workflow now also triggers on + `evals/results.json`. ## [20.0.0] — Agentic Tooling — 2026-06-18 diff --git a/evals/README.md b/evals/README.md index a830901..0bf62c2 100644 --- a/evals/README.md +++ b/evals/README.md @@ -28,6 +28,12 @@ node scripts/build-leaderboard.mjs # render web/leaderboard.html `run-evals.mjs` writes `evals/results.json`; the leaderboard builder prefers it and falls back to `results.example.json` (clearly labelled) so the page renders before you run real evals. +### No local key? Run it in CI + +Add an `ANTHROPIC_API_KEY` repo secret, then go to **Actions → "Update Skill Leaderboard" +→ Run workflow**. It runs the evals, commits `evals/results.json`, and the Pages deploy +re-renders the public leaderboard with real numbers — no laptop required. + ## Add a case Append to [`cases.json`](cases.json): `{ "skill": "", "input": "" }`.