name: Update Skill Leaderboard # Runs the eval harness with your ANTHROPIC_API_KEY secret, commits the real # results (evals/results.json), and lets the Pages deploy re-render the public # leaderboard with real numbers. Manual trigger so it never burns tokens by # surprise. (Uncomment the schedule to re-run, e.g. monthly, after model upgrades.) on: workflow_dispatch: inputs: models: description: 'Comma-separated model ids to score' required: false default: 'claude-sonnet-4-6,claude-haiku-4-5-20251001' judge: description: 'Judge model id' required: false default: 'claude-opus-4-8' # schedule: # - cron: '0 6 1 * *' # 06:00 on the 1st of each month permissions: contents: write pull-requests: write concurrency: group: eval-leaderboard cancel-in-progress: false jobs: evaluate: runs-on: ubuntu-latest timeout-minutes: 20 steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Node uses: actions/setup-node@v4 with: node-version: '20' - name: Run evals env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | if [ -z "$ANTHROPIC_API_KEY" ]; then echo "::error::ANTHROPIC_API_KEY secret is not set. Add it in Settings → Secrets and variables → Actions." exit 1 fi node evals/run-evals.mjs \ --models "${{ github.event.inputs.models || 'claude-sonnet-4-6,claude-haiku-4-5-20251001' }}" \ --judge "${{ github.event.inputs.judge || 'claude-opus-4-8' }}" - name: Build the leaderboard page (sanity check) run: node scripts/build-leaderboard.mjs - name: Open a PR with the refreshed results uses: peter-evans/create-pull-request@v7 with: add-paths: evals/results.json branch: eval-results delete-branch: true commit-message: "chore(evals): refresh leaderboard results" title: "chore(evals): refresh leaderboard results" body: | Auto-generated by the **Update Skill Leaderboard** workflow. Merging this publishes the **real** numbers on the live leaderboard — the Pages deploy is triggered by changes to `evals/results.json`.