Files
mohitagw15856 827d7f62ec Make evals fast and hang-proof (timeout, retry, concurrency) (#44)
The "Run evals" step ran 24 API calls sequentially with no request timeout, so
it was slow and could stall indefinitely if one call hung.

- bin/lib/anthropic.mjs: per-request timeout (120s) via AbortController + retry
  (2x, backoff) on 429/5xx/timeout. Fails fast on 4xx (bad key/model).
- evals/run-evals.mjs: run (case × model) tasks through a concurrency pool
  (default 4, --concurrency to tune); preserves result order.
- eval-leaderboard.yml: job timeout-minutes: 20 as a safety net.

Applies to the next run. The hardening also benefits the Action runner and
`generate`, which share the client.


Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-18 13:30:06 +01:00

114 lines
4.7 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
// Skill eval harness. For each case × model: run the skill, then score the output
// with an LLM judge on a fixed rubric. Writes evals/results.json — feed it to
// scripts/build-leaderboard.mjs to render web/leaderboard.html.
//
// Requires an Anthropic API key (this calls the API and costs tokens).
//
// Usage:
// ANTHROPIC_API_KEY=sk-ant-... node evals/run-evals.mjs
// ... node evals/run-evals.mjs --models claude-opus-4-8,claude-sonnet-4-6,claude-haiku-4-5-20251001
// ... node evals/run-evals.mjs --judge claude-opus-4-8 --cases evals/cases.json
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
import { complete, parseSkill } from '../bin/lib/anthropic.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const root = join(__dirname, '..');
function arg(name, def) {
const i = process.argv.indexOf(`--${name}`);
return i !== -1 ? process.argv[i + 1] : def;
}
const apiKey = process.env.ANTHROPIC_API_KEY || '';
const models = arg('models', 'claude-sonnet-4-6,claude-haiku-4-5-20251001').split(',').map((s) => s.trim());
const judge = arg('judge', 'claude-opus-4-8');
const casesPath = arg('cases', join(__dirname, 'cases.json'));
const outPath = arg('out', join(__dirname, 'results.json'));
const DIMENSIONS = ['structure', 'completeness', 'usefulness', 'grounding'];
function runPrompt(skillBody) {
return skillBody + '\n\n---\nExecute this skill now on the input. Output only the finished artifact.';
}
function judgePrompt(description, output) {
return `You are a strict evaluator of a professional work artifact.
The artifact was produced by a skill whose job is:
"${description}"
Score the artifact below from 1 (poor) to 5 (excellent) on each dimension:
- structure: follows a clear, expected structure for this kind of output
- completeness: covers what the task needs, nothing important missing
- usefulness: actually useful to a professional, specific not generic
- grounding: stays grounded in the given input, no invented facts/metrics
Return ONLY a JSON object, no prose: {"structure":N,"completeness":N,"usefulness":N,"grounding":N}
--- ARTIFACT ---
${output}`;
}
function parseScores(text) {
const m = text.match(/\{[\s\S]*\}/);
if (!m) throw new Error('judge did not return JSON');
const j = JSON.parse(m[0]);
const s = {};
for (const d of DIMENSIONS) s[d] = Math.max(1, Math.min(5, Number(j[d]) || 0));
return s;
}
// Run an async worker over `items` with at most `limit` in flight.
async function pool(items, limit, worker) {
const out = [];
let i = 0;
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, async () => {
while (i < items.length) {
const idx = i++;
out[idx] = await worker(items[idx]);
}
}));
return out;
}
async function scoreTask({ c, body, description, model }) {
try {
const output = await complete({ apiKey, model, system: runPrompt(body), messages: [{ role: 'user', content: c.input }], maxTokens: 3000 });
const judged = await complete({ apiKey, model: judge, messages: [{ role: 'user', content: judgePrompt(description, output) }], maxTokens: 200 });
const scores = parseScores(judged);
const overall = DIMENSIONS.reduce((a, d) => a + scores[d], 0) / DIMENSIONS.length;
process.stderr.write(`✓ ${c.skill} on ${model}${overall.toFixed(2)}/5\n`);
return { skill: c.skill, model, scores, overall: Math.round(overall * 100) / 100 };
} catch (e) {
process.stderr.write(`✗ ${c.skill} on ${model} — FAILED (${e.message})\n`);
return null;
}
}
async function main() {
if (!apiKey) { console.error('Set ANTHROPIC_API_KEY to run evals.'); process.exit(1); }
const concurrency = parseInt(arg('concurrency', '4'), 10) || 4;
const { cases } = JSON.parse(readFileSync(casesPath, 'utf8'));
// Build the full (case × model) task list.
const tasks = [];
for (const c of cases) {
const skillFile = join(root, 'skills', c.skill, 'SKILL.md');
if (!existsSync(skillFile)) { console.error(`skip ${c.skill}: no SKILL.md`); continue; }
const { meta, body } = parseSkill(readFileSync(skillFile, 'utf8'));
for (const model of models) tasks.push({ c, body, description: meta.description || c.skill, model });
}
process.stderr.write(`Scoring ${tasks.length} runs (concurrency ${concurrency})…\n`);
const results = (await pool(tasks, concurrency, scoreTask)).filter(Boolean);
const out = { generatedAt: new Date().toISOString(), judge, models, dimensions: DIMENSIONS, results };
writeFileSync(outPath, JSON.stringify(out, null, 2));
console.log(`\nWrote ${outPath}${results.length}/${tasks.length} scored runs. Build the page: node scripts/build-leaderboard.mjs`);
}
main();