diff --git a/.github/workflows/eval-leaderboard.yml b/.github/workflows/eval-leaderboard.yml index 4520082..ff24cdc 100644 --- a/.github/workflows/eval-leaderboard.yml +++ b/.github/workflows/eval-leaderboard.yml @@ -29,6 +29,7 @@ concurrency: jobs: evaluate: runs-on: ubuntu-latest + timeout-minutes: 20 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 2267b55..32be794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,12 @@ each new wave of skills bumps the **major** version, extensions and fixes bump ## [Unreleased] +### Changed +- **Faster, hang-proof evals.** The Anthropic client now has a per-request timeout (120s) + and limited retries (429/5xx/timeout); the eval harness runs cases concurrently + (default 4). The leaderboard workflow has a 20-minute job timeout. A 24-call run that + was sequential now finishes in a few minutes and can't stall a job indefinitely. + ### Added - **One-click leaderboard updates in CI** — `.github/workflows/eval-leaderboard.yml` ("Update Skill Leaderboard") runs the evals with the `ANTHROPIC_API_KEY` secret, commits diff --git a/bin/lib/anthropic.mjs b/bin/lib/anthropic.mjs index 409d5b4..445364e 100644 --- a/bin/lib/anthropic.mjs +++ b/bin/lib/anthropic.mjs @@ -6,31 +6,57 @@ const API_URL = 'https://api.anthropic.com/v1/messages'; /** * Call the Anthropic Messages API and return the concatenated text output. + * Adds a per-request timeout and limited retries so a slow/transient failure + * can't hang a CI job forever. * @param {object} o * @param {string} o.apiKey - Anthropic API key. * @param {string} [o.model] - Model id (default claude-sonnet-4-6). * @param {string} [o.system]- System prompt. * @param {Array} o.messages- [{role, content}] messages. * @param {number} [o.maxTokens] + * @param {number} [o.timeoutMs] - Per-request timeout (default 120s). + * @param {number} [o.retries] - Retries on timeout / 429 / 5xx (default 2). * @returns {Promise} */ -export async function complete({ apiKey, model = 'claude-sonnet-4-6', system, messages, maxTokens = 4096 }) { +export async function complete({ apiKey, model = 'claude-sonnet-4-6', system, messages, maxTokens = 4096, timeoutMs = 120000, retries = 2 }) { if (!apiKey) throw new Error('Missing Anthropic API key (set ANTHROPIC_API_KEY).'); - const res = await fetch(API_URL, { - method: 'POST', - headers: { - 'content-type': 'application/json', - 'x-api-key': apiKey, - 'anthropic-version': '2023-06-01', - }, - body: JSON.stringify({ model, max_tokens: maxTokens, ...(system ? { system } : {}), messages }), - }); - if (!res.ok) { - const body = await res.text().catch(() => ''); - throw new Error(`Anthropic API ${res.status}: ${body.slice(0, 500)}`); + let lastErr; + for (let attempt = 0; attempt <= retries; attempt++) { + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(), timeoutMs); + try { + const res = await fetch(API_URL, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'x-api-key': apiKey, + 'anthropic-version': '2023-06-01', + }, + body: JSON.stringify({ model, max_tokens: maxTokens, ...(system ? { system } : {}), messages }), + signal: ctrl.signal, + }); + if (res.ok) { + const data = await res.json(); + return (data.content || []).map((c) => c.text || '').join('').trim(); + } + const body = await res.text().catch(() => ''); + // Retry transient server / rate-limit errors; fail fast on 4xx (bad key/model). + if ((res.status === 429 || res.status >= 500) && attempt < retries) { + lastErr = new Error(`Anthropic API ${res.status}`); + } else { + throw new Error(`Anthropic API ${res.status}: ${body.slice(0, 500)}`); + } + } catch (e) { + if (e.name === 'AbortError') e = new Error(`Anthropic API request timed out after ${timeoutMs}ms`); + const retryable = /timed out/.test(e.message) || e.name === 'TypeError' || /Anthropic API (429|5\d\d)/.test(e.message); + if (!retryable || attempt >= retries) throw e; + lastErr = e; + } finally { + clearTimeout(timer); + } + await new Promise((r) => setTimeout(r, 1000 * 2 ** attempt)); // backoff: 1s, 2s, 4s } - const data = await res.json(); - return (data.content || []).map((c) => c.text || '').join('').trim(); + throw lastErr || new Error('Anthropic API request failed.'); } /** Parse "name: value" YAML-ish frontmatter + body from a SKILL.md string. */ diff --git a/evals/run-evals.mjs b/evals/run-evals.mjs index 9669fdd..9867a67 100644 --- a/evals/run-evals.mjs +++ b/evals/run-evals.mjs @@ -61,33 +61,53 @@ function parseScores(text) { return s; } +// Run an async worker over `items` with at most `limit` in flight. +async function pool(items, limit, worker) { + const out = []; + let i = 0; + await Promise.all(Array.from({ length: Math.min(limit, items.length) }, async () => { + while (i < items.length) { + const idx = i++; + out[idx] = await worker(items[idx]); + } + })); + return out; +} + +async function scoreTask({ c, body, description, model }) { + try { + const output = await complete({ apiKey, model, system: runPrompt(body), messages: [{ role: 'user', content: c.input }], maxTokens: 3000 }); + const judged = await complete({ apiKey, model: judge, messages: [{ role: 'user', content: judgePrompt(description, output) }], maxTokens: 200 }); + const scores = parseScores(judged); + const overall = DIMENSIONS.reduce((a, d) => a + scores[d], 0) / DIMENSIONS.length; + process.stderr.write(`✓ ${c.skill} on ${model} — ${overall.toFixed(2)}/5\n`); + return { skill: c.skill, model, scores, overall: Math.round(overall * 100) / 100 }; + } catch (e) { + process.stderr.write(`✗ ${c.skill} on ${model} — FAILED (${e.message})\n`); + return null; + } +} + async function main() { if (!apiKey) { console.error('Set ANTHROPIC_API_KEY to run evals.'); process.exit(1); } + const concurrency = parseInt(arg('concurrency', '4'), 10) || 4; const { cases } = JSON.parse(readFileSync(casesPath, 'utf8')); - const results = []; + // Build the full (case × model) task list. + const tasks = []; for (const c of cases) { const skillFile = join(root, 'skills', c.skill, 'SKILL.md'); if (!existsSync(skillFile)) { console.error(`skip ${c.skill}: no SKILL.md`); continue; } const { meta, body } = parseSkill(readFileSync(skillFile, 'utf8')); - for (const model of models) { - process.stderr.write(`Running ${c.skill} on ${model}… `); - try { - const output = await complete({ apiKey, model, system: runPrompt(body), messages: [{ role: 'user', content: c.input }], maxTokens: 3000 }); - const judged = await complete({ apiKey, model: judge, messages: [{ role: 'user', content: judgePrompt(meta.description || c.skill, output) }], maxTokens: 200 }); - const scores = parseScores(judged); - const overall = DIMENSIONS.reduce((a, d) => a + scores[d], 0) / DIMENSIONS.length; - results.push({ skill: c.skill, model, scores, overall: Math.round(overall * 100) / 100 }); - process.stderr.write(`${overall.toFixed(2)}/5\n`); - } catch (e) { - process.stderr.write(`FAILED (${e.message})\n`); - } - } + for (const model of models) tasks.push({ c, body, description: meta.description || c.skill, model }); } + process.stderr.write(`Scoring ${tasks.length} runs (concurrency ${concurrency})…\n`); + const results = (await pool(tasks, concurrency, scoreTask)).filter(Boolean); + const out = { generatedAt: new Date().toISOString(), judge, models, dimensions: DIMENSIONS, results }; writeFileSync(outPath, JSON.stringify(out, null, 2)); - console.log(`\nWrote ${outPath} — ${results.length} scored runs. Build the page: node scripts/build-leaderboard.mjs`); + console.log(`\nWrote ${outPath} — ${results.length}/${tasks.length} scored runs. Build the page: node scripts/build-leaderboard.mjs`); } main();