Make evals fast and hang-proof (timeout, retry, concurrency) (#44)

The "Run evals" step ran 24 API calls sequentially with no request timeout, so it was slow and could stall indefinitely if one call hung. - bin/lib/anthropic.mjs: per-request timeout (120s) via AbortController + retry (2x, backoff) on 429/5xx/timeout. Fails fast on 4xx (bad key/model). - evals/run-evals.mjs: run (case × model) tasks through a concurrency pool (default 4, --concurrency to tune); preserves result order. - eval-leaderboard.yml: job timeout-minutes: 20 as a safety net. Applies to the next run. The hardening also benefits the Action runner and `generate`, which share the client. Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px Co-authored-by: Claude <noreply@anthropic.com>
2026-06-18 13:30:06 +01:00
parent edb663ad72
commit 827d7f62ec
4 changed files with 83 additions and 30 deletions
@@ -6,31 +6,57 @@ const API_URL = 'https://api.anthropic.com/v1/messages';

 /**
 * Call the Anthropic Messages API and return the concatenated text output.
+ * Adds a per-request timeout and limited retries so a slow/transient failure
+ * can't hang a CI job forever.
 * @param {object} o
 * @param {string} o.apiKey  - Anthropic API key.
 * @param {string} [o.model] - Model id (default claude-sonnet-4-6).
 * @param {string} [o.system]- System prompt.
 * @param {Array}  o.messages- [{role, content}] messages.
 * @param {number} [o.maxTokens]
+ * @param {number} [o.timeoutMs] - Per-request timeout (default 120s).
+ * @param {number} [o.retries]   - Retries on timeout / 429 / 5xx (default 2).
 * @returns {Promise<string>}
 */
-export async function complete({ apiKey, model = 'claude-sonnet-4-6', system, messages, maxTokens = 4096 }) {
+export async function complete({ apiKey, model = 'claude-sonnet-4-6', system, messages, maxTokens = 4096, timeoutMs = 120000, retries = 2 }) {
  if (!apiKey) throw new Error('Missing Anthropic API key (set ANTHROPIC_API_KEY).');
-  const res = await fetch(API_URL, {
-    method: 'POST',
-    headers: {
-      'content-type': 'application/json',
-      'x-api-key': apiKey,
-      'anthropic-version': '2023-06-01',
-    },
-    body: JSON.stringify({ model, max_tokens: maxTokens, ...(system ? { system } : {}), messages }),
-  });
-  if (!res.ok) {
-    const body = await res.text().catch(() => '');
-    throw new Error(`Anthropic API ${res.status}: ${body.slice(0, 500)}`);
+  let lastErr;
+  for (let attempt = 0; attempt <= retries; attempt++) {
+    const ctrl = new AbortController();
+    const timer = setTimeout(() => ctrl.abort(), timeoutMs);
+    try {
+      const res = await fetch(API_URL, {
+        method: 'POST',
+        headers: {
+          'content-type': 'application/json',
+          'x-api-key': apiKey,
+          'anthropic-version': '2023-06-01',
+        },
+        body: JSON.stringify({ model, max_tokens: maxTokens, ...(system ? { system } : {}), messages }),
+        signal: ctrl.signal,
+      });
+      if (res.ok) {
+        const data = await res.json();
+        return (data.content || []).map((c) => c.text || '').join('').trim();
+      }
+      const body = await res.text().catch(() => '');
+      // Retry transient server / rate-limit errors; fail fast on 4xx (bad key/model).
+      if ((res.status === 429 || res.status >= 500) && attempt < retries) {
+        lastErr = new Error(`Anthropic API ${res.status}`);
+      } else {
+        throw new Error(`Anthropic API ${res.status}: ${body.slice(0, 500)}`);
+      }
+    } catch (e) {
+      if (e.name === 'AbortError') e = new Error(`Anthropic API request timed out after ${timeoutMs}ms`);
+      const retryable = /timed out/.test(e.message) || e.name === 'TypeError' || /Anthropic API (429|5\d\d)/.test(e.message);
+      if (!retryable || attempt >= retries) throw e;
+      lastErr = e;
+    } finally {
+      clearTimeout(timer);
+    }
+    await new Promise((r) => setTimeout(r, 1000 * 2 ** attempt)); // backoff: 1s, 2s, 4s
  }
-  const data = await res.json();
-  return (data.content || []).map((c) => c.text || '').join('').trim();
+  throw lastErr || new Error('Anthropic API request failed.');
 }

 /** Parse "name: value" YAML-ish frontmatter + body from a SKILL.md string. */