51bf4be52f
Three features riding 2026 trends (agentic CI, codegen, evals), sharing one dependency-free Anthropic client (bin/lib/anthropic.mjs). 1. GitHub Action (action/) — run any skill in a consumer repo's CI: uses: mohitagw15856/pm-claude-skills/action@main. Composite action + run.mjs (loads the bundled SKILL.md, calls the API, exposes result as a step output / file). Docs with auto-PR-description example. 2. generate command — `npx pm-claude-skills generate --from <url|file>` turns a team's docs into a SKILL.md following the authoring standard (bin/generate.mjs, wired into the CLI; needs ANTHROPIC_API_KEY). 3. Skill evals + Leaderboard — evals/run-evals.mjs runs each case across models and scores output with an LLM judge (structure/completeness/usefulness/ grounding); scripts/build-leaderboard.mjs renders web/leaderboard.html (built in the Pages deploy, falls back to clearly-labelled example data). Linked from README, catalog, and playground. Offline-testable parts verified (prompt building, skill loading, graceful errors, leaderboard render). SkillCheck/audit/exports all green. Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px Co-authored-by: Claude <noreply@anthropic.com>
94 lines
4.0 KiB
JavaScript
94 lines
4.0 KiB
JavaScript
#!/usr/bin/env node
|
||
// Skill eval harness. For each case × model: run the skill, then score the output
|
||
// with an LLM judge on a fixed rubric. Writes evals/results.json — feed it to
|
||
// scripts/build-leaderboard.mjs to render web/leaderboard.html.
|
||
//
|
||
// Requires an Anthropic API key (this calls the API and costs tokens).
|
||
//
|
||
// Usage:
|
||
// ANTHROPIC_API_KEY=sk-ant-... node evals/run-evals.mjs
|
||
// ... node evals/run-evals.mjs --models claude-opus-4-8,claude-sonnet-4-6,claude-haiku-4-5-20251001
|
||
// ... node evals/run-evals.mjs --judge claude-opus-4-8 --cases evals/cases.json
|
||
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
|
||
import { join, dirname } from 'node:path';
|
||
import { fileURLToPath } from 'node:url';
|
||
import { complete, parseSkill } from '../bin/lib/anthropic.mjs';
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
const root = join(__dirname, '..');
|
||
|
||
function arg(name, def) {
|
||
const i = process.argv.indexOf(`--${name}`);
|
||
return i !== -1 ? process.argv[i + 1] : def;
|
||
}
|
||
|
||
const apiKey = process.env.ANTHROPIC_API_KEY || '';
|
||
const models = arg('models', 'claude-sonnet-4-6,claude-haiku-4-5-20251001').split(',').map((s) => s.trim());
|
||
const judge = arg('judge', 'claude-opus-4-8');
|
||
const casesPath = arg('cases', join(__dirname, 'cases.json'));
|
||
const outPath = arg('out', join(__dirname, 'results.json'));
|
||
|
||
const DIMENSIONS = ['structure', 'completeness', 'usefulness', 'grounding'];
|
||
|
||
function runPrompt(skillBody) {
|
||
return skillBody + '\n\n---\nExecute this skill now on the input. Output only the finished artifact.';
|
||
}
|
||
|
||
function judgePrompt(description, output) {
|
||
return `You are a strict evaluator of a professional work artifact.
|
||
|
||
The artifact was produced by a skill whose job is:
|
||
"${description}"
|
||
|
||
Score the artifact below from 1 (poor) to 5 (excellent) on each dimension:
|
||
- structure: follows a clear, expected structure for this kind of output
|
||
- completeness: covers what the task needs, nothing important missing
|
||
- usefulness: actually useful to a professional, specific not generic
|
||
- grounding: stays grounded in the given input, no invented facts/metrics
|
||
|
||
Return ONLY a JSON object, no prose: {"structure":N,"completeness":N,"usefulness":N,"grounding":N}
|
||
|
||
--- ARTIFACT ---
|
||
${output}`;
|
||
}
|
||
|
||
function parseScores(text) {
|
||
const m = text.match(/\{[\s\S]*\}/);
|
||
if (!m) throw new Error('judge did not return JSON');
|
||
const j = JSON.parse(m[0]);
|
||
const s = {};
|
||
for (const d of DIMENSIONS) s[d] = Math.max(1, Math.min(5, Number(j[d]) || 0));
|
||
return s;
|
||
}
|
||
|
||
async function main() {
|
||
if (!apiKey) { console.error('Set ANTHROPIC_API_KEY to run evals.'); process.exit(1); }
|
||
const { cases } = JSON.parse(readFileSync(casesPath, 'utf8'));
|
||
const results = [];
|
||
|
||
for (const c of cases) {
|
||
const skillFile = join(root, 'skills', c.skill, 'SKILL.md');
|
||
if (!existsSync(skillFile)) { console.error(`skip ${c.skill}: no SKILL.md`); continue; }
|
||
const { meta, body } = parseSkill(readFileSync(skillFile, 'utf8'));
|
||
for (const model of models) {
|
||
process.stderr.write(`Running ${c.skill} on ${model}… `);
|
||
try {
|
||
const output = await complete({ apiKey, model, system: runPrompt(body), messages: [{ role: 'user', content: c.input }], maxTokens: 3000 });
|
||
const judged = await complete({ apiKey, model: judge, messages: [{ role: 'user', content: judgePrompt(meta.description || c.skill, output) }], maxTokens: 200 });
|
||
const scores = parseScores(judged);
|
||
const overall = DIMENSIONS.reduce((a, d) => a + scores[d], 0) / DIMENSIONS.length;
|
||
results.push({ skill: c.skill, model, scores, overall: Math.round(overall * 100) / 100 });
|
||
process.stderr.write(`${overall.toFixed(2)}/5\n`);
|
||
} catch (e) {
|
||
process.stderr.write(`FAILED (${e.message})\n`);
|
||
}
|
||
}
|
||
}
|
||
|
||
const out = { generatedAt: new Date().toISOString(), judge, models, dimensions: DIMENSIONS, results };
|
||
writeFileSync(outPath, JSON.stringify(out, null, 2));
|
||
console.log(`\nWrote ${outPath} — ${results.length} scored runs. Build the page: node scripts/build-leaderboard.mjs`);
|
||
}
|
||
|
||
main();
|