Files
pm-claude-skills/scripts/build-leaderboard.mjs
T
mohitagw15856 51bf4be52f AI-powered tooling: GitHub Action, generate command, evals + leaderboard (#41)
Three features riding 2026 trends (agentic CI, codegen, evals), sharing one
dependency-free Anthropic client (bin/lib/anthropic.mjs).

1. GitHub Action (action/) — run any skill in a consumer repo's CI:
   uses: mohitagw15856/pm-claude-skills/action@main. Composite action +
   run.mjs (loads the bundled SKILL.md, calls the API, exposes result as a
   step output / file). Docs with auto-PR-description example.

2. generate command — `npx pm-claude-skills generate --from <url|file>` turns
   a team's docs into a SKILL.md following the authoring standard
   (bin/generate.mjs, wired into the CLI; needs ANTHROPIC_API_KEY).

3. Skill evals + Leaderboard — evals/run-evals.mjs runs each case across models
   and scores output with an LLM judge (structure/completeness/usefulness/
   grounding); scripts/build-leaderboard.mjs renders web/leaderboard.html
   (built in the Pages deploy, falls back to clearly-labelled example data).
   Linked from README, catalog, and playground.

Offline-testable parts verified (prompt building, skill loading, graceful
errors, leaderboard render). SkillCheck/audit/exports all green.


Claude-Session: https://claude.ai/code/session_016JWn5jRD5tcEFKrubjQ6Px

Co-authored-by: Claude <noreply@anthropic.com>
2026-06-18 08:37:40 +01:00

77 lines
4.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
// Renders web/leaderboard.html from evals/results.json (or evals/results.example.json
// as a clearly-labelled placeholder). Run after evals/run-evals.mjs. No dependencies.
import { readFileSync, writeFileSync, existsSync } from 'node:fs';
import { join, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const root = join(__dirname, '..');
const REPO = 'https://github.com/mohitagw15856/pm-claude-skills';
const real = join(root, 'evals', 'results.json');
const example = join(root, 'evals', 'results.example.json');
const src = existsSync(real) ? real : example;
const data = JSON.parse(readFileSync(src, 'utf8'));
const isExample = !!data.example || src === example;
const esc = (s) => String(s).replace(/[&<>"]/g, (c) => ({ '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;' }[c]));
const skills = [...new Set(data.results.map((r) => r.skill))].sort();
const models = data.models || [...new Set(data.results.map((r) => r.model))];
const cell = (skill, model) => data.results.find((r) => r.skill === skill && r.model === model);
const colour = (v) => v >= 4.5 ? '#6ee7b7' : v >= 4 ? '#93c5fd' : v >= 3 ? '#fcd34d' : '#fca5a5';
const modelAvg = (m) => {
const xs = data.results.filter((r) => r.model === m).map((r) => r.overall);
return xs.length ? (xs.reduce((a, b) => a + b, 0) / xs.length) : 0;
};
const headRow = `<tr><th>Skill</th>${models.map((m) => `<th>${esc(m)}</th>`).join('')}</tr>`;
const rows = skills.map((s) => `<tr><td class="skill">${esc(s)}</td>${models.map((m) => {
const c = cell(s, m);
return c ? `<td><span class="score" style="color:${colour(c.overall)}">${c.overall.toFixed(2)}</span></td>` : '<td class="na">—</td>';
}).join('')}</tr>`).join('\n');
const avgRow = `<tr class="avg"><td>Average</td>${models.map((m) => `<td><strong>${modelAvg(m).toFixed(2)}</strong></td>`).join('')}</tr>`;
const html = `<!DOCTYPE html>
<html lang="en"><head>
<meta charset="UTF-8" /><meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Skill Leaderboard — how pm-claude-skills score across Claude models</title>
<meta name="description" content="LLM-judged quality scores for professional Agent Skills across Claude models, on structure, completeness, usefulness, and grounding." />
<style>
:root{--bg:#0f1115;--panel:#161a21;--border:#2a313c;--text:#e7ebf0;--muted:#95a0b0;--accent2:#e89b82}
body{margin:0;background:var(--bg);color:var(--text);font:15px/1.5 -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,sans-serif}
a{color:var(--accent2)} header{padding:28px 22px;border-bottom:1px solid var(--border);background:var(--panel)}
header h1{margin:0 0 6px;font-size:23px} header p{margin:0;color:var(--muted);font-size:14px}
.nav{margin-top:12px;display:flex;gap:14px;font-size:13px;flex-wrap:wrap}
main{max-width:900px;margin:0 auto;padding:22px}
.banner{background:rgba(245,158,11,.12);border:1px solid rgba(245,158,11,.4);color:#fcd34d;padding:12px 14px;border-radius:10px;margin-bottom:18px;font-size:13.5px}
table{width:100%;border-collapse:collapse;font-size:14px}
th,td{padding:10px 12px;text-align:center;border-bottom:1px solid var(--border)}
th:first-child,td:first-child{text-align:left}
th{color:var(--accent2);font-size:12px;text-transform:uppercase;letter-spacing:.04em}
td.skill{font-weight:600} .score{font-weight:700} .na{color:var(--muted)}
tr.avg td{border-top:2px solid var(--border);color:var(--muted)}
.meta{color:var(--muted);font-size:12.5px;margin-top:16px}
</style></head><body>
<header>
<h1>🏆 Skill Leaderboard</h1>
<p>LLM-judged quality (15) for each skill across Claude models — scored on structure, completeness, usefulness &amp; grounding by <code>${esc(data.judge || 'an LLM judge')}</code>.</p>
<div class="nav"><a href="https://mohitagw15856.github.io/pm-claude-skills/">Playground</a><a href="catalog.html">Catalog</a><a href="${REPO}/tree/main/evals">How it works</a></div>
</header>
<main>
${isExample ? '<div class="banner">⚠️ <strong>Example data</strong> — illustrative scores so this page renders. Run <code>ANTHROPIC_API_KEY=… node evals/run-evals.mjs</code> then <code>node scripts/build-leaderboard.mjs</code> for real numbers.</div>' : ''}
<table>
<thead>${headRow}</thead>
<tbody>
${rows}
${avgRow}
</tbody>
</table>
<p class="meta">Higher is better (max 5). ${esc(skills.length)} skills × ${esc(models.length)} models${data.generatedAt ? ` · generated ${esc(String(data.generatedAt).slice(0, 10))}` : ''}. Methodology and cases in <a href="${REPO}/tree/main/evals">evals/</a>.</p>
</main></body></html>
`;
writeFileSync(join(root, 'web', 'leaderboard.html'), html);
console.log(`Wrote web/leaderboard.html — ${skills.length} skills × ${models.length} models${isExample ? ' (EXAMPLE data)' : ''}.`);