Merge pull request #59 from mohitagw15856/eval-results
chore(evals): refresh leaderboard results
This commit is contained in:
+46
-46
@@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"generatedAt": "2026-06-18T12:40:14.995Z",
|
"generatedAt": "2026-06-18T20:13:31.726Z",
|
||||||
"judge": "claude-opus-4-8",
|
"judge": "claude-opus-4-8",
|
||||||
"models": [
|
"models": [
|
||||||
"claude-sonnet-4-6",
|
"claude-sonnet-4-6",
|
||||||
@@ -41,20 +41,20 @@
|
|||||||
"structure": 5,
|
"structure": 5,
|
||||||
"completeness": 5,
|
"completeness": 5,
|
||||||
"usefulness": 5,
|
"usefulness": 5,
|
||||||
"grounding": 4
|
"grounding": 5
|
||||||
},
|
},
|
||||||
"overall": 4.75
|
"overall": 5
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"skill": "prd-template",
|
"skill": "prd-template",
|
||||||
"model": "claude-haiku-4-5-20251001",
|
"model": "claude-haiku-4-5-20251001",
|
||||||
"scores": {
|
"scores": {
|
||||||
"structure": 5,
|
"structure": 5,
|
||||||
"completeness": 4,
|
"completeness": 5,
|
||||||
"usefulness": 5,
|
"usefulness": 5,
|
||||||
"grounding": 3
|
"grounding": 4
|
||||||
},
|
},
|
||||||
"overall": 4.25
|
"overall": 4.75
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"skill": "cs-health-scorecard",
|
"skill": "cs-health-scorecard",
|
||||||
@@ -63,9 +63,9 @@
|
|||||||
"structure": 5,
|
"structure": 5,
|
||||||
"completeness": 5,
|
"completeness": 5,
|
||||||
"usefulness": 5,
|
"usefulness": 5,
|
||||||
"grounding": 5
|
"grounding": 4
|
||||||
},
|
},
|
||||||
"overall": 5
|
"overall": 4.75
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"skill": "cs-health-scorecard",
|
"skill": "cs-health-scorecard",
|
||||||
@@ -84,44 +84,44 @@
|
|||||||
"scores": {
|
"scores": {
|
||||||
"structure": 5,
|
"structure": 5,
|
||||||
"completeness": 5,
|
"completeness": 5,
|
||||||
"usefulness": 5,
|
"usefulness": 4,
|
||||||
"grounding": 5
|
|
||||||
},
|
|
||||||
"overall": 5
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"skill": "executive-summary",
|
|
||||||
"model": "claude-haiku-4-5-20251001",
|
|
||||||
"scores": {
|
|
||||||
"structure": 5,
|
|
||||||
"completeness": 5,
|
|
||||||
"usefulness": 5,
|
|
||||||
"grounding": 4
|
|
||||||
},
|
|
||||||
"overall": 4.75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"skill": "competitive-analysis",
|
|
||||||
"model": "claude-sonnet-4-6",
|
|
||||||
"scores": {
|
|
||||||
"structure": 5,
|
|
||||||
"completeness": 4,
|
|
||||||
"usefulness": 5,
|
|
||||||
"grounding": 5
|
|
||||||
},
|
|
||||||
"overall": 4.75
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"skill": "competitive-analysis",
|
|
||||||
"model": "claude-haiku-4-5-20251001",
|
|
||||||
"scores": {
|
|
||||||
"structure": 5,
|
|
||||||
"completeness": 4,
|
|
||||||
"usefulness": 5,
|
|
||||||
"grounding": 4
|
"grounding": 4
|
||||||
},
|
},
|
||||||
"overall": 4.5
|
"overall": 4.5
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"skill": "executive-summary",
|
||||||
|
"model": "claude-haiku-4-5-20251001",
|
||||||
|
"scores": {
|
||||||
|
"structure": 5,
|
||||||
|
"completeness": 4,
|
||||||
|
"usefulness": 4,
|
||||||
|
"grounding": 4
|
||||||
|
},
|
||||||
|
"overall": 4.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"skill": "competitive-analysis",
|
||||||
|
"model": "claude-sonnet-4-6",
|
||||||
|
"scores": {
|
||||||
|
"structure": 5,
|
||||||
|
"completeness": 4,
|
||||||
|
"usefulness": 5,
|
||||||
|
"grounding": 5
|
||||||
|
},
|
||||||
|
"overall": 4.75
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"skill": "competitive-analysis",
|
||||||
|
"model": "claude-haiku-4-5-20251001",
|
||||||
|
"scores": {
|
||||||
|
"structure": 5,
|
||||||
|
"completeness": 5,
|
||||||
|
"usefulness": 5,
|
||||||
|
"grounding": 4
|
||||||
|
},
|
||||||
|
"overall": 4.75
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"skill": "sprint-planning",
|
"skill": "sprint-planning",
|
||||||
"model": "claude-sonnet-4-6",
|
"model": "claude-sonnet-4-6",
|
||||||
@@ -138,11 +138,11 @@
|
|||||||
"model": "claude-haiku-4-5-20251001",
|
"model": "claude-haiku-4-5-20251001",
|
||||||
"scores": {
|
"scores": {
|
||||||
"structure": 5,
|
"structure": 5,
|
||||||
"completeness": 4,
|
"completeness": 5,
|
||||||
"usefulness": 4,
|
"usefulness": 5,
|
||||||
"grounding": 3
|
"grounding": 4
|
||||||
},
|
},
|
||||||
"overall": 4
|
"overall": 4.75
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user