389ac2e460
Apply the no-ai-slop standard (now binding in AGENTS.md): the em-dash character is banned outright (restructured, not blind-replaced), plus the banned word/phrase list (delve, leverage, robust, seamless, truly, unlock, etc.). 0 em-dashes remain in modules + capstone; the only "robust" left is the planted M10 ai-change.patch trap. Module H1 titles use a colon separator. All deliberate teaching devices preserved; labs compile/parse (py/sh/yaml/json); no junk. AGENTS.md updated with the hard no-slop rules. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01TfzV5QvtPDz8LJS3Pu5VLT
79 lines
2.8 KiB
Python
79 lines
2.8 KiB
Python
"""Run the eval set against one candidate and print a scorecard.
|
|
|
|
Usage:
|
|
python run_eval.py candidates/current_model
|
|
python run_eval.py candidates/swapped_model
|
|
python run_eval.py candidates/current_model --threshold 0.9
|
|
|
|
A "candidate" is a directory containing a tasks.py that an agent produced. The
|
|
runner imports that tasks.py, runs every case in eval_set.py against it, prints
|
|
a pass/fail line per case, and reports an aggregate score.
|
|
|
|
The exit code is the guardrail: 0 if the score meets the threshold, 1 if it
|
|
doesn't. That single integer is what lets an eval gate a model swap, a prompt
|
|
change, or an unattended agent in CI (Module 14) instead of a human eyeballing
|
|
output. "Below threshold" should block exactly like a failing test does.
|
|
"""
|
|
|
|
import argparse
|
|
import importlib.util
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from eval_set import CASES
|
|
|
|
|
|
def load_candidate(candidate_dir: Path):
|
|
"""Import the tasks.py living in candidate_dir as an isolated module."""
|
|
tasks_py = candidate_dir / "tasks.py"
|
|
if not tasks_py.exists():
|
|
sys.exit(f"no tasks.py in {candidate_dir}")
|
|
spec = importlib.util.spec_from_file_location("candidate_tasks", tasks_py)
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
def run_case(candidate, items, expected):
|
|
"""Build the input state with the candidate's own classes, then grade."""
|
|
tlist = candidate.TaskList()
|
|
for title, done in items:
|
|
tlist.tasks.append(candidate.Task(title=title, done=done))
|
|
try:
|
|
actual = tlist.pending_count()
|
|
except Exception as exc: # a crash is a failed case, not a crashed harness
|
|
return False, f"raised {type(exc).__name__}: {exc}"
|
|
return actual == expected, f"expected {expected}, got {actual}"
|
|
|
|
|
|
def main(argv):
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("candidate", help="path to a candidate dir with tasks.py")
|
|
parser.add_argument("--threshold", type=float, default=1.0,
|
|
help="minimum passing fraction to exit 0 (default 1.0)")
|
|
args = parser.parse_args(argv)
|
|
|
|
candidate_dir = Path(args.candidate)
|
|
candidate = load_candidate(candidate_dir)
|
|
|
|
passed = 0
|
|
print(f"\neval set: {len(CASES)} cases candidate: {candidate_dir}\n")
|
|
for name, items, expected in CASES:
|
|
ok, detail = run_case(candidate, items, expected)
|
|
mark = "PASS" if ok else "FAIL"
|
|
print(f" [{mark}] {name:<40} ({detail})")
|
|
passed += ok
|
|
|
|
score = passed / len(CASES)
|
|
print(f"\nscore: {passed}/{len(CASES)} = {score:.0%} threshold: {args.threshold:.0%}")
|
|
|
|
if score < args.threshold:
|
|
print("RESULT: below threshold; this change is NOT safe to ship.\n")
|
|
return 1
|
|
print("RESULT: at or above threshold; safe by this eval.\n")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main(sys.argv[1:]))
|