ai-workflow-course/modules/27-evals/lab/run_eval.py

"""Run the eval set against one candidate and print a scorecard.

Usage:
    python run_eval.py candidates/current_model
    python run_eval.py candidates/swapped_model
    python run_eval.py candidates/current_model --threshold 0.9

A "candidate" is a directory containing a tasks.py that an agent produced. The
runner imports that tasks.py, runs every case in eval_set.py against it, prints
a pass/fail line per case, and reports an aggregate score.

The exit code is the guardrail: 0 if the score meets the threshold, 1 if it
doesn't. That single integer is what lets an eval gate a model swap, a prompt
change, or an unattended agent in CI (Module 14) instead of a human eyeballing
output. "Below threshold" should block exactly like a failing test does.
"""

import argparse
import importlib.util
import sys
from pathlib import Path

from eval_set import CASES


def load_candidate(candidate_dir: Path):
    """Import the tasks.py living in candidate_dir as an isolated module."""
    tasks_py = candidate_dir / "tasks.py"
    if not tasks_py.exists():
        sys.exit(f"no tasks.py in {candidate_dir}")
    spec = importlib.util.spec_from_file_location("candidate_tasks", tasks_py)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


def run_case(candidate, items, expected):
    """Build the input state with the candidate's own classes, then grade."""
    tlist = candidate.TaskList()
    for title, done in items:
        tlist.tasks.append(candidate.Task(title=title, done=done))
    try:
        actual = tlist.pending_count()
    except Exception as exc:  # a crash is a failed case, not a crashed harness
        return False, f"raised {type(exc).__name__}: {exc}"
    return actual == expected, f"expected {expected}, got {actual}"


def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("candidate", help="path to a candidate dir with tasks.py")
    parser.add_argument("--threshold", type=float, default=1.0,
                        help="minimum passing fraction to exit 0 (default 1.0)")
    args = parser.parse_args(argv)

    candidate_dir = Path(args.candidate)
    candidate = load_candidate(candidate_dir)

    passed = 0
    print(f"\neval set: {len(CASES)} cases   candidate: {candidate_dir}\n")
    for name, items, expected in CASES:
        ok, detail = run_case(candidate, items, expected)
        mark = "PASS" if ok else "FAIL"
        print(f"  [{mark}] {name:<40} ({detail})")
        passed += ok

    score = passed / len(CASES)
    print(f"\nscore: {passed}/{len(CASES)} = {score:.0%}   threshold: {args.threshold:.0%}")

    if score < args.threshold:
        print("RESULT: below threshold; this change is NOT safe to ship.\n")
        return 1
    print("RESULT: at or above threshold; safe by this eval.\n")
    return 0


if __name__ == "__main__":
    raise SystemExit(main(sys.argv[1:]))