feat(course): build out all 27 modules, capstone, scaffold, and conventions
Scaffold the course repo and author the full curriculum in dependency-chain order, following the settled build decisions in handoff.md. - Scaffold: course README, vendor-neutral AGENTS.md (dogfoods Module 5), _TEMPLATE.md (the fixed 9-section module shape), root .gitignore, ship config. - Modules 1-2: reference exemplars (locked for tone/depth/lab style). - Modules 3-27: full lessons + runnable labs, each following the template, respecting the chain, vendor/model-agnostic, with "feel the pain" labs. - Module 8 hosting comparison web-researched and date-stamped (as of 2026-06-22), not written from memory; expansion-zone modules carry Verify-before-publish. - Capstone: the full loop end to end on the running tasks-app example. Lab code syntax-checked (Python/shell/YAML); every module has the 7 core template sections. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01TfzV5QvtPDz8LJS3Pu5VLT
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
"""Candidate output: the CURRENT model/prompt.
|
||||
|
||||
This is what your agent produced for the task "implement pending_count() so it
|
||||
returns the number of tasks that are not done." It's correct. Replace this whole
|
||||
directory with your own agent's real output when you run the lab for real.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
title: str
|
||||
done: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskList:
|
||||
tasks: list[Task] = field(default_factory=list)
|
||||
|
||||
def add(self, title: str) -> Task:
|
||||
task = Task(title=title)
|
||||
self.tasks.append(task)
|
||||
return task
|
||||
|
||||
def complete(self, index: int) -> None:
|
||||
self.tasks[index].done = True
|
||||
|
||||
def pending(self) -> list[Task]:
|
||||
return [t for t in self.tasks if not t.done]
|
||||
|
||||
def pending_count(self) -> int:
|
||||
# Correct: count only the tasks that aren't done.
|
||||
return len(self.pending())
|
||||
|
||||
def render(self) -> str:
|
||||
if not self.tasks:
|
||||
return "(no tasks yet)"
|
||||
lines = []
|
||||
for i, task in enumerate(self.tasks):
|
||||
box = "[x]" if task.done else "[ ]"
|
||||
lines.append(f"{i}. {box} {task.title}")
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,50 @@
|
||||
"""Candidate output: a SWAPPED model/prompt.
|
||||
|
||||
Same task, different model (or a tweaked prompt). This output "looks right" and
|
||||
passes a casual manual check — adding three tasks and calling count returns 3.
|
||||
But pending_count() returns the total number of tasks, not the number of
|
||||
*pending* ones, so it's wrong the moment anything is marked done.
|
||||
|
||||
Nobody would notice this by skimming. The eval set notices it instantly. That's
|
||||
the regression eval catching an unsafe swap — exactly the scenario this module
|
||||
exists for. Replace this with your own swapped-model output when you run it for
|
||||
real; you may get lucky and have it pass, or you may catch a regression like
|
||||
this one.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
title: str
|
||||
done: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskList:
|
||||
tasks: list[Task] = field(default_factory=list)
|
||||
|
||||
def add(self, title: str) -> Task:
|
||||
task = Task(title=title)
|
||||
self.tasks.append(task)
|
||||
return task
|
||||
|
||||
def complete(self, index: int) -> None:
|
||||
self.tasks[index].done = True
|
||||
|
||||
def pending(self) -> list[Task]:
|
||||
return [t for t in self.tasks if not t.done]
|
||||
|
||||
def pending_count(self) -> int:
|
||||
# WRONG, but plausibly so: counts every task, not just pending ones.
|
||||
return len(self.tasks)
|
||||
|
||||
def render(self) -> str:
|
||||
if not self.tasks:
|
||||
return "(no tasks yet)"
|
||||
lines = []
|
||||
for i, task in enumerate(self.tasks):
|
||||
box = "[x]" if task.done else "[ ]"
|
||||
lines.append(f"{i}. {box} {task.title}")
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,37 @@
|
||||
"""The eval set for the tasks-app `pending_count` agent task.
|
||||
|
||||
An *eval set* is a list of CASES. Each case is three things:
|
||||
|
||||
- a name (so the scorecard is readable),
|
||||
- an input (here: the state a TaskList is in), and
|
||||
- the expected result (here: how many tasks should count as pending).
|
||||
|
||||
The grading lives in run_eval.py; this file is just data. Keeping the cases
|
||||
separate from any model, prompt, or runner is the whole point — the same eval
|
||||
set judges *any* candidate you point it at, which is what makes it useful when
|
||||
you swap the model out from under it.
|
||||
|
||||
The task we're evaluating: an agent was asked to implement
|
||||
`TaskList.pending_count()` so it returns the number of tasks that are NOT done.
|
||||
That sounds trivial. The discriminating cases below are the ones a
|
||||
"looks-right" implementation quietly fails.
|
||||
"""
|
||||
|
||||
# Each case: (name, [(title, done), ...], expected_pending_count)
|
||||
CASES = [
|
||||
("empty list has zero pending", [], 0),
|
||||
("one open task counts as one", [("write tests", False)], 1),
|
||||
(
|
||||
"three open tasks count as three",
|
||||
[("a", False), ("b", False), ("c", False)],
|
||||
3,
|
||||
),
|
||||
# The discriminating case. A candidate that returns len(tasks) passes
|
||||
# everything above and fails right here. This is the eval earning its keep.
|
||||
(
|
||||
"completed tasks are NOT pending",
|
||||
[("done thing", True), ("open thing", False), ("also done", True)],
|
||||
1,
|
||||
),
|
||||
("all done means zero pending", [("x", True), ("y", True)], 0),
|
||||
]
|
||||
@@ -0,0 +1,79 @@
|
||||
"""LLM-as-judge: the pattern, and its limits, in one file.
|
||||
|
||||
Some agent output can't be graded by `==`. "Is this commit message clear?" or
|
||||
"Does this PR description actually explain the change?" has no exact answer. The
|
||||
common move is to ask *another* model to grade it. This file shows the shape of
|
||||
that grader and is deliberately honest about what it can't do.
|
||||
|
||||
It is vendor-agnostic by design. Point it at whatever model endpoint you already
|
||||
use by setting two environment variables; if they're not set, it abstains rather
|
||||
than pretending. NOTHING here pins a provider.
|
||||
|
||||
EVAL_JUDGE_URL # an OpenAI-style /chat/completions-compatible endpoint, or your own
|
||||
EVAL_JUDGE_KEY # the bearer token for it
|
||||
EVAL_JUDGE_MODEL # the model name to ask for
|
||||
|
||||
Run it standalone to grade one sample:
|
||||
python llm_judge.py "Add count command" "fix"
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
RUBRIC = """You are grading one piece of agent output against a rubric.
|
||||
Score 1 if the commit message clearly and specifically describes the change.
|
||||
Score 0 if it is vague, generic, or could describe almost any change.
|
||||
Reply with ONLY a JSON object: {"score": 0 or 1, "reason": "<one short sentence>"}
|
||||
"""
|
||||
|
||||
|
||||
def judge(candidate_text: str) -> dict:
|
||||
url = os.environ.get("EVAL_JUDGE_URL")
|
||||
key = os.environ.get("EVAL_JUDGE_KEY")
|
||||
model = os.environ.get("EVAL_JUDGE_MODEL")
|
||||
if not (url and key and model):
|
||||
return {"score": None, "reason": "judge not configured — abstaining (set EVAL_JUDGE_* to enable)"}
|
||||
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"temperature": 0, # determinism matters for a grader; you want repeatable scores
|
||||
"messages": [
|
||||
{"role": "system", "content": RUBRIC},
|
||||
{"role": "user", "content": f"Output to grade:\n{candidate_text}"},
|
||||
],
|
||||
}).encode()
|
||||
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=payload,
|
||||
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
|
||||
)
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
body = json.loads(resp.read())
|
||||
content = body["choices"][0]["message"]["content"]
|
||||
return json.loads(content)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sample = sys.argv[1] if len(sys.argv) > 1 else "fix stuff"
|
||||
print(judge(sample))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# READ THIS BEFORE TRUSTING A SCORE FROM HERE.
|
||||
#
|
||||
# An LLM judge is a model grading a model. Its failure modes are real:
|
||||
# - Correlated blind spots: the judge can share the candidate's confusion, so
|
||||
# a wrong answer gets a passing grade because both models are wrong the same way.
|
||||
# - Bias: judges favor longer, more confident, or first-presented answers
|
||||
# regardless of correctness. Hold position and length constant when you can.
|
||||
# - Drift: change the judge model and your scores move even though nothing
|
||||
# about the candidate changed. The ruler is itself made of rubber.
|
||||
#
|
||||
# So: use a programmatic grader (run_eval.py) wherever a deterministic check is
|
||||
# possible — that is most of the time. Reach for an LLM judge only for genuinely
|
||||
# open-ended output, and CALIBRATE it first: hand-label ~20 examples yourself,
|
||||
# run the judge on them, and confirm it agrees with you before you let it gate
|
||||
# anything. An uncalibrated judge is a vibe with a number attached.
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -0,0 +1,78 @@
|
||||
"""Run the eval set against one candidate and print a scorecard.
|
||||
|
||||
Usage:
|
||||
python run_eval.py candidates/current_model
|
||||
python run_eval.py candidates/swapped_model
|
||||
python run_eval.py candidates/current_model --threshold 0.9
|
||||
|
||||
A "candidate" is a directory containing a tasks.py that an agent produced. The
|
||||
runner imports that tasks.py, runs every case in eval_set.py against it, prints
|
||||
a pass/fail line per case, and reports an aggregate score.
|
||||
|
||||
The exit code is the guardrail: 0 if the score meets the threshold, 1 if it
|
||||
doesn't. That single integer is what lets an eval gate a model swap, a prompt
|
||||
change, or an unattended agent in CI (Module 14) instead of a human eyeballing
|
||||
output. "Below threshold" should block exactly like a failing test does.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import importlib.util
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from eval_set import CASES
|
||||
|
||||
|
||||
def load_candidate(candidate_dir: Path):
|
||||
"""Import the tasks.py living in candidate_dir as an isolated module."""
|
||||
tasks_py = candidate_dir / "tasks.py"
|
||||
if not tasks_py.exists():
|
||||
sys.exit(f"no tasks.py in {candidate_dir}")
|
||||
spec = importlib.util.spec_from_file_location("candidate_tasks", tasks_py)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
return module
|
||||
|
||||
|
||||
def run_case(candidate, items, expected):
|
||||
"""Build the input state with the candidate's own classes, then grade."""
|
||||
tlist = candidate.TaskList()
|
||||
for title, done in items:
|
||||
tlist.tasks.append(candidate.Task(title=title, done=done))
|
||||
try:
|
||||
actual = tlist.pending_count()
|
||||
except Exception as exc: # a crash is a failed case, not a crashed harness
|
||||
return False, f"raised {type(exc).__name__}: {exc}"
|
||||
return actual == expected, f"expected {expected}, got {actual}"
|
||||
|
||||
|
||||
def main(argv):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("candidate", help="path to a candidate dir with tasks.py")
|
||||
parser.add_argument("--threshold", type=float, default=1.0,
|
||||
help="minimum passing fraction to exit 0 (default 1.0)")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
candidate_dir = Path(args.candidate)
|
||||
candidate = load_candidate(candidate_dir)
|
||||
|
||||
passed = 0
|
||||
print(f"\neval set: {len(CASES)} cases candidate: {candidate_dir}\n")
|
||||
for name, items, expected in CASES:
|
||||
ok, detail = run_case(candidate, items, expected)
|
||||
mark = "PASS" if ok else "FAIL"
|
||||
print(f" [{mark}] {name:<40} ({detail})")
|
||||
passed += ok
|
||||
|
||||
score = passed / len(CASES)
|
||||
print(f"\nscore: {passed}/{len(CASES)} = {score:.0%} threshold: {args.threshold:.0%}")
|
||||
|
||||
if score < args.threshold:
|
||||
print("RESULT: below threshold — this change is NOT safe to ship.\n")
|
||||
return 1
|
||||
print("RESULT: at or above threshold — safe by this eval.\n")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv[1:]))
|
||||
Reference in New Issue
Block a user