feat(course): build out all 27 modules, capstone, scaffold, and conventions

Scaffold the course repo and author the full curriculum in dependency-chain
order, following the settled build decisions in handoff.md.

- Scaffold: course README, vendor-neutral AGENTS.md (dogfoods Module 5),
  _TEMPLATE.md (the fixed 9-section module shape), root .gitignore, ship config.
- Modules 1-2: reference exemplars (locked for tone/depth/lab style).
- Modules 3-27: full lessons + runnable labs, each following the template,
  respecting the chain, vendor/model-agnostic, with "feel the pain" labs.
- Module 8 hosting comparison web-researched and date-stamped (as of 2026-06-22),
  not written from memory; expansion-zone modules carry Verify-before-publish.
- Capstone: the full loop end to end on the running tasks-app example.

Lab code syntax-checked (Python/shell/YAML); every module has the 7 core
template sections.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01TfzV5QvtPDz8LJS3Pu5VLT
This commit is contained in:
2026-06-22 12:18:30 -04:00
parent 4bd586bbd0
commit fbec36cb67
117 changed files with 15131 additions and 1 deletions
@@ -0,0 +1,43 @@
"""Candidate output: the CURRENT model/prompt.
This is what your agent produced for the task "implement pending_count() so it
returns the number of tasks that are not done." It's correct. Replace this whole
directory with your own agent's real output when you run the lab for real.
"""
from dataclasses import dataclass, field
@dataclass
class Task:
title: str
done: bool = False
@dataclass
class TaskList:
tasks: list[Task] = field(default_factory=list)
def add(self, title: str) -> Task:
task = Task(title=title)
self.tasks.append(task)
return task
def complete(self, index: int) -> None:
self.tasks[index].done = True
def pending(self) -> list[Task]:
return [t for t in self.tasks if not t.done]
def pending_count(self) -> int:
# Correct: count only the tasks that aren't done.
return len(self.pending())
def render(self) -> str:
if not self.tasks:
return "(no tasks yet)"
lines = []
for i, task in enumerate(self.tasks):
box = "[x]" if task.done else "[ ]"
lines.append(f"{i}. {box} {task.title}")
return "\n".join(lines)
@@ -0,0 +1,50 @@
"""Candidate output: a SWAPPED model/prompt.
Same task, different model (or a tweaked prompt). This output "looks right" and
passes a casual manual check — adding three tasks and calling count returns 3.
But pending_count() returns the total number of tasks, not the number of
*pending* ones, so it's wrong the moment anything is marked done.
Nobody would notice this by skimming. The eval set notices it instantly. That's
the regression eval catching an unsafe swap — exactly the scenario this module
exists for. Replace this with your own swapped-model output when you run it for
real; you may get lucky and have it pass, or you may catch a regression like
this one.
"""
from dataclasses import dataclass, field
@dataclass
class Task:
title: str
done: bool = False
@dataclass
class TaskList:
tasks: list[Task] = field(default_factory=list)
def add(self, title: str) -> Task:
task = Task(title=title)
self.tasks.append(task)
return task
def complete(self, index: int) -> None:
self.tasks[index].done = True
def pending(self) -> list[Task]:
return [t for t in self.tasks if not t.done]
def pending_count(self) -> int:
# WRONG, but plausibly so: counts every task, not just pending ones.
return len(self.tasks)
def render(self) -> str:
if not self.tasks:
return "(no tasks yet)"
lines = []
for i, task in enumerate(self.tasks):
box = "[x]" if task.done else "[ ]"
lines.append(f"{i}. {box} {task.title}")
return "\n".join(lines)
+37
View File
@@ -0,0 +1,37 @@
"""The eval set for the tasks-app `pending_count` agent task.
An *eval set* is a list of CASES. Each case is three things:
- a name (so the scorecard is readable),
- an input (here: the state a TaskList is in), and
- the expected result (here: how many tasks should count as pending).
The grading lives in run_eval.py; this file is just data. Keeping the cases
separate from any model, prompt, or runner is the whole point — the same eval
set judges *any* candidate you point it at, which is what makes it useful when
you swap the model out from under it.
The task we're evaluating: an agent was asked to implement
`TaskList.pending_count()` so it returns the number of tasks that are NOT done.
That sounds trivial. The discriminating cases below are the ones a
"looks-right" implementation quietly fails.
"""
# Each case: (name, [(title, done), ...], expected_pending_count)
CASES = [
("empty list has zero pending", [], 0),
("one open task counts as one", [("write tests", False)], 1),
(
"three open tasks count as three",
[("a", False), ("b", False), ("c", False)],
3,
),
# The discriminating case. A candidate that returns len(tasks) passes
# everything above and fails right here. This is the eval earning its keep.
(
"completed tasks are NOT pending",
[("done thing", True), ("open thing", False), ("also done", True)],
1,
),
("all done means zero pending", [("x", True), ("y", True)], 0),
]
+79
View File
@@ -0,0 +1,79 @@
"""LLM-as-judge: the pattern, and its limits, in one file.
Some agent output can't be graded by `==`. "Is this commit message clear?" or
"Does this PR description actually explain the change?" has no exact answer. The
common move is to ask *another* model to grade it. This file shows the shape of
that grader and is deliberately honest about what it can't do.
It is vendor-agnostic by design. Point it at whatever model endpoint you already
use by setting two environment variables; if they're not set, it abstains rather
than pretending. NOTHING here pins a provider.
EVAL_JUDGE_URL # an OpenAI-style /chat/completions-compatible endpoint, or your own
EVAL_JUDGE_KEY # the bearer token for it
EVAL_JUDGE_MODEL # the model name to ask for
Run it standalone to grade one sample:
python llm_judge.py "Add count command" "fix"
"""
import json
import os
import sys
import urllib.request
RUBRIC = """You are grading one piece of agent output against a rubric.
Score 1 if the commit message clearly and specifically describes the change.
Score 0 if it is vague, generic, or could describe almost any change.
Reply with ONLY a JSON object: {"score": 0 or 1, "reason": "<one short sentence>"}
"""
def judge(candidate_text: str) -> dict:
url = os.environ.get("EVAL_JUDGE_URL")
key = os.environ.get("EVAL_JUDGE_KEY")
model = os.environ.get("EVAL_JUDGE_MODEL")
if not (url and key and model):
return {"score": None, "reason": "judge not configured — abstaining (set EVAL_JUDGE_* to enable)"}
payload = json.dumps({
"model": model,
"temperature": 0, # determinism matters for a grader; you want repeatable scores
"messages": [
{"role": "system", "content": RUBRIC},
{"role": "user", "content": f"Output to grade:\n{candidate_text}"},
],
}).encode()
req = urllib.request.Request(
url,
data=payload,
headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
)
with urllib.request.urlopen(req) as resp:
body = json.loads(resp.read())
content = body["choices"][0]["message"]["content"]
return json.loads(content)
if __name__ == "__main__":
sample = sys.argv[1] if len(sys.argv) > 1 else "fix stuff"
print(judge(sample))
# ---------------------------------------------------------------------------
# READ THIS BEFORE TRUSTING A SCORE FROM HERE.
#
# An LLM judge is a model grading a model. Its failure modes are real:
# - Correlated blind spots: the judge can share the candidate's confusion, so
# a wrong answer gets a passing grade because both models are wrong the same way.
# - Bias: judges favor longer, more confident, or first-presented answers
# regardless of correctness. Hold position and length constant when you can.
# - Drift: change the judge model and your scores move even though nothing
# about the candidate changed. The ruler is itself made of rubber.
#
# So: use a programmatic grader (run_eval.py) wherever a deterministic check is
# possible — that is most of the time. Reach for an LLM judge only for genuinely
# open-ended output, and CALIBRATE it first: hand-label ~20 examples yourself,
# run the judge on them, and confirm it agrees with you before you let it gate
# anything. An uncalibrated judge is a vibe with a number attached.
# ---------------------------------------------------------------------------
+78
View File
@@ -0,0 +1,78 @@
"""Run the eval set against one candidate and print a scorecard.
Usage:
python run_eval.py candidates/current_model
python run_eval.py candidates/swapped_model
python run_eval.py candidates/current_model --threshold 0.9
A "candidate" is a directory containing a tasks.py that an agent produced. The
runner imports that tasks.py, runs every case in eval_set.py against it, prints
a pass/fail line per case, and reports an aggregate score.
The exit code is the guardrail: 0 if the score meets the threshold, 1 if it
doesn't. That single integer is what lets an eval gate a model swap, a prompt
change, or an unattended agent in CI (Module 14) instead of a human eyeballing
output. "Below threshold" should block exactly like a failing test does.
"""
import argparse
import importlib.util
import sys
from pathlib import Path
from eval_set import CASES
def load_candidate(candidate_dir: Path):
"""Import the tasks.py living in candidate_dir as an isolated module."""
tasks_py = candidate_dir / "tasks.py"
if not tasks_py.exists():
sys.exit(f"no tasks.py in {candidate_dir}")
spec = importlib.util.spec_from_file_location("candidate_tasks", tasks_py)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def run_case(candidate, items, expected):
"""Build the input state with the candidate's own classes, then grade."""
tlist = candidate.TaskList()
for title, done in items:
tlist.tasks.append(candidate.Task(title=title, done=done))
try:
actual = tlist.pending_count()
except Exception as exc: # a crash is a failed case, not a crashed harness
return False, f"raised {type(exc).__name__}: {exc}"
return actual == expected, f"expected {expected}, got {actual}"
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument("candidate", help="path to a candidate dir with tasks.py")
parser.add_argument("--threshold", type=float, default=1.0,
help="minimum passing fraction to exit 0 (default 1.0)")
args = parser.parse_args(argv)
candidate_dir = Path(args.candidate)
candidate = load_candidate(candidate_dir)
passed = 0
print(f"\neval set: {len(CASES)} cases candidate: {candidate_dir}\n")
for name, items, expected in CASES:
ok, detail = run_case(candidate, items, expected)
mark = "PASS" if ok else "FAIL"
print(f" [{mark}] {name:<40} ({detail})")
passed += ok
score = passed / len(CASES)
print(f"\nscore: {passed}/{len(CASES)} = {score:.0%} threshold: {args.threshold:.0%}")
if score < args.threshold:
print("RESULT: below threshold — this change is NOT safe to ship.\n")
return 1
print("RESULT: at or above threshold — safe by this eval.\n")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))