feat(course): build out all 27 modules, capstone, scaffold, and conventions

Scaffold the course repo and author the full curriculum in dependency-chain order, following the settled build decisions in handoff.md. - Scaffold: course README, vendor-neutral AGENTS.md (dogfoods Module 5), _TEMPLATE.md (the fixed 9-section module shape), root .gitignore, ship config. - Modules 1-2: reference exemplars (locked for tone/depth/lab style). - Modules 3-27: full lessons + runnable labs, each following the template, respecting the chain, vendor/model-agnostic, with "feel the pain" labs. - Module 8 hosting comparison web-researched and date-stamped (as of 2026-06-22), not written from memory; expansion-zone modules carry Verify-before-publish. - Capstone: the full loop end to end on the running tasks-app example. Lab code syntax-checked (Python/shell/YAML); every module has the 7 core template sections. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01TfzV5QvtPDz8LJS3Pu5VLT
2026-06-22 12:18:30 -04:00
parent 4bd586bbd0
commit fbec36cb67
117 changed files with 15131 additions and 1 deletions
@@ -0,0 +1,43 @@
+"""Candidate output: the CURRENT model/prompt.
+
+This is what your agent produced for the task "implement pending_count() so it
+returns the number of tasks that are not done." It's correct. Replace this whole
+directory with your own agent's real output when you run the lab for real.
+"""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Task:
+    title: str
+    done: bool = False
+
+
+@dataclass
+class TaskList:
+    tasks: list[Task] = field(default_factory=list)
+
+    def add(self, title: str) -> Task:
+        task = Task(title=title)
+        self.tasks.append(task)
+        return task
+
+    def complete(self, index: int) -> None:
+        self.tasks[index].done = True
+
+    def pending(self) -> list[Task]:
+        return [t for t in self.tasks if not t.done]
+
+    def pending_count(self) -> int:
+        # Correct: count only the tasks that aren't done.
+        return len(self.pending())
+
+    def render(self) -> str:
+        if not self.tasks:
+            return "(no tasks yet)"
+        lines = []
+        for i, task in enumerate(self.tasks):
+            box = "[x]" if task.done else "[ ]"
+            lines.append(f"{i}. {box} {task.title}")
+        return "\n".join(lines)
@@ -0,0 +1,50 @@
+"""Candidate output: a SWAPPED model/prompt.
+
+Same task, different model (or a tweaked prompt). This output "looks right" and
+passes a casual manual check — adding three tasks and calling count returns 3.
+But pending_count() returns the total number of tasks, not the number of
+*pending* ones, so it's wrong the moment anything is marked done.
+
+Nobody would notice this by skimming. The eval set notices it instantly. That's
+the regression eval catching an unsafe swap — exactly the scenario this module
+exists for. Replace this with your own swapped-model output when you run it for
+real; you may get lucky and have it pass, or you may catch a regression like
+this one.
+"""
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Task:
+    title: str
+    done: bool = False
+
+
+@dataclass
+class TaskList:
+    tasks: list[Task] = field(default_factory=list)
+
+    def add(self, title: str) -> Task:
+        task = Task(title=title)
+        self.tasks.append(task)
+        return task
+
+    def complete(self, index: int) -> None:
+        self.tasks[index].done = True
+
+    def pending(self) -> list[Task]:
+        return [t for t in self.tasks if not t.done]
+
+    def pending_count(self) -> int:
+        # WRONG, but plausibly so: counts every task, not just pending ones.
+        return len(self.tasks)
+
+    def render(self) -> str:
+        if not self.tasks:
+            return "(no tasks yet)"
+        lines = []
+        for i, task in enumerate(self.tasks):
+            box = "[x]" if task.done else "[ ]"
+            lines.append(f"{i}. {box} {task.title}")
+        return "\n".join(lines)
@@ -0,0 +1,37 @@
+"""The eval set for the tasks-app `pending_count` agent task.
+
+An *eval set* is a list of CASES. Each case is three things:
+
+  - a name (so the scorecard is readable),
+  - an input (here: the state a TaskList is in), and
+  - the expected result (here: how many tasks should count as pending).
+
+The grading lives in run_eval.py; this file is just data. Keeping the cases
+separate from any model, prompt, or runner is the whole point — the same eval
+set judges *any* candidate you point it at, which is what makes it useful when
+you swap the model out from under it.
+
+The task we're evaluating: an agent was asked to implement
+`TaskList.pending_count()` so it returns the number of tasks that are NOT done.
+That sounds trivial. The discriminating cases below are the ones a
+"looks-right" implementation quietly fails.
+"""
+
+# Each case: (name, [(title, done), ...], expected_pending_count)
+CASES = [
+    ("empty list has zero pending", [], 0),
+    ("one open task counts as one", [("write tests", False)], 1),
+    (
+        "three open tasks count as three",
+        [("a", False), ("b", False), ("c", False)],
+        3,
+    ),
+    # The discriminating case. A candidate that returns len(tasks) passes
+    # everything above and fails right here. This is the eval earning its keep.
+    (
+        "completed tasks are NOT pending",
+        [("done thing", True), ("open thing", False), ("also done", True)],
+        1,
+    ),
+    ("all done means zero pending", [("x", True), ("y", True)], 0),
+]
@@ -0,0 +1,79 @@
+"""LLM-as-judge: the pattern, and its limits, in one file.
+
+Some agent output can't be graded by `==`. "Is this commit message clear?" or
+"Does this PR description actually explain the change?" has no exact answer. The
+common move is to ask *another* model to grade it. This file shows the shape of
+that grader and is deliberately honest about what it can't do.
+
+It is vendor-agnostic by design. Point it at whatever model endpoint you already
+use by setting two environment variables; if they're not set, it abstains rather
+than pretending. NOTHING here pins a provider.
+
+    EVAL_JUDGE_URL    # an OpenAI-style /chat/completions-compatible endpoint, or your own
+    EVAL_JUDGE_KEY    # the bearer token for it
+    EVAL_JUDGE_MODEL  # the model name to ask for
+
+Run it standalone to grade one sample:
+    python llm_judge.py "Add count command" "fix"
+"""
+
+import json
+import os
+import sys
+import urllib.request
+
+RUBRIC = """You are grading one piece of agent output against a rubric.
+Score 1 if the commit message clearly and specifically describes the change.
+Score 0 if it is vague, generic, or could describe almost any change.
+Reply with ONLY a JSON object: {"score": 0 or 1, "reason": "<one short sentence>"}
+"""
+
+
+def judge(candidate_text: str) -> dict:
+    url = os.environ.get("EVAL_JUDGE_URL")
+    key = os.environ.get("EVAL_JUDGE_KEY")
+    model = os.environ.get("EVAL_JUDGE_MODEL")
+    if not (url and key and model):
+        return {"score": None, "reason": "judge not configured — abstaining (set EVAL_JUDGE_* to enable)"}
+
+    payload = json.dumps({
+        "model": model,
+        "temperature": 0,  # determinism matters for a grader; you want repeatable scores
+        "messages": [
+            {"role": "system", "content": RUBRIC},
+            {"role": "user", "content": f"Output to grade:\n{candidate_text}"},
+        ],
+    }).encode()
+
+    req = urllib.request.Request(
+        url,
+        data=payload,
+        headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req) as resp:
+        body = json.loads(resp.read())
+    content = body["choices"][0]["message"]["content"]
+    return json.loads(content)
+
+
+if __name__ == "__main__":
+    sample = sys.argv[1] if len(sys.argv) > 1 else "fix stuff"
+    print(judge(sample))
+
+# ---------------------------------------------------------------------------
+# READ THIS BEFORE TRUSTING A SCORE FROM HERE.
+#
+# An LLM judge is a model grading a model. Its failure modes are real:
+#   - Correlated blind spots: the judge can share the candidate's confusion, so
+#     a wrong answer gets a passing grade because both models are wrong the same way.
+#   - Bias: judges favor longer, more confident, or first-presented answers
+#     regardless of correctness. Hold position and length constant when you can.
+#   - Drift: change the judge model and your scores move even though nothing
+#     about the candidate changed. The ruler is itself made of rubber.
+#
+# So: use a programmatic grader (run_eval.py) wherever a deterministic check is
+# possible — that is most of the time. Reach for an LLM judge only for genuinely
+# open-ended output, and CALIBRATE it first: hand-label ~20 examples yourself,
+# run the judge on them, and confirm it agrees with you before you let it gate
+# anything. An uncalibrated judge is a vibe with a number attached.
+# ---------------------------------------------------------------------------
@@ -0,0 +1,78 @@
+"""Run the eval set against one candidate and print a scorecard.
+
+Usage:
+    python run_eval.py candidates/current_model
+    python run_eval.py candidates/swapped_model
+    python run_eval.py candidates/current_model --threshold 0.9
+
+A "candidate" is a directory containing a tasks.py that an agent produced. The
+runner imports that tasks.py, runs every case in eval_set.py against it, prints
+a pass/fail line per case, and reports an aggregate score.
+
+The exit code is the guardrail: 0 if the score meets the threshold, 1 if it
+doesn't. That single integer is what lets an eval gate a model swap, a prompt
+change, or an unattended agent in CI (Module 14) instead of a human eyeballing
+output. "Below threshold" should block exactly like a failing test does.
+"""
+
+import argparse
+import importlib.util
+import sys
+from pathlib import Path
+
+from eval_set import CASES
+
+
+def load_candidate(candidate_dir: Path):
+    """Import the tasks.py living in candidate_dir as an isolated module."""
+    tasks_py = candidate_dir / "tasks.py"
+    if not tasks_py.exists():
+        sys.exit(f"no tasks.py in {candidate_dir}")
+    spec = importlib.util.spec_from_file_location("candidate_tasks", tasks_py)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def run_case(candidate, items, expected):
+    """Build the input state with the candidate's own classes, then grade."""
+    tlist = candidate.TaskList()
+    for title, done in items:
+        tlist.tasks.append(candidate.Task(title=title, done=done))
+    try:
+        actual = tlist.pending_count()
+    except Exception as exc:  # a crash is a failed case, not a crashed harness
+        return False, f"raised {type(exc).__name__}: {exc}"
+    return actual == expected, f"expected {expected}, got {actual}"
+
+
+def main(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("candidate", help="path to a candidate dir with tasks.py")
+    parser.add_argument("--threshold", type=float, default=1.0,
+                        help="minimum passing fraction to exit 0 (default 1.0)")
+    args = parser.parse_args(argv)
+
+    candidate_dir = Path(args.candidate)
+    candidate = load_candidate(candidate_dir)
+
+    passed = 0
+    print(f"\neval set: {len(CASES)} cases   candidate: {candidate_dir}\n")
+    for name, items, expected in CASES:
+        ok, detail = run_case(candidate, items, expected)
+        mark = "PASS" if ok else "FAIL"
+        print(f"  [{mark}] {name:<40} ({detail})")
+        passed += ok
+
+    score = passed / len(CASES)
+    print(f"\nscore: {passed}/{len(CASES)} = {score:.0%}   threshold: {args.threshold:.0%}")
+
+    if score < args.threshold:
+        print("RESULT: below threshold — this change is NOT safe to ship.\n")
+        return 1
+    print("RESULT: at or above threshold — safe by this eval.\n")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))