"""Candidate output: a SWAPPED model/prompt. Same task, different model (or a tweaked prompt). This output "looks right" and passes a casual manual check; adding three tasks and calling count returns 3. But pending_count() returns the total number of tasks, not the number of *pending* ones, so it's wrong the moment anything is marked done. Nobody would notice this by skimming. The eval set notices it instantly. That's the regression eval catching an unsafe swap, exactly the scenario this module exists for. Replace this with your own swapped-model output when you run it for real; you may get lucky and have it pass, or you may catch a regression like this one. """ from dataclasses import dataclass, field @dataclass class Task: title: str done: bool = False @dataclass class TaskList: tasks: list[Task] = field(default_factory=list) def add(self, title: str) -> Task: task = Task(title=title) self.tasks.append(task) return task def complete(self, index: int) -> None: self.tasks[index].done = True def pending(self) -> list[Task]: return [t for t in self.tasks if not t.done] def pending_count(self) -> int: # WRONG, but plausibly so: counts every task, not just pending ones. return len(self.tasks) def render(self) -> str: if not self.tasks: return "(no tasks yet)" lines = [] for i, task in enumerate(self.tasks): box = "[x]" if task.done else "[ ]" lines.append(f"{i}. {box} {task.title}") return "\n".join(lines)