"""Candidate output: a SWAPPED model/prompt. Same task, different model (or a tweaked prompt). This output "looks right" and passes a casual manual check — adding three tasks and calling count returns 3. But pending_count() returns the total number of tasks, not the number of *pending* ones, so it's wrong the moment anything is marked done. Nobody would notice this by skimming. The eval set notices it instantly. That's the regression eval catching an unsafe swap — exactly the scenario this module exists for. Replace this with your own swapped-model output when you run it for real; you may get lucky and have it pass, or you may catch a regression like this one. """ from dataclasses import dataclass, field @dataclass class Task: title: str done: bool = False @dataclass class TaskList: tasks: list[Task] = field(default_factory=list) def add(self, title: str) -> Task: task = Task(title=title) self.tasks.append(task) return task def complete(self, index: int) -> None: self.tasks[index].done = True def pending(self) -> list[Task]: return [t for t in self.tasks if not t.done] def pending_count(self) -> int: # WRONG, but plausibly so: counts every task, not just pending ones. return len(self.tasks) def render(self) -> str: if not self.tasks: return "(no tasks yet)" lines = [] for i, task in enumerate(self.tasks): box = "[x]" if task.done else "[ ]" lines.append(f"{i}. {box} {task.title}") return "\n".join(lines)