"""Retriever protocol + concrete implementations. A single matrix dimension per knob (dense / reranked / bm25 / hybrid) so the eval harness can compare them apples-to-apples. Implement these once at Phase 7 and reuse them across every retrieval change. Each retriever returns a ranked list of (bundle_id, page_id) tuples deduplicated to the page level (chunks within the same page collapse to one entry; the highest-ranked chunk's position wins). """ from __future__ import annotations from typing import Protocol, Iterable class Retriever(Protocol): name: str def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]: """Return up to k (bundle_id, page_id) tuples in rank order.""" ... def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]: """Take a stream of (bundle_id, page_id, chunk_ordinal) and return the first k unique pages in their first-seen order.""" seen: set[tuple[str, str]] = set() out: list[tuple[str, str]] = [] for bid, pid, _ord in chunk_ids: key = (bid, pid) if key in seen: continue seen.add(key) out.append(key) if len(out) >= k: break return out # TODO Phase 2/3 — implement these once Chroma + the bm25 module are # in place. Each one is small (15-30 LOC). The eval harness imports # from this module by class name. # # class DenseRetriever: # name = "dense" # def __init__(self, collection): self.col = collection # def retrieve(self, query, k=10): ... # # class RerankedRetriever: # name = "dense+rerank" # def __init__(self, collection, rerank_url, pool=200): ... # def retrieve(self, query, k=10): ... # # class BM25Retriever: # name = "bm25" # def __init__(self, bm25_index): ... # def retrieve(self, query, k=10): ... # # class HybridRetriever: # name = "bm25+dense+rrf" # def __init__(self, dense, bm25, k_rrf=60): ... # def retrieve(self, query, k=10): ...