seed-mcp/eval/retrievers.py

"""Retriever protocol + concrete implementations.

A single matrix dimension per knob (dense / reranked / bm25 / hybrid)
so the eval harness can compare them apples-to-apples. Implement these
once at Phase 7 and reuse them across every retrieval change.

Each retriever returns a ranked list of (bundle_id, page_id) tuples
deduplicated to the page level (chunks within the same page collapse
to one entry; the highest-ranked chunk's position wins).
"""
from __future__ import annotations

from typing import Protocol, Iterable


class Retriever(Protocol):
    name: str

    def retrieve(self, query: str, k: int = 10) -> list[tuple[str, str]]:
        """Return up to k (bundle_id, page_id) tuples in rank order."""
        ...


def _collapse_to_pages(chunk_ids: Iterable[tuple[str, str, str]], k: int) -> list[tuple[str, str]]:
    """Take a stream of (bundle_id, page_id, chunk_ordinal) and return
    the first k unique pages in their first-seen order."""
    seen: set[tuple[str, str]] = set()
    out: list[tuple[str, str]] = []
    for bid, pid, _ord in chunk_ids:
        key = (bid, pid)
        if key in seen:
            continue
        seen.add(key)
        out.append(key)
        if len(out) >= k:
            break
    return out


# TODO Phase 2/3 — implement these once Chroma + the bm25 module are
# in place. Each one is small (15-30 LOC). The eval harness imports
# from this module by class name.
#
# class DenseRetriever:
#     name = "dense"
#     def __init__(self, collection): self.col = collection
#     def retrieve(self, query, k=10): ...
#
# class RerankedRetriever:
#     name = "dense+rerank"
#     def __init__(self, collection, rerank_url, pool=200): ...
#     def retrieve(self, query, k=10): ...
#
# class BM25Retriever:
#     name = "bm25"
#     def __init__(self, bm25_index): ...
#     def retrieve(self, query, k=10): ...
#
# class HybridRetriever:
#     name = "bm25+dense+rrf"
#     def __init__(self, dense, bm25, k_rrf=60): ...
#     def retrieve(self, query, k=10): ...