bd71f30ca7
Phase 6 — Reranker integration
- New _rerank(query, [(cid, doc), ...]) helper in server.py calls
llama.cpp's /v1/rerank endpoint, returns reranker-ordered ids
or None on failure (graceful fallback — search never blocks
on the sidecar).
- search_docs + search_trials both call _rerank() on the post-
hybrid pool BEFORE truncating to k. The variety-code prefilter
still pins exact matches on top.
- Per-doc truncation to 2000 chars to fit jina-reranker-v2-base's
per-pair token budget. Full chunk text still returned to the
caller — truncation is rerank-input-only.
- Telemetry adds `reranked: true|false` so usage logs distinguish
reranked calls.
Phase 7 — Eval harness
- eval/queries.jsonl: 21 golden queries spanning:
* variety-code lookups (DKC62-08RIB, AG29XF4, WB6430, E085Z5,
AP Iliad)
* semantic variety queries (drought-tolerant corn, SCN MG-3
soy, Rps3a, XtendFlex, HRS stripe rust, SWW PNW, Goss's Wilt)
* trial queries (IA/IN/MN regional, AP Iliad ID, NK1701 head-
to-head, silage Ton/Acre, product=DKC65-95)
* anti-hallucination (Pioneer P1142 fallback, DKC65-20 not-in-
corpus expected_empty)
- eval/retrievers.py: 4 named retrievers — dense, bm25, hybrid
(dense+bm25+RRF), hybrid+rerank — all sharing the same filter
shape as docs_mcp/server.py._build_where.
- eval/run_eval.py: runs each retriever against each query,
reports Recall / Precision@1 / MRR / avg latency. Markdown
output in eval/results/baseline.md.
Baseline results (k=5, 21 queries):
| Retriever | Pass | Recall | P@1 | MRR | Avg ms |
|-----------------|-------|--------|-------|-------|--------|
| hybrid+rerank | 21/21 | 100% | 90% | 0.905 | 2064 |
| bm25 | 20/21 | 95% | 81% | 0.833 | 5 |
| hybrid | 15/21 | 71% | 62% | 0.619 | 73 |
| dense | 14/21 | 67% | 38% | 0.440 | 79 |
Key findings:
1. hybrid+rerank wins on quality — 100% pass, 90% P@1.
2. BM25 alone is surprisingly competitive (95% pass) at 5 ms —
excellent fallback when rerank is down. The variety-code
prefilter in search_docs is doing a lot of work here.
3. Dense embedding alone is the WEAKEST configuration on this
corpus — variety identity tokens (DKC62-08RIB, AP Iliad,
Rps3a) have no semantic neighbors, so nomic-embed-text returns
noise. The hybrid (no rerank) layer actively hurts because
RRF dilutes the BM25 ranking with dense noise.
4. Anti-hallucination queries (Pioneer fallback, DKC65-20 not-
in-corpus) pass on ALL retrievers including dense-only —
the must_not_contain + expected_empty design holds.
Deploy decision: HYBRID_SEARCH=true + RERANK_URL set
(production env already has both — refresh.yml + image-only.yml
+ deploy/docker-compose.yml all configured).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
201 lines
7.0 KiB
Python
201 lines
7.0 KiB
Python
"""Retriever protocol + concrete implementations for seed-mcp eval.
|
|
|
|
Each retriever returns a ranked list of chunk_ids. The eval harness
|
|
in ``run_eval.py`` measures each retriever against the golden
|
|
``queries.jsonl`` set across MRR / Recall@K / nDCG@K.
|
|
|
|
Four named configurations, matching the four switches in
|
|
``docs_mcp/server.py``:
|
|
|
|
dense — Chroma dense retrieval alone
|
|
bm25 — SQLite FTS5 BM25 alone
|
|
hybrid — dense + bm25 fused via RRF
|
|
hybrid_rerank — hybrid pool → cross-encoder rerank
|
|
|
|
Each retriever takes ``filters`` (the same dict shape
|
|
``_build_where`` accepts in server.py) so trial-specific facets
|
|
(data_type, state, year, crop) work consistently across the
|
|
four configurations.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Protocol
|
|
|
|
|
|
# Add repo root so we can import docs_mcp and rag from here.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
|
|
|
|
class Retriever(Protocol):
|
|
name: str
|
|
|
|
def retrieve(self, query: str, k: int, filters: dict | None) -> list[str]:
|
|
"""Return up to k chunk_ids in rank order."""
|
|
...
|
|
|
|
|
|
def _build_where(filters: dict | None) -> dict | None:
|
|
"""Mirror of docs_mcp.server._build_where but accepts the eval's
|
|
looser shape."""
|
|
if not filters:
|
|
return None
|
|
conds: list[dict] = []
|
|
if filters.get("data_type"):
|
|
conds.append({"data_type": filters["data_type"]})
|
|
if filters.get("crop"):
|
|
conds.append({"crop": filters["crop"].lower()})
|
|
if filters.get("brand"):
|
|
conds.append({"brand": filters["brand"].upper()})
|
|
if filters.get("state"):
|
|
s = filters["state"]
|
|
conds.append({"state": s.upper() if len(s) <= 3 else s})
|
|
if filters.get("year"):
|
|
conds.append({"year": int(filters["year"])})
|
|
if not conds:
|
|
return None
|
|
if len(conds) == 1:
|
|
return conds[0]
|
|
return {"$and": conds}
|
|
|
|
|
|
class DenseRetriever:
|
|
name = "dense"
|
|
|
|
def __init__(self, collection, pool: int = 50):
|
|
self.col = collection
|
|
self.pool = pool
|
|
|
|
def retrieve(self, query: str, k: int, filters: dict | None) -> list[str]:
|
|
where = _build_where(filters)
|
|
try:
|
|
r = self.col.query(
|
|
query_texts=[query], n_results=max(k, self.pool), where=where,
|
|
)
|
|
except Exception:
|
|
return []
|
|
return (r.get("ids") or [[]])[0][:k]
|
|
|
|
|
|
class BM25Retriever:
|
|
name = "bm25"
|
|
|
|
def __init__(self, bm25, pool: int = 50):
|
|
self.bm25 = bm25
|
|
self.pool = pool
|
|
|
|
def retrieve(self, query: str, k: int, filters: dict | None) -> list[str]:
|
|
where = _build_where(filters)
|
|
hits = self.bm25.query(query, n=max(k, self.pool), where=where)
|
|
return [cid for cid, _ in hits[:k]]
|
|
|
|
|
|
class HybridRetriever:
|
|
"""Dense + BM25 fused via RRF — same fusion the server uses."""
|
|
name = "hybrid"
|
|
|
|
def __init__(self, collection, bm25, pool: int = 50, rrf_k: int = 60):
|
|
self.col = collection
|
|
self.bm25 = bm25
|
|
self.pool = pool
|
|
self.rrf_k = rrf_k
|
|
|
|
def retrieve(self, query: str, k: int, filters: dict | None) -> list[str]:
|
|
where = _build_where(filters)
|
|
try:
|
|
d = self.col.query(query_texts=[query], n_results=self.pool, where=where)
|
|
dense_ids = (d.get("ids") or [[]])[0]
|
|
except Exception:
|
|
dense_ids = []
|
|
bm25_ids = [c for c, _ in self.bm25.query(query, n=self.pool, where=where)]
|
|
scores: dict[str, float] = {}
|
|
for ranking in (dense_ids, bm25_ids):
|
|
for rank, cid in enumerate(ranking):
|
|
scores[cid] = scores.get(cid, 0.0) + 1.0 / (self.rrf_k + rank + 1)
|
|
fused = sorted(scores, key=lambda d: scores[d], reverse=True)
|
|
return fused[:k]
|
|
|
|
|
|
class HybridRerankRetriever:
|
|
"""Hybrid pool → cross-encoder rerank via the llama.cpp endpoint."""
|
|
name = "hybrid+rerank"
|
|
|
|
def __init__(self, collection, bm25, rerank_url: str,
|
|
pool: int = 50, rerank_pool: int = 50,
|
|
rrf_k: int = 60, doc_max_chars: int = 2000,
|
|
timeout: float = 30.0):
|
|
self.col = collection
|
|
self.bm25 = bm25
|
|
self.rerank_url = rerank_url.rstrip("/")
|
|
self.pool = pool
|
|
self.rerank_pool = rerank_pool
|
|
self.rrf_k = rrf_k
|
|
self.doc_max_chars = doc_max_chars
|
|
self.timeout = timeout
|
|
|
|
def retrieve(self, query: str, k: int, filters: dict | None) -> list[str]:
|
|
where = _build_where(filters)
|
|
try:
|
|
d = self.col.query(
|
|
query_texts=[query], n_results=self.pool, where=where,
|
|
include=["documents"],
|
|
)
|
|
dense_ids = (d.get("ids") or [[]])[0]
|
|
dense_docs = (d.get("documents") or [[]])[0]
|
|
id_to_doc = dict(zip(dense_ids, dense_docs))
|
|
except Exception:
|
|
dense_ids = []
|
|
id_to_doc = {}
|
|
bm25_ids = [c for c, _ in self.bm25.query(query, n=self.pool, where=where)]
|
|
|
|
# Fuse to a hybrid pool
|
|
scores: dict[str, float] = {}
|
|
for ranking in (dense_ids, bm25_ids):
|
|
for rank, cid in enumerate(ranking):
|
|
scores[cid] = scores.get(cid, 0.0) + 1.0 / (self.rrf_k + rank + 1)
|
|
fused = sorted(scores, key=lambda d: scores[d], reverse=True)
|
|
|
|
# Fetch docs for any BM25-only ids in the rerank pool
|
|
missing = [cid for cid in fused[: self.rerank_pool] if cid not in id_to_doc]
|
|
if missing:
|
|
try:
|
|
extra = self.col.get(ids=missing, include=["documents"])
|
|
for cid, doc in zip(extra.get("ids") or [], extra.get("documents") or []):
|
|
id_to_doc[cid] = doc
|
|
except Exception:
|
|
pass
|
|
|
|
# Rerank
|
|
pool_ids = fused[: self.rerank_pool]
|
|
docs = [(id_to_doc.get(cid, "") or "")[: self.doc_max_chars] for cid in pool_ids]
|
|
try:
|
|
import httpx
|
|
with httpx.Client(timeout=self.timeout) as c:
|
|
r = c.post(
|
|
f"{self.rerank_url}/v1/rerank",
|
|
json={"model": "rerank", "query": query, "documents": docs},
|
|
)
|
|
r.raise_for_status()
|
|
results = r.json().get("results") or []
|
|
if not results:
|
|
return fused[:k]
|
|
ordered = sorted(results, key=lambda x: -x.get("relevance_score", float("-inf")))
|
|
reranked = [pool_ids[x["index"]] for x in ordered if 0 <= x.get("index", -1) < len(pool_ids)]
|
|
return reranked[:k]
|
|
except Exception:
|
|
return fused[:k]
|
|
|
|
|
|
def build_all_retrievers(collection, bm25, rerank_url: str | None = None) -> list[Retriever]:
|
|
"""Return the four named retrievers ready to evaluate."""
|
|
out: list[Retriever] = [
|
|
DenseRetriever(collection),
|
|
BM25Retriever(bm25),
|
|
HybridRetriever(collection, bm25),
|
|
]
|
|
if rerank_url:
|
|
out.append(HybridRerankRetriever(collection, bm25, rerank_url))
|
|
return out
|