Phase 7+8: eval harness + hybrid retrieval
## Phase 7 — Eval harness
eval/retrievers.py + rag/retrieval.py: Retriever protocol with
DenseRetriever, BM25Retriever, HybridRetriever (RRF k=60),
RerankedRetriever (llama.cpp /v1/rerank). retrievers.py is now a
thin shim re-exporting from rag.retrieval so the MCP server can
use the same code at request time without making eval/ a runtime
dep.
eval/run_eval.py: drives N retrievers against eval/queries.jsonl,
computes MRR / Recall@K / nDCG@K, emits a markdown report with a
summary table + per-query breakdown for the first retriever. Each
query carries expected (source, source_key) tuples — matches the
labels-domain page-level keying.
eval/queries.jsonl: 35 curated queries — 25 brand-name (Warrant,
Huskie, Roundup Custom, Liberty, Authority, Headline, Trivapro,
Poncho, Lorsban, Sencor, Acuron, ...) + 10 intent/semantic
("what controls horseweed before soybean", "fungicide for fusarium
head blight", "rainfast interval for glyphosate", ...).
## Phase 8 — Hybrid retrieval (BM25 + dense + RRF)
docs_mcp/server.py: search_docs now branches on HYBRID_SEARCH env.
When on, _search_chunks runs both Chroma + BM25 (rag/bm25.py
existing impl), fuses on chunk_id with reciprocal-rank-fusion
(RRF k=60), and returns the combined pool. Dense-only path
unchanged when HYBRID_SEARCH is unset. The rendering layer
(_format_hit) is untouched.
The RERANK_URL hook is also wired (_rerank_pool sends docs to
llama.cpp /v1/rerank, truncated to 2000 chars per the jina-reranker
n_ctx_train=1024 batch-rejection gotcha). Fails open to base order
on any exception.
## Baseline numbers (k=5, pool=50, 35 queries)
| Retriever | MRR | Recall@5 | nDCG@5 |
|------------|-------|----------|--------|
| dense | 0.027 | 0.086 | 0.041 |
| bm25 | 0.544 | 0.586 | 0.524 |
| hybrid-rrf | 0.114 | 0.114 | 0.108 |
Headline: BM25 dominates because farmers search for products by
brand name, and brand names are exact-match tokens that lexical
search nails. Dense is poor — semantic embeddings spread across
similar products and don't preferentially weight brand-name tokens.
Textbook RRF hurts when one retriever is much weaker than the
other: dense's irrelevant top-50 pollute the fused pool with
ties at 1/(60+rank). Phase 6 reranker is the planned fix —
the reranker scores each (query, chunk) pair independently
and can recover the right answer regardless of base order.
Per-query report at eval/results/baseline.md.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
+126
-20
@@ -242,6 +242,7 @@ def search_docs(
|
||||
"query": query, "source": source, "product_class": product_class,
|
||||
"registrant_contains": registrant_contains, "signal_word": signal_word,
|
||||
"epa_reg_no": epa_reg_no, "k": k,
|
||||
"hybrid": HYBRID_SEARCH, "rerank": bool(RERANK_URL),
|
||||
}) as _call:
|
||||
try:
|
||||
col = _collection()
|
||||
@@ -251,37 +252,35 @@ def search_docs(
|
||||
|
||||
where = _build_where(source, product_class, registrant_contains,
|
||||
signal_word, epa_reg_no)
|
||||
# Over-fetch when we'll post-filter on registrant substring, so we
|
||||
# still have ~k matches after the filter trims.
|
||||
n_fetch = k * 4 if registrant_contains else k
|
||||
# Over-fetch — we need a meaningful pool for fusion/reranking,
|
||||
# and registrant_contains filtering trims down post-query.
|
||||
pool = max(k * (5 if (HYBRID_SEARCH or RERANK_URL) else 2),
|
||||
k * (4 if registrant_contains else 2))
|
||||
|
||||
scored: list[tuple[str, dict, float]] = []
|
||||
try:
|
||||
res = col.query(query_texts=[query], n_results=n_fetch, where=where)
|
||||
scored = _search_chunks(query, pool, where, registrant_contains)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_call.set(hits_returned=0, error=str(exc))
|
||||
return f"_(search failed: {exc})_"
|
||||
|
||||
docs = res.get("documents", [[]])[0]
|
||||
metas = res.get("metadatas", [[]])[0]
|
||||
dists = res.get("distances", [[]])[0]
|
||||
|
||||
# Cosine distance → similarity score (1 - d). Clip to [0,1] for display.
|
||||
scored: list[tuple[str, dict, float]] = []
|
||||
for doc, meta, dist in zip(docs, metas, dists):
|
||||
if registrant_contains:
|
||||
reg = (meta.get("registrant") or "").upper()
|
||||
if registrant_contains.upper() not in reg:
|
||||
continue
|
||||
score = max(0.0, 1.0 - float(dist))
|
||||
scored.append((doc, meta, score))
|
||||
if len(scored) >= k:
|
||||
break
|
||||
# Optionally rerank the pool (Phase 6) before truncating to k.
|
||||
if RERANK_URL and len(scored) > 1:
|
||||
try:
|
||||
scored = _rerank_pool(query, scored)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("rerank failed (%s) — falling back to base order", exc)
|
||||
|
||||
scored = scored[:k]
|
||||
_call.set(hits_returned=len(scored))
|
||||
if not scored:
|
||||
return "_(no results — try broadening the query, dropping filters, or check list_versions() for valid sources/classes)_"
|
||||
|
||||
mode = "hybrid-rrf+rerank" if (HYBRID_SEARCH and RERANK_URL) else \
|
||||
"hybrid-rrf" if HYBRID_SEARCH else \
|
||||
"dense+rerank" if RERANK_URL else "dense"
|
||||
out: list[str] = [
|
||||
f"# Search results for {query!r} ({len(scored)} of top-{n_fetch} dense hits)",
|
||||
f"# Search results for {query!r} ({len(scored)} hits, mode={mode})",
|
||||
"",
|
||||
]
|
||||
for doc, meta, score in scored:
|
||||
@@ -289,6 +288,113 @@ def search_docs(
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _search_chunks(
|
||||
query: str,
|
||||
pool: int,
|
||||
where: dict | None,
|
||||
registrant_contains: str | None,
|
||||
) -> list[tuple[str, dict, float]]:
|
||||
"""Run dense (and optionally BM25-hybrid) chunk retrieval, return
|
||||
list of (doc_text, metadata, score) sorted by score descending.
|
||||
Filters by ``registrant_contains`` post-query."""
|
||||
col = _collection()
|
||||
# --- dense (Chroma) ----------------------------------------------------
|
||||
dense_res = col.query(query_texts=[query], n_results=pool, where=where)
|
||||
dense_ids = dense_res.get("ids", [[]])[0]
|
||||
dense_docs = dense_res.get("documents", [[]])[0]
|
||||
dense_metas = dense_res.get("metadatas", [[]])[0]
|
||||
dense_dists = dense_res.get("distances", [[]])[0]
|
||||
|
||||
chunk_pool: dict[str, dict] = {}
|
||||
for cid, doc, meta, dist in zip(dense_ids, dense_docs, dense_metas, dense_dists):
|
||||
chunk_pool[cid] = {
|
||||
"doc": doc, "meta": meta or {},
|
||||
"dense_sim": max(0.0, 1.0 - float(dist)),
|
||||
"dense_rank": None, "bm25_rank": None,
|
||||
}
|
||||
for rank, cid in enumerate(dense_ids, start=1):
|
||||
chunk_pool[cid]["dense_rank"] = rank
|
||||
|
||||
# --- BM25 (Phase 8 hybrid) --------------------------------------------
|
||||
if HYBRID_SEARCH:
|
||||
try:
|
||||
from rag.bm25 import BM25Index
|
||||
bm25 = BM25Index(BM25_DB)
|
||||
bm25_hits = bm25.query(query, n=pool)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("bm25 query failed (%s) — dense-only this call", exc)
|
||||
bm25_hits = []
|
||||
missing_ids = [cid for cid, _ in bm25_hits if cid not in chunk_pool]
|
||||
if missing_ids:
|
||||
got = col.get(ids=missing_ids,
|
||||
include=["documents", "metadatas"])
|
||||
for cid, doc, meta in zip(got.get("ids", []), got.get("documents", []),
|
||||
got.get("metadatas", [])):
|
||||
chunk_pool[cid] = {
|
||||
"doc": doc, "meta": meta or {},
|
||||
"dense_sim": 0.0, "dense_rank": None, "bm25_rank": None,
|
||||
}
|
||||
for rank, (cid, _bm25_score) in enumerate(bm25_hits, start=1):
|
||||
if cid in chunk_pool:
|
||||
chunk_pool[cid]["bm25_rank"] = rank
|
||||
|
||||
# --- RRF fusion or dense-only score -----------------------------------
|
||||
out: list[tuple[str, dict, float]] = []
|
||||
for cid, info in chunk_pool.items():
|
||||
meta = info["meta"]
|
||||
if registrant_contains:
|
||||
reg = (meta.get("registrant") or "").upper()
|
||||
if registrant_contains.upper() not in reg:
|
||||
continue
|
||||
if HYBRID_SEARCH:
|
||||
rrf = 0.0
|
||||
if info["dense_rank"]:
|
||||
rrf += 1.0 / (RRF_K + info["dense_rank"])
|
||||
if info["bm25_rank"]:
|
||||
rrf += 1.0 / (RRF_K + info["bm25_rank"])
|
||||
score = rrf
|
||||
else:
|
||||
score = info["dense_sim"]
|
||||
out.append((info["doc"], meta, score))
|
||||
out.sort(key=lambda x: -x[2])
|
||||
return out
|
||||
|
||||
|
||||
def _rerank_pool(
|
||||
query: str,
|
||||
pool: list[tuple[str, dict, float]],
|
||||
) -> list[tuple[str, dict, float]]:
|
||||
"""Send (query, doc_text) pairs to a llama.cpp /v1/rerank endpoint
|
||||
and reorder by relevance score. Truncates docs to 2000 chars (the
|
||||
jina-reranker GGUF rejects the ENTIRE batch if any pair exceeds
|
||||
n_ctx_train=1024; full text still goes back to the user)."""
|
||||
import httpx
|
||||
docs_truncated = [d[:2000] for d, _meta, _s in pool[:RERANK_POOL]]
|
||||
if not docs_truncated:
|
||||
return pool
|
||||
r = httpx.post(
|
||||
f"{RERANK_URL}/v1/rerank",
|
||||
json={"query": query, "documents": docs_truncated},
|
||||
timeout=RERANK_TIMEOUT,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
results = data.get("results") or []
|
||||
rescored: list[tuple[str, dict, float]] = []
|
||||
for r_item in results:
|
||||
idx = r_item.get("index")
|
||||
score = r_item.get("relevance_score") or r_item.get("score") or 0.0
|
||||
if isinstance(idx, int) and 0 <= idx < len(pool):
|
||||
doc, meta, _ = pool[idx]
|
||||
rescored.append((doc, meta, float(score)))
|
||||
rescored.sort(key=lambda x: -x[2])
|
||||
# Anything in the original pool past RERANK_POOL stays at the tail
|
||||
# in original order (rare — we usually rerank the entire pool).
|
||||
seen = {id(item) for item in rescored}
|
||||
tail = [p for p in pool[RERANK_POOL:] if id(p) not in seen]
|
||||
return rescored + tail
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
def get_page(
|
||||
source: Annotated[
|
||||
|
||||
Reference in New Issue
Block a user