"""Minimal HTTP reranker — `/v1/rerank` endpoint over a sentence-transformers CrossEncoder. Matches the Cohere `/v1/rerank` request/response shape, which is what the server's `_rerank()` helper expects. This is the dev-friendly fallback; production replaces this with the llama.cpp + jina-reranker-v2-base GGUF sidecar (see deploy/docker-compose.yml) without changing the client. Request: POST /v1/rerank {"model": "...", "query": "...", "documents": ["text", ...], "top_n": 10} Response: {"model": "...", "results": [{"index": 0, "relevance_score": 0.93}, ...]} Usage: python -m scripts.rerank_server # localhost:8001 RERANK_MODEL=cross-encoder/ms-marco-MiniLM-L-12-v2 \\ RERANK_PORT=8001 python -m scripts.rerank_server """ from __future__ import annotations import json import logging import os import sys from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") MODEL_NAME = os.environ.get("RERANK_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2") PORT = int(os.environ.get("RERANK_PORT", "8001")) HOST = os.environ.get("RERANK_HOST", "127.0.0.1") # Truncate docs to this many chars before scoring. jina-reranker GGUF has a # 1024-token per-pair cap that 400s the entire batch; ms-marco is more # forgiving but we still cap to keep latency predictable. MAX_DOC_CHARS = int(os.environ.get("RERANK_MAX_DOC_CHARS", "2000")) _model = None def _get_model(): global _model if _model is None: from sentence_transformers import CrossEncoder log.info("loading %s", MODEL_NAME) _model = CrossEncoder(MODEL_NAME) log.info("loaded") return _model def _rerank(query: str, documents: list[str], top_n: int | None) -> list[dict]: model = _get_model() pairs = [[query, (d or "")[:MAX_DOC_CHARS]] for d in documents] scores = model.predict(pairs) ranked = sorted( ({"index": i, "relevance_score": float(s)} for i, s in enumerate(scores)), key=lambda r: -r["relevance_score"], ) if top_n is not None: ranked = ranked[:top_n] return ranked class Handler(BaseHTTPRequestHandler): def log_message(self, fmt, *args): log.info("%s - %s", self.address_string(), fmt % args) def _send_json(self, status: int, payload: dict) -> None: body = json.dumps(payload).encode() self.send_response(status) self.send_header("Content-Type", "application/json") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) def do_GET(self): # noqa: N802 if self.path in ("/", "/health"): self._send_json(200, {"status": "ok", "model": MODEL_NAME}) return self._send_json(404, {"error": "not found"}) def do_POST(self): # noqa: N802 if self.path not in ("/v1/rerank", "/rerank"): self._send_json(404, {"error": "not found"}) return length = int(self.headers.get("Content-Length", "0")) try: req = json.loads(self.rfile.read(length).decode()) except Exception as e: self._send_json(400, {"error": f"bad json: {e}"}) return query = req.get("query") documents = req.get("documents") if not isinstance(query, str) or not isinstance(documents, list): self._send_json(400, {"error": "expected {query: str, documents: list[str]}"}) return top_n = req.get("top_n") try: results = _rerank(query, documents, top_n if isinstance(top_n, int) else None) except Exception as e: log.exception("rerank failed") self._send_json(500, {"error": str(e)}) return self._send_json(200, {"model": MODEL_NAME, "results": results}) def main() -> int: _get_model() # warm-load before accepting traffic server = ThreadingHTTPServer((HOST, PORT), Handler) log.info("listening on http://%s:%d", HOST, PORT) try: server.serve_forever() except KeyboardInterrupt: log.info("shutting down") return 0 if __name__ == "__main__": sys.exit(main())