"""Build Chroma (and BM25) indexes from the seed corpus on disk. Reads ``corpus//.json`` sidecars, chunks each variety via ``rag.chunk.chunks_from_variety``, upserts into Chroma. With ``--rebuild``, drops + recreates the collection (clean state). With ``--bm25-only``, skips Chroma and rebuilds only the FTS5 index — useful for fast iteration when the chunker didn't change. Collection name is ``_docs`` (default: ``crop_seed_docs``). Override via the PRODUCT_NAME env var. """ from __future__ import annotations import argparse import logging import os import time from pathlib import Path from typing import Iterator import chromadb from chromadb.config import Settings from .chunk import chunks_from_variety from .embeddings import embedding_function log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") ROOT = Path(__file__).resolve().parent.parent CORPUS = ROOT / "corpus" CHROMA_DIR = ROOT / "chroma" PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "crop_seed") COLLECTION = f"{PRODUCT_NAME}_docs" def variety_records() -> Iterator[dict]: """Walk ``corpus//.json``, yield one chunk per variety.""" if not CORPUS.exists(): log.error("corpus/ doesn't exist; run a scraper first") return for source_dir in sorted(CORPUS.iterdir()): if not source_dir.is_dir() or source_dir.name.startswith("."): continue for sidecar_path in sorted(source_dir.glob("*.json")): yield from chunks_from_variety(sidecar_path) def upsert_to_chroma(records: list[dict]) -> int: client = chromadb.PersistentClient( path=str(CHROMA_DIR), settings=Settings(anonymized_telemetry=False), ) # Drop + recreate for --rebuild semantics. try: client.delete_collection(COLLECTION) except Exception: pass col = client.create_collection(COLLECTION, embedding_function=embedding_function()) BATCH = 64 total = 0 for i in range(0, len(records), BATCH): chunk = records[i:i + BATCH] col.upsert( ids=[r["id"] for r in chunk], documents=[r["text"] for r in chunk], metadatas=[r["metadata"] for r in chunk], ) total += len(chunk) log.info("upserted %d / %d chunks", total, len(records)) return total def main() -> int: p = argparse.ArgumentParser() p.add_argument("--rebuild", action="store_true", help="Drop and recreate the Chroma collection.") p.add_argument("--bm25-only", action="store_true", help="Rebuild only the BM25 index, skip Chroma.") p.add_argument("--bm25-db", type=Path, default=ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db", help="Path to the BM25 sqlite db.") args = p.parse_args() log.info("reading corpus from %s", CORPUS) t0 = time.time() records = list(variety_records()) log.info("loaded %d chunks in %.1fs", len(records), time.time() - t0) if not records: log.error("no chunks — is corpus/ populated?") return 1 if args.bm25_only: from .bm25 import BM25Index log.info("--bm25-only: building FTS5 only") BM25Index(args.bm25_db).build(records) return 0 if not args.rebuild: log.info("no --rebuild; nothing to do. (Use --rebuild to upsert.)") return 0 t_c = time.time() n = upsert_to_chroma(records) log.info("chroma: %d chunks in %.1fs", n, time.time() - t_c) try: from .bm25 import BM25Index t_b = time.time() BM25Index(args.bm25_db).build(records) log.info("bm25 done in %.1fs", time.time() - t_b) except ImportError: log.info("rag.bm25 not available — skipping BM25 build") return 0 if __name__ == "__main__": raise SystemExit(main())