From 1a45280e455ca9af5b1f1b715e82b144c70cde11 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Sun, 24 May 2026 12:25:59 -0400 Subject: [PATCH] =?UTF-8?q?rename:=20ppls-docs=20=E2=86=92=20crop-chem-doc?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repo/project rename to better reflect scope. PPLS is EPA's term for their Pesticide Product Label System — accurate when the corpus was EPA-only, narrow now that it also pulls from Bayer's own catalog (and may expand to Syngenta/Corteva/BASF/FMC labels in the future). crop-chem-docs scopes flexibly without acronyms to explain. Renames: - directory: ppls-docs → crop-chem-docs - PRODUCT_NAME: ppls → crop_chem - Chroma collection: ppls_docs → crop_chem_docs (in-place via .modify(), no re-embed) - BM25 db: bm25/ppls_docs.db → bm25/crop_chem_docs.db - MCP tool name: ppls_api_lessons → crop_chem_api_lessons - FastMCP server name: ppls-docs → crop-chem-docs - Env vars: PPLS_CORPUS_ROOT → CORPUS_ROOT PPLS_CHROMA_DIR → CHROMA_DIR_OVERRIDE - User-Agent: ppls-docs-scraper → crop-chem-docs-scraper Preserved (intentional, correct): - epa_ppls (source id) — refers specifically to EPA's PPLS database - "EPA PPLS" mentions in regulatory text (lessons.md, server docstrings) - PPLS_API_BASE / PPLS_PDF_BASE / PPLS_INDEX_URL_TEMPLATE in scrape/sources/epa_ppls.py — these point at EPA's actual endpoints Memory entries get updated in a follow-up commit so the rename is isolated. Co-Authored-By: Claude Opus 4.7 (1M context) --- PLAN.md | 4 ++-- docs_mcp/lessons.md | 6 +++--- docs_mcp/server.py | 18 +++++++++--------- rag/index.py | 8 ++++---- rag/retrieval.py | 6 +++--- scrape/README.md | 6 +++--- scrape/runner.py | 2 +- scrape/sources/bayer.py | 6 +++--- scrape/sources/epa_ppls.py | 6 +++--- 9 files changed, 31 insertions(+), 31 deletions(-) diff --git a/PLAN.md b/PLAN.md index 369c109..e98ca7e 100644 --- a/PLAN.md +++ b/PLAN.md @@ -9,9 +9,9 @@ any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can call to answer questions against the docs, surface what changed recently, and flag likely inconsistencies. -> **Domain note for ppls-docs.** This template was originally written +> **Domain note for crop-chem-docs.** This template was originally written > for versioned software product documentation (Zoomin bundles, Hugo -> sites, etc.). For ppls-docs the domain is pesticide product labels — +> sites, etc.). For crop-chem-docs the domain is pesticide product labels — > the "bundle" abstraction has been replaced with "source" > (manufacturer or regulator), and "page" with "product label". The > canonical on-disk schema lives in [`scrape/README.md`](scrape/README.md), diff --git a/docs_mcp/lessons.md b/docs_mcp/lessons.md index aa0f27e..bba4533 100644 --- a/docs_mcp/lessons.md +++ b/docs_mcp/lessons.md @@ -1,8 +1,8 @@ -# PPLS API Lessons +# Crop-Chem API Lessons Curated agronomy + label-handling knowledge that an LLM should know *before* giving recommendations from the labels corpus. Surfaced by -the `ppls_api_lessons` MCP tool. +the `crop_chem_api_lessons` MCP tool. Each top-level `## Topic: ` block is independently retrievable. The tool docstring tells the LLM to call this proactively before @@ -12,7 +12,7 @@ answering any pesticide recommendation question. ## Topic: how-to-use-this-corpus -The PPLS docs corpus is the source of truth for *what's on the label*. +The crop-chem-docs label corpus is the source of truth for *what's on the label*. You should: 1. **Run `search_docs` first** with the user's natural-language diff --git a/docs_mcp/server.py b/docs_mcp/server.py index d527e34..cb69a07 100644 --- a/docs_mcp/server.py +++ b/docs_mcp/server.py @@ -1,4 +1,4 @@ -"""MCP server for the ppls-docs pesticide label corpus. +"""MCP server for the crop-chem-docs pesticide label corpus. Adapted from the docs-mcp-template (which targeted versioned software docs) for the EPA pesticide-labels domain: ``bundle_id`` → ``source``, @@ -34,7 +34,7 @@ log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Product configuration. # --------------------------------------------------------------------------- -PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls") +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "crop_chem") PRODUCT_DOCS_URL = os.environ.get( "PRODUCT_DOCS_URL", "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1", @@ -43,8 +43,8 @@ COLLECTION = f"{PRODUCT_NAME}_docs" # Paths — corpus on (possibly) external storage, indexes always at repo root. REPO_ROOT = Path(__file__).resolve().parent.parent -CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") -CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma") +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma") BM25_DB = Path(os.environ.get("BM25_DB", str(REPO_ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db"))) SOURCES_JSON = REPO_ROOT / "sources.json" @@ -464,7 +464,7 @@ def list_versions() -> str: cat = _sources() # Source-level summary from sources.json - lines: list[str] = ["# PPLS docs corpus"] + lines: list[str] = ["# crop-chem-docs corpus"] # Live counts from Chroma (best-effort; the server should still # render a useful response if Chroma is unreachable) @@ -628,7 +628,7 @@ def _load_lessons() -> tuple[str, list[tuple[str, str]]]: @mcp.tool() -def ppls_api_lessons( +def crop_chem_api_lessons( topic: Annotated[ str | None, Field(description="OPTIONAL: topic slug or substring (e.g., " @@ -654,7 +654,7 @@ def ppls_api_lessons( warnings that make them actionable. Call this first; cite specific lessons in your response. """ - with TimedCall("ppls_api_lessons", {"topic": topic}) as _call: + with TimedCall("crop_chem_api_lessons", {"topic": topic}) as _call: full, sections = _load_lessons() if not sections: _call.set(sections=0) @@ -663,9 +663,9 @@ def ppls_api_lessons( if not topic: _call.set(sections=len(sections), returned="toc") toc_lines = [ - "# PPLS API lessons — table of contents", + "# Crop-Chem API lessons — table of contents", "", - f"Call `ppls_api_lessons(topic='')` to fetch a specific section.", + f"Call `crop_chem_api_lessons(topic='')` to fetch a specific section.", "", ] for slug, body in sections: diff --git a/rag/index.py b/rag/index.py index cbe7dbc..6eccbbd 100644 --- a/rag/index.py +++ b/rag/index.py @@ -5,7 +5,7 @@ into Chroma. With --rebuild, drops + recreates the collection (clean state). With --bm25-only, skips Chroma and rebuilds only the FTS5 index — useful for fast iteration when chunking didn't change. -The corpus root honors PPLS_CORPUS_ROOT (matching the scrapers). +The corpus root honors CORPUS_ROOT (matching the scrapers). The Chroma + BM25 stores stay at the repo root because both rely on filesystem locking semantics that vfat (typical USB drive) doesn't provide reliably. @@ -30,11 +30,11 @@ log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") REPO_ROOT = Path(__file__).resolve().parent.parent -CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") -CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma") +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") +CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma") # Collection name — convention: _docs. Override via env. -PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls") +PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "crop_chem") COLLECTION = f"{PRODUCT_NAME}_docs" diff --git a/rag/retrieval.py b/rag/retrieval.py index 6556af2..bb4725e 100644 --- a/rag/retrieval.py +++ b/rag/retrieval.py @@ -20,10 +20,10 @@ from typing import Iterable, Protocol log = logging.getLogger(__name__) REPO_ROOT = Path(__file__).resolve().parent.parent -CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma") +CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma") BM25_DB = Path(os.environ.get("BM25_DB", - str(REPO_ROOT / "bm25" / "ppls_docs.db"))) -COLLECTION = f"{os.environ.get('PRODUCT_NAME', 'ppls')}_docs" + str(REPO_ROOT / "bm25" / "crop_chem_docs.db"))) +COLLECTION = f"{os.environ.get('PRODUCT_NAME', 'crop_chem')}_docs" class Retriever(Protocol): diff --git a/scrape/README.md b/scrape/README.md index 383dcfc..081e7a8 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -44,13 +44,13 @@ re-fetch. ### Corpus location Default: `corpus/` at the repo root. Override with the -`PPLS_CORPUS_ROOT` env var to route the corpus to external storage +`CORPUS_ROOT` env var to route the corpus to external storage (USB drive, NAS mount, secondary partition): ```bash -export PPLS_CORPUS_ROOT=/mnt/big-disk/ppls-corpus +export CORPUS_ROOT=/mnt/big-disk/crop-chem-corpus python -m scrape.runner --source bayer --limit 20 -# writes to /mnt/big-disk/ppls-corpus/bayer/... +# writes to /mnt/big-disk/crop-chem-corpus/bayer/... ``` All sources honor the same env var; each creates its own diff --git a/scrape/runner.py b/scrape/runner.py index 1e20bbb..a9f7e36 100644 --- a/scrape/runner.py +++ b/scrape/runner.py @@ -1,7 +1,7 @@ """Thin dispatcher that routes ``--source `` to the right per-source scraper module. -For ppls-docs the convention is **one source per scraper module** under +For crop-chem-docs the convention is **one source per scraper module** under ``scrape.sources.``. Each module is independently runnable via ``python -m scrape.sources.`` and accepts its own flags — this runner is a convenience shim for CI / the weekly refresh workflow. diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py index eb6f30c..5ce5975 100644 --- a/scrape/sources/bayer.py +++ b/scrape/sources/bayer.py @@ -47,7 +47,7 @@ import requests from pypdf import PdfReader SCRAPER_VERSION = "0.1.0" -USER_AGENT = "ppls-docs-scraper/0.1 (+https://drawbar.example/contact)" +USER_AGENT = "crop-chem-docs-scraper/0.1 (+https://drawbar.example/contact)" BASE = "https://www.cropscience.bayer.us" # Catalog product-type values used in the Next.js data API. @@ -63,11 +63,11 @@ PRODUCT_CLASS = { } # Repo root: scrape/sources/bayer.py -> repo root is 3 parents up. -# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the +# Corpus root is overridable via CORPUS_ROOT for routing the # corpus to external storage (USB drive, NAS mount, etc.) without # editing the repo. REPO_ROOT = Path(__file__).resolve().parents[2] -CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "bayer" # Politeness: target ~1 req/sec to Bayer. Each HTTP method goes through diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py index 6773cef..7e1edfa 100644 --- a/scrape/sources/epa_ppls.py +++ b/scrape/sources/epa_ppls.py @@ -63,7 +63,7 @@ from pypdf import PdfReader from pypdf.errors import PdfReadError SCRAPER_VERSION = "0.1.0" -USER_AGENT = "ppls-docs-scraper/0.1 (+https://drawbar.example/contact)" +USER_AGENT = "crop-chem-docs-scraper/0.1 (+https://drawbar.example/contact)" PPIS_PRODUCT_ZIP_URL = "https://www3.epa.gov/pesticides/PPISdata/product.zip" PPLS_API_BASE = "https://ordspub.epa.gov/ords/pesticides/cswu/ppls" @@ -73,10 +73,10 @@ PPLS_INDEX_URL_TEMPLATE = ( ) REPO_ROOT = Path(__file__).resolve().parents[2] -# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the +# Corpus root is overridable via CORPUS_ROOT for routing the # corpus to external storage (USB drive, NAS mount, etc.) without # editing the repo. -CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus") +CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus") CORPUS_DIR = CORPUS_ROOT / "epa_ppls" REQUEST_DELAY_SECONDS = 1.1 # polite: ~1 req/sec