From 1a45280e455ca9af5b1f1b715e82b144c70cde11 Mon Sep 17 00:00:00 2001
From: Justin Paul <justin@jpaul.me>
Date: Sun, 24 May 2026 12:25:59 -0400
Subject: [PATCH] =?UTF-8?q?rename:=20ppls-docs=20=E2=86=92=20crop-chem-doc?=
 =?UTF-8?q?s?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Repo/project rename to better reflect scope. PPLS is EPA's term for
their Pesticide Product Label System — accurate when the corpus was
EPA-only, narrow now that it also pulls from Bayer's own catalog
(and may expand to Syngenta/Corteva/BASF/FMC labels in the future).
crop-chem-docs scopes flexibly without acronyms to explain.

Renames:
- directory:           ppls-docs            → crop-chem-docs
- PRODUCT_NAME:        ppls                 → crop_chem
- Chroma collection:   ppls_docs            → crop_chem_docs  (in-place via .modify(), no re-embed)
- BM25 db:             bm25/ppls_docs.db    → bm25/crop_chem_docs.db
- MCP tool name:       ppls_api_lessons     → crop_chem_api_lessons
- FastMCP server name: ppls-docs            → crop-chem-docs
- Env vars:            PPLS_CORPUS_ROOT     → CORPUS_ROOT
                       PPLS_CHROMA_DIR      → CHROMA_DIR_OVERRIDE
- User-Agent:          ppls-docs-scraper    → crop-chem-docs-scraper

Preserved (intentional, correct):
- epa_ppls (source id) — refers specifically to EPA's PPLS database
- "EPA PPLS" mentions in regulatory text (lessons.md, server docstrings)
- PPLS_API_BASE / PPLS_PDF_BASE / PPLS_INDEX_URL_TEMPLATE in
  scrape/sources/epa_ppls.py — these point at EPA's actual endpoints

Memory entries get updated in a follow-up commit so the rename is
isolated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 PLAN.md                    |  4 ++--
 docs_mcp/lessons.md        |  6 +++---
 docs_mcp/server.py         | 18 +++++++++---------
 rag/index.py               |  8 ++++----
 rag/retrieval.py           |  6 +++---
 scrape/README.md           |  6 +++---
 scrape/runner.py           |  2 +-
 scrape/sources/bayer.py    |  6 +++---
 scrape/sources/epa_ppls.py |  6 +++---
 9 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/PLAN.md b/PLAN.md
index 369c109..e98ca7e 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -9,9 +9,9 @@ any LLM client (Claude Desktop, Claude Code, Cursor, Copilot) can
 call to answer questions against the docs, surface what changed
 recently, and flag likely inconsistencies.
 
-> **Domain note for ppls-docs.** This template was originally written
+> **Domain note for crop-chem-docs.** This template was originally written
 > for versioned software product documentation (Zoomin bundles, Hugo
-> sites, etc.). For ppls-docs the domain is pesticide product labels —
+> sites, etc.). For crop-chem-docs the domain is pesticide product labels —
 > the "bundle" abstraction has been replaced with "source"
 > (manufacturer or regulator), and "page" with "product label". The
 > canonical on-disk schema lives in [`scrape/README.md`](scrape/README.md),
diff --git a/docs_mcp/lessons.md b/docs_mcp/lessons.md
index aa0f27e..bba4533 100644
--- a/docs_mcp/lessons.md
+++ b/docs_mcp/lessons.md
@@ -1,8 +1,8 @@
-# PPLS API Lessons
+# Crop-Chem API Lessons
 
 Curated agronomy + label-handling knowledge that an LLM should know
 *before* giving recommendations from the labels corpus. Surfaced by
-the `ppls_api_lessons` MCP tool.
+the `crop_chem_api_lessons` MCP tool.
 
 Each top-level `## Topic: <slug>` block is independently retrievable.
 The tool docstring tells the LLM to call this proactively before
@@ -12,7 +12,7 @@ answering any pesticide recommendation question.
 
 ## Topic: how-to-use-this-corpus
 
-The PPLS docs corpus is the source of truth for *what's on the label*.
+The crop-chem-docs label corpus is the source of truth for *what's on the label*.
 You should:
 
 1. **Run `search_docs` first** with the user's natural-language
diff --git a/docs_mcp/server.py b/docs_mcp/server.py
index d527e34..cb69a07 100644
--- a/docs_mcp/server.py
+++ b/docs_mcp/server.py
@@ -1,4 +1,4 @@
-"""MCP server for the ppls-docs pesticide label corpus.
+"""MCP server for the crop-chem-docs pesticide label corpus.
 
 Adapted from the docs-mcp-template (which targeted versioned software
 docs) for the EPA pesticide-labels domain: ``bundle_id`` → ``source``,
@@ -34,7 +34,7 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Product configuration.
 # ---------------------------------------------------------------------------
-PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls")
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "crop_chem")
 PRODUCT_DOCS_URL = os.environ.get(
     "PRODUCT_DOCS_URL",
     "https://ordspub.epa.gov/ords/pesticides/f?p=PPLS:1",
@@ -43,8 +43,8 @@ COLLECTION = f"{PRODUCT_NAME}_docs"
 
 # Paths — corpus on (possibly) external storage, indexes always at repo root.
 REPO_ROOT = Path(__file__).resolve().parent.parent
-CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
-CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma")
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma")
 BM25_DB = Path(os.environ.get("BM25_DB",
                str(REPO_ROOT / "bm25" / f"{PRODUCT_NAME}_docs.db")))
 SOURCES_JSON = REPO_ROOT / "sources.json"
@@ -464,7 +464,7 @@ def list_versions() -> str:
         cat = _sources()
 
         # Source-level summary from sources.json
-        lines: list[str] = ["# PPLS docs corpus"]
+        lines: list[str] = ["# crop-chem-docs corpus"]
 
         # Live counts from Chroma (best-effort; the server should still
         # render a useful response if Chroma is unreachable)
@@ -628,7 +628,7 @@ def _load_lessons() -> tuple[str, list[tuple[str, str]]]:
 
 
 @mcp.tool()
-def ppls_api_lessons(
+def crop_chem_api_lessons(
     topic: Annotated[
         str | None,
         Field(description="OPTIONAL: topic slug or substring (e.g., "
@@ -654,7 +654,7 @@ def ppls_api_lessons(
     warnings that make them actionable. Call this first; cite specific
     lessons in your response.
     """
-    with TimedCall("ppls_api_lessons", {"topic": topic}) as _call:
+    with TimedCall("crop_chem_api_lessons", {"topic": topic}) as _call:
         full, sections = _load_lessons()
         if not sections:
             _call.set(sections=0)
@@ -663,9 +663,9 @@ def ppls_api_lessons(
         if not topic:
             _call.set(sections=len(sections), returned="toc")
             toc_lines = [
-                "# PPLS API lessons — table of contents",
+                "# Crop-Chem API lessons — table of contents",
                 "",
-                f"Call `ppls_api_lessons(topic='<slug>')` to fetch a specific section.",
+                f"Call `crop_chem_api_lessons(topic='<slug>')` to fetch a specific section.",
                 "",
             ]
             for slug, body in sections:
diff --git a/rag/index.py b/rag/index.py
index cbe7dbc..6eccbbd 100644
--- a/rag/index.py
+++ b/rag/index.py
@@ -5,7 +5,7 @@ into Chroma. With --rebuild, drops + recreates the collection (clean
 state). With --bm25-only, skips Chroma and rebuilds only the FTS5
 index — useful for fast iteration when chunking didn't change.
 
-The corpus root honors PPLS_CORPUS_ROOT (matching the scrapers).
+The corpus root honors CORPUS_ROOT (matching the scrapers).
 The Chroma + BM25 stores stay at the repo root because both rely on
 filesystem locking semantics that vfat (typical USB drive) doesn't
 provide reliably.
@@ -30,11 +30,11 @@ log = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO, format="%(asctime)s  %(message)s")
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
-CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
-CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma")
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
+CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma")
 
 # Collection name — convention: <product>_docs. Override via env.
-PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "ppls")
+PRODUCT_NAME = os.environ.get("PRODUCT_NAME", "crop_chem")
 COLLECTION = f"{PRODUCT_NAME}_docs"
 
 
diff --git a/rag/retrieval.py b/rag/retrieval.py
index 6556af2..bb4725e 100644
--- a/rag/retrieval.py
+++ b/rag/retrieval.py
@@ -20,10 +20,10 @@ from typing import Iterable, Protocol
 log = logging.getLogger(__name__)
 
 REPO_ROOT = Path(__file__).resolve().parent.parent
-CHROMA_DIR = Path(os.environ.get("PPLS_CHROMA_DIR") or REPO_ROOT / "chroma")
+CHROMA_DIR = Path(os.environ.get("CHROMA_DIR_OVERRIDE") or REPO_ROOT / "chroma")
 BM25_DB = Path(os.environ.get("BM25_DB",
-               str(REPO_ROOT / "bm25" / "ppls_docs.db")))
-COLLECTION = f"{os.environ.get('PRODUCT_NAME', 'ppls')}_docs"
+               str(REPO_ROOT / "bm25" / "crop_chem_docs.db")))
+COLLECTION = f"{os.environ.get('PRODUCT_NAME', 'crop_chem')}_docs"
 
 
 class Retriever(Protocol):
diff --git a/scrape/README.md b/scrape/README.md
index 383dcfc..081e7a8 100644
--- a/scrape/README.md
+++ b/scrape/README.md
@@ -44,13 +44,13 @@ re-fetch.
 ### Corpus location
 
 Default: `corpus/` at the repo root. Override with the
-`PPLS_CORPUS_ROOT` env var to route the corpus to external storage
+`CORPUS_ROOT` env var to route the corpus to external storage
 (USB drive, NAS mount, secondary partition):
 
 ```bash
-export PPLS_CORPUS_ROOT=/mnt/big-disk/ppls-corpus
+export CORPUS_ROOT=/mnt/big-disk/crop-chem-corpus
 python -m scrape.runner --source bayer --limit 20
-# writes to /mnt/big-disk/ppls-corpus/bayer/...
+# writes to /mnt/big-disk/crop-chem-corpus/bayer/...
 ```
 
 All sources honor the same env var; each creates its own
diff --git a/scrape/runner.py b/scrape/runner.py
index 1e20bbb..a9f7e36 100644
--- a/scrape/runner.py
+++ b/scrape/runner.py
@@ -1,7 +1,7 @@
 """Thin dispatcher that routes ``--source <id>`` to the right per-source
 scraper module.
 
-For ppls-docs the convention is **one source per scraper module** under
+For crop-chem-docs the convention is **one source per scraper module** under
 ``scrape.sources.<id>``. Each module is independently runnable via
 ``python -m scrape.sources.<id>`` and accepts its own flags — this
 runner is a convenience shim for CI / the weekly refresh workflow.
diff --git a/scrape/sources/bayer.py b/scrape/sources/bayer.py
index eb6f30c..5ce5975 100644
--- a/scrape/sources/bayer.py
+++ b/scrape/sources/bayer.py
@@ -47,7 +47,7 @@ import requests
 from pypdf import PdfReader
 
 SCRAPER_VERSION = "0.1.0"
-USER_AGENT = "ppls-docs-scraper/0.1 (+https://drawbar.example/contact)"
+USER_AGENT = "crop-chem-docs-scraper/0.1 (+https://drawbar.example/contact)"
 BASE = "https://www.cropscience.bayer.us"
 
 # Catalog product-type values used in the Next.js data API.
@@ -63,11 +63,11 @@ PRODUCT_CLASS = {
 }
 
 # Repo root: scrape/sources/bayer.py -> repo root is 3 parents up.
-# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the
+# Corpus root is overridable via CORPUS_ROOT for routing the
 # corpus to external storage (USB drive, NAS mount, etc.) without
 # editing the repo.
 REPO_ROOT = Path(__file__).resolve().parents[2]
-CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
 CORPUS_DIR = CORPUS_ROOT / "bayer"
 
 # Politeness: target ~1 req/sec to Bayer. Each HTTP method goes through
diff --git a/scrape/sources/epa_ppls.py b/scrape/sources/epa_ppls.py
index 6773cef..7e1edfa 100644
--- a/scrape/sources/epa_ppls.py
+++ b/scrape/sources/epa_ppls.py
@@ -63,7 +63,7 @@ from pypdf import PdfReader
 from pypdf.errors import PdfReadError
 
 SCRAPER_VERSION = "0.1.0"
-USER_AGENT = "ppls-docs-scraper/0.1 (+https://drawbar.example/contact)"
+USER_AGENT = "crop-chem-docs-scraper/0.1 (+https://drawbar.example/contact)"
 
 PPIS_PRODUCT_ZIP_URL = "https://www3.epa.gov/pesticides/PPISdata/product.zip"
 PPLS_API_BASE = "https://ordspub.epa.gov/ords/pesticides/cswu/ppls"
@@ -73,10 +73,10 @@ PPLS_INDEX_URL_TEMPLATE = (
 )
 
 REPO_ROOT = Path(__file__).resolve().parents[2]
-# Corpus root is overridable via PPLS_CORPUS_ROOT for routing the
+# Corpus root is overridable via CORPUS_ROOT for routing the
 # corpus to external storage (USB drive, NAS mount, etc.) without
 # editing the repo.
-CORPUS_ROOT = Path(os.environ.get("PPLS_CORPUS_ROOT") or REPO_ROOT / "corpus")
+CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
 CORPUS_DIR = CORPUS_ROOT / "epa_ppls"
 
 REQUEST_DELAY_SECONDS = 1.1  # polite: ~1 req/sec