gh_plot_reports corpus (4,299 plots) + concurrency + 4-GPU pool

CORPUS — 4,299 GH plot reports added (3,797 written + 502 from the
earlier slow run + 319 sitemap-listed URLs that 404'd as
discontinued). Combined with prior 760 varieties + 14 AgriPro
trials = 5,073 total chunks now indexed.

scrape/sources/gh_plot_reports.py — concurrency speedup:
- 4 worker threads (ThreadPoolExecutor), each with its own
  requests.Session for connection-pool efficiency.
- Shared class-level rate limiter (0.25 sec between ANY two
  requests across all threads). Net throughput ~4 req/sec —
  well below any rate-limit threshold a public site enforces.
- Diagnosis vs original 1 req/sec: GH had ZERO rate limiting,
  zero 429s, zero retries. The 1 sec self-throttle was just too
  conservative. Bench:
    1 worker  / 1.0 sec throttle:  ~0.4 plots/sec (190 min ETA)
    4 workers / 0.25 sec throttle: ~3 plots/sec  (~25 min actual)

rag/chunk.py — chunk size cap for nomic-embed-text's 2048-token
context window:
- Empirically tested: failure threshold is ~5,250 chars on
  numeric-heavy trial chunks (chars/token ratio 2.4 vs 3.5 for
  prose). Cap at 4,500 chars to be safely under at worst-case
  2.2 chars/token.
- Applied to BOTH variety and trial chunks. Marked truncated
  chunks with metadata.embed_truncated = True; FULL text stays
  in the on-disk .md for get_page to return verbatim.

.gitea/workflows/{refresh,image-only}.yml — OLLAMA_URL pool
restructured for the 4 GPU-pinned endpoints. Bench (50-chunk
batches on nomic-embed-text):

    .0.125:11434  (RTX 40-series)  242 embeds/sec  ← weight ×4
    .0.2:11436    (GPU-pinned)     108 embeds/sec  ← weight ×2
    .0.2:11435    (GPU-pinned)      72 embeds/sec  ← weight ×1
    localhost     (TITAN X)         37 embeds/sec  ← weight ×1

Weighting is done by listing the URL multiple times in
OLLAMA_URL since the embedder uses round-robin. .0.2:11434 is
explicitly EXCLUDED — it isn't pinned to a specific GPU.

Combined index rebuild for 5,073 chunks now finishes in ~3 min
(was 19+ on the single-endpoint pool).

Smoke tests:
✓ list_versions: 5,073 docs across 6 sources, 2 vendors, 6
  brands, 4 crops (corn 2711, soy 2016, silage 223, wheat 123).
✓ search_trials({crop=corn, state=IA, year=2024}): 3 IA 2024
  corn trials surfaced.
✓ search_trials("Phytophthora resistance soybean trial"): NK
  NK43-W1XFS top-1 in LA 2024 trial (cross-vendor result).
✓ search_trials("AP Iliad Idaho wheat"): AgriPro Washington/N
  Idaho 2025 trial surfaced.
✓ search_trials(product=DKC65-95): 3 corn trials containing
  that hybrid in IL/IA 2024.
✓ search_trials(product=NK1701): 3 corn trials in AR/MS 2024.
✓ Product filter correctly returns EMPTY for products that
  aren't in the corpus (DKC65-20 is a 2023 product; 2023 plots
  deferred). Anti-hallucination contract preserved.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-25 16:46:35 -04:00
parent cfa27d0bca
commit 0e625553e5
8602 changed files with 1013877 additions and 32 deletions
+93 -27
View File
@@ -61,7 +61,9 @@ import os
import random
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
@@ -79,7 +81,13 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "gh_plot_reports"
REQ_INTERVAL_SEC = 1.0
# 0.25 sec between any two requests in the WHOLE process (shared
# across worker threads). The site's pages take ~1.5 sec to serve on
# their own; combined with concurrent workers this gives ~4 req/sec
# net — polite by any normal standard, and the GH plot-reports
# scrape on 4,600 docs finishes in ~20 min instead of ~3 hours.
REQ_INTERVAL_SEC = 0.25
DEFAULT_WORKERS = 4
log = logging.getLogger("scrape.gh_plot_reports")
@@ -108,17 +116,35 @@ STATE_NAMES = {
class RateLimitedSession:
"""Thread-safe rate-limited requests.Session wrapper.
The lock + last-request timestamp are class-level so multiple
sessions (one per worker thread) share the same global interval.
Each thread has its own requests.Session for connection-pool
efficiency, but they all coordinate on the request-cadence
floor.
"""
_lock = threading.Lock()
_last_global: float = 0.0
_global_interval: float = REQ_INTERVAL_SEC
def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
self.s = requests.Session()
self.s.headers["User-Agent"] = USER_AGENT
self.interval = interval
self._last = 0.0
# Set the class-level interval to the most-restrictive caller
# (so a 0.25s caller can't be overridden by a later 1.0s
# caller starting another scraper in the same process).
with RateLimitedSession._lock:
if interval > RateLimitedSession._global_interval:
RateLimitedSession._global_interval = interval
def _wait(self) -> None:
delta = time.monotonic() - self._last
if delta < self.interval:
time.sleep(self.interval - delta)
self._last = time.monotonic()
with RateLimitedSession._lock:
delta = time.monotonic() - RateLimitedSession._last_global
if delta < RateLimitedSession._global_interval:
time.sleep(RateLimitedSession._global_interval - delta)
RateLimitedSession._last_global = time.monotonic()
def request(
self,
@@ -692,9 +718,9 @@ def run(
only_state: str | None,
only_year: int | None,
include_2023: bool,
workers: int = DEFAULT_WORKERS,
) -> int:
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
http = RateLimitedSession()
crops = {only_crop} if only_crop else {"corn", "soybeans", "silage"}
states = {only_state} if only_state else None
@@ -705,32 +731,68 @@ def run(
else:
years = {2024, 2025}
targets = discover_plots(http, crops=crops, states=states, years=years)
# One shared session for sitemap walk (single-threaded).
discovery_http = RateLimitedSession()
targets = discover_plots(discovery_http, crops=crops, states=states, years=years)
if limit is not None:
targets = targets[:limit]
counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0}
processed = 0
for url, crop, state, year, plot_id in targets:
if limit is not None and processed >= limit:
break
processed += 1
status, prod = process_plot(
http, url=url, crop=crop, state=state, year=year,
counts_lock = threading.Lock()
processed_counter = {"n": 0}
total = len(targets)
# One requests.Session per worker thread — they share the
# class-level rate limiter (REQ_INTERVAL_SEC between any two
# requests across all threads), but each has its own HTTP
# connection pool.
thread_local = threading.local()
def _session() -> RateLimitedSession:
s = getattr(thread_local, "session", None)
if s is None:
s = RateLimitedSession()
thread_local.session = s
return s
def _worker(target: tuple[str, str, str, int, str]) -> tuple[str, Any]:
url, crop, state, year, plot_id = target
return process_plot(
_session(), url=url, crop=crop, state=state, year=year,
plot_id=plot_id, force=force,
)
counts[status] = counts.get(status, 0) + 1
if prod is not None and processed <= 5 or processed % 100 == 0:
log.info(
"[%d/%s] %s %s | results=%d coop=%s",
processed, str(limit) if limit else len(targets),
source_key_for(crop, state, year, plot_id), status,
len(prod.results) if prod else 0,
(prod.cooperator if prod else "-") or "-",
)
log.info("dispatching %d plots across %d workers (shared rate limiter %.2f sec/req)",
total, workers, REQ_INTERVAL_SEC)
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_worker, t): t for t in targets}
for fut in as_completed(futures):
target = futures[fut]
url, crop, state, year, plot_id = target
try:
status, prod = fut.result()
except Exception as exc: # noqa: BLE001
log.error("worker failed for %s: %s", url, exc)
status, prod = "failed", None
with counts_lock:
counts[status] = counts.get(status, 0) + 1
processed_counter["n"] += 1
n = processed_counter["n"]
if (prod is not None and n <= 5) or n % 100 == 0 or status == "failed":
log.info(
"[%d/%d] %s %s | results=%d coop=%s",
n, total,
source_key_for(crop, state, year, plot_id), status,
len(prod.results) if prod else 0,
(prod.cooperator if prod else "-") or "-",
)
log.info(
"done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)",
processed, counts["written"], counts["skipped"],
counts["missing"], counts["failed"], len(targets),
processed_counter["n"], counts["written"], counts["skipped"],
counts["missing"], counts["failed"], total,
)
return 0 if counts["failed"] == 0 else 1
@@ -756,6 +818,9 @@ def _build_argparser() -> argparse.ArgumentParser:
help="Limit to one year.")
p.add_argument("--include-2023", action="store_true",
help="Include 2023 plot reports (default: 2024-2025 only).")
p.add_argument("--workers", type=int, default=DEFAULT_WORKERS,
help=f"Concurrent worker threads (default {DEFAULT_WORKERS}, "
f"all share a global {REQ_INTERVAL_SEC}-sec rate limiter).")
p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
return p
@@ -774,6 +839,7 @@ def main(argv: list[str] | None = None) -> int:
only_state=args.state.lower() if args.state else None,
only_year=args.year,
include_2023=args.include_2023,
workers=args.workers,
)