seed-mcp/scrape/sources/becks_pfr.py

"""Beck's PFR (Practical Farm Research) scraper.

Source: Public Sanity GROQ API at ``https://mc8v24rf.api.sanity.io``.
No authentication required — Beck's exposes their CMS content store
publicly. ~2,089 documents going back to 2015.

Sanity query endpoint:
  ``/v1/data/query/production?query=<groq>``

Useful GROQ for PFR docs (the projectId / dataset are public):

  *[_type == "pfrStudy"] {
    _id, title, year, crop, slug, summary, body, attachments
  }

Records are research studies, not variety identity — head-to-head
yield trials, fungicide timing, planting-date studies, hybrid-by-
population, biological seed treatments, etc.

Treat differently from variety scrapers:
- One record per study, not per variety
- chunk_0 preamble includes the study's tl;dr finding (extract from
  the ``summary`` field if present, or first paragraph of ``body``)
- Crop tag (corn/soy/wheat) for filtering
- Year tag — older PFR studies are still relevant but search should
  let the user weight recency

Polite rate limit: Sanity is generous but no auth means we should
keep concurrency ≤4 and pause ~250ms between batches.

TODO: implement.
"""
from __future__ import annotations

import sys


def main(argv: list[str] | None = None) -> int:
    print("becks_pfr: deferred — public Sanity GROQ at mc8v24rf.api.sanity.io, ~2089 research docs",
          file=sys.stderr)
    # Return 0 so the monthly CI workflow doesn't fail when this
    # source is listed but not yet implemented.
    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))