"""Thin dispatcher that routes ``--source `` to the right per-source scraper module. For ppls-docs the convention is **one source per scraper module** under ``scrape.sources.``. Each module is independently runnable via ``python -m scrape.sources.`` and accepts its own flags — this runner is a convenience shim for CI / the weekly refresh workflow. Examples: python -m scrape.runner --source bayer --limit 20 python -m scrape.runner --source epa_ppls --limit 20 python -m scrape.runner --all # walk every source in sources.json Anything after the recognized flags is passed through to the source scraper, so: python -m scrape.runner --source bayer --force --product warrant just dispatches to ``scrape.sources.bayer`` with ``--force --product warrant`` as argv. """ from __future__ import annotations import argparse import importlib import json import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] SOURCES_JSON = REPO_ROOT / "sources.json" def _load_sources() -> list[dict]: if not SOURCES_JSON.exists(): return [] try: return json.loads(SOURCES_JSON.read_text()) except json.JSONDecodeError: return [] def _run_source(source_id: str, passthrough: list[str]) -> int: mod_name = f"scrape.sources.{source_id}" try: mod = importlib.import_module(mod_name) except ImportError as exc: print(f"runner: no source module {mod_name}: {exc}", file=sys.stderr) return 2 main = getattr(mod, "main", None) if not callable(main): print(f"runner: {mod_name} has no main() entrypoint", file=sys.stderr) return 2 return int(main(passthrough) or 0) def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(prog="scrape.runner") parser.add_argument("--source", help="Source id (matches sources.json)") parser.add_argument("--all", action="store_true", help="Run every source listed in sources.json") args, passthrough = parser.parse_known_args(argv) if not args.source and not args.all: parser.error("specify --source or --all") sources = _load_sources() if args.all: ids = [s["id"] for s in sources if "id" in s] if not ids: print("runner: sources.json is empty or missing", file=sys.stderr) return 2 else: # If the source isn't registered in sources.json yet, dispatch anyway # so the scraper can be exercised during initial development. ids = [args.source] rc = 0 for sid in ids: rc |= _run_source(sid, passthrough) return rc if __name__ == "__main__": sys.exit(main())