"""Thin dispatcher that routes ``--source `` to the right per-source scraper module. Convention: one source per module under ``scrape.sources.``. Each module is independently runnable via ``python -m scrape.sources.`` and accepts its own flags — this runner is a convenience shim for CI. Examples: python -m scrape.runner --source bayer_seeds --force python -m scrape.runner --source golden_harvest --limit 20 python -m scrape.runner --all # walk every source in sources.json Anything after the recognized flags is passed through to the source scraper, so: python -m scrape.runner --source bayer_seeds --force --brand dekalb dispatches to ``scrape.sources.bayer_seeds`` with ``--force --brand dekalb`` as argv. Sources whose ``verdict`` in sources.json is anything other than ``"green"`` are skipped by ``--all`` (Beck's products is yellow until the SeedIQ XHR is captured). Pass ``--source becks_products`` to run a yellow source explicitly. """ from __future__ import annotations import argparse import importlib import json import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] SOURCES_JSON = REPO_ROOT / "sources.json" def _load_sources() -> list[dict]: if not SOURCES_JSON.exists(): return [] try: data = json.loads(SOURCES_JSON.read_text()) return data.get("sources", []) if isinstance(data, dict) else data except json.JSONDecodeError: return [] def _run_source(source_id: str, passthrough: list[str]) -> int: mod_name = f"scrape.sources.{source_id}" try: mod = importlib.import_module(mod_name) except ImportError as exc: print(f"runner: no source module {mod_name}: {exc}", file=sys.stderr) return 2 main = getattr(mod, "main", None) if not callable(main): print(f"runner: {mod_name} has no main() entrypoint", file=sys.stderr) return 2 return int(main(passthrough) or 0) def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(prog="scrape.runner") parser.add_argument("--source", help="Source id (matches sources.json)") parser.add_argument("--all", action="store_true", help="Run every GREEN source listed in sources.json") args, passthrough = parser.parse_known_args(argv) if not args.source and not args.all: parser.error("specify --source or --all") sources = _load_sources() if args.all: ids = [s["name"] for s in sources if s.get("verdict") == "green"] if not ids: print("runner: no GREEN sources in sources.json", file=sys.stderr) return 2 else: # If the source isn't registered in sources.json yet, dispatch anyway # so the scraper can be exercised during initial development. ids = [args.source] rc = 0 for sid in ids: print(f"=== scrape.runner: dispatching to {sid} ===") rc |= _run_source(sid, passthrough) return rc if __name__ == "__main__": sys.exit(main())