seed-mcp/scrape/runner.py

"""Thin dispatcher that routes ``--source <id>`` to the right per-source
scraper module.

Convention: one source per module under ``scrape.sources.<id>``. Each
module is independently runnable via ``python -m scrape.sources.<id>``
and accepts its own flags — this runner is a convenience shim for CI.

Examples:

    python -m scrape.runner --source bayer_seeds --force
    python -m scrape.runner --source golden_harvest --limit 20
    python -m scrape.runner --all          # walk every source in sources.json

Anything after the recognized flags is passed through to the source
scraper, so:

    python -m scrape.runner --source bayer_seeds --force --brand dekalb

dispatches to ``scrape.sources.bayer_seeds`` with ``--force --brand
dekalb`` as argv.

Sources whose ``verdict`` in sources.json is anything other than
``"green"`` are skipped by ``--all`` (Beck's products is yellow until
the SeedIQ XHR is captured). Pass ``--source becks_products`` to run
a yellow source explicitly.
"""

from __future__ import annotations

import argparse
import importlib
import json
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
SOURCES_JSON = REPO_ROOT / "sources.json"


def _load_sources() -> list[dict]:
    if not SOURCES_JSON.exists():
        return []
    try:
        data = json.loads(SOURCES_JSON.read_text())
        return data.get("sources", []) if isinstance(data, dict) else data
    except json.JSONDecodeError:
        return []


def _run_source(source_id: str, passthrough: list[str]) -> int:
    mod_name = f"scrape.sources.{source_id}"
    try:
        mod = importlib.import_module(mod_name)
    except ImportError as exc:
        print(f"runner: no source module {mod_name}: {exc}", file=sys.stderr)
        return 2
    main = getattr(mod, "main", None)
    if not callable(main):
        print(f"runner: {mod_name} has no main() entrypoint", file=sys.stderr)
        return 2
    return int(main(passthrough) or 0)


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(prog="scrape.runner")
    parser.add_argument("--source", help="Source id (matches sources.json)")
    parser.add_argument("--all", action="store_true",
                        help="Run every GREEN source listed in sources.json")
    args, passthrough = parser.parse_known_args(argv)

    if not args.source and not args.all:
        parser.error("specify --source <id> or --all")

    sources = _load_sources()
    if args.all:
        ids = [s["name"] for s in sources if s.get("verdict") == "green"]
        if not ids:
            print("runner: no GREEN sources in sources.json", file=sys.stderr)
            return 2
    else:
        # If the source isn't registered in sources.json yet, dispatch anyway
        # so the scraper can be exercised during initial development.
        ids = [args.source]

    rc = 0
    for sid in ids:
        print(f"=== scrape.runner: dispatching to {sid} ===")
        rc |= _run_source(sid, passthrough)
    return rc


if __name__ == "__main__":
    sys.exit(main())