crop-chem-docs/scrape/runner.py

"""Thin dispatcher that routes ``--source <id>`` to the right per-source
scraper module.

For ppls-docs the convention is **one source per scraper module** under
``scrape.sources.<id>``. Each module is independently runnable via
``python -m scrape.sources.<id>`` and accepts its own flags — this
runner is a convenience shim for CI / the weekly refresh workflow.

Examples:

    python -m scrape.runner --source bayer --limit 20
    python -m scrape.runner --source epa_ppls --limit 20
    python -m scrape.runner --all          # walk every source in sources.json

Anything after the recognized flags is passed through to the source
scraper, so:

    python -m scrape.runner --source bayer --force --product warrant

just dispatches to ``scrape.sources.bayer`` with ``--force --product
warrant`` as argv.
"""

from __future__ import annotations

import argparse
import importlib
import json
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
SOURCES_JSON = REPO_ROOT / "sources.json"


def _load_sources() -> list[dict]:
    if not SOURCES_JSON.exists():
        return []
    try:
        return json.loads(SOURCES_JSON.read_text())
    except json.JSONDecodeError:
        return []


def _run_source(source_id: str, passthrough: list[str]) -> int:
    mod_name = f"scrape.sources.{source_id}"
    try:
        mod = importlib.import_module(mod_name)
    except ImportError as exc:
        print(f"runner: no source module {mod_name}: {exc}", file=sys.stderr)
        return 2
    main = getattr(mod, "main", None)
    if not callable(main):
        print(f"runner: {mod_name} has no main() entrypoint", file=sys.stderr)
        return 2
    return int(main(passthrough) or 0)


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(prog="scrape.runner")
    parser.add_argument("--source", help="Source id (matches sources.json)")
    parser.add_argument("--all", action="store_true",
                        help="Run every source listed in sources.json")
    args, passthrough = parser.parse_known_args(argv)

    if not args.source and not args.all:
        parser.error("specify --source <id> or --all")

    sources = _load_sources()
    if args.all:
        ids = [s["id"] for s in sources if "id" in s]
        if not ids:
            print("runner: sources.json is empty or missing", file=sys.stderr)
            return 2
    else:
        # If the source isn't registered in sources.json yet, dispatch anyway
        # so the scraper can be exercised during initial development.
        ids = [args.source]

    rc = 0
    for sid in ids:
        rc |= _run_source(sid, passthrough)
    return rc


if __name__ == "__main__":
    sys.exit(main())