seed-mcp/scrape/sources/agrigold_plot_reports.py

"""AgriGold plot-report scraper — cross-vendor yield trials.

AgriGold publishes detailed cross-vendor yield trials at
``agrigold.com/{crop}/performance/{crop}-yield-results``. Each
``plot-id`` is a single head-to-head trial site comparing AgriGold
hybrids against competitors (DEKALB, Pioneer, Dairyland, etc.) on
yield, moisture, test weight, and an "Adj Yield" check-adjusted
yield.

This is the THIRD ``data_type: "trial"`` source in the corpus
(after ``gh_plot_reports`` and ``lg_plot_reports``) — same shape
(per-site cross-vendor), different vendor (AgReliant Genetics /
AgriGold), and the **most metadata-rich** of the three. AgriGold's
detail page includes tillage, previous crop, fungicide, herbicide,
insecticide, irrigation, soil type — fields the others don't
publish.

Listing URLs (one per crop, year-filterable):
  /corn/performance/corn-yield-results?harvestYear={year}
  /soybeans/performance/soybean-yield-results?harvestYear={year}

Detail URL:
  /<crop_url>/performance/<crop_url>-yield-results/{plot_id}

(For soybeans the URL is ``/soybeans/performance/soybean-yield-results/{id}``
- note the singular "soybean" in the path.)

Plot counts by harvest year (corn):
  2025: 408, 2024: 441, 2023: 583, plus 2022 + 2026 (likely sparse)

Initial scrape: 2024+2025 (matching gh_plot_reports baseline).

Output:
  corpus/agrigold_plot_reports/<source_key>.md      LLM-visible body
  corpus/agrigold_plot_reports/<source_key>.json    sidecar metadata

source_key convention: ``agpr-<crop>-<year>-<plot_id>``
e.g. ``agpr-corn-2025-145926``.

CLI:
  python -m scrape.sources.agrigold_plot_reports --limit 5
  python -m scrape.sources.agrigold_plot_reports --crop corn --year 2025
  python -m scrape.sources.agrigold_plot_reports --include-2023 --force
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import random
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import requests
from bs4 import BeautifulSoup

SCRAPER_VERSION = "0.1.0"
USER_AGENT = "seed-mcp-scraper/0.1 (+https://drawbar.example/contact)"
BASE = "https://www.agrigold.com"

REPO_ROOT = Path(__file__).resolve().parents[2]
CORPUS_ROOT = Path(os.environ.get("CORPUS_ROOT") or REPO_ROOT / "corpus")
CORPUS_DIR = CORPUS_ROOT / "agrigold_plot_reports"

REQ_INTERVAL_SEC = 0.5  # AgriGold pages are HEAVY (~50KB detail, 1MB+ listing)
DEFAULT_WORKERS = 4

log = logging.getLogger("scrape.agrigold_plot_reports")

# Crop → (URL segment, listing URL slug, detail URL slug)
# Corn: /corn/performance/corn-yield-results[/{id}]
# Soybeans: /soybeans/performance/soybean-yield-results[/{id}]  (singular "soybean")
CROPS: dict[str, tuple[str, str]] = {
    "corn":     ("corn",     "corn-yield-results"),
    "soybeans": ("soybeans", "soybean-yield-results"),
}


# --------------------------------------------------------------------- HTTP


class RateLimitedSession:
    """Thread-safe rate-limited requests.Session wrapper.

    Mirrors the primitive in gh_plot_reports.py.
    """

    _lock = threading.Lock()
    _last_global: float = 0.0
    _global_interval: float = REQ_INTERVAL_SEC

    def __init__(self, interval: float = REQ_INTERVAL_SEC) -> None:
        self.s = requests.Session()
        self.s.headers["User-Agent"] = USER_AGENT
        with RateLimitedSession._lock:
            if interval > RateLimitedSession._global_interval:
                RateLimitedSession._global_interval = interval

    def _wait(self) -> None:
        with RateLimitedSession._lock:
            delta = time.monotonic() - RateLimitedSession._last_global
            if delta < RateLimitedSession._global_interval:
                time.sleep(RateLimitedSession._global_interval - delta)
            RateLimitedSession._last_global = time.monotonic()

    def request(
        self,
        method: str,
        url: str,
        *,
        max_retries: int = 4,
        timeout: float = 30.0,
        **kw: Any,
    ) -> requests.Response:
        last_exc: Exception | None = None
        for attempt in range(max_retries):
            self._wait()
            try:
                resp = self.s.request(method, url, timeout=timeout, **kw)
            except requests.RequestException as exc:
                last_exc = exc
                backoff = min(30.0, (2 ** attempt) + random.random())
                log.warning("network error on %s %s: %s — retry in %.1fs",
                            method, url, exc, backoff)
                time.sleep(backoff)
                continue
            if resp.status_code == 429 or 500 <= resp.status_code < 600:
                ra = resp.headers.get("Retry-After")
                backoff = float(ra) if (ra and ra.isdigit()) else min(30.0, (2 ** attempt) + random.random())
                log.warning("HTTP %d on %s %s — retry in %.1fs",
                            resp.status_code, method, url, backoff)
                time.sleep(backoff)
                continue
            return resp
        if last_exc:
            raise last_exc
        return resp  # type: ignore[return-value]

    def get(self, url: str, **kw: Any) -> requests.Response:
        return self.request("GET", url, **kw)


# --------------------------------------------------------------------- model


@dataclass
class TrialResult:
    rank: int | None = None
    brand: str = ""
    product: str = ""
    traits: str = ""
    # Columns: Ck, H20 (moisture %), Test Wt., Yield, Adj Yield
    metrics: dict[str, float | str | None] = field(default_factory=dict)


@dataclass
class PlotReport:
    source_key: str
    source_url: str
    crop: str           # "corn" / "soybeans"
    year: int
    plot_id: str

    city: str | None = None
    state_abbrev: str | None = None
    county: str | None = None
    cooperator: str | None = None
    plot_average: float | None = None  # whole-plot mean yield

    # Plot management details (AgriGold publishes more of these
    # than GH or LG — useful for agronomic comparison queries).
    planted_date: str | None = None     # ISO date
    harvested_date: str | None = None   # ISO date
    population: int | None = None
    fungicide: str | None = None
    soil_type: str | None = None
    tillage: str | None = None
    herbicide: str | None = None
    insecticide: str | None = None
    row_width_in: str | None = None      # kept as string ("30.0\"")
    num_rows: int | None = None
    previous_crop: str | None = None
    irrigation: str | None = None

    results: list[TrialResult] = field(default_factory=list)


# --------------------------------------------------------------------- discovery


def discover_plots(
    http: RateLimitedSession,
    *,
    crops: set[str],
    years: set[int],
) -> list[tuple[str, int, str]]:
    """Walk the listing pages per (crop, year). Returns
    ``[(crop, year, plot_id), ...]``."""
    out: list[tuple[str, int, str]] = []
    for crop in sorted(crops):
        if crop not in CROPS:
            log.warning("unknown crop %r, skipping", crop)
            continue
        url_seg, listing_slug = CROPS[crop]
        for year in sorted(years):
            url = f"{BASE}/{url_seg}/performance/{listing_slug}?harvestYear={year}"
            log.info("GET %s", url)
            r = http.get(url)
            r.raise_for_status()
            # data-plotid="123456" appears 5x per plot. Dedupe.
            ids = set(re.findall(r'data-plotid="(\d+)"', r.text))
            log.info("  %s %d: %d unique plot ids", crop, year, len(ids))
            for pid in sorted(ids):
                out.append((crop, year, pid))
    return out


# --------------------------------------------------------------------- helpers


def source_key_for(crop: str, year: int, plot_id: str) -> str:
    return f"agpr-{crop}-{year}-{plot_id}"


# State abbrev (city, ST format) regex — e.g. "Erie, IL", "Cottage Hill , KS"
_CITY_STATE_RE = re.compile(r"^(.*?)\s*,\s*([A-Z]{2})\s*$")


def _parse_int(s: str | None) -> int | None:
    if not s:
        return None
    s = re.sub(r"[,$]", "", str(s).strip())
    try:
        return int(s)
    except ValueError:
        return None


def _parse_float(s: str | None) -> float | None:
    if not s:
        return None
    s = re.sub(r"[,$]", "", str(s).strip())
    try:
        return float(s)
    except ValueError:
        return None


def _parse_date_slash(s: str | None) -> str | None:
    """``05/10/25`` → ``2025-05-10``. 2-digit year → 20xx."""
    if not s:
        return None
    s = s.strip()
    m = re.match(r"^(\d{1,2})/(\d{1,2})/(\d{2,4})$", s)
    if not m:
        return None
    mo, dy, yr = m.group(1), m.group(2), m.group(3)
    if len(yr) == 2:
        yr = "20" + yr
    try:
        return f"{int(yr):04d}-{int(mo):02d}-{int(dy):02d}"
    except ValueError:
        return None


def _detail_pairs(soup: BeautifulSoup, container_class: str) -> dict[str, str]:
    """Pull ``.detail-item`` label/value pairs from a container."""
    out: dict[str, str] = {}
    container = soup.find("div", class_=container_class)
    if container is None:
        return out
    for item in container.find_all("div", class_="detail-item"):
        label_el = item.find("div", class_="label")
        value_el = item.find("div", class_="value")
        if label_el is None or value_el is None:
            continue
        label = label_el.get_text(" ", strip=True)
        value = value_el.get_text(" ", strip=True)
        if label and value:
            out[label] = value
    return out


# --------------------------------------------------------------------- detail


def fetch_plot_detail(
    http: RateLimitedSession,
    *,
    crop: str,
    year: int,
    plot_id: str,
) -> PlotReport | None:
    url_seg, listing_slug = CROPS[crop]
    detail_url = f"{BASE}/{url_seg}/performance/{listing_slug}/{plot_id}"
    r = http.get(detail_url)
    if r.status_code == 404:
        return None
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    prod = PlotReport(
        source_key=source_key_for(crop, year, plot_id),
        source_url=detail_url,
        crop=crop,
        year=year,
        plot_id=str(plot_id),
    )

    # Header block: plot-average + city/state + county/cooperator
    header = soup.find("div", class_="details-header")
    if header is not None:
        avg_el = header.find("div", class_="plot-average")
        if avg_el is not None:
            # Strip the inner "Plot Average" label and parse the leading number
            label_inside = avg_el.find("div", class_="label")
            if label_inside is not None:
                label_inside.extract()
            avg_text = avg_el.get_text(" ", strip=True)
            prod.plot_average = _parse_float(avg_text)

        cs_el = header.find("div", class_="city-state")
        if cs_el is not None:
            cs = cs_el.get_text(" ", strip=True)
            m = _CITY_STATE_RE.match(cs)
            if m:
                prod.city = m.group(1).strip()
                prod.state_abbrev = m.group(2).strip().lower()
            else:
                prod.city = cs.strip()

        county_el = header.find("div", class_="county")
        if county_el is not None:
            prod.county = county_el.get_text(" ", strip=True)
        coop_el = header.find("div", class_="coorperator")  # site's typo, not ours
        if coop_el is not None:
            prod.cooperator = coop_el.get_text(" ", strip=True)

    # Plot Details: two columns of (label, value) pairs.
    details = _detail_pairs(soup, "plot-details")
    prod.planted_date = _parse_date_slash(details.get("Planting Date"))
    prod.harvested_date = _parse_date_slash(details.get("Harvest Date"))
    prod.population = _parse_int(details.get("Planting Population"))
    prod.fungicide = details.get("Fungicide") or None
    prod.soil_type = details.get("Soil Type") or None
    prod.tillage = details.get("Tillage") or None
    prod.herbicide = details.get("Herbicide") or None
    prod.insecticide = details.get("Insecticide") or None
    prod.row_width_in = details.get("Row Width") or None
    prod.num_rows = _parse_int(details.get("Number Of Rows"))
    prod.previous_crop = details.get("Previous Crop") or None
    prod.irrigation = details.get("Irrigation") or None

    # Comparison table
    table = soup.find("table", class_="plot-rows")
    if table is None:
        return prod
    # Header cells — already known to be Rank, Brand, Product, Trait,
    # Ck, H20, Test Wt., Yield, Adj Yield. Read defensively from DOM
    # in case it shifts.
    header_cells = []
    thead = table.find("thead")
    if thead is not None:
        for th in thead.find_all("th"):
            wrap = th.find("div", class_="th-wrapper")
            txt = (wrap.get_text(" ", strip=True) if wrap else th.get_text(" ", strip=True)).strip()
            header_cells.append(txt)

    # Map header position → key
    def find_col(*names: str) -> int | None:
        for n in names:
            for i, h in enumerate(header_cells):
                if h.lower() == n.lower():
                    return i
        return None

    i_rank = find_col("Rank")
    i_brand = find_col("Brand")
    i_product = find_col("Product")
    i_trait = find_col("Trait", "Traits")
    skip = {i_rank, i_brand, i_product, i_trait}
    # Anything else is a metric column
    metric_cols: list[tuple[str, int]] = []
    for i, h in enumerate(header_cells):
        if i in skip:
            continue
        if h:
            metric_cols.append((h, i))

    tbody = table.find("tbody")
    if tbody is None:
        return prod
    for row in tbody.find_all("tr"):
        cls = row.get("class") or []
        # Skip CK AVERAGE and PLOT AVERAGE summary rows
        if "check-averages" in cls or "plot-averages" in cls:
            continue
        cells = [c.get_text(" ", strip=True) for c in row.find_all("td")]
        if len(cells) < 4:
            continue
        def cell(i: int | None) -> str:
            return cells[i] if i is not None and 0 <= i < len(cells) else ""

        metrics: dict[str, float | str | None] = {}
        for name, idx in metric_cols:
            raw = cell(idx).strip()
            if not raw or raw == "-":
                metrics[name] = None
            else:
                f = _parse_float(raw)
                metrics[name] = f if f is not None else raw

        result = TrialResult(
            rank=_parse_int(cell(i_rank)),
            brand=cell(i_brand).strip(),
            product=cell(i_product).strip(),
            traits=cell(i_trait).strip(),
            metrics=metrics,
        )
        if result.brand or result.product or any(v is not None for v in metrics.values()):
            prod.results.append(result)

    return prod


# --------------------------------------------------------------------- render


def render_markdown(p: PlotReport) -> str:
    crop_label = {"corn": "Corn", "soybeans": "Soybean"}.get(p.crop, p.crop.title())
    where = ", ".join(x for x in (p.city, p.state_abbrev.upper() if p.state_abbrev else None) if x) or "?"

    head: list[str] = [
        f"# {crop_label} yield trial — {where}, {p.year}",
        "",
        f"- **Source:** AgriGold plot report (cross-vendor head-to-head)",
        f"- **Vendor:** AgReliant Genetics / AgriGold",
        f"- **Crop:** {crop_label}",
    ]
    if p.state_abbrev:
        head.append(f"- **State:** {p.state_abbrev.upper()}")
    if p.county:
        head.append(f"- **County:** {p.county}")
    if p.city:
        head.append(f"- **City:** {p.city}")
    head.append(f"- **Year:** {p.year}")
    head.append(f"- **Plot ID:** {p.plot_id}")
    if p.cooperator:
        head.append(f"- **Cooperator:** {p.cooperator}")
    if p.plot_average is not None:
        unit = "BU/Ac"  # AgriGold publishes BU/Ac for both corn and soy
        head.append(f"- **Plot average:** {p.plot_average} {unit}")
    if p.planted_date:
        head.append(f"- **Planted:** {p.planted_date}")
    if p.harvested_date:
        head.append(f"- **Harvested:** {p.harvested_date}")
    if p.population:
        head.append(f"- **Population:** {p.population:,} seeds/acre")
    if p.row_width_in:
        head.append(f"- **Row width:** {p.row_width_in}")
    if p.num_rows:
        head.append(f"- **# Rows:** {p.num_rows}")
    if p.soil_type:
        head.append(f"- **Soil type:** {p.soil_type}")
    if p.tillage:
        head.append(f"- **Tillage:** {p.tillage}")
    if p.previous_crop:
        head.append(f"- **Previous crop:** {p.previous_crop}")
    if p.irrigation:
        head.append(f"- **Irrigation:** {p.irrigation}")
    if p.fungicide and p.fungicide.upper() != "N/A":
        head.append(f"- **Fungicide:** {p.fungicide}")
    if p.herbicide and p.herbicide.upper() != "N/A":
        head.append(f"- **Herbicide:** {p.herbicide}")
    if p.insecticide and p.insecticide.upper() != "N/A":
        head.append(f"- **Insecticide:** {p.insecticide}")
    head.append(f"- **URL:** {p.source_url}")
    head.append("")
    head.append("---")
    head.append("")

    sections: list[str] = []
    if p.results:
        metric_keys: list[str] = []
        seen: set[str] = set()
        for r in p.results:
            for k in r.metrics.keys():
                if k not in seen:
                    seen.add(k)
                    metric_keys.append(k)

        sections.append("## Results (by rank)")
        sections.append("")
        headers = ["Rank", "Brand", "Product", "Trait"] + metric_keys
        sections.append("| " + " | ".join(headers) + " |")
        sections.append("|" + "|".join(["---"] * len(headers)) + "|")
        for r in p.results:
            row = [
                str(r.rank) if r.rank is not None else "-",
                r.brand or "-",
                r.product or "-",
                r.traits or "-",
            ]
            for k in metric_keys:
                v = r.metrics.get(k)
                if v is None:
                    row.append("-")
                elif isinstance(v, (int, float)):
                    row.append(str(v))
                else:
                    row.append(str(v))
            sections.append("| " + " | ".join(row) + " |")
        sections.append("")

        # Compact summary line for embedder signal — top 5 by Yield.
        primary = "Yield" if "Yield" in metric_keys else (metric_keys[0] if metric_keys else None)
        if primary:
            top = sorted(
                (r for r in p.results if isinstance(r.metrics.get(primary), (int, float))),
                key=lambda r: -r.metrics[primary],  # type: ignore[operator]
            )[:5]
            if top:
                bits = [f"{r.product} ({r.brand}) {r.metrics[primary]}" for r in top]
                sections.append(f"Top 5 by {primary}: " + ", ".join(bits) + ".")
                sections.append("")

    return "\n".join(head) + "\n".join(sections)


# --------------------------------------------------------------------- write


def write_plot(p: PlotReport, body_md: str) -> None:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)
    md_path = CORPUS_DIR / f"{p.source_key}.md"
    json_path = CORPUS_DIR / f"{p.source_key}.json"

    md_path.write_text(body_md, encoding="utf-8")
    sidecar = {
        "source": "agrigold_plot_reports",
        "source_key": p.source_key,
        "data_type": "trial",
        "vendor": "AgReliant Genetics",
        "brand": "AgriGold",
        "crop": p.crop,
        "state": p.state_abbrev.upper() if p.state_abbrev else None,
        "state_abbrev": p.state_abbrev,
        "city": p.city,
        "county": p.county,
        "year": p.year,
        "plot_id": p.plot_id,
        "cooperator": p.cooperator,
        "plot_average": p.plot_average,
        "planted_date": p.planted_date,
        "harvested_date": p.harvested_date,
        "population_seeds_per_acre": p.population,
        "row_width": p.row_width_in,
        "num_rows": p.num_rows,
        "soil_type": p.soil_type,
        "tillage": p.tillage,
        "previous_crop": p.previous_crop,
        "irrigation": p.irrigation,
        "fungicide": p.fungicide,
        "herbicide": p.herbicide,
        "insecticide": p.insecticide,
        "results": [
            {
                "rank": r.rank,
                "brand": r.brand,
                "product": r.product,
                "traits": r.traits,
                "metrics": r.metrics,
            }
            for r in p.results
        ],
        "n_results": len(p.results),
        "source_urls": [p.source_url],
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "scraper_version": SCRAPER_VERSION,
    }
    json_path.write_text(
        json.dumps(sidecar, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


# --------------------------------------------------------------------- pipeline


def process_plot(
    http: RateLimitedSession,
    *,
    crop: str,
    year: int,
    plot_id: str,
    force: bool,
) -> tuple[str, PlotReport | None]:
    sk = source_key_for(crop, year, plot_id)
    md_path = CORPUS_DIR / f"{sk}.md"
    if md_path.exists() and not force:
        return "skipped", None
    try:
        p = fetch_plot_detail(http, crop=crop, year=year, plot_id=plot_id)
    except Exception as exc:  # noqa: BLE001
        log.error("detail fetch failed for %s/%s: %s", crop, plot_id, exc)
        return "failed", None
    if p is None:
        return "missing", None
    body = render_markdown(p)
    write_plot(p, body)
    return "written", p


def run(
    *,
    limit: int | None,
    force: bool,
    only_crop: str | None,
    only_year: int | None,
    include_2023: bool,
    workers: int = DEFAULT_WORKERS,
) -> int:
    CORPUS_DIR.mkdir(parents=True, exist_ok=True)

    crops = {only_crop} if only_crop else set(CROPS.keys())
    if only_year:
        years = {only_year}
    elif include_2023:
        years = {2023, 2024, 2025}
    else:
        years = {2024, 2025}

    discovery_http = RateLimitedSession()
    targets = discover_plots(discovery_http, crops=crops, years=years)
    log.info("discovered %d total plot targets", len(targets))
    if limit is not None:
        targets = targets[:limit]

    counts = {"written": 0, "skipped": 0, "missing": 0, "failed": 0}
    counts_lock = threading.Lock()
    processed_counter = {"n": 0}
    total = len(targets)

    thread_local = threading.local()

    def _session() -> RateLimitedSession:
        s = getattr(thread_local, "session", None)
        if s is None:
            s = RateLimitedSession()
            thread_local.session = s
        return s

    def _worker(target: tuple[str, int, str]) -> tuple[str, Any]:
        crop, year, plot_id = target
        return process_plot(
            _session(), crop=crop, year=year, plot_id=plot_id, force=force,
        )

    log.info(
        "dispatching %d plots across %d workers (shared rate limiter %.2f sec/req)",
        total, workers, REQ_INTERVAL_SEC,
    )
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futures = {pool.submit(_worker, t): t for t in targets}
        for fut in as_completed(futures):
            target = futures[fut]
            crop, year, plot_id = target
            try:
                status, p = fut.result()
            except Exception as exc:  # noqa: BLE001
                log.error("worker failed for %s/%s/%s: %s", crop, year, plot_id, exc)
                status, p = "failed", None

            with counts_lock:
                counts[status] = counts.get(status, 0) + 1
                processed_counter["n"] += 1
                n = processed_counter["n"]

            if (p is not None and n <= 5) or n % 100 == 0 or status == "failed":
                log.info(
                    "[%d/%d] %s %s | results=%d state=%s",
                    n, total,
                    source_key_for(crop, year, plot_id), status,
                    len(p.results) if p else 0,
                    (p.state_abbrev.upper() if p and p.state_abbrev else "-"),
                )

    log.info(
        "done: processed=%d written=%d skipped=%d missing=%d failed=%d (of %d candidates)",
        processed_counter["n"], counts["written"], counts["skipped"],
        counts["missing"], counts["failed"], total,
    )
    return 0 if counts["failed"] == 0 else 1


# --------------------------------------------------------------------- CLI


def _build_argparser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="scrape.sources.agrigold_plot_reports",
        description="Scrape AgriGold cross-vendor plot reports (yield trials).",
    )
    p.add_argument("--limit", type=int, default=None,
                   help="Stop after processing N plots (default: all).")
    p.add_argument("--force", action="store_true",
                   help="Re-fetch even if the markdown file already exists.")
    p.add_argument("--crop", default=None,
                   choices=tuple(CROPS.keys()),
                   help="Limit to one crop.")
    p.add_argument("--year", type=int, default=None,
                   choices=(2022, 2023, 2024, 2025, 2026),
                   help="Limit to one year.")
    p.add_argument("--include-2023", action="store_true",
                   help="Include 2023 plot reports (default: 2024-2025 only).")
    p.add_argument("--workers", type=int, default=DEFAULT_WORKERS,
                   help=f"Concurrent worker threads (default {DEFAULT_WORKERS}, "
                        f"all share a global {REQ_INTERVAL_SEC}-sec rate limiter).")
    p.add_argument("--log-level", default=os.environ.get("LOG_LEVEL", "INFO"))
    return p


def main(argv: list[str] | None = None) -> int:
    args = _build_argparser().parse_args(argv)
    logging.basicConfig(
        level=args.log_level.upper(),
        format="%(asctime)s %(levelname)s %(name)s %(message)s",
        stream=sys.stderr,
    )
    return run(
        limit=args.limit,
        force=args.force,
        only_crop=args.crop,
        only_year=args.year,
        include_2023=args.include_2023,
        workers=args.workers,
    )


if __name__ == "__main__":
    sys.exit(main())