zroc/zroc-planner/server.py

"""
HTTP server for the zROC Planner vCenter metrics collector.

Exposes:
  GET /metrics  — Prometheus text exposition format
  GET /health   — JSON health-check (used by Docker HEALTHCHECK and load balancers)
"""

import json
import logging
import signal
import sys
import time
from http.server import BaseHTTPRequestHandler, HTTPServer
from typing import Optional

import config
import collector as col

log = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Prometheus text format renderer
# ---------------------------------------------------------------------------

_METRIC_HELP = {
    "vcenter_vm_disk_write_iops": (
        "gauge",
        "Virtual disk write IOPS (numberWriteAveraged.average, sum across all disk instances)",
    ),
    "vcenter_vm_disk_write_throughput_mbps": (
        "gauge",
        "Virtual disk write throughput in MB/s (write.average, sum across all disk instances)",
    ),
    "vcenter_vm_disk_write_latency_ms": (
        "gauge",
        "Virtual disk write latency in milliseconds (totalWriteLatency.average, mean across disk instances)",
    ),
    "vcenter_vm_disk_provisioned_gb": (
        "gauge",
        "Total provisioned virtual disk capacity in GB (sum of all VirtualDisk devices)",
    ),
}

# Maps our collector metric keys to Prometheus metric names
_METRIC_NAME_MAP = {
    "disk_write_iops":       "vcenter_vm_disk_write_iops",
    "disk_write_throughput": "vcenter_vm_disk_write_throughput_mbps",
    "disk_write_latency":    "vcenter_vm_disk_write_latency_ms",
    "disk_provisioned_gb":   "vcenter_vm_disk_provisioned_gb",
}


def _escape_label_value(v: str) -> str:
    return v.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")


def _render_labels(labels: dict) -> str:
    parts = [
        f'{k}="{_escape_label_value(str(v))}"'
        for k, v in labels.items()
    ]
    return "{" + ",".join(parts) + "}"


def _build_prometheus_output() -> str:
    lines: list[str] = []
    snapshot = col.store.snapshot()

    # Emit # HELP / # TYPE headers once per metric name
    emitted_headers: set[str] = set()

    for _moref, entry in snapshot.items():
        labels = entry["labels"]
        metrics = entry["metrics"]
        label_str = _render_labels(labels)

        for col_key, value in metrics.items():
            prom_name = _METRIC_NAME_MAP.get(col_key)
            if prom_name is None:
                continue

            if prom_name not in emitted_headers:
                mtype, mhelp = _METRIC_HELP[prom_name]
                lines.append(f"# HELP {prom_name} {mhelp}")
                lines.append(f"# TYPE {prom_name} {mtype}")
                emitted_headers.add(prom_name)

            lines.append(f"{prom_name}{label_str} {value:.4f}")

    # Collector self-metrics
    lines += [
        "# HELP vcenter_collector_last_collection_timestamp_seconds Unix timestamp of the last completed collection cycle",
        "# TYPE vcenter_collector_last_collection_timestamp_seconds gauge",
    ]
    ts = col.store.last_collection_time
    lines.append(f"vcenter_collector_last_collection_timestamp_seconds {ts or 0:.0f}")

    lines += [
        "# HELP vcenter_collector_last_collection_duration_seconds Duration of the last collection cycle in seconds",
        "# TYPE vcenter_collector_last_collection_duration_seconds gauge",
    ]
    lines.append(f"vcenter_collector_last_collection_duration_seconds {col.store.last_collection_duration:.3f}")

    lines += [
        "# HELP vcenter_collector_last_vm_count Number of VMs collected in the last cycle",
        "# TYPE vcenter_collector_last_vm_count gauge",
    ]
    lines.append(f"vcenter_collector_last_vm_count {col.store.last_vm_count}")

    lines += [
        "# HELP vcenter_collector_cycles_total Total number of completed collection cycles",
        "# TYPE vcenter_collector_cycles_total counter",
    ]
    lines.append(f"vcenter_collector_cycles_total {col.store.collection_cycles}")

    return "\n".join(lines) + "\n"


# ---------------------------------------------------------------------------
# HTTP handler
# ---------------------------------------------------------------------------

class _Handler(BaseHTTPRequestHandler):
    def log_message(self, fmt: str, *args) -> None:
        # Route HTTP access log through the standard logger at DEBUG level
        log.debug("HTTP %s", fmt % args)

    def do_GET(self) -> None:
        path = self.path.split("?")[0]

        if path == "/metrics":
            self._serve_metrics()
        elif path in ("/health", "/healthz", "/ready"):
            self._serve_health()
        else:
            self._send(404, "text/plain", b"Not Found\n")

    def _serve_metrics(self) -> None:
        body = _build_prometheus_output().encode("utf-8")
        self._send(200, "text/plain; version=0.0.4; charset=utf-8", body)

    def _serve_health(self) -> None:
        now = time.time()
        last_ts = col.store.last_collection_time
        last_error = col.store.last_error

        # Unhealthy if we've never collected, or the last collection was more
        # than 3× the poll interval ago (suggests the loop is hung/dead).
        stale_threshold = config.POLL_INTERVAL * 3
        is_stale = last_ts is None or (now - last_ts) > stale_threshold
        healthy = not is_stale and last_error is None

        payload = {
            "status": "ok" if healthy else "degraded",
            "last_collection_time": last_ts,
            "last_collection_duration_seconds": col.store.last_collection_duration,
            "last_vm_count": col.store.last_vm_count,
            "last_error": last_error,
            "collection_cycles": col.store.collection_cycles,
            "stale": is_stale,
        }
        body = json.dumps(payload, indent=2).encode("utf-8")
        status = 200 if healthy else 503
        self._send(status, "application/json", body)

    def _send(self, code: int, content_type: str, body: bytes) -> None:
        self.send_response(code)
        self.send_header("Content-Type", content_type)
        self.send_header("Content-Length", str(len(body)))
        self.end_headers()
        self.wfile.write(body)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------

def main() -> None:
    config.configure_logging()
    log.info("zROC Planner vCenter Collector starting")
    log.info(
        "Config: vCenter=%s port=%s poll_interval=%ss batch_size=%d",
        config.VCENTER_HOST, config.VCENTER_PORT,
        config.POLL_INTERVAL, config.BATCH_SIZE,
    )

    # Start the background collection loop
    c = col.VCenterCollector()
    c.start()

    # Graceful shutdown on SIGTERM / SIGINT
    def _shutdown(signum, frame) -> None:
        log.info("Caught signal %d, shutting down…", signum)
        c.stop()
        sys.exit(0)

    signal.signal(signal.SIGTERM, _shutdown)
    signal.signal(signal.SIGINT, _shutdown)

    # Start HTTP server (blocking)
    httpd = HTTPServer((config.HTTP_HOST, config.HTTP_PORT), _Handler)
    log.info("HTTP server listening on %s:%d", config.HTTP_HOST, config.HTTP_PORT)
    log.info("Metrics endpoint: http://%s:%d/metrics", config.HTTP_HOST, config.HTTP_PORT)
    log.info("Health endpoint:  http://%s:%d/health",  config.HTTP_HOST, config.HTTP_PORT)

    try:
        httpd.serve_forever()
    except KeyboardInterrupt:
        pass
    finally:
        c.stop()
        httpd.server_close()
        log.info("Server stopped")


if __name__ == "__main__":
    main()