mirror of
https://github.com/recklessop/zroc.git
synced 2026-07-03 05:23:13 -04:00
0f988fa449
Extend collector to pull total provisioned virtual disk capacity per VM using VirtualDisk device enumeration, and expose it as a Prometheus gauge. Used by the zroc-ui Planner page for mirror volume storage estimates. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
220 lines
7.5 KiB
Python
220 lines
7.5 KiB
Python
"""
|
||
HTTP server for the zROC Planner vCenter metrics collector.
|
||
|
||
Exposes:
|
||
GET /metrics — Prometheus text exposition format
|
||
GET /health — JSON health-check (used by Docker HEALTHCHECK and load balancers)
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import signal
|
||
import sys
|
||
import time
|
||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||
from typing import Optional
|
||
|
||
import config
|
||
import collector as col
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Prometheus text format renderer
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_METRIC_HELP = {
|
||
"vcenter_vm_disk_write_iops": (
|
||
"gauge",
|
||
"Virtual disk write IOPS (numberWriteAveraged.average, sum across all disk instances)",
|
||
),
|
||
"vcenter_vm_disk_write_throughput_mbps": (
|
||
"gauge",
|
||
"Virtual disk write throughput in MB/s (write.average, sum across all disk instances)",
|
||
),
|
||
"vcenter_vm_disk_write_latency_ms": (
|
||
"gauge",
|
||
"Virtual disk write latency in milliseconds (totalWriteLatency.average, mean across disk instances)",
|
||
),
|
||
"vcenter_vm_disk_provisioned_gb": (
|
||
"gauge",
|
||
"Total provisioned virtual disk capacity in GB (sum of all VirtualDisk devices)",
|
||
),
|
||
}
|
||
|
||
# Maps our collector metric keys to Prometheus metric names
|
||
_METRIC_NAME_MAP = {
|
||
"disk_write_iops": "vcenter_vm_disk_write_iops",
|
||
"disk_write_throughput": "vcenter_vm_disk_write_throughput_mbps",
|
||
"disk_write_latency": "vcenter_vm_disk_write_latency_ms",
|
||
"disk_provisioned_gb": "vcenter_vm_disk_provisioned_gb",
|
||
}
|
||
|
||
|
||
def _escape_label_value(v: str) -> str:
|
||
return v.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
||
|
||
|
||
def _render_labels(labels: dict) -> str:
|
||
parts = [
|
||
f'{k}="{_escape_label_value(str(v))}"'
|
||
for k, v in labels.items()
|
||
]
|
||
return "{" + ",".join(parts) + "}"
|
||
|
||
|
||
def _build_prometheus_output() -> str:
|
||
lines: list[str] = []
|
||
snapshot = col.store.snapshot()
|
||
|
||
# Emit # HELP / # TYPE headers once per metric name
|
||
emitted_headers: set[str] = set()
|
||
|
||
for _moref, entry in snapshot.items():
|
||
labels = entry["labels"]
|
||
metrics = entry["metrics"]
|
||
label_str = _render_labels(labels)
|
||
|
||
for col_key, value in metrics.items():
|
||
prom_name = _METRIC_NAME_MAP.get(col_key)
|
||
if prom_name is None:
|
||
continue
|
||
|
||
if prom_name not in emitted_headers:
|
||
mtype, mhelp = _METRIC_HELP[prom_name]
|
||
lines.append(f"# HELP {prom_name} {mhelp}")
|
||
lines.append(f"# TYPE {prom_name} {mtype}")
|
||
emitted_headers.add(prom_name)
|
||
|
||
lines.append(f"{prom_name}{label_str} {value:.4f}")
|
||
|
||
# Collector self-metrics
|
||
lines += [
|
||
"# HELP vcenter_collector_last_collection_timestamp_seconds Unix timestamp of the last completed collection cycle",
|
||
"# TYPE vcenter_collector_last_collection_timestamp_seconds gauge",
|
||
]
|
||
ts = col.store.last_collection_time
|
||
lines.append(f"vcenter_collector_last_collection_timestamp_seconds {ts or 0:.0f}")
|
||
|
||
lines += [
|
||
"# HELP vcenter_collector_last_collection_duration_seconds Duration of the last collection cycle in seconds",
|
||
"# TYPE vcenter_collector_last_collection_duration_seconds gauge",
|
||
]
|
||
lines.append(f"vcenter_collector_last_collection_duration_seconds {col.store.last_collection_duration:.3f}")
|
||
|
||
lines += [
|
||
"# HELP vcenter_collector_last_vm_count Number of VMs collected in the last cycle",
|
||
"# TYPE vcenter_collector_last_vm_count gauge",
|
||
]
|
||
lines.append(f"vcenter_collector_last_vm_count {col.store.last_vm_count}")
|
||
|
||
lines += [
|
||
"# HELP vcenter_collector_cycles_total Total number of completed collection cycles",
|
||
"# TYPE vcenter_collector_cycles_total counter",
|
||
]
|
||
lines.append(f"vcenter_collector_cycles_total {col.store.collection_cycles}")
|
||
|
||
return "\n".join(lines) + "\n"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# HTTP handler
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class _Handler(BaseHTTPRequestHandler):
|
||
def log_message(self, fmt: str, *args) -> None:
|
||
# Route HTTP access log through the standard logger at DEBUG level
|
||
log.debug("HTTP %s", fmt % args)
|
||
|
||
def do_GET(self) -> None:
|
||
path = self.path.split("?")[0]
|
||
|
||
if path == "/metrics":
|
||
self._serve_metrics()
|
||
elif path in ("/health", "/healthz", "/ready"):
|
||
self._serve_health()
|
||
else:
|
||
self._send(404, "text/plain", b"Not Found\n")
|
||
|
||
def _serve_metrics(self) -> None:
|
||
body = _build_prometheus_output().encode("utf-8")
|
||
self._send(200, "text/plain; version=0.0.4; charset=utf-8", body)
|
||
|
||
def _serve_health(self) -> None:
|
||
now = time.time()
|
||
last_ts = col.store.last_collection_time
|
||
last_error = col.store.last_error
|
||
|
||
# Unhealthy if we've never collected, or the last collection was more
|
||
# than 3× the poll interval ago (suggests the loop is hung/dead).
|
||
stale_threshold = config.POLL_INTERVAL * 3
|
||
is_stale = last_ts is None or (now - last_ts) > stale_threshold
|
||
healthy = not is_stale and last_error is None
|
||
|
||
payload = {
|
||
"status": "ok" if healthy else "degraded",
|
||
"last_collection_time": last_ts,
|
||
"last_collection_duration_seconds": col.store.last_collection_duration,
|
||
"last_vm_count": col.store.last_vm_count,
|
||
"last_error": last_error,
|
||
"collection_cycles": col.store.collection_cycles,
|
||
"stale": is_stale,
|
||
}
|
||
body = json.dumps(payload, indent=2).encode("utf-8")
|
||
status = 200 if healthy else 503
|
||
self._send(status, "application/json", body)
|
||
|
||
def _send(self, code: int, content_type: str, body: bytes) -> None:
|
||
self.send_response(code)
|
||
self.send_header("Content-Type", content_type)
|
||
self.send_header("Content-Length", str(len(body)))
|
||
self.end_headers()
|
||
self.wfile.write(body)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Entry point
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main() -> None:
|
||
config.configure_logging()
|
||
log.info("zROC Planner vCenter Collector starting")
|
||
log.info(
|
||
"Config: vCenter=%s port=%s poll_interval=%ss batch_size=%d",
|
||
config.VCENTER_HOST, config.VCENTER_PORT,
|
||
config.POLL_INTERVAL, config.BATCH_SIZE,
|
||
)
|
||
|
||
# Start the background collection loop
|
||
c = col.VCenterCollector()
|
||
c.start()
|
||
|
||
# Graceful shutdown on SIGTERM / SIGINT
|
||
def _shutdown(signum, frame) -> None:
|
||
log.info("Caught signal %d, shutting down…", signum)
|
||
c.stop()
|
||
sys.exit(0)
|
||
|
||
signal.signal(signal.SIGTERM, _shutdown)
|
||
signal.signal(signal.SIGINT, _shutdown)
|
||
|
||
# Start HTTP server (blocking)
|
||
httpd = HTTPServer((config.HTTP_HOST, config.HTTP_PORT), _Handler)
|
||
log.info("HTTP server listening on %s:%d", config.HTTP_HOST, config.HTTP_PORT)
|
||
log.info("Metrics endpoint: http://%s:%d/metrics", config.HTTP_HOST, config.HTTP_PORT)
|
||
log.info("Health endpoint: http://%s:%d/health", config.HTTP_HOST, config.HTTP_PORT)
|
||
|
||
try:
|
||
httpd.serve_forever()
|
||
except KeyboardInterrupt:
|
||
pass
|
||
finally:
|
||
c.stop()
|
||
httpd.server_close()
|
||
log.info("Server stopped")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|