mirror of
https://github.com/recklessop/Zerto_Exporter.git
synced 2026-07-03 16:03:15 -04:00
2836e1ae10
- Replace file-based metrics and HTTP server with prometheus_client (Gauge + start_http_server), eliminating file I/O race conditions - Fix ThreadProbe and thread restart lambda bugs - Switch logging from RotatingFileHandler to JSON stdout for fluentd/Loki - Add PYTHONUNBUFFERED=1 to Dockerfile for immediate container log output - Upgrade base image from python:3.12.3-slim to python:3.13-slim - Upgrade pyvmomi to 9.0.0.0; pin pydantic>=2.9.0 and typing_extensions>=4.12.2 to use pre-built Python 3.13 wheels (removes Rust toolchain from build) - Remove unused packages: boto3, botocore, s3transfer, jmespath, redis, docopt, pyflakes, posthog - Remove unused imports (Posthog, CaseInsensitiveDict) and dead variables (callhomestats, local_site_info, lastStats) - Fix service_profile() NameError (siteidentifier -> serviceProfileIdentifier) - Remove bare print() in zvma.py __authhandler__, replace with self.log.info() - Remove all commented-out PostHog blocks from zvma10/zvma.py - Delete legacy zvma9_7/ module and app/logs/ directory - Remove deprecated 'version: 3.3' from docker-compose.yml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
635 lines
38 KiB
Python
635 lines
38 KiB
Python
import requests
|
|
import os
|
|
import sys
|
|
import json
|
|
import logging
|
|
import threading
|
|
import socket
|
|
from time import sleep, time
|
|
from requests.packages.urllib3.exceptions import InsecureRequestWarning
|
|
from tinydb import TinyDB, Query
|
|
from tinydb.storages import MemoryStorage
|
|
from prometheus_client import Gauge, start_http_server
|
|
from version import VERSION
|
|
from vmware.vcenter import vcsite
|
|
from zvma10.zvma import zvmsite
|
|
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
|
|
|
global start_time
|
|
start_time = time()
|
|
|
|
"""
|
|
Variables: Normally these are imported from the Docker Container, but alternative values can be modified if running the script manually
|
|
"""
|
|
|
|
listen_port = int(os.getenv('LISTEN_PORT', 9999))
|
|
verifySSL = os.getenv("VERIFY_SSL", 'False').lower() in ('true', '1', 't')
|
|
zvm_url = os.environ.get('ZVM_HOST', '192.168.50.30')
|
|
zvm_port = os.environ.get('ZVM_PORT', '443')
|
|
zvm_username = os.environ.get('ZVM_USERNAME', 'admin')
|
|
zvm_password = os.environ.get('ZVM_PASSWORD', 'Zertodata987!')
|
|
client_id = os.environ.get('CLIENT_ID', 'zerto-client')
|
|
client_secret = os.environ.get('CLIENT_SECRET', 'fcYMFuA5TkIUwp6b3hDUxim0f32z8erk')
|
|
scrape_speed = int(os.environ.get('SCRAPE_SPEED', 30))
|
|
api_timeout = int(os.environ.get('API_TIMEOUT', 5))
|
|
LOGLEVEL = os.environ.get('LOGLEVEL', 'DEBUG').upper()
|
|
DISABLE_STATS = os.environ.get('DISABLE_STATS', 'FALSE').upper()
|
|
version = str(VERSION)
|
|
vcenter_host = os.environ.get('VCENTER_HOST', '192.168.50.20')
|
|
vcenter_user = os.environ.get('VCENTER_USER', 'administrator@vsphere.local')
|
|
vcenter_pwd = os.environ.get('VCENTER_PASSWORD', 'Zertodata987!')
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Prometheus Gauge definitions
|
|
# All metrics are served thread-safely at http://host:<LISTEN_PORT>/metrics
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Encryption / stats metrics (GetStatsFunc)
|
|
_STATS_LABELS = ['VpgIdentifier', 'VmIdentifier', 'VmName', 'SiteIdentifier', 'SiteName']
|
|
g_vm_iops_counter = Gauge('vm_IoOperationsCounter', 'VM IO Operations Counter', _STATS_LABELS)
|
|
g_vm_write_counter = Gauge('vm_WriteCounterInMBs', 'VM Write Counter In MBs', _STATS_LABELS)
|
|
g_vm_sync_counter = Gauge('vm_SyncCounterInMBs', 'VM Sync Counter In MBs', _STATS_LABELS)
|
|
g_vm_network_counter = Gauge('vm_NetworkTrafficCounterInMBs', 'VM Network Traffic Counter In MBs', _STATS_LABELS)
|
|
g_vm_encrypted_lbs = Gauge('vm_EncryptedDataInLBs', 'VM Encrypted Data In LBs', _STATS_LABELS)
|
|
g_vm_unencrypted_lbs = Gauge('vm_UnencryptedDataInLBs', 'VM Unencrypted Data In LBs', _STATS_LABELS)
|
|
g_vm_total_lbs = Gauge('vm_TotalDataInLBs', 'VM Total Data In LBs', _STATS_LABELS)
|
|
g_vm_percent_encrypted = Gauge('vm_PercentEncrypted', 'VM Percent Encrypted', _STATS_LABELS)
|
|
g_vm_trend_change_level = Gauge('vm_TrendChangeLevel', 'VM Trend Change Level', _STATS_LABELS)
|
|
|
|
# VPG metrics (GetDataFunc)
|
|
_VPG_LABELS = ['VpgIdentifier', 'VpgName', 'VpgPriority', 'SiteIdentifier', 'SiteName']
|
|
g_vpg_storage_used = Gauge('vpg_storage_used_in_mb', 'VPG Storage Used In MB', _VPG_LABELS)
|
|
g_vpg_actual_rpo = Gauge('vpg_actual_rpo', 'VPG Actual RPO', _VPG_LABELS)
|
|
g_vpg_throughput = Gauge('vpg_throughput_in_mb', 'VPG Throughput In MB', _VPG_LABELS)
|
|
g_vpg_iops = Gauge('vpg_iops', 'VPG IOPs', _VPG_LABELS)
|
|
g_vpg_provisioned_storage = Gauge('vpg_provisioned_storage_in_mb', 'VPG Provisioned Storage In MB', _VPG_LABELS)
|
|
g_vpg_vms_count = Gauge('vpg_vms_count', 'VPG VMs Count', _VPG_LABELS)
|
|
g_vpg_configured_rpo = Gauge('vpg_configured_rpo_seconds', 'VPG Configured RPO Seconds', _VPG_LABELS)
|
|
g_vpg_actual_history = Gauge('vpg_actual_history_in_minutes', 'VPG Actual History In Minutes', _VPG_LABELS)
|
|
g_vpg_configured_history = Gauge('vpg_configured_history_in_minutes', 'VPG Configured History In Minutes', _VPG_LABELS)
|
|
g_vpg_failsafe_actual = Gauge('vpg_failsafe_history_in_minutes_actual', 'VPG Failsafe History In Minutes Actual', _VPG_LABELS)
|
|
g_vpg_failsafe_configured = Gauge('vpg_failsafe_history_in_minutes_configured', 'VPG Failsafe History In Minutes Configured', _VPG_LABELS)
|
|
g_vpg_status = Gauge('vpg_status', 'VPG Status', _VPG_LABELS)
|
|
g_vpg_substatus = Gauge('vpg_substatus', 'VPG Sub-Status', _VPG_LABELS)
|
|
g_vpg_alert_status = Gauge('vpg_alert_status', 'VPG Alert Status', _VPG_LABELS)
|
|
|
|
# Datastore metrics (GetDataFunc)
|
|
_DS_LABELS = ['datastoreIdentifier', 'DatastoreName', 'SiteIdentifier', 'SiteName']
|
|
g_ds_vras = Gauge('datastore_vras', 'Datastore VRAs', _DS_LABELS)
|
|
g_ds_incoming_vms = Gauge('datastore_incoming_vms', 'Datastore Incoming VMs', _DS_LABELS)
|
|
g_ds_outgoing_vms = Gauge('datastore_outgoing_vms', 'Datastore Outgoing VMs', _DS_LABELS)
|
|
g_ds_capacity = Gauge('datastore_usage_capacityinbytes', 'Datastore Capacity In Bytes', _DS_LABELS)
|
|
g_ds_free = Gauge('datastore_usage_freeinbytes', 'Datastore Free In Bytes', _DS_LABELS)
|
|
g_ds_used = Gauge('datastore_usage_usedinbytes', 'Datastore Used In Bytes', _DS_LABELS)
|
|
g_ds_provisioned = Gauge('datastore_usage_provisionedinbytes', 'Datastore Provisioned In Bytes', _DS_LABELS)
|
|
g_ds_zerto_protected_used = Gauge('datastore_usage_zerto_protected_usedinbytes', 'Datastore Zerto Protected Used In Bytes', _DS_LABELS)
|
|
g_ds_zerto_protected_provisioned = Gauge('datastore_usage_zerto_protected_provisionedinbytes', 'Datastore Zerto Protected Provisioned In Bytes', _DS_LABELS)
|
|
g_ds_zerto_recovery_used = Gauge('datastore_usage_zerto_recovery_usedinbytes', 'Datastore Zerto Recovery Used In Bytes', _DS_LABELS)
|
|
g_ds_zerto_recovery_provisioned = Gauge('datastore_usage_zerto_recovery_provisionedinbytes', 'Datastore Zerto Recovery Provisioned In Bytes', _DS_LABELS)
|
|
g_ds_zerto_journal_used = Gauge('datastore_usage_zerto_journal_usedinbytes', 'Datastore Zerto Journal Used In Bytes', _DS_LABELS)
|
|
g_ds_zerto_journal_provisioned = Gauge('datastore_usage_zerto_journal_provisionedinbytes', 'Datastore Zerto Journal Provisioned In Bytes', _DS_LABELS)
|
|
g_ds_zerto_scratch_used = Gauge('datastore_usage_zerto_scratch_usedinbytes', 'Datastore Zerto Scratch Used In Bytes', _DS_LABELS)
|
|
g_ds_zerto_scratch_provisioned = Gauge('datastore_usage_zerto_scratch_provisionedinbytes', 'Datastore Zerto Scratch Provisioned In Bytes', _DS_LABELS)
|
|
g_ds_zerto_appliances_used = Gauge('datastore_usage_zerto_appliances_usedinbytes', 'Datastore Zerto Appliances Used In Bytes', _DS_LABELS)
|
|
g_ds_zerto_appliances_provisioned = Gauge('datastore_usage_zerto_appliances_provisionedinbytes', 'Datastore Zerto Appliances Provisioned In Bytes', _DS_LABELS)
|
|
|
|
# VM metrics (GetDataFunc - VMs section)
|
|
_VM_LABELS = ['VmIdentifier', 'VmName', 'VmRecoveryVRA', 'VmPriority', 'SiteIdentifier', 'VpgName', 'SiteName']
|
|
g_vm_actualrpo = Gauge('vm_actualrpo', 'VM Actual RPO', _VM_LABELS)
|
|
g_vm_throughput = Gauge('vm_throughput_in_mb', 'VM Throughput In MB', _VM_LABELS)
|
|
g_vm_iops = Gauge('vm_iops', 'VM IOPs', _VM_LABELS)
|
|
g_vm_journal_hard_limit = Gauge('vm_journal_hard_limit', 'VM Journal Hard Limit', _VM_LABELS)
|
|
g_vm_journal_warning_limit = Gauge('vm_journal_warning_limit', 'VM Journal Warning Limit', _VM_LABELS)
|
|
g_vm_journal_used_storage = Gauge('vm_journal_used_storage_mb', 'VM Journal Used Storage MB', _VM_LABELS)
|
|
g_vm_outgoing_bandwidth = Gauge('vm_outgoing_bandwidth_in_mbps', 'VM Outgoing Bandwidth In Mbps', _VM_LABELS)
|
|
g_vm_used_storage = Gauge('vm_used_storage_in_MB', 'VM Used Storage In MB', _VM_LABELS)
|
|
g_vm_provisioned_storage = Gauge('vm_provisioned_storage_in_MB', 'VM Provisioned Storage In MB', _VM_LABELS)
|
|
g_vm_status = Gauge('vm_status', 'VM Status', _VM_LABELS)
|
|
g_vm_substatus = Gauge('vm_substatus', 'VM Sub-Status', _VM_LABELS)
|
|
|
|
# Scratch and journal volume metrics (GetDataFunc - Volumes sections)
|
|
_VOL_LABELS = ['ProtectedVm', 'ProtectedVmIdentifier', 'OwningVRA', 'VpgName', 'SiteIdentifier', 'SiteName']
|
|
g_scratch_vol_size = Gauge('scratch_volume_size_in_bytes', 'Scratch Volume Size In Bytes', _VOL_LABELS)
|
|
g_journal_vol_size = Gauge('vm_journal_volume_size_in_bytes', 'VM Journal Volume Size In Bytes', _VOL_LABELS)
|
|
g_journal_vol_provisioned = Gauge('vm_journal_volume_provisioned_in_bytes','VM Journal Volume Provisioned In Bytes', _VOL_LABELS)
|
|
g_journal_vol_count = Gauge('vm_journal_volume_count', 'VM Journal Volume Count', _VOL_LABELS)
|
|
|
|
# VRA metrics (GetVraMetrics)
|
|
_VRA_LABELS = ['VraIdentifierStr', 'VraName', 'VraVersion', 'HostVersion', 'SiteIdentifier', 'SiteName']
|
|
g_vra_memory = Gauge('vra_memory_in_GB', 'VRA Memory In GB', _VRA_LABELS)
|
|
g_vra_vcpu_count = Gauge('vra_vcpu_count', 'VRA vCPU Count', _VRA_LABELS)
|
|
g_vra_protected_vms = Gauge('vra_protected_vms', 'VRA Protected VMs', _VRA_LABELS)
|
|
g_vra_protected_vpgs = Gauge('vra_protected_vpgs', 'VRA Protected VPGs', _VRA_LABELS)
|
|
g_vra_protected_vols = Gauge('vra_protected_volumes', 'VRA Protected Volumes', _VRA_LABELS)
|
|
g_vra_recovery_vms = Gauge('vra_recovery_vms', 'VRA Recovery VMs', _VRA_LABELS)
|
|
g_vra_recovery_vpgs = Gauge('vra_recovery_vpgs', 'VRA Recovery VPGs', _VRA_LABELS)
|
|
g_vra_recovery_vols = Gauge('vra_recovery_volumes', 'VRA Recovery Volumes', _VRA_LABELS)
|
|
g_vra_self_protected = Gauge('vra_self_protected_vpgs', 'VRA Self-Protected VPGs', _VRA_LABELS)
|
|
g_vra_cpu_usage = Gauge('vra_cpu_usage_mhz', 'VRA CPU Usage MHz', _VRA_LABELS)
|
|
g_vra_memory_usage = Gauge('vra_memory_usage_mb', 'VRA Memory Usage MB', _VRA_LABELS)
|
|
|
|
# Exporter / thread health metrics (ThreadProbe)
|
|
g_exporter_uptime = Gauge('exporter_uptime', 'Exporter Uptime In Minutes', ['ExporterInstance'])
|
|
g_thread_status = Gauge('exporter_thread_status', 'Exporter Thread Status', ['thread', 'ExporterInstance'])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Thread which gets VM level encryption statistics from ZVM API
|
|
# ---------------------------------------------------------------------------
|
|
def GetStatsFunc(zvm_instance):
|
|
tempdb = TinyDB(storage=MemoryStorage)
|
|
dbvm = Query()
|
|
dbvpg = Query()
|
|
|
|
zvm = zvm_instance
|
|
while True:
|
|
global siteId
|
|
global siteName
|
|
|
|
if zvm.is_authenticated():
|
|
log.debug("Stats Collector Loop Running")
|
|
|
|
statsapi_json = zvm.vms_statistics()
|
|
log.debug(statsapi_json)
|
|
vms_encryption_metrics = zvm.encryptiondetection_metrics_vms()
|
|
|
|
if statsapi_json is not None:
|
|
for vm in statsapi_json:
|
|
vmsiteinfo = zvm.vm(vmidentifier=vm['VmIdentifier'], vpgidentifier=vm['VpgIdentifier'])
|
|
if vmsiteinfo['ProtectedSite']['identifier'] == zvm.site_id:
|
|
log.debug(f"VM is protected at this site - {vm['VmIdentifier']}")
|
|
|
|
if 'EncryptionMetrics' not in vm:
|
|
vm['EncryptionMetrics'] = {}
|
|
vm['VmName'] = None
|
|
vm['SiteId'] = zvm.site_id
|
|
|
|
CurrentIops = 0
|
|
CurrentWriteCounterInMBs = 0
|
|
CurrentSyncCounterInMBs = 0
|
|
CurrentNetworkTrafficCounterInMBs = 0
|
|
CurrentEncryptedLBs = 0
|
|
CurrentUnencryptedLBs = 0
|
|
CurrentTotalLBs = 0
|
|
CurrentPercentEncrypted = 0
|
|
CurrentTrendChangeLevel = 0
|
|
VMName = "NA"
|
|
|
|
for vmem in vms_encryption_metrics:
|
|
if vmem['Link']['identifier'] == vm['VmIdentifier']:
|
|
log.debug(f"Aligning VM Stats and Encryption Metrics for {vm['VmIdentifier']} - {vmem['Link']['name']}")
|
|
vm['EncryptionMetrics']['EncryptedData'] = vmem['EncryptionMetrics']['EncryptedData']
|
|
vm['EncryptionMetrics']['NonEncryptedData'] = vmem['EncryptionMetrics']['NonEncryptedData']
|
|
vm['EncryptionMetrics']['TrendChangeLevel'] = vmem['EncryptionMetrics']['TrendChangeLevel']
|
|
vm['VmName'] = vmem['Link']['name']
|
|
|
|
log.info("Checking TempDB for VM " + vm['VmIdentifier'] + " in VPG " + vm['VpgIdentifier'])
|
|
oldvmdata = tempdb.search((dbvm.VmIdentifier == vm['VmIdentifier']) & (dbvpg.VpgIdentifier == vm['VpgIdentifier']))
|
|
if oldvmdata:
|
|
log.info(vm['VmIdentifier'] + " Record Found, Updating DB")
|
|
log.debug("Old Data")
|
|
log.debug(oldvmdata)
|
|
log.debug(tempdb.update(vm, (dbvm.VmIdentifier == vm['VmIdentifier']) & (dbvpg.VpgIdentifier == vm['VpgIdentifier'])))
|
|
log.debug("New Data")
|
|
log.debug(vm)
|
|
log.debug("!@!@!@!@!@ Stats !@!@!@!@!@")
|
|
VMName = oldvmdata[0]['VmName']
|
|
log.debug("Current VM " + str(VMName))
|
|
CurrentIops = abs(vm['IoOperationsCounter'] - oldvmdata[0]['IoOperationsCounter'])
|
|
log.debug("CurrentIops " + str(CurrentIops))
|
|
CurrentSyncCounterInMBs = abs(vm['SyncCounterInMBs'] - oldvmdata[0]['SyncCounterInMBs'])
|
|
log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs))
|
|
CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs'])
|
|
log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs))
|
|
CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs'])
|
|
log.debug("CurrentWriteCounterInMBs " + str(CurrentWriteCounterInMBs))
|
|
CurrentEncryptedLBs = abs(vm['EncryptionMetrics']['EncryptedData'] - oldvmdata[0]['EncryptionMetrics']['EncryptedData'])
|
|
log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs))
|
|
CurrentUnencryptedLBs = abs(vm['EncryptionMetrics']['NonEncryptedData'] - oldvmdata[0]['EncryptionMetrics']['NonEncryptedData'])
|
|
log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs))
|
|
CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel'])
|
|
log.debug("CurrentTrendChangeLevel " + str(CurrentTrendChangeLevel))
|
|
CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs)
|
|
log.debug("CurrentTotalLBs " + str(CurrentTotalLBs))
|
|
if CurrentTotalLBs != 0:
|
|
CurrentPercentEncrypted = (CurrentEncryptedLBs / CurrentTotalLBs) * 100
|
|
else:
|
|
CurrentPercentEncrypted = 0
|
|
log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted))
|
|
else:
|
|
log.info(f"{vm['VmIdentifier']} - {vm['VmName']} - No Record Found, Inserting into DB")
|
|
log.debug(tempdb.insert(vm))
|
|
|
|
# Push calculated metrics to Prometheus Gauges
|
|
lbl = dict(
|
|
VpgIdentifier=str(vm['VpgIdentifier']),
|
|
VmIdentifier=str(vm['VmIdentifier']),
|
|
VmName=str(vm['VmName']),
|
|
SiteIdentifier=str(siteId),
|
|
SiteName=str(siteName)
|
|
)
|
|
g_vm_iops_counter.labels(**lbl).set(CurrentIops)
|
|
g_vm_write_counter.labels(**lbl).set(CurrentWriteCounterInMBs)
|
|
g_vm_sync_counter.labels(**lbl).set(CurrentSyncCounterInMBs)
|
|
g_vm_network_counter.labels(**lbl).set(CurrentNetworkTrafficCounterInMBs)
|
|
g_vm_encrypted_lbs.labels(**lbl).set(CurrentEncryptedLBs)
|
|
g_vm_unencrypted_lbs.labels(**lbl).set(CurrentUnencryptedLBs)
|
|
g_vm_total_lbs.labels(**lbl).set(CurrentTotalLBs)
|
|
g_vm_percent_encrypted.labels(**lbl).set(CurrentPercentEncrypted)
|
|
g_vm_trend_change_level.labels(**lbl).set(CurrentTrendChangeLevel)
|
|
else:
|
|
log.debug(f"VM is only recovering to this site, skipping metrics - {vm['VmIdentifier']}")
|
|
else:
|
|
log.debug("No VMS in Stats API")
|
|
|
|
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
|
|
sleep(scrape_speed)
|
|
else:
|
|
log.debug("Waiting 1 second for Auth Token")
|
|
sleep(1)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Function which retrieves stats from various ZVM APIs
|
|
# ---------------------------------------------------------------------------
|
|
def GetDataFunc(zvm_instance):
|
|
zvm = zvm_instance
|
|
while True:
|
|
global siteId
|
|
global siteName
|
|
|
|
if zvm.is_authenticated():
|
|
log.info("Data Collector Loop Running")
|
|
|
|
### VPGs API
|
|
vpg_json = zvm.vpgs()
|
|
if vpg_json is not None:
|
|
log.debug("Got VPG JSON")
|
|
for vpg in vpg_json:
|
|
lbl = dict(
|
|
VpgIdentifier=vpg['VpgIdentifier'],
|
|
VpgName=vpg['VpgName'],
|
|
VpgPriority=str(vpg['Priority']),
|
|
SiteIdentifier=siteId,
|
|
SiteName=siteName
|
|
)
|
|
g_vpg_storage_used.labels(**lbl).set(vpg["UsedStorageInMB"])
|
|
g_vpg_actual_rpo.labels(**lbl).set(vpg["ActualRPO"])
|
|
g_vpg_throughput.labels(**lbl).set(vpg["ThroughputInMB"])
|
|
g_vpg_iops.labels(**lbl).set(vpg["IOPs"])
|
|
g_vpg_provisioned_storage.labels(**lbl).set(vpg["ProvisionedStorageInMB"])
|
|
g_vpg_vms_count.labels(**lbl).set(vpg["VmsCount"])
|
|
g_vpg_configured_rpo.labels(**lbl).set(vpg["ConfiguredRpoSeconds"])
|
|
g_vpg_actual_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ActualHistoryInMinutes"])
|
|
g_vpg_configured_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"])
|
|
if vpg["FailSafeHistory"] is None:
|
|
g_vpg_failsafe_actual.labels(**lbl).set(0)
|
|
g_vpg_failsafe_configured.labels(**lbl).set(0)
|
|
else:
|
|
g_vpg_failsafe_actual.labels(**lbl).set(vpg["FailSafeHistory"]["ActualFailSafeHistory"])
|
|
g_vpg_failsafe_configured.labels(**lbl).set(vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"])
|
|
g_vpg_status.labels(**lbl).set(vpg["Status"])
|
|
g_vpg_substatus.labels(**lbl).set(vpg["SubStatus"])
|
|
g_vpg_alert_status.labels(**lbl).set(vpg["AlertStatus"])
|
|
else:
|
|
log.debug("No VPGs Found")
|
|
|
|
### Datastores API
|
|
ds_json = zvm.datastores()
|
|
if ds_json is not None:
|
|
log.debug("Got Datastores API")
|
|
for ds in ds_json:
|
|
log.debug(f"Processing {ds['DatastoreName']}")
|
|
lbl = dict(
|
|
datastoreIdentifier=ds['DatastoreIdentifier'],
|
|
DatastoreName=ds['DatastoreName'],
|
|
SiteIdentifier=siteId,
|
|
SiteName=siteName
|
|
)
|
|
g_ds_vras.labels(**lbl).set(ds["Stats"]["NumVRAs"])
|
|
g_ds_incoming_vms.labels(**lbl).set(ds["Stats"]["NumIncomingVMs"])
|
|
g_ds_outgoing_vms.labels(**lbl).set(ds["Stats"]["NumOutgoingVMs"])
|
|
g_ds_capacity.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"])
|
|
g_ds_free.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"])
|
|
g_ds_used.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"])
|
|
g_ds_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"])
|
|
g_ds_zerto_protected_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"])
|
|
g_ds_zerto_protected_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"])
|
|
g_ds_zerto_recovery_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"])
|
|
g_ds_zerto_recovery_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"])
|
|
g_ds_zerto_journal_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"])
|
|
g_ds_zerto_journal_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"])
|
|
g_ds_zerto_scratch_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"])
|
|
g_ds_zerto_scratch_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"])
|
|
g_ds_zerto_appliances_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"])
|
|
g_ds_zerto_appliances_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"])
|
|
else:
|
|
log.debug("No Datastores Found")
|
|
|
|
## VMs API
|
|
log.debug("Getting VMs API")
|
|
vms_json = zvm.vms()
|
|
if vms_json is not None:
|
|
log.debug("Got VMs API")
|
|
for vm in vms_json:
|
|
log.debug("Processing VM: " + str(vm['VmName']))
|
|
log.debug("Checking VM " + vm['VmIdentifier'] + " on Protected Site " + vm['ProtectedSite']['identifier'] + " against " + siteId)
|
|
|
|
if siteId == vm['ProtectedSite']['identifier']:
|
|
log.debug("Found VM " + vm['VmIdentifier'] + " on Protected Site")
|
|
|
|
if not isinstance(vm["ActualRPO"], int):
|
|
vm["ActualRPO"] = -1
|
|
|
|
lbl = dict(
|
|
VmIdentifier=str(vm['VmIdentifier']),
|
|
VmName=str(vm['VmName']),
|
|
VmRecoveryVRA=str(vm["RecoveryHostName"]),
|
|
VmPriority=str(vm['Priority']),
|
|
SiteIdentifier=str(siteId),
|
|
VpgName=str(vm['VpgName']),
|
|
SiteName=str(siteName)
|
|
)
|
|
g_vm_actualrpo.labels(**lbl).set(vm["ActualRPO"])
|
|
g_vm_throughput.labels(**lbl).set(vm["ThroughputInMB"])
|
|
g_vm_iops.labels(**lbl).set(vm["IOPs"])
|
|
g_vm_journal_hard_limit.labels(**lbl).set(vm["JournalHardLimit"]["LimitValue"])
|
|
g_vm_journal_warning_limit.labels(**lbl).set(vm["JournalWarningThreshold"]["LimitValue"])
|
|
g_vm_journal_used_storage.labels(**lbl).set(vm["JournalUsedStorageMb"])
|
|
g_vm_outgoing_bandwidth.labels(**lbl).set(vm["OutgoingBandWidthInMbps"])
|
|
g_vm_used_storage.labels(**lbl).set(vm["UsedStorageInMB"])
|
|
g_vm_provisioned_storage.labels(**lbl).set(vm["ProvisionedStorageInMB"])
|
|
g_vm_status.labels(**lbl).set(vm["Status"])
|
|
g_vm_substatus.labels(**lbl).set(vm["SubStatus"])
|
|
log.debug("Processed VM: " + str(vm['VmName']))
|
|
else:
|
|
log.debug("VM " + vm['VmIdentifier'] + " is protected to this site")
|
|
else:
|
|
log.debug("No VMs Found")
|
|
|
|
## Volumes API - Scratch Volumes
|
|
log.debug("Getting Scratch Volumes")
|
|
scratch_vols = zvm.volumes(volumetype="scratch")
|
|
if scratch_vols is not None:
|
|
log.debug("Got Scratch Volumes API")
|
|
# Accumulate per-VM totals before setting gauges (multiple volumes per VM)
|
|
scratch_accumulator = {}
|
|
for volume in scratch_vols:
|
|
key = (
|
|
volume['ProtectedVm']['Name'],
|
|
volume['ProtectedVm']['Identifier'],
|
|
volume['OwningVm']['Name'],
|
|
volume['Vpg']['Name']
|
|
)
|
|
scratch_accumulator[key] = scratch_accumulator.get(key, 0) + volume["Size"]["UsedInBytes"]
|
|
for (pvm, pvmid, owning_vra, vpg_name), size in scratch_accumulator.items():
|
|
g_scratch_vol_size.labels(
|
|
ProtectedVm=pvm, ProtectedVmIdentifier=pvmid,
|
|
OwningVRA=owning_vra, VpgName=vpg_name,
|
|
SiteIdentifier=siteId, SiteName=siteName
|
|
).set(size)
|
|
else:
|
|
log.debug("No Scratch Volumes Found")
|
|
|
|
## Volumes API - Journal Volumes
|
|
log.debug("Getting Journal Volumes")
|
|
journal_vols = zvm.volumes(volumetype="journal")
|
|
if journal_vols is not None:
|
|
log.debug("Journal Volumes Exist")
|
|
# Accumulate per-VM totals before setting gauges (multiple volumes per VM)
|
|
journal_size_acc = {}
|
|
journal_prov_acc = {}
|
|
journal_count_acc = {}
|
|
for volume in journal_vols:
|
|
log.debug("Journal Volume: " + volume['ProtectedVm']['Name'] + " Calculating total size...")
|
|
key = (
|
|
volume['ProtectedVm']['Name'],
|
|
volume['ProtectedVm']['Identifier'],
|
|
volume['OwningVm']['Name'],
|
|
volume['Vpg']['Name']
|
|
)
|
|
journal_size_acc[key] = journal_size_acc.get(key, 0) + volume["Size"]["UsedInBytes"]
|
|
journal_prov_acc[key] = journal_prov_acc.get(key, 0) + volume["Size"]["ProvisionedInBytes"]
|
|
journal_count_acc[key] = journal_count_acc.get(key, 0) + 1
|
|
for key in journal_size_acc:
|
|
pvm, pvmid, owning_vra, vpg_name = key
|
|
lbl = dict(
|
|
ProtectedVm=pvm, ProtectedVmIdentifier=pvmid,
|
|
OwningVRA=owning_vra, VpgName=vpg_name,
|
|
SiteIdentifier=siteId, SiteName=siteName
|
|
)
|
|
g_journal_vol_size.labels(**lbl).set(journal_size_acc[key])
|
|
g_journal_vol_provisioned.labels(**lbl).set(journal_prov_acc[key])
|
|
g_journal_vol_count.labels(**lbl).set(journal_count_acc[key])
|
|
else:
|
|
log.debug("No Journal Volumes Exist")
|
|
|
|
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
|
|
sleep(scrape_speed)
|
|
else:
|
|
log.debug("Waiting 1 second for Auth Token")
|
|
sleep(1)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Get VRA CPU and memory usage from vCenter Server
|
|
# ---------------------------------------------------------------------------
|
|
def GetVraMetrics(zvm_instance):
|
|
log.debug("GetVraMetrics thread started")
|
|
try:
|
|
zvm = zvm_instance
|
|
while True:
|
|
global siteId
|
|
global siteName
|
|
|
|
log.debug("Checking Token in VRA CPU MEM Collector")
|
|
if zvm.is_authenticated():
|
|
log.info("VRA CPU MEM Collector Running")
|
|
|
|
vras_json = zvm.vras()
|
|
log.debug(vras_json)
|
|
|
|
if vras_json is not None:
|
|
log.debug("VRA names: %s", vras_json)
|
|
for vra in vras_json:
|
|
lbl = dict(
|
|
VraIdentifierStr=vra['VraIdentifierStr'],
|
|
VraName=vra['VraName'],
|
|
VraVersion=vra['VraVersion'],
|
|
HostVersion=vra['HostVersion'],
|
|
SiteIdentifier=siteId,
|
|
SiteName=siteName
|
|
)
|
|
g_vra_memory.labels(**lbl).set(vra["MemoryInGB"])
|
|
g_vra_vcpu_count.labels(**lbl).set(vra["NumOfCpus"])
|
|
g_vra_protected_vms.labels(**lbl).set(vra["ProtectedCounters"]["Vms"])
|
|
g_vra_protected_vpgs.labels(**lbl).set(vra["ProtectedCounters"]["Vpgs"])
|
|
g_vra_protected_vols.labels(**lbl).set(vra["ProtectedCounters"]["Volumes"])
|
|
g_vra_recovery_vms.labels(**lbl).set(vra["RecoveryCounters"]["Vms"])
|
|
g_vra_recovery_vpgs.labels(**lbl).set(vra["RecoveryCounters"]["Vpgs"])
|
|
g_vra_recovery_vols.labels(**lbl).set(vra["RecoveryCounters"]["Volumes"])
|
|
g_vra_self_protected.labels(**lbl).set(vra["SelfProtectedVpgs"])
|
|
|
|
log.debug("VRA Name: %s", vra['VraName'])
|
|
log.info(f"vCenter info: T/F = {is_vcenter_set} Host: {vcenter_host} u: {vcenter_user}")
|
|
|
|
if is_vcenter_set:
|
|
log.debug("vCenter Info Is Valid... Trying to get CPU and Memory usage for VRAs")
|
|
try:
|
|
log.debug("Trying to get stats from vCenter module")
|
|
vradata = vc_connection.get_cpu_mem_used(vra['VraName'])
|
|
if vradata is not None:
|
|
for item in vradata:
|
|
log.debug(item)
|
|
cpu_usage_mhz = vradata[0]
|
|
memory_usage_mb = vradata[1]
|
|
log.debug(f"VRA {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB")
|
|
g_vra_cpu_usage.labels(**lbl).set(cpu_usage_mhz)
|
|
g_vra_memory_usage.labels(**lbl).set(memory_usage_mb)
|
|
else:
|
|
log.info(f"No data returned for VRA {vra['VraName']} from vCenter")
|
|
except Exception as e:
|
|
log.info(f"No VM found with name {vra['VraName']}, or unexpected response: {e}")
|
|
else:
|
|
log.debug("No VRAs Found")
|
|
|
|
log.debug("Starting Sleep for " + str(int(scrape_speed * 2)) + " seconds")
|
|
sleep(scrape_speed * 2)
|
|
else:
|
|
log.debug("Waiting 1 second for Auth Token")
|
|
sleep(1)
|
|
except Exception as e:
|
|
log.error(f"Error in GetVraMetrics: {e}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Monitors thread health and exporter uptime
|
|
# ---------------------------------------------------------------------------
|
|
def ThreadProbe():
|
|
global container_id
|
|
while True:
|
|
log.debug("Thread Probe Started")
|
|
|
|
uptime = round((time() - start_time) / 60, 1)
|
|
g_exporter_uptime.labels(ExporterInstance=container_id).set(uptime)
|
|
|
|
g_thread_status.labels(thread="DataStats", ExporterInstance=container_id).set(1 if data_thread.is_alive() else 0)
|
|
g_thread_status.labels(thread="EncryptionStats",ExporterInstance=container_id).set(1 if stats_thread.is_alive() else 0)
|
|
g_thread_status.labels(thread="VraMetrics", ExporterInstance=container_id).set(1 if vra_metrics_thread.is_alive() else 0)
|
|
|
|
log.debug("Probe Thread Going to Sleep")
|
|
sleep(30)
|
|
|
|
|
|
def start_thread(target_func):
|
|
log.debug(f"Starting thread for {target_func.__name__}")
|
|
thread = threading.Thread(target=target_func)
|
|
thread.daemon = True
|
|
thread.start()
|
|
log.debug(f"Thread {target_func.__name__} started")
|
|
return thread
|
|
|
|
|
|
"""
|
|
Main Program Logic
|
|
"""
|
|
|
|
# Get the hostname of the machine
|
|
container_id = str(socket.gethostname())
|
|
|
|
class JsonFormatter(logging.Formatter):
|
|
"""Formats log records as single-line JSON for container stdout / fluentd ingestion."""
|
|
def format(self, record):
|
|
log_entry = {
|
|
"time": self.formatTime(record, "%Y-%m-%d %H:%M:%S"),
|
|
"level": record.levelname,
|
|
"thread": record.threadName,
|
|
"message": record.getMessage(),
|
|
"container": container_id,
|
|
}
|
|
if record.exc_info:
|
|
log_entry["exception"] = self.formatException(record.exc_info)
|
|
return json.dumps(log_entry)
|
|
|
|
log_handler = logging.StreamHandler(sys.stdout)
|
|
log_handler.setFormatter(JsonFormatter())
|
|
log = logging.getLogger("Node-Exporter")
|
|
log.setLevel(LOGLEVEL)
|
|
log.addHandler(log_handler)
|
|
log.info(f"Zerto-Node-Exporter - Version {version}")
|
|
log.info(f"Log Level: {LOGLEVEL}")
|
|
log.debug("Running with Variables:\nVerify SSL: " + str(verifySSL) + "\nZVM Host: " + zvm_url + "\nZVM Port: " + zvm_port + "\nClient-Id: " + client_id + "\nClient Secret: " + client_secret)
|
|
|
|
# Initialize zvmsite instance
|
|
zvm_instance = zvmsite(
|
|
host=zvm_url,
|
|
port=int(zvm_port),
|
|
username=zvm_username,
|
|
password=zvm_password,
|
|
client_id=client_id,
|
|
client_secret=client_secret,
|
|
loglevel=LOGLEVEL,
|
|
logger=log,
|
|
stats=(DISABLE_STATS != "TRUE")
|
|
)
|
|
|
|
# Start the zvmsite authentication thread
|
|
zvm_instance.connect()
|
|
|
|
siteId = None
|
|
siteName = None
|
|
|
|
while siteId is None:
|
|
if zvm_instance.is_authenticated():
|
|
sleep(2)
|
|
log.debug("Trying Set Global Vars")
|
|
siteId = zvm_instance.site_id
|
|
siteName = zvm_instance.site_name
|
|
else:
|
|
sleep(1)
|
|
|
|
# Check if vCenter is set; if not, disable VRA CPU/memory metrics
|
|
is_vcenter_set = True
|
|
if vcenter_host == "vcenter.local":
|
|
log.error("vCenter Host not set. Please set the environment variable VCENTER_HOST, turning off VRA CPU and Memory metrics")
|
|
is_vcenter_set = False
|
|
log.debug("vCenter data collection is enabled")
|
|
vc_connection = vcsite(vcenter_host, vcenter_user, vcenter_pwd, loglevel="debug", logger=log)
|
|
|
|
# Start prometheus metrics HTTP server (replaces the file-based SimpleHTTPRequestHandler)
|
|
# All Gauges from all threads are served thread-safely at http://host:<LISTEN_PORT>/metrics
|
|
start_http_server(listen_port)
|
|
log.info(f"Prometheus metrics server started on port {listen_port}")
|
|
|
|
# Starting collection threads
|
|
vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance))
|
|
data_thread = start_thread(lambda: GetDataFunc(zvm_instance))
|
|
stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance))
|
|
probe_thread = start_thread(ThreadProbe)
|
|
log.debug("All collection threads started")
|
|
|
|
# Loop indefinitely - monitor and restart any crashed threads
|
|
while True:
|
|
sleep(10)
|
|
if not probe_thread.is_alive():
|
|
log.error("Probe Thread Died - Restarting")
|
|
probe_thread = start_thread(ThreadProbe)
|
|
else:
|
|
log.debug("Probe Thread is alive")
|
|
if not data_thread.is_alive():
|
|
log.error("Data Thread Died - Restarting")
|
|
data_thread = start_thread(lambda: GetDataFunc(zvm_instance))
|
|
else:
|
|
log.debug("Data API Thread is alive")
|
|
if not stats_thread.is_alive():
|
|
log.error("Stats Thread Died - Restarting")
|
|
stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance))
|
|
else:
|
|
log.debug("Stats API Thread is alive")
|
|
if not vra_metrics_thread.is_alive():
|
|
log.error("VRA Metrics Thread Died - Restarting")
|
|
vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance))
|
|
else:
|
|
log.debug("VRA Metrics Thread is alive")
|
|
sleep(api_timeout)
|