From 2836e1ae10bf2f6322c7b2e829c7ec05aa6064bd Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 20 Feb 2026 19:05:45 -0500 Subject: [PATCH] Refactor to prometheus_client, JSON logging, and codebase cleanup - Replace file-based metrics and HTTP server with prometheus_client (Gauge + start_http_server), eliminating file I/O race conditions - Fix ThreadProbe and thread restart lambda bugs - Switch logging from RotatingFileHandler to JSON stdout for fluentd/Loki - Add PYTHONUNBUFFERED=1 to Dockerfile for immediate container log output - Upgrade base image from python:3.12.3-slim to python:3.13-slim - Upgrade pyvmomi to 9.0.0.0; pin pydantic>=2.9.0 and typing_extensions>=4.12.2 to use pre-built Python 3.13 wheels (removes Rust toolchain from build) - Remove unused packages: boto3, botocore, s3transfer, jmespath, redis, docopt, pyflakes, posthog - Remove unused imports (Posthog, CaseInsensitiveDict) and dead variables (callhomestats, local_site_info, lastStats) - Fix service_profile() NameError (siteidentifier -> serviceProfileIdentifier) - Remove bare print() in zvma.py __authhandler__, replace with self.log.info() - Remove all commented-out PostHog blocks from zvma10/zvma.py - Delete legacy zvma9_7/ module and app/logs/ directory - Remove deprecated 'version: 3.3' from docker-compose.yml Co-Authored-By: Claude Sonnet 4.6 --- Dockerfile | 11 +- app/logs/readme.txt | 1 - app/python-node-exporter.py | 676 +++++++++++++++++++----------------- app/requirements.txt | 15 +- app/zvma10/zvma.py | 34 +- app/zvma9_7/GetStatsFunc.py | 126 ------- app/zvma9_7/__init__.py | 3 - docker-compose.yml | 14 +- 8 files changed, 364 insertions(+), 516 deletions(-) delete mode 100644 app/logs/readme.txt delete mode 100644 app/zvma9_7/GetStatsFunc.py delete mode 100644 app/zvma9_7/__init__.py diff --git a/Dockerfile b/Dockerfile index 1479756..5d0e48c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,10 @@ -FROM python:3.12.3-slim +FROM python:3.13-slim EXPOSE 9999 # Install system dependencies RUN apt-get update \ && apt-get install -y \ - curl \ gcc \ libffi-dev \ libssl-dev \ @@ -13,14 +12,12 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -# Install Rust and Cargo using curl with IPv4 only -RUN CURL_IPRESOLVE=4 curl https://sh.rustup.rs -sSf | sh -s -- -y -ENV PATH="/root/.cargo/bin:${PATH}" - WORKDIR /usr/src/app # Set PYTHONPATH to include /usr/src/app ENV PYTHONPATH=/usr/src/app +# Disable stdout buffering so logs appear immediately in the container console +ENV PYTHONUNBUFFERED=1 # Copy the zerto exporter into the container COPY app /usr/src/app/ @@ -29,8 +26,6 @@ COPY app /usr/src/app/ RUN [ -f uuid.txt ] && rm uuid.txt || echo "No uuid.txt file to delete" # Install Python dependencies -# Set environment variable for PyO3 compatibility -ENV PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1 RUN pip install --upgrade pip RUN pip install --no-cache-dir -r requirements.txt diff --git a/app/logs/readme.txt b/app/logs/readme.txt deleted file mode 100644 index 77e59aa..0000000 --- a/app/logs/readme.txt +++ /dev/null @@ -1 +0,0 @@ -ZVM Exporter Log files will be in this folder \ No newline at end of file diff --git a/app/python-node-exporter.py b/app/python-node-exporter.py index 9c3a7fd..e05d6f2 100644 --- a/app/python-node-exporter.py +++ b/app/python-node-exporter.py @@ -1,23 +1,18 @@ import requests -import http.server -import socketserver import os -import ssl +import sys +import json import logging -from logging.handlers import RotatingFileHandler import threading import socket -from pyVim.connect import SmartConnect, Disconnect -from pyVmomi import vim from time import sleep, time from requests.packages.urllib3.exceptions import InsecureRequestWarning -from requests.structures import CaseInsensitiveDict from tinydb import TinyDB, Query from tinydb.storages import MemoryStorage +from prometheus_client import Gauge, start_http_server from version import VERSION from vmware.vcenter import vcsite from zvma10.zvma import zvmsite -from posthog import Posthog requests.packages.urllib3.disable_warnings(InsecureRequestWarning) global start_time @@ -28,9 +23,8 @@ Variables: Normally these are imported from the Docker Container, but alternativ """ listen_port = int(os.getenv('LISTEN_PORT', 9999)) -callhomestats = os.getenv("CALL_HOME_STATS", 'True').lower() in ('false', '0', 'f') verifySSL = os.getenv("VERIFY_SSL", 'False').lower() in ('true', '1', 't') -zvm_url = os.environ.get('ZVM_HOST', '192.168.50.60') +zvm_url = os.environ.get('ZVM_HOST', '192.168.50.30') zvm_port = os.environ.get('ZVM_PORT', '443') zvm_username = os.environ.get('ZVM_USERNAME', 'admin') zvm_password = os.environ.get('ZVM_PASSWORD', 'Zertodata987!') @@ -41,31 +35,121 @@ api_timeout = int(os.environ.get('API_TIMEOUT', 5)) LOGLEVEL = os.environ.get('LOGLEVEL', 'DEBUG').upper() DISABLE_STATS = os.environ.get('DISABLE_STATS', 'FALSE').upper() version = str(VERSION) -vcenter_host = os.environ.get('VCENTER_HOST', '192.168.50.50') +vcenter_host = os.environ.get('VCENTER_HOST', '192.168.50.20') vcenter_user = os.environ.get('VCENTER_USER', 'administrator@vsphere.local') vcenter_pwd = os.environ.get('VCENTER_PASSWORD', 'Zertodata987!') -# Thread which gets VM level encryption statistics from ZVM API +# --------------------------------------------------------------------------- +# Prometheus Gauge definitions +# All metrics are served thread-safely at http://host:/metrics +# --------------------------------------------------------------------------- +# Encryption / stats metrics (GetStatsFunc) +_STATS_LABELS = ['VpgIdentifier', 'VmIdentifier', 'VmName', 'SiteIdentifier', 'SiteName'] +g_vm_iops_counter = Gauge('vm_IoOperationsCounter', 'VM IO Operations Counter', _STATS_LABELS) +g_vm_write_counter = Gauge('vm_WriteCounterInMBs', 'VM Write Counter In MBs', _STATS_LABELS) +g_vm_sync_counter = Gauge('vm_SyncCounterInMBs', 'VM Sync Counter In MBs', _STATS_LABELS) +g_vm_network_counter = Gauge('vm_NetworkTrafficCounterInMBs', 'VM Network Traffic Counter In MBs', _STATS_LABELS) +g_vm_encrypted_lbs = Gauge('vm_EncryptedDataInLBs', 'VM Encrypted Data In LBs', _STATS_LABELS) +g_vm_unencrypted_lbs = Gauge('vm_UnencryptedDataInLBs', 'VM Unencrypted Data In LBs', _STATS_LABELS) +g_vm_total_lbs = Gauge('vm_TotalDataInLBs', 'VM Total Data In LBs', _STATS_LABELS) +g_vm_percent_encrypted = Gauge('vm_PercentEncrypted', 'VM Percent Encrypted', _STATS_LABELS) +g_vm_trend_change_level = Gauge('vm_TrendChangeLevel', 'VM Trend Change Level', _STATS_LABELS) + +# VPG metrics (GetDataFunc) +_VPG_LABELS = ['VpgIdentifier', 'VpgName', 'VpgPriority', 'SiteIdentifier', 'SiteName'] +g_vpg_storage_used = Gauge('vpg_storage_used_in_mb', 'VPG Storage Used In MB', _VPG_LABELS) +g_vpg_actual_rpo = Gauge('vpg_actual_rpo', 'VPG Actual RPO', _VPG_LABELS) +g_vpg_throughput = Gauge('vpg_throughput_in_mb', 'VPG Throughput In MB', _VPG_LABELS) +g_vpg_iops = Gauge('vpg_iops', 'VPG IOPs', _VPG_LABELS) +g_vpg_provisioned_storage = Gauge('vpg_provisioned_storage_in_mb', 'VPG Provisioned Storage In MB', _VPG_LABELS) +g_vpg_vms_count = Gauge('vpg_vms_count', 'VPG VMs Count', _VPG_LABELS) +g_vpg_configured_rpo = Gauge('vpg_configured_rpo_seconds', 'VPG Configured RPO Seconds', _VPG_LABELS) +g_vpg_actual_history = Gauge('vpg_actual_history_in_minutes', 'VPG Actual History In Minutes', _VPG_LABELS) +g_vpg_configured_history = Gauge('vpg_configured_history_in_minutes', 'VPG Configured History In Minutes', _VPG_LABELS) +g_vpg_failsafe_actual = Gauge('vpg_failsafe_history_in_minutes_actual', 'VPG Failsafe History In Minutes Actual', _VPG_LABELS) +g_vpg_failsafe_configured = Gauge('vpg_failsafe_history_in_minutes_configured', 'VPG Failsafe History In Minutes Configured', _VPG_LABELS) +g_vpg_status = Gauge('vpg_status', 'VPG Status', _VPG_LABELS) +g_vpg_substatus = Gauge('vpg_substatus', 'VPG Sub-Status', _VPG_LABELS) +g_vpg_alert_status = Gauge('vpg_alert_status', 'VPG Alert Status', _VPG_LABELS) + +# Datastore metrics (GetDataFunc) +_DS_LABELS = ['datastoreIdentifier', 'DatastoreName', 'SiteIdentifier', 'SiteName'] +g_ds_vras = Gauge('datastore_vras', 'Datastore VRAs', _DS_LABELS) +g_ds_incoming_vms = Gauge('datastore_incoming_vms', 'Datastore Incoming VMs', _DS_LABELS) +g_ds_outgoing_vms = Gauge('datastore_outgoing_vms', 'Datastore Outgoing VMs', _DS_LABELS) +g_ds_capacity = Gauge('datastore_usage_capacityinbytes', 'Datastore Capacity In Bytes', _DS_LABELS) +g_ds_free = Gauge('datastore_usage_freeinbytes', 'Datastore Free In Bytes', _DS_LABELS) +g_ds_used = Gauge('datastore_usage_usedinbytes', 'Datastore Used In Bytes', _DS_LABELS) +g_ds_provisioned = Gauge('datastore_usage_provisionedinbytes', 'Datastore Provisioned In Bytes', _DS_LABELS) +g_ds_zerto_protected_used = Gauge('datastore_usage_zerto_protected_usedinbytes', 'Datastore Zerto Protected Used In Bytes', _DS_LABELS) +g_ds_zerto_protected_provisioned = Gauge('datastore_usage_zerto_protected_provisionedinbytes', 'Datastore Zerto Protected Provisioned In Bytes', _DS_LABELS) +g_ds_zerto_recovery_used = Gauge('datastore_usage_zerto_recovery_usedinbytes', 'Datastore Zerto Recovery Used In Bytes', _DS_LABELS) +g_ds_zerto_recovery_provisioned = Gauge('datastore_usage_zerto_recovery_provisionedinbytes', 'Datastore Zerto Recovery Provisioned In Bytes', _DS_LABELS) +g_ds_zerto_journal_used = Gauge('datastore_usage_zerto_journal_usedinbytes', 'Datastore Zerto Journal Used In Bytes', _DS_LABELS) +g_ds_zerto_journal_provisioned = Gauge('datastore_usage_zerto_journal_provisionedinbytes', 'Datastore Zerto Journal Provisioned In Bytes', _DS_LABELS) +g_ds_zerto_scratch_used = Gauge('datastore_usage_zerto_scratch_usedinbytes', 'Datastore Zerto Scratch Used In Bytes', _DS_LABELS) +g_ds_zerto_scratch_provisioned = Gauge('datastore_usage_zerto_scratch_provisionedinbytes', 'Datastore Zerto Scratch Provisioned In Bytes', _DS_LABELS) +g_ds_zerto_appliances_used = Gauge('datastore_usage_zerto_appliances_usedinbytes', 'Datastore Zerto Appliances Used In Bytes', _DS_LABELS) +g_ds_zerto_appliances_provisioned = Gauge('datastore_usage_zerto_appliances_provisionedinbytes', 'Datastore Zerto Appliances Provisioned In Bytes', _DS_LABELS) + +# VM metrics (GetDataFunc - VMs section) +_VM_LABELS = ['VmIdentifier', 'VmName', 'VmRecoveryVRA', 'VmPriority', 'SiteIdentifier', 'VpgName', 'SiteName'] +g_vm_actualrpo = Gauge('vm_actualrpo', 'VM Actual RPO', _VM_LABELS) +g_vm_throughput = Gauge('vm_throughput_in_mb', 'VM Throughput In MB', _VM_LABELS) +g_vm_iops = Gauge('vm_iops', 'VM IOPs', _VM_LABELS) +g_vm_journal_hard_limit = Gauge('vm_journal_hard_limit', 'VM Journal Hard Limit', _VM_LABELS) +g_vm_journal_warning_limit = Gauge('vm_journal_warning_limit', 'VM Journal Warning Limit', _VM_LABELS) +g_vm_journal_used_storage = Gauge('vm_journal_used_storage_mb', 'VM Journal Used Storage MB', _VM_LABELS) +g_vm_outgoing_bandwidth = Gauge('vm_outgoing_bandwidth_in_mbps', 'VM Outgoing Bandwidth In Mbps', _VM_LABELS) +g_vm_used_storage = Gauge('vm_used_storage_in_MB', 'VM Used Storage In MB', _VM_LABELS) +g_vm_provisioned_storage = Gauge('vm_provisioned_storage_in_MB', 'VM Provisioned Storage In MB', _VM_LABELS) +g_vm_status = Gauge('vm_status', 'VM Status', _VM_LABELS) +g_vm_substatus = Gauge('vm_substatus', 'VM Sub-Status', _VM_LABELS) + +# Scratch and journal volume metrics (GetDataFunc - Volumes sections) +_VOL_LABELS = ['ProtectedVm', 'ProtectedVmIdentifier', 'OwningVRA', 'VpgName', 'SiteIdentifier', 'SiteName'] +g_scratch_vol_size = Gauge('scratch_volume_size_in_bytes', 'Scratch Volume Size In Bytes', _VOL_LABELS) +g_journal_vol_size = Gauge('vm_journal_volume_size_in_bytes', 'VM Journal Volume Size In Bytes', _VOL_LABELS) +g_journal_vol_provisioned = Gauge('vm_journal_volume_provisioned_in_bytes','VM Journal Volume Provisioned In Bytes', _VOL_LABELS) +g_journal_vol_count = Gauge('vm_journal_volume_count', 'VM Journal Volume Count', _VOL_LABELS) + +# VRA metrics (GetVraMetrics) +_VRA_LABELS = ['VraIdentifierStr', 'VraName', 'VraVersion', 'HostVersion', 'SiteIdentifier', 'SiteName'] +g_vra_memory = Gauge('vra_memory_in_GB', 'VRA Memory In GB', _VRA_LABELS) +g_vra_vcpu_count = Gauge('vra_vcpu_count', 'VRA vCPU Count', _VRA_LABELS) +g_vra_protected_vms = Gauge('vra_protected_vms', 'VRA Protected VMs', _VRA_LABELS) +g_vra_protected_vpgs = Gauge('vra_protected_vpgs', 'VRA Protected VPGs', _VRA_LABELS) +g_vra_protected_vols = Gauge('vra_protected_volumes', 'VRA Protected Volumes', _VRA_LABELS) +g_vra_recovery_vms = Gauge('vra_recovery_vms', 'VRA Recovery VMs', _VRA_LABELS) +g_vra_recovery_vpgs = Gauge('vra_recovery_vpgs', 'VRA Recovery VPGs', _VRA_LABELS) +g_vra_recovery_vols = Gauge('vra_recovery_volumes', 'VRA Recovery Volumes', _VRA_LABELS) +g_vra_self_protected = Gauge('vra_self_protected_vpgs', 'VRA Self-Protected VPGs', _VRA_LABELS) +g_vra_cpu_usage = Gauge('vra_cpu_usage_mhz', 'VRA CPU Usage MHz', _VRA_LABELS) +g_vra_memory_usage = Gauge('vra_memory_usage_mb', 'VRA Memory Usage MB', _VRA_LABELS) + +# Exporter / thread health metrics (ThreadProbe) +g_exporter_uptime = Gauge('exporter_uptime', 'Exporter Uptime In Minutes', ['ExporterInstance']) +g_thread_status = Gauge('exporter_thread_status', 'Exporter Thread Status', ['thread', 'ExporterInstance']) + + +# --------------------------------------------------------------------------- +# Thread which gets VM level encryption statistics from ZVM API +# --------------------------------------------------------------------------- def GetStatsFunc(zvm_instance): - tempdb = TinyDB(storage=MemoryStorage) # ('./db.json') #(storage=MemoryStorage) used for storing db on disk for debugging + tempdb = TinyDB(storage=MemoryStorage) dbvm = Query() dbvpg = Query() - dbsite = Query() zvm = zvm_instance - while (True) : + while True: global siteId global siteName - if (zvm.is_authenticated()): + if zvm.is_authenticated(): log.debug("Stats Collector Loop Running") - - metricsDictionary = {} - - ## Statistics API - statsapi_json = None - statsapi_json = zvm.vms_statistics() + + statsapi_json = zvm.vms_statistics() log.debug(statsapi_json) vms_encryption_metrics = zvm.encryptiondetection_metrics_vms() @@ -74,8 +158,7 @@ def GetStatsFunc(zvm_instance): vmsiteinfo = zvm.vm(vmidentifier=vm['VmIdentifier'], vpgidentifier=vm['VpgIdentifier']) if vmsiteinfo['ProtectedSite']['identifier'] == zvm.site_id: log.debug(f"VM is protected at this site - {vm['VmIdentifier']}") - oldvmdata = dict() - # this part of the dictionary will never exist, so not sure why i need this as i set the key/values below in the vmem section. + if 'EncryptionMetrics' not in vm: vm['EncryptionMetrics'] = {} vm['VmName'] = None @@ -95,15 +178,14 @@ def GetStatsFunc(zvm_instance): for vmem in vms_encryption_metrics: if vmem['Link']['identifier'] == vm['VmIdentifier']: log.debug(f"Aligning VM Stats and Encryption Metrics for {vm['VmIdentifier']} - {vmem['Link']['name']}") - #print(f"Aligning VM Stats and Encryption Metrics for {vm['VmIdentifier']} - {vmem['Link']['name']}") - vm['EncryptionMetrics']['EncryptedData'] = vmem['EncryptionMetrics']['EncryptedData'] + vm['EncryptionMetrics']['EncryptedData'] = vmem['EncryptionMetrics']['EncryptedData'] vm['EncryptionMetrics']['NonEncryptedData'] = vmem['EncryptionMetrics']['NonEncryptedData'] vm['EncryptionMetrics']['TrendChangeLevel'] = vmem['EncryptionMetrics']['TrendChangeLevel'] vm['VmName'] = vmem['Link']['name'] log.info("Checking TempDB for VM " + vm['VmIdentifier'] + " in VPG " + vm['VpgIdentifier']) oldvmdata = tempdb.search((dbvm.VmIdentifier == vm['VmIdentifier']) & (dbvpg.VpgIdentifier == vm['VpgIdentifier'])) - if (oldvmdata): + if oldvmdata: log.info(vm['VmIdentifier'] + " Record Found, Updating DB") log.debug("Old Data") log.debug(oldvmdata) @@ -119,139 +201,137 @@ def GetStatsFunc(zvm_instance): log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs)) CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs']) log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs)) - CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs']) + CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs']) log.debug("CurrentWriteCounterInMBs " + str(CurrentWriteCounterInMBs)) CurrentEncryptedLBs = abs(vm['EncryptionMetrics']['EncryptedData'] - oldvmdata[0]['EncryptionMetrics']['EncryptedData']) log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs)) CurrentUnencryptedLBs = abs(vm['EncryptionMetrics']['NonEncryptedData'] - oldvmdata[0]['EncryptionMetrics']['NonEncryptedData']) log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs)) - CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel']) + CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel']) log.debug("CurrentTrendChangeLevel " + str(CurrentTrendChangeLevel)) CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs) log.debug("CurrentTotalLBs " + str(CurrentTotalLBs)) if CurrentTotalLBs != 0: - CurrentPercentEncrypted = ((CurrentEncryptedLBs / CurrentTotalLBs) * 100) + CurrentPercentEncrypted = (CurrentEncryptedLBs / CurrentTotalLBs) * 100 else: CurrentPercentEncrypted = 0 log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted)) else: log.info(f"{vm['VmIdentifier']} - {vm['VmName']} - No Record Found, Inserting into DB") - #insert original VM record to tempdb log.debug(tempdb.insert(vm)) - # Store Calculated Metrics - metricsDictionary["vm_IoOperationsCounter{VpgIdentifier=\"" + str(vm['VpgIdentifier']) + "\",VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + str(siteId) + "\",SiteName=\"" + str(siteName) + "\"}"] = CurrentIops - metricsDictionary["vm_WriteCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentWriteCounterInMBs - metricsDictionary["vm_SyncCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentSyncCounterInMBs - metricsDictionary["vm_NetworkTrafficCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentNetworkTrafficCounterInMBs - metricsDictionary["vm_EncryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentEncryptedLBs - metricsDictionary["vm_UnencryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentUnencryptedLBs - metricsDictionary["vm_TotalDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTotalLBs - metricsDictionary["vm_PercentEncrypted{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentPercentEncrypted - metricsDictionary["vm_TrendChangeLevel{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTrendChangeLevel + # Push calculated metrics to Prometheus Gauges + lbl = dict( + VpgIdentifier=str(vm['VpgIdentifier']), + VmIdentifier=str(vm['VmIdentifier']), + VmName=str(vm['VmName']), + SiteIdentifier=str(siteId), + SiteName=str(siteName) + ) + g_vm_iops_counter.labels(**lbl).set(CurrentIops) + g_vm_write_counter.labels(**lbl).set(CurrentWriteCounterInMBs) + g_vm_sync_counter.labels(**lbl).set(CurrentSyncCounterInMBs) + g_vm_network_counter.labels(**lbl).set(CurrentNetworkTrafficCounterInMBs) + g_vm_encrypted_lbs.labels(**lbl).set(CurrentEncryptedLBs) + g_vm_unencrypted_lbs.labels(**lbl).set(CurrentUnencryptedLBs) + g_vm_total_lbs.labels(**lbl).set(CurrentTotalLBs) + g_vm_percent_encrypted.labels(**lbl).set(CurrentPercentEncrypted) + g_vm_trend_change_level.labels(**lbl).set(CurrentTrendChangeLevel) else: log.debug(f"VM is only recovering to this site, skipping metrics - {vm['VmIdentifier']}") - #print(f"VM is only recovering to this site, skipping metrics - {vm['VmIdentifier']}") else: log.debug("No VMS in Stats API") - - - ## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus - file_object = open('statsmetrics', 'w') - txt_object = open('statsmetrics.txt', 'w') - for item in metricsDictionary : - file_object.write(item) - file_object.write(" ") - file_object.write(str(metricsDictionary[item])) - file_object.write("\n") - txt_object.write(item) - txt_object.write(" ") - txt_object.write(str(metricsDictionary[item])) - txt_object.write("\n") - file_object.close() - txt_object.close() - log.debug("Starting Sleep for " + str(scrape_speed) + " seconds") sleep(scrape_speed) else: log.debug("Waiting 1 second for Auth Token") sleep(1) -# Function which retrieves stats from various ZVM APIs and stores them in a metrics file + +# --------------------------------------------------------------------------- +# Function which retrieves stats from various ZVM APIs +# --------------------------------------------------------------------------- def GetDataFunc(zvm_instance): - tempdb = TinyDB(storage=MemoryStorage) - dbvm = Query() zvm = zvm_instance - while (True) : + while True: global siteId global siteName - if (zvm.is_authenticated()): + if zvm.is_authenticated(): log.info("Data Collector Loop Running") - metricsDictionary = {} ### VPGs API - vpg_json = None - vpg_json = zvm.vpgs() - if(vpg_json is not None): + vpg_json = zvm.vpgs() + if vpg_json is not None: log.debug("Got VPG JSON") - for vpg in vpg_json : - metricsDictionary["vpg_storage_used_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["UsedStorageInMB"] - metricsDictionary["vpg_actual_rpo{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ActualRPO"] - metricsDictionary["vpg_throughput_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ThroughputInMB"] - metricsDictionary["vpg_iops{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["IOPs"] - metricsDictionary["vpg_provisioned_storage_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ProvisionedStorageInMB"] - metricsDictionary["vpg_vms_count{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["VmsCount"] - metricsDictionary["vpg_configured_rpo_seconds{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ConfiguredRpoSeconds"] - metricsDictionary["vpg_actual_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ActualHistoryInMinutes"] - metricsDictionary["vpg_configured_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"] - if(vpg["FailSafeHistory"] is None): - metricsDictionary["vpg_failsafe_history_in_minutes_actual{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = 0 - metricsDictionary["vpg_failsafe_history_in_minutes_configured{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = 0 + for vpg in vpg_json: + lbl = dict( + VpgIdentifier=vpg['VpgIdentifier'], + VpgName=vpg['VpgName'], + VpgPriority=str(vpg['Priority']), + SiteIdentifier=siteId, + SiteName=siteName + ) + g_vpg_storage_used.labels(**lbl).set(vpg["UsedStorageInMB"]) + g_vpg_actual_rpo.labels(**lbl).set(vpg["ActualRPO"]) + g_vpg_throughput.labels(**lbl).set(vpg["ThroughputInMB"]) + g_vpg_iops.labels(**lbl).set(vpg["IOPs"]) + g_vpg_provisioned_storage.labels(**lbl).set(vpg["ProvisionedStorageInMB"]) + g_vpg_vms_count.labels(**lbl).set(vpg["VmsCount"]) + g_vpg_configured_rpo.labels(**lbl).set(vpg["ConfiguredRpoSeconds"]) + g_vpg_actual_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ActualHistoryInMinutes"]) + g_vpg_configured_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"]) + if vpg["FailSafeHistory"] is None: + g_vpg_failsafe_actual.labels(**lbl).set(0) + g_vpg_failsafe_configured.labels(**lbl).set(0) else: - metricsDictionary["vpg_failsafe_history_in_minutes_actual{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ActualFailSafeHistory"] - metricsDictionary["vpg_failsafe_history_in_minutes_configured{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"] - metricsDictionary["vpg_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["Status"] - metricsDictionary["vpg_substatus{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["SubStatus"] - metricsDictionary["vpg_alert_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["AlertStatus"] + g_vpg_failsafe_actual.labels(**lbl).set(vpg["FailSafeHistory"]["ActualFailSafeHistory"]) + g_vpg_failsafe_configured.labels(**lbl).set(vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"]) + g_vpg_status.labels(**lbl).set(vpg["Status"]) + g_vpg_substatus.labels(**lbl).set(vpg["SubStatus"]) + g_vpg_alert_status.labels(**lbl).set(vpg["AlertStatus"]) else: log.debug("No VPGs Found") - ### Datastores APIs - ds_json = None - ds_json = zvm.datastores() - if(ds_json is not None): + ### Datastores API + ds_json = zvm.datastores() + if ds_json is not None: log.debug("Got Datastores API") - for ds in ds_json : + for ds in ds_json: log.debug(f"Processing {ds['DatastoreName']}") - metricsDictionary["datastore_vras{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumVRAs"] - metricsDictionary["datastore_incoming_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumIncomingVMs"] - metricsDictionary["datastore_outgoing_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumOutgoingVMs"] - metricsDictionary["datastore_usage_capacityinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"] - metricsDictionary["datastore_usage_freeinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"] - metricsDictionary["datastore_usage_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"] - metricsDictionary["datastore_usage_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"] - metricsDictionary["datastore_usage_zerto_protected_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"] - metricsDictionary["datastore_usage_zerto_protected_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"] - metricsDictionary["datastore_usage_zerto_recovery_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"] - metricsDictionary["datastore_usage_zerto_recovery_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"] - metricsDictionary["datastore_usage_zerto_journal_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"] - metricsDictionary["datastore_usage_zerto_journal_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"] - metricsDictionary["datastore_usage_zerto_scratch_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"] - metricsDictionary["datastore_usage_zerto_scratch_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"] - metricsDictionary["datastore_usage_zerto_appliances_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"] - metricsDictionary["datastore_usage_zerto_appliances_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"] + lbl = dict( + datastoreIdentifier=ds['DatastoreIdentifier'], + DatastoreName=ds['DatastoreName'], + SiteIdentifier=siteId, + SiteName=siteName + ) + g_ds_vras.labels(**lbl).set(ds["Stats"]["NumVRAs"]) + g_ds_incoming_vms.labels(**lbl).set(ds["Stats"]["NumIncomingVMs"]) + g_ds_outgoing_vms.labels(**lbl).set(ds["Stats"]["NumOutgoingVMs"]) + g_ds_capacity.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"]) + g_ds_free.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"]) + g_ds_used.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"]) + g_ds_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"]) + g_ds_zerto_protected_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"]) + g_ds_zerto_protected_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"]) + g_ds_zerto_recovery_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"]) + g_ds_zerto_recovery_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"]) + g_ds_zerto_journal_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"]) + g_ds_zerto_journal_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"]) + g_ds_zerto_scratch_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"]) + g_ds_zerto_scratch_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"]) + g_ds_zerto_appliances_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"]) + g_ds_zerto_appliances_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"]) else: log.debug("No Datastores Found") ## VMs API log.debug("Getting VMs API") - scratch_vols = None - scratch_vols = zvm.vms() - if(scratch_vols is not None): + vms_json = zvm.vms() + if vms_json is not None: log.debug("Got VMs API") - for vm in scratch_vols: + for vm in vms_json: log.debug("Processing VM: " + str(vm['VmName'])) log.debug("Checking VM " + vm['VmIdentifier'] + " on Protected Site " + vm['ProtectedSite']['identifier'] + " against " + siteId) @@ -260,181 +340,160 @@ def GetDataFunc(zvm_instance): if not isinstance(vm["ActualRPO"], int): vm["ActualRPO"] = -1 - metricsDictionary["vm_actualrpo{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["ActualRPO"] - metricsDictionary["vm_throughput_in_mb{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["ThroughputInMB"] - metricsDictionary["vm_iops{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["IOPs"] - metricsDictionary["vm_journal_hard_limit{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["JournalHardLimit"]["LimitValue"] - metricsDictionary["vm_journal_warning_limit{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalWarningThreshold"]["LimitValue"] - metricsDictionary["vm_journal_used_storage_mb{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalUsedStorageMb"] - metricsDictionary["vm_outgoing_bandwidth_in_mbps{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["OutgoingBandWidthInMbps"] - metricsDictionary["vm_used_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["UsedStorageInMB"] - metricsDictionary["vm_provisioned_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["ProvisionedStorageInMB"] - metricsDictionary["vm_status{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["Status"] - metricsDictionary["vm_substatus{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["SubStatus"] - log.debug("Processed VM: " + str(vm['VmName'])) + lbl = dict( + VmIdentifier=str(vm['VmIdentifier']), + VmName=str(vm['VmName']), + VmRecoveryVRA=str(vm["RecoveryHostName"]), + VmPriority=str(vm['Priority']), + SiteIdentifier=str(siteId), + VpgName=str(vm['VpgName']), + SiteName=str(siteName) + ) + g_vm_actualrpo.labels(**lbl).set(vm["ActualRPO"]) + g_vm_throughput.labels(**lbl).set(vm["ThroughputInMB"]) + g_vm_iops.labels(**lbl).set(vm["IOPs"]) + g_vm_journal_hard_limit.labels(**lbl).set(vm["JournalHardLimit"]["LimitValue"]) + g_vm_journal_warning_limit.labels(**lbl).set(vm["JournalWarningThreshold"]["LimitValue"]) + g_vm_journal_used_storage.labels(**lbl).set(vm["JournalUsedStorageMb"]) + g_vm_outgoing_bandwidth.labels(**lbl).set(vm["OutgoingBandWidthInMbps"]) + g_vm_used_storage.labels(**lbl).set(vm["UsedStorageInMB"]) + g_vm_provisioned_storage.labels(**lbl).set(vm["ProvisionedStorageInMB"]) + g_vm_status.labels(**lbl).set(vm["Status"]) + g_vm_substatus.labels(**lbl).set(vm["SubStatus"]) + log.debug("Processed VM: " + str(vm['VmName'])) else: log.debug("VM " + vm['VmIdentifier'] + " is protected to this site") else: log.debug("No VMs Found") - - ## Volumes API for Scratch Volumes + ## Volumes API - Scratch Volumes log.debug("Getting Scratch Volumes") - scratch_vols = None scratch_vols = zvm.volumes(volumetype="scratch") - - if(scratch_vols is not None): + if scratch_vols is not None: log.debug("Got Scratch Volumes API") + # Accumulate per-VM totals before setting gauges (multiple volumes per VM) + scratch_accumulator = {} for volume in scratch_vols: - #metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"] - # Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key. - metrickey = "scratch_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}" - if (metrickey in metricsDictionary): - metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"] - else: - metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"] - percentage_used = (volume["Size"]["UsedInBytes"] / volume["Size"]["ProvisionedInBytes"] * 100) - percentage_used = round(percentage_used, 1) - #metricsDictionary["scratch_volume_percentage_used{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = percentage_used + key = ( + volume['ProtectedVm']['Name'], + volume['ProtectedVm']['Identifier'], + volume['OwningVm']['Name'], + volume['Vpg']['Name'] + ) + scratch_accumulator[key] = scratch_accumulator.get(key, 0) + volume["Size"]["UsedInBytes"] + for (pvm, pvmid, owning_vra, vpg_name), size in scratch_accumulator.items(): + g_scratch_vol_size.labels( + ProtectedVm=pvm, ProtectedVmIdentifier=pvmid, + OwningVRA=owning_vra, VpgName=vpg_name, + SiteIdentifier=siteId, SiteName=siteName + ).set(size) else: log.debug("No Scratch Volumes Found") - ## Volumes API for Journal Volumes + ## Volumes API - Journal Volumes log.debug("Getting Journal Volumes") - journal_vols = None journal_vols = zvm.volumes(volumetype="journal") - - if(journal_vols is not None): + if journal_vols is not None: log.debug("Journal Volumes Exist") - for volume in journal_vols : + # Accumulate per-VM totals before setting gauges (multiple volumes per VM) + journal_size_acc = {} + journal_prov_acc = {} + journal_count_acc = {} + for volume in journal_vols: log.debug("Journal Volume: " + volume['ProtectedVm']['Name'] + " Calculating total size...") - #metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"] - # Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key. - metrickey = "vm_journal_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}" - if (metrickey in metricsDictionary): - metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"] - else: - metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"] - - metrickey = "vm_journal_volume_provisioned_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}" - if (metrickey in metricsDictionary): - metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["ProvisionedInBytes"] - else: - metricsDictionary[metrickey] = volume["Size"]["ProvisionedInBytes"] - - metrickey = "vm_journal_volume_count{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}" - if (metrickey in metricsDictionary): - metricsDictionary[metrickey] = metricsDictionary[metrickey] + 1 - else: - metricsDictionary[metrickey] = 1 + key = ( + volume['ProtectedVm']['Name'], + volume['ProtectedVm']['Identifier'], + volume['OwningVm']['Name'], + volume['Vpg']['Name'] + ) + journal_size_acc[key] = journal_size_acc.get(key, 0) + volume["Size"]["UsedInBytes"] + journal_prov_acc[key] = journal_prov_acc.get(key, 0) + volume["Size"]["ProvisionedInBytes"] + journal_count_acc[key] = journal_count_acc.get(key, 0) + 1 + for key in journal_size_acc: + pvm, pvmid, owning_vra, vpg_name = key + lbl = dict( + ProtectedVm=pvm, ProtectedVmIdentifier=pvmid, + OwningVRA=owning_vra, VpgName=vpg_name, + SiteIdentifier=siteId, SiteName=siteName + ) + g_journal_vol_size.labels(**lbl).set(journal_size_acc[key]) + g_journal_vol_provisioned.labels(**lbl).set(journal_prov_acc[key]) + g_journal_vol_count.labels(**lbl).set(journal_count_acc[key]) else: log.debug("No Journal Volumes Exist") - - ## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus - log.debug("Writing metrics to file") - file_object = open('metrics', 'w') - txt_object = open('metrics.txt', 'w') - for item in metricsDictionary : - file_object.write(item) - file_object.write(" ") - file_object.write(str(metricsDictionary[item])) - file_object.write("\n") - txt_object.write(item) - txt_object.write(" ") - txt_object.write(str(metricsDictionary[item])) - txt_object.write("\n") - - file_object.close() - txt_object.close() - log.debug("Metrics written to file") - # This function will get data every 10 seconds log.debug("Starting Sleep for " + str(scrape_speed) + " seconds") sleep(scrape_speed) else: log.debug("Waiting 1 second for Auth Token") sleep(1) -# get VRA CPU and memory usage from vCenter Server + +# --------------------------------------------------------------------------- +# Get VRA CPU and memory usage from vCenter Server +# --------------------------------------------------------------------------- def GetVraMetrics(zvm_instance): log.debug("GetVraMetrics thread started") try: - - metricsDictionary = {} zvm = zvm_instance while True: - vra_names = [] - vras = [] global siteId global siteName log.debug("Checking Token in VRA CPU MEM Collector") - if (zvm.is_authenticated()): + if zvm.is_authenticated(): log.info("VRA CPU MEM Collector Running") - ### VRA API - vras_json = None vras_json = zvm.vras() log.debug(vras_json) - - if (vras_json is not None): + + if vras_json is not None: log.debug("VRA names: %s", vras_json) - log.debug(type(vras)) - for vra in vras_json : - # Gather other VRA Metrics from Zerto API into Metrics Diectionary - metricsDictionary["vra_memory_in_GB{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["MemoryInGB"] - metricsDictionary["vra_vcpu_count{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["NumOfCpus"] - metricsDictionary["vra_protected_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vms"] - metricsDictionary["vra_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vpgs"] - metricsDictionary["vra_protected_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Volumes"] - metricsDictionary["vra_recovery_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vms"] - metricsDictionary["vra_recovery_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vpgs"] - metricsDictionary["vra_recovery_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Volumes"] - metricsDictionary["vra_self_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["SelfProtectedVpgs"] + for vra in vras_json: + lbl = dict( + VraIdentifierStr=vra['VraIdentifierStr'], + VraName=vra['VraName'], + VraVersion=vra['VraVersion'], + HostVersion=vra['HostVersion'], + SiteIdentifier=siteId, + SiteName=siteName + ) + g_vra_memory.labels(**lbl).set(vra["MemoryInGB"]) + g_vra_vcpu_count.labels(**lbl).set(vra["NumOfCpus"]) + g_vra_protected_vms.labels(**lbl).set(vra["ProtectedCounters"]["Vms"]) + g_vra_protected_vpgs.labels(**lbl).set(vra["ProtectedCounters"]["Vpgs"]) + g_vra_protected_vols.labels(**lbl).set(vra["ProtectedCounters"]["Volumes"]) + g_vra_recovery_vms.labels(**lbl).set(vra["RecoveryCounters"]["Vms"]) + g_vra_recovery_vpgs.labels(**lbl).set(vra["RecoveryCounters"]["Vpgs"]) + g_vra_recovery_vols.labels(**lbl).set(vra["RecoveryCounters"]["Volumes"]) + g_vra_self_protected.labels(**lbl).set(vra["SelfProtectedVpgs"]) log.debug("VRA Name: %s", vra['VraName']) log.info(f"vCenter info: T/F = {is_vcenter_set} Host: {vcenter_host} u: {vcenter_user}") - # get the CPU and memory usage for each VRA if is_vcenter_set: - log.debug(f"vCenter Info Is Valid... Trying to get CPU and Memory usage for VRAs") + log.debug("vCenter Info Is Valid... Trying to get CPU and Memory usage for VRAs") try: log.debug("Trying to get stats from vCenter module") vradata = vc_connection.get_cpu_mem_used(vra['VraName']) - for item in vradata: - log.debug(item) - # get the CPU usage and memory usage for the VM - cpu_usage_mhz = vradata[0] - memory_usage_mb = vradata[1] - - # print the CPU and memory usage for the VM - log.debug(f"VRA {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB") - metricsDictionary["vra_cpu_usage_mhz{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = cpu_usage_mhz - metricsDictionary["vra_memory_usage_mb{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = memory_usage_mb - except: - log.info(f"No VM found with name {vra['VraName']}, or unexpected response.") + if vradata is not None: + for item in vradata: + log.debug(item) + cpu_usage_mhz = vradata[0] + memory_usage_mb = vradata[1] + log.debug(f"VRA {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB") + g_vra_cpu_usage.labels(**lbl).set(cpu_usage_mhz) + g_vra_memory_usage.labels(**lbl).set(memory_usage_mb) + else: + log.info(f"No data returned for VRA {vra['VraName']} from vCenter") + except Exception as e: + log.info(f"No VM found with name {vra['VraName']}, or unexpected response: {e}") else: log.debug("No VRAs Found") - - ## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus - file_object = open('vrametrics', 'w') - txt_object = open('vrametrics.txt', 'w') - for item in metricsDictionary : - file_object.write(item) - file_object.write(" ") - file_object.write(str(metricsDictionary[item])) - file_object.write("\n") - txt_object.write(item) - txt_object.write(" ") - txt_object.write(str(metricsDictionary[item])) - txt_object.write("\n") - - file_object.close() - txt_object.close() - # This function will get data every 10 seconds - log.debug("Starting Sleep for " + str(int(scrape_speed *2)) + " seconds") + log.debug("Starting Sleep for " + str(int(scrape_speed * 2)) + " seconds") sleep(scrape_speed * 2) else: log.debug("Waiting 1 second for Auth Token") @@ -442,65 +501,25 @@ def GetVraMetrics(zvm_instance): except Exception as e: log.error(f"Error in GetVraMetrics: {e}") -# function which monitors the threads and restarts them if they die + +# --------------------------------------------------------------------------- +# Monitors thread health and exporter uptime +# --------------------------------------------------------------------------- def ThreadProbe(): global container_id while True: log.debug("Thread Probe Started") - metricsDictionary = {} uptime = round((time() - start_time) / 60, 1) - metricsDictionary["exporter_uptime{ExporterInstance=\"" + container_id + "\"}"] = uptime - if data_thread.is_alive(): - log.debug("Data Thread Is Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1 - else: - log.debug("Data Thread Is NOT Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0 + g_exporter_uptime.labels(ExporterInstance=container_id).set(uptime) - if stats_thread.is_alive(): - log.debug("Stats Thread Is Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1 - else: - log.debug("Stats Thread Is NOT Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0 - - if vra_metrics_thread.is_alive(): - log.debug("VRA Metrics Thread Is Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1 - else: - log.debug("VRA Metrics Thread Is NOT Alive") - metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0 - - log.debug("Writing Thread data to files") - file_object = open('threads', 'w') - txt_object = open('threads.txt', 'w') - for item in metricsDictionary : - file_object.write(item) - file_object.write(" ") - file_object.write(str(metricsDictionary[item])) - file_object.write("\n") - txt_object.write(item) - txt_object.write(" ") - txt_object.write(str(metricsDictionary[item])) - txt_object.write("\n") - - log.debug("Trying to close Thread txt files") - file_object.close() - txt_object.close() + g_thread_status.labels(thread="DataStats", ExporterInstance=container_id).set(1 if data_thread.is_alive() else 0) + g_thread_status.labels(thread="EncryptionStats",ExporterInstance=container_id).set(1 if stats_thread.is_alive() else 0) + g_thread_status.labels(thread="VraMetrics", ExporterInstance=container_id).set(1 if vra_metrics_thread.is_alive() else 0) log.debug("Probe Thread Going to Sleep") sleep(30) -#----------------run http server on port ----------------- -def WebServer(port): - log.info(f"Web Server Starting on port {port}") - - Handler = http.server.SimpleHTTPRequestHandler - - with socketserver.TCPServer(("", port), Handler) as httpd: - log.info(f"Webserver running on port {port}") - httpd.serve_forever() def start_thread(target_func): log.debug(f"Starting thread for {target_func.__name__}") @@ -510,6 +529,7 @@ def start_thread(target_func): log.debug(f"Thread {target_func.__name__} started") return thread + """ Main Program Logic """ @@ -517,10 +537,22 @@ Main Program Logic # Get the hostname of the machine container_id = str(socket.gethostname()) -#set log line format including container_id -log_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(threadName)s;%(message)s", "%Y-%m-%d %H:%M:%S") -log_handler = RotatingFileHandler(filename=f"./logs/Log-{container_id}.log", maxBytes=1024*1024*100, backupCount=5) -log_handler.setFormatter(log_formatter) +class JsonFormatter(logging.Formatter): + """Formats log records as single-line JSON for container stdout / fluentd ingestion.""" + def format(self, record): + log_entry = { + "time": self.formatTime(record, "%Y-%m-%d %H:%M:%S"), + "level": record.levelname, + "thread": record.threadName, + "message": record.getMessage(), + "container": container_id, + } + if record.exc_info: + log_entry["exception"] = self.formatException(record.exc_info) + return json.dumps(log_entry) + +log_handler = logging.StreamHandler(sys.stdout) +log_handler.setFormatter(JsonFormatter()) log = logging.getLogger("Node-Exporter") log.setLevel(LOGLEVEL) log.addHandler(log_handler) @@ -530,37 +562,33 @@ log.debug("Running with Variables:\nVerify SSL: " + str(verifySSL) + "\nZVM Host # Initialize zvmsite instance zvm_instance = zvmsite( - host=zvm_url, - port=zvm_port, + host=zvm_url, + port=int(zvm_port), username=zvm_username, password=zvm_password, - client_id=client_id, + client_id=client_id, client_secret=client_secret, loglevel=LOGLEVEL, logger=log, - stats=DISABLE_STATS + stats=(DISABLE_STATS != "TRUE") ) -# grant_type="client_credentials", -# Start the zvmsite authentication thread +# Start the zvmsite authentication thread zvm_instance.connect() -""" -Global Variables used by the program -""" -local_site_info = None + siteId = None siteName = None -while(siteId is None): +while siteId is None: if zvm_instance.is_authenticated(): sleep(2) log.debug("Trying Set Global Vars") siteId = zvm_instance.site_id siteName = zvm_instance.site_name + else: + sleep(1) -lastStats = CaseInsensitiveDict() - -# Check if vCenter is set, if not disable VRA metrics +# Check if vCenter is set; if not, disable VRA CPU/memory metrics is_vcenter_set = True if vcenter_host == "vcenter.local": log.error("vCenter Host not set. Please set the environment variable VCENTER_HOST, turning off VRA CPU and Memory metrics") @@ -568,47 +596,39 @@ if vcenter_host == "vcenter.local": log.debug("vCenter data collection is enabled") vc_connection = vcsite(vcenter_host, vcenter_user, vcenter_pwd, loglevel="debug", logger=log) -# Starting threads -vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance)) -data_thread = start_thread(lambda: GetDataFunc(zvm_instance)) -stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance)) -log.debug("Starting VRA Metrics") -webserver_thread = start_thread(lambda: WebServer(listen_port)) -probe_thread = start_thread(lambda: ThreadProbe) -log.debug(f"ThreadProbe just started on PID {probe_thread}") +# Start prometheus metrics HTTP server (replaces the file-based SimpleHTTPRequestHandler) +# All Gauges from all threads are served thread-safely at http://host:/metrics +start_http_server(listen_port) +log.info(f"Prometheus metrics server started on port {listen_port}") -# loop indefinitely +# Starting collection threads +vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance)) +data_thread = start_thread(lambda: GetDataFunc(zvm_instance)) +stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance)) +probe_thread = start_thread(ThreadProbe) +log.debug("All collection threads started") + +# Loop indefinitely - monitor and restart any crashed threads while True: - # check if any thread has crashed sleep(10) if not probe_thread.is_alive(): - # restart the thread log.error("Probe Thread Died - Restarting") probe_thread = start_thread(ThreadProbe) else: - print("Probe Thread is alive") + log.debug("Probe Thread is alive") if not data_thread.is_alive(): - # restart the thread log.error("Data Thread Died - Restarting") - data_thread = start_thread(GetDataFunc(zvm_instance)) + data_thread = start_thread(lambda: GetDataFunc(zvm_instance)) else: - print("Data API Thread is alive") + log.debug("Data API Thread is alive") if not stats_thread.is_alive(): - # restart the thread log.error("Stats Thread Died - Restarting") stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance)) else: - print("Stats API Thread is alive") + log.debug("Stats API Thread is alive") if not vra_metrics_thread.is_alive(): - # restart the thread log.error("VRA Metrics Thread Died - Restarting") - vra_metrics_thread = start_thread(GetVraMetrics(zvm_instance)) + vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance)) else: - print("VRA Metrics Thread is alive") - if not webserver_thread.is_alive(): - # restart the thread - log.error("Webserver Thread Died - Restarting") - webserver_thread = start_thread(WebServer(listen_port)) - else: - print("WebServer Thread is alive") + log.debug("VRA Metrics Thread is alive") sleep(api_timeout) diff --git a/app/requirements.txt b/app/requirements.txt index f059f6f..ac01332 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1,28 +1,21 @@ +prometheus_client>=0.19.0 annotated-types==0.6.0 async-timeout==4.0.3 backoff==2.2.1 -boto3==1.28.63 -botocore==1.31.63 cachetools==5.3.1 certifi==2023.7.22 charset-normalizer==3.3.0 -docopt==0.6.2 idna==3.4 -jmespath==1.0.1 monotonic==1.6 -posthog==3.0.2 prompt-toolkit==3.0.39 -pydantic -pyflakes==3.1.0 +pydantic>=2.9.0 Pygments==2.16.1 python-dateutil==2.8.2 pyvim==3.0.3 -pyvmomi==8.0.2.0 -redis==5.0.1 +pyvmomi==9.0.0.0 requests==2.32.0 -s3transfer==0.7.0 six==1.16.0 tinydb==4.8.0 -typing_extensions==4.8.0 +typing_extensions>=4.12.2 urllib3==2.0.6 wcwidth==0.2.8 diff --git a/app/zvma10/zvma.py b/app/zvma10/zvma.py index 59b5db2..bf926f6 100644 --- a/app/zvma10/zvma.py +++ b/app/zvma10/zvma.py @@ -17,7 +17,6 @@ from dateutil import parser from typing import List, Dict, Tuple, Union, Any, Optional from requests.structures import CaseInsensitiveDict from logging.handlers import RotatingFileHandler -#from posthog import Posthog import uuid from requests import Request, Session from .version import VERSION @@ -74,12 +73,6 @@ class zvmsite: # Get UUID self.uuid = self.load_or_generate_uuid() - # Posthog stats setup - #if self.stats: - # self.setup_posthog() - # self.posthog.capture(self.uuid, 'ZVMA10 Python Module Loaded') - # self.log.debug("Sent PostHog Hook") - def __authhandler__(self) -> None: self.log.info(f"Log Level set to {self.LOGLEVEL}") if not self.__connected__: @@ -102,9 +95,11 @@ class zvmsite: } if self.grant_type == "client_credentials": data["client_secret"] = self.client_secret + data["scope"] = "openid" else: data["username"] = self.username data["password"] = self.password + data["scope"] = "openid" uri = self.construct_url(path="auth/realms/zerto/protocol/openid-connect/token") @@ -129,7 +124,6 @@ class zvmsite: self.expiresIn -= 10 else: self.log.info("Authentication thread is already running") - print(f"Auth thread already running") def is_authenticated(self) -> bool: # Assuming self.token is the authentication token and it's set upon successful authentication @@ -172,11 +166,6 @@ class zvmsite: file.write(new_uuid) return new_uuid - #def setup_posthog(self) -> None: - # self.posthog = Posthog(project_api_key='phc_HflqUkx9majhzm8DZva8pTwXFRnOn99onA9xPpK5HaQ', host='https://posthog.jpaul.io') - # self.posthog.debug = True - # self.posthog.identify(distinct_id=self.uuid) - def construct_url(self, path="", params=None) -> str: full_url = f"{self.base_url}/{path}" if params: @@ -235,23 +224,6 @@ class zvmsite: response.raise_for_status() self.log.debug(f'API Request: {method} - {url}') - # Posthog stats setup - #if self.stats: - # temp_base, temp_path = self.deconstruct_url(url) - # self.posthog.capture( self.uuid, 'API REQUEST', - # { - # "url": temp_base, - # "port": self.port, - # "endpoint": temp_path, - # "method": method, - # "response_time_ms": int(elapsed_time_ms), - # "verify_ssl": self.verify_ssl, - # "grant_type": self.grant_type, - # "status_code": str(response.status_code), - # "sdk_version": self.__version__ - # }) - # self.log.debug("Sent PostHog Hook") - return response.json() except requests.exceptions.RequestException as e: self.log.error(f"Error while sending API request: {e}") @@ -794,7 +766,7 @@ class zvmsite: return self.make_api_request("GET", uri, headers=self.apiheader) def service_profile(self, serviceProfileIdentifier=None) -> Dict[str, Any]: - if siteidentifier is None: + if serviceProfileIdentifier is None: self.log.error("Service Profile identifier is required for get site function.") raise ValueError("Service Profile identifier is required.") diff --git a/app/zvma9_7/GetStatsFunc.py b/app/zvma9_7/GetStatsFunc.py deleted file mode 100644 index 9d694a0..0000000 --- a/app/zvma9_7/GetStatsFunc.py +++ /dev/null @@ -1,126 +0,0 @@ - -import requests -from requests.packages.urllib3.exceptions import InsecureRequestWarning -from requests.structures import CaseInsensitiveDict -from tinydb import TinyDB, Query -from tinydbstorage.storage import MemoryStorage -from logging.handlers import RotatingFileHandler - -# Function to get VM Encryption Data from ZVMa version 9.7 -def GetStatsFunc(): - tempdb = TinyDB(storage=MemoryStorage) # ('./db.json') used for storing db on disk for debugging - dbvm = Query() - dbvpg = Query() - while (True) : - global token - global siteId - global siteName - - if (token != ""): - log.info("Got Auth Token!") - log.debug("token: " + str(token)) - log.debug("Stats Collector Loop Running") - - metricsDictionary = {} - - h2 = CaseInsensitiveDict() - h2["Accept"] = "application/json" - h2["Authorization"] = "Bearer " + token - - ## Statistics API - uri = "https://" + zvm_url + ":" + zvm_port + "/v1/statistics/vms/" - statsapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL) - statsapi_json = statsapi.json() - #log.debug(statsapi_json) - - for vm in statsapi_json: - oldvmdata = dict() - - CurrentIops = 0 - CurrentWriteCounterInMBs = 0 - CurrentSyncCounterInMBs = 0 - CurrentNetworkTrafficCounterInMBs = 0 - CurrentEncryptedLBs = 0 - CurrentUnencryptedLBs = 0 - CurrentTotalLBs = 0 - CurrentPercentEncrypted = 0 - VMName = "NA" - - oldvmdata = tempdb.search(dbvm.VmIdentifier == vm['VmIdentifier'] and dbvpg.VpgIdentifier == vm['VpgIdentifier']) - - log.info("Checking TempDB for VM " + vm['VmIdentifier'] + " in VPG " + vm['VpgIdentifier']) - if (oldvmdata): - log.info(vm['VmIdentifier'] + " Record Found, Updating DB") - log.debug(oldvmdata[0]) - log.debug(tempdb.update(vm, dbvm.VmIdentifier == vm['VmIdentifier'] and dbvpg.VpgIdentifier == vm['VpgIdentifier'])) - - log.debug("!@!@!@!@!@ Stats !@!@!@!@!@") - VMName = oldvmdata[0]['VmName'] - log.debug("Current VM " + str(VMName)) - CurrentIops = abs(vm['IoOperationsCounter'] - oldvmdata[0]['IoOperationsCounter']) - log.debug("CurrentIops " + str(CurrentIops)) - CurrentSyncCounterInMBs = abs(vm['SyncCounterInMBs'] - oldvmdata[0]['SyncCounterInMBs']) - log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs)) - CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs']) - log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs)) - CurrentEncryptedLBs = abs(vm['EncryptionStatistics']['EncryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['EncryptedDataInLBs']) - log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs)) - CurrentUnencryptedLBs = abs(vm['EncryptionStatistics']['UnencryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['UnencryptedDataInLBs']) - log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs)) - CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs) - log.debug("CurrentTotalLBs " + str(CurrentTotalLBs)) - if CurrentTotalLBs != 0: - CurrentPercentEncrypted = ((CurrentEncryptedLBs / CurrentTotalLBs) * 100) - else: - CurrentPercentEncrypted = 0 - log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted)) - - else: - log.info(vm['VmIdentifier'] + " No Record Found, Inserting into DB") - #insert original VM record to tempdb - log.debug(tempdb.insert(vm)) - - # update database with VM name, for easier display in Grafana Legends - uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vms/" + vm['VmIdentifier'] +"?vpgIdentifier=" + vm['VpgIdentifier'] - try: - vapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL) - vapi_json = vapi.json() - except Exception as e: - log.error("Error while sending api request: " + str(e)) - VMName = "Unknown" - else: - log.debug("vapi_json: " + str(vapi_json)) - tempdb.update({'VmName': vapi_json['VmName']}, dbvm.VmIdentifier == vm['VmIdentifier']) - log.info("Added vm to tempdb " + vm['VmIdentifier'] + " - " + vapi_json['VmName']) - VMName = vapi_json['VmName'] - - # Store Calculated Metrics - metricsDictionary["vm_IoOperationsCounter{VpgIdentifier=\"" + str(vm['VpgIdentifier']) + "\",VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(VMName) + "\",SiteIdentifier=\"" + str(siteId) + "\",SiteName=\"" + str(siteName) + "\"}"] = CurrentIops - metricsDictionary["vm_WriteCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentWriteCounterInMBs - metricsDictionary["vm_SyncCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentSyncCounterInMBs - metricsDictionary["vm_NetworkTrafficCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentNetworkTrafficCounterInMBs - metricsDictionary["vm_EncryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentEncryptedLBs - metricsDictionary["vm_UnencryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentUnencryptedLBs - metricsDictionary["vm_TotalDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTotalLBs - metricsDictionary["vm_PercentEncrypted{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentPercentEncrypted - - ## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus - file_object = open('statsmetrics', 'w') - txt_object = open('statsmetrics.txt', 'w') - for item in metricsDictionary : - file_object.write(item) - file_object.write(" ") - file_object.write(str(metricsDictionary[item])) - file_object.write("\n") - txt_object.write(item) - txt_object.write(" ") - txt_object.write(str(metricsDictionary[item])) - txt_object.write("\n") - file_object.close() - txt_object.close() - - log.debug("Starting Sleep for " + str(scrape_speed) + " seconds") - sleep(scrape_speed) - else: - log.debug("Waiting 1 second for Auth Token") - sleep(1) \ No newline at end of file diff --git a/app/zvma9_7/__init__.py b/app/zvma9_7/__init__.py deleted file mode 100644 index 0231b62..0000000 --- a/app/zvma9_7/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -print("Initializing zvma9_7 package...") - -from .GetStatsFunc import GetStatsFunc \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index f75259b..da2e082 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: "3.3" - services: zerto-exporter: build: . @@ -7,13 +5,13 @@ services: - "9999:9999" environment: - VERIFY_SSL=False - - ZVM_HOST=192.168.50.60 + - ZVM_HOST=192.168.50.30 - ZVM_PORT=443 - - CLIENT_ID=api-script - - CLIENT_SECRET=fcYMFuA5TkIUwp6b3hDUxim0f32z8erk + #- CLIENT_ID=api-script + #- CLIENT_SECRET=fcYMFuA5TkIUwp6b3hDUxim0f32z8erk + - ZVM_USERNAME=admin + - ZVM_PASSWORD=Zertodata987! - LOGLEVEL=INFO #Valid settings are CRITICAL, ERROR, WARNING, INFO, DEBUG - - VCENTER_HOST=192.168.50.50 + - VCENTER_HOST=192.168.50.20 - VCENTER_USER=administrator@vsphere.local - VCENTER_PASSWORD=Zertodata987! - volumes: - - "./logs:/usr/src/app/logs/"