Refactor to prometheus_client, JSON logging, and codebase cleanup

- Replace file-based metrics and HTTP server with prometheus_client
  (Gauge + start_http_server), eliminating file I/O race conditions
- Fix ThreadProbe and thread restart lambda bugs
- Switch logging from RotatingFileHandler to JSON stdout for fluentd/Loki
- Add PYTHONUNBUFFERED=1 to Dockerfile for immediate container log output
- Upgrade base image from python:3.12.3-slim to python:3.13-slim
- Upgrade pyvmomi to 9.0.0.0; pin pydantic>=2.9.0 and typing_extensions>=4.12.2
  to use pre-built Python 3.13 wheels (removes Rust toolchain from build)
- Remove unused packages: boto3, botocore, s3transfer, jmespath, redis,
  docopt, pyflakes, posthog
- Remove unused imports (Posthog, CaseInsensitiveDict) and dead variables
  (callhomestats, local_site_info, lastStats)
- Fix service_profile() NameError (siteidentifier -> serviceProfileIdentifier)
- Remove bare print() in zvma.py __authhandler__, replace with self.log.info()
- Remove all commented-out PostHog blocks from zvma10/zvma.py
- Delete legacy zvma9_7/ module and app/logs/ directory
- Remove deprecated 'version: 3.3' from docker-compose.yml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-20 19:05:45 -05:00
parent a0afcefc78
commit 2836e1ae10
8 changed files with 364 additions and 516 deletions
+3 -8
View File
@@ -1,11 +1,10 @@
FROM python:3.12.3-slim
FROM python:3.13-slim
EXPOSE 9999
# Install system dependencies
RUN apt-get update \
&& apt-get install -y \
curl \
gcc \
libffi-dev \
libssl-dev \
@@ -13,14 +12,12 @@ RUN apt-get update \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install Rust and Cargo using curl with IPv4 only
RUN CURL_IPRESOLVE=4 curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /usr/src/app
# Set PYTHONPATH to include /usr/src/app
ENV PYTHONPATH=/usr/src/app
# Disable stdout buffering so logs appear immediately in the container console
ENV PYTHONUNBUFFERED=1
# Copy the zerto exporter into the container
COPY app /usr/src/app/
@@ -29,8 +26,6 @@ COPY app /usr/src/app/
RUN [ -f uuid.txt ] && rm uuid.txt || echo "No uuid.txt file to delete"
# Install Python dependencies
# Set environment variable for PyO3 compatibility
ENV PYO3_USE_ABI3_FORWARD_COMPATIBILITY=1
RUN pip install --upgrade pip
RUN pip install --no-cache-dir -r requirements.txt
-1
View File
@@ -1 +0,0 @@
ZVM Exporter Log files will be in this folder
+344 -324
View File
@@ -1,23 +1,18 @@
import requests
import http.server
import socketserver
import os
import ssl
import sys
import json
import logging
from logging.handlers import RotatingFileHandler
import threading
import socket
from pyVim.connect import SmartConnect, Disconnect
from pyVmomi import vim
from time import sleep, time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from requests.structures import CaseInsensitiveDict
from tinydb import TinyDB, Query
from tinydb.storages import MemoryStorage
from prometheus_client import Gauge, start_http_server
from version import VERSION
from vmware.vcenter import vcsite
from zvma10.zvma import zvmsite
from posthog import Posthog
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
global start_time
@@ -28,9 +23,8 @@ Variables: Normally these are imported from the Docker Container, but alternativ
"""
listen_port = int(os.getenv('LISTEN_PORT', 9999))
callhomestats = os.getenv("CALL_HOME_STATS", 'True').lower() in ('false', '0', 'f')
verifySSL = os.getenv("VERIFY_SSL", 'False').lower() in ('true', '1', 't')
zvm_url = os.environ.get('ZVM_HOST', '192.168.50.60')
zvm_url = os.environ.get('ZVM_HOST', '192.168.50.30')
zvm_port = os.environ.get('ZVM_PORT', '443')
zvm_username = os.environ.get('ZVM_USERNAME', 'admin')
zvm_password = os.environ.get('ZVM_PASSWORD', 'Zertodata987!')
@@ -41,31 +35,121 @@ api_timeout = int(os.environ.get('API_TIMEOUT', 5))
LOGLEVEL = os.environ.get('LOGLEVEL', 'DEBUG').upper()
DISABLE_STATS = os.environ.get('DISABLE_STATS', 'FALSE').upper()
version = str(VERSION)
vcenter_host = os.environ.get('VCENTER_HOST', '192.168.50.50')
vcenter_host = os.environ.get('VCENTER_HOST', '192.168.50.20')
vcenter_user = os.environ.get('VCENTER_USER', 'administrator@vsphere.local')
vcenter_pwd = os.environ.get('VCENTER_PASSWORD', 'Zertodata987!')
# Thread which gets VM level encryption statistics from ZVM API
# ---------------------------------------------------------------------------
# Prometheus Gauge definitions
# All metrics are served thread-safely at http://host:<LISTEN_PORT>/metrics
# ---------------------------------------------------------------------------
# Encryption / stats metrics (GetStatsFunc)
_STATS_LABELS = ['VpgIdentifier', 'VmIdentifier', 'VmName', 'SiteIdentifier', 'SiteName']
g_vm_iops_counter = Gauge('vm_IoOperationsCounter', 'VM IO Operations Counter', _STATS_LABELS)
g_vm_write_counter = Gauge('vm_WriteCounterInMBs', 'VM Write Counter In MBs', _STATS_LABELS)
g_vm_sync_counter = Gauge('vm_SyncCounterInMBs', 'VM Sync Counter In MBs', _STATS_LABELS)
g_vm_network_counter = Gauge('vm_NetworkTrafficCounterInMBs', 'VM Network Traffic Counter In MBs', _STATS_LABELS)
g_vm_encrypted_lbs = Gauge('vm_EncryptedDataInLBs', 'VM Encrypted Data In LBs', _STATS_LABELS)
g_vm_unencrypted_lbs = Gauge('vm_UnencryptedDataInLBs', 'VM Unencrypted Data In LBs', _STATS_LABELS)
g_vm_total_lbs = Gauge('vm_TotalDataInLBs', 'VM Total Data In LBs', _STATS_LABELS)
g_vm_percent_encrypted = Gauge('vm_PercentEncrypted', 'VM Percent Encrypted', _STATS_LABELS)
g_vm_trend_change_level = Gauge('vm_TrendChangeLevel', 'VM Trend Change Level', _STATS_LABELS)
# VPG metrics (GetDataFunc)
_VPG_LABELS = ['VpgIdentifier', 'VpgName', 'VpgPriority', 'SiteIdentifier', 'SiteName']
g_vpg_storage_used = Gauge('vpg_storage_used_in_mb', 'VPG Storage Used In MB', _VPG_LABELS)
g_vpg_actual_rpo = Gauge('vpg_actual_rpo', 'VPG Actual RPO', _VPG_LABELS)
g_vpg_throughput = Gauge('vpg_throughput_in_mb', 'VPG Throughput In MB', _VPG_LABELS)
g_vpg_iops = Gauge('vpg_iops', 'VPG IOPs', _VPG_LABELS)
g_vpg_provisioned_storage = Gauge('vpg_provisioned_storage_in_mb', 'VPG Provisioned Storage In MB', _VPG_LABELS)
g_vpg_vms_count = Gauge('vpg_vms_count', 'VPG VMs Count', _VPG_LABELS)
g_vpg_configured_rpo = Gauge('vpg_configured_rpo_seconds', 'VPG Configured RPO Seconds', _VPG_LABELS)
g_vpg_actual_history = Gauge('vpg_actual_history_in_minutes', 'VPG Actual History In Minutes', _VPG_LABELS)
g_vpg_configured_history = Gauge('vpg_configured_history_in_minutes', 'VPG Configured History In Minutes', _VPG_LABELS)
g_vpg_failsafe_actual = Gauge('vpg_failsafe_history_in_minutes_actual', 'VPG Failsafe History In Minutes Actual', _VPG_LABELS)
g_vpg_failsafe_configured = Gauge('vpg_failsafe_history_in_minutes_configured', 'VPG Failsafe History In Minutes Configured', _VPG_LABELS)
g_vpg_status = Gauge('vpg_status', 'VPG Status', _VPG_LABELS)
g_vpg_substatus = Gauge('vpg_substatus', 'VPG Sub-Status', _VPG_LABELS)
g_vpg_alert_status = Gauge('vpg_alert_status', 'VPG Alert Status', _VPG_LABELS)
# Datastore metrics (GetDataFunc)
_DS_LABELS = ['datastoreIdentifier', 'DatastoreName', 'SiteIdentifier', 'SiteName']
g_ds_vras = Gauge('datastore_vras', 'Datastore VRAs', _DS_LABELS)
g_ds_incoming_vms = Gauge('datastore_incoming_vms', 'Datastore Incoming VMs', _DS_LABELS)
g_ds_outgoing_vms = Gauge('datastore_outgoing_vms', 'Datastore Outgoing VMs', _DS_LABELS)
g_ds_capacity = Gauge('datastore_usage_capacityinbytes', 'Datastore Capacity In Bytes', _DS_LABELS)
g_ds_free = Gauge('datastore_usage_freeinbytes', 'Datastore Free In Bytes', _DS_LABELS)
g_ds_used = Gauge('datastore_usage_usedinbytes', 'Datastore Used In Bytes', _DS_LABELS)
g_ds_provisioned = Gauge('datastore_usage_provisionedinbytes', 'Datastore Provisioned In Bytes', _DS_LABELS)
g_ds_zerto_protected_used = Gauge('datastore_usage_zerto_protected_usedinbytes', 'Datastore Zerto Protected Used In Bytes', _DS_LABELS)
g_ds_zerto_protected_provisioned = Gauge('datastore_usage_zerto_protected_provisionedinbytes', 'Datastore Zerto Protected Provisioned In Bytes', _DS_LABELS)
g_ds_zerto_recovery_used = Gauge('datastore_usage_zerto_recovery_usedinbytes', 'Datastore Zerto Recovery Used In Bytes', _DS_LABELS)
g_ds_zerto_recovery_provisioned = Gauge('datastore_usage_zerto_recovery_provisionedinbytes', 'Datastore Zerto Recovery Provisioned In Bytes', _DS_LABELS)
g_ds_zerto_journal_used = Gauge('datastore_usage_zerto_journal_usedinbytes', 'Datastore Zerto Journal Used In Bytes', _DS_LABELS)
g_ds_zerto_journal_provisioned = Gauge('datastore_usage_zerto_journal_provisionedinbytes', 'Datastore Zerto Journal Provisioned In Bytes', _DS_LABELS)
g_ds_zerto_scratch_used = Gauge('datastore_usage_zerto_scratch_usedinbytes', 'Datastore Zerto Scratch Used In Bytes', _DS_LABELS)
g_ds_zerto_scratch_provisioned = Gauge('datastore_usage_zerto_scratch_provisionedinbytes', 'Datastore Zerto Scratch Provisioned In Bytes', _DS_LABELS)
g_ds_zerto_appliances_used = Gauge('datastore_usage_zerto_appliances_usedinbytes', 'Datastore Zerto Appliances Used In Bytes', _DS_LABELS)
g_ds_zerto_appliances_provisioned = Gauge('datastore_usage_zerto_appliances_provisionedinbytes', 'Datastore Zerto Appliances Provisioned In Bytes', _DS_LABELS)
# VM metrics (GetDataFunc - VMs section)
_VM_LABELS = ['VmIdentifier', 'VmName', 'VmRecoveryVRA', 'VmPriority', 'SiteIdentifier', 'VpgName', 'SiteName']
g_vm_actualrpo = Gauge('vm_actualrpo', 'VM Actual RPO', _VM_LABELS)
g_vm_throughput = Gauge('vm_throughput_in_mb', 'VM Throughput In MB', _VM_LABELS)
g_vm_iops = Gauge('vm_iops', 'VM IOPs', _VM_LABELS)
g_vm_journal_hard_limit = Gauge('vm_journal_hard_limit', 'VM Journal Hard Limit', _VM_LABELS)
g_vm_journal_warning_limit = Gauge('vm_journal_warning_limit', 'VM Journal Warning Limit', _VM_LABELS)
g_vm_journal_used_storage = Gauge('vm_journal_used_storage_mb', 'VM Journal Used Storage MB', _VM_LABELS)
g_vm_outgoing_bandwidth = Gauge('vm_outgoing_bandwidth_in_mbps', 'VM Outgoing Bandwidth In Mbps', _VM_LABELS)
g_vm_used_storage = Gauge('vm_used_storage_in_MB', 'VM Used Storage In MB', _VM_LABELS)
g_vm_provisioned_storage = Gauge('vm_provisioned_storage_in_MB', 'VM Provisioned Storage In MB', _VM_LABELS)
g_vm_status = Gauge('vm_status', 'VM Status', _VM_LABELS)
g_vm_substatus = Gauge('vm_substatus', 'VM Sub-Status', _VM_LABELS)
# Scratch and journal volume metrics (GetDataFunc - Volumes sections)
_VOL_LABELS = ['ProtectedVm', 'ProtectedVmIdentifier', 'OwningVRA', 'VpgName', 'SiteIdentifier', 'SiteName']
g_scratch_vol_size = Gauge('scratch_volume_size_in_bytes', 'Scratch Volume Size In Bytes', _VOL_LABELS)
g_journal_vol_size = Gauge('vm_journal_volume_size_in_bytes', 'VM Journal Volume Size In Bytes', _VOL_LABELS)
g_journal_vol_provisioned = Gauge('vm_journal_volume_provisioned_in_bytes','VM Journal Volume Provisioned In Bytes', _VOL_LABELS)
g_journal_vol_count = Gauge('vm_journal_volume_count', 'VM Journal Volume Count', _VOL_LABELS)
# VRA metrics (GetVraMetrics)
_VRA_LABELS = ['VraIdentifierStr', 'VraName', 'VraVersion', 'HostVersion', 'SiteIdentifier', 'SiteName']
g_vra_memory = Gauge('vra_memory_in_GB', 'VRA Memory In GB', _VRA_LABELS)
g_vra_vcpu_count = Gauge('vra_vcpu_count', 'VRA vCPU Count', _VRA_LABELS)
g_vra_protected_vms = Gauge('vra_protected_vms', 'VRA Protected VMs', _VRA_LABELS)
g_vra_protected_vpgs = Gauge('vra_protected_vpgs', 'VRA Protected VPGs', _VRA_LABELS)
g_vra_protected_vols = Gauge('vra_protected_volumes', 'VRA Protected Volumes', _VRA_LABELS)
g_vra_recovery_vms = Gauge('vra_recovery_vms', 'VRA Recovery VMs', _VRA_LABELS)
g_vra_recovery_vpgs = Gauge('vra_recovery_vpgs', 'VRA Recovery VPGs', _VRA_LABELS)
g_vra_recovery_vols = Gauge('vra_recovery_volumes', 'VRA Recovery Volumes', _VRA_LABELS)
g_vra_self_protected = Gauge('vra_self_protected_vpgs', 'VRA Self-Protected VPGs', _VRA_LABELS)
g_vra_cpu_usage = Gauge('vra_cpu_usage_mhz', 'VRA CPU Usage MHz', _VRA_LABELS)
g_vra_memory_usage = Gauge('vra_memory_usage_mb', 'VRA Memory Usage MB', _VRA_LABELS)
# Exporter / thread health metrics (ThreadProbe)
g_exporter_uptime = Gauge('exporter_uptime', 'Exporter Uptime In Minutes', ['ExporterInstance'])
g_thread_status = Gauge('exporter_thread_status', 'Exporter Thread Status', ['thread', 'ExporterInstance'])
# ---------------------------------------------------------------------------
# Thread which gets VM level encryption statistics from ZVM API
# ---------------------------------------------------------------------------
def GetStatsFunc(zvm_instance):
tempdb = TinyDB(storage=MemoryStorage) # ('./db.json') #(storage=MemoryStorage) used for storing db on disk for debugging
tempdb = TinyDB(storage=MemoryStorage)
dbvm = Query()
dbvpg = Query()
dbsite = Query()
zvm = zvm_instance
while (True) :
while True:
global siteId
global siteName
if (zvm.is_authenticated()):
if zvm.is_authenticated():
log.debug("Stats Collector Loop Running")
metricsDictionary = {}
## Statistics API
statsapi_json = None
statsapi_json = zvm.vms_statistics()
statsapi_json = zvm.vms_statistics()
log.debug(statsapi_json)
vms_encryption_metrics = zvm.encryptiondetection_metrics_vms()
@@ -74,8 +158,7 @@ def GetStatsFunc(zvm_instance):
vmsiteinfo = zvm.vm(vmidentifier=vm['VmIdentifier'], vpgidentifier=vm['VpgIdentifier'])
if vmsiteinfo['ProtectedSite']['identifier'] == zvm.site_id:
log.debug(f"VM is protected at this site - {vm['VmIdentifier']}")
oldvmdata = dict()
# this part of the dictionary will never exist, so not sure why i need this as i set the key/values below in the vmem section.
if 'EncryptionMetrics' not in vm:
vm['EncryptionMetrics'] = {}
vm['VmName'] = None
@@ -95,15 +178,14 @@ def GetStatsFunc(zvm_instance):
for vmem in vms_encryption_metrics:
if vmem['Link']['identifier'] == vm['VmIdentifier']:
log.debug(f"Aligning VM Stats and Encryption Metrics for {vm['VmIdentifier']} - {vmem['Link']['name']}")
#print(f"Aligning VM Stats and Encryption Metrics for {vm['VmIdentifier']} - {vmem['Link']['name']}")
vm['EncryptionMetrics']['EncryptedData'] = vmem['EncryptionMetrics']['EncryptedData']
vm['EncryptionMetrics']['EncryptedData'] = vmem['EncryptionMetrics']['EncryptedData']
vm['EncryptionMetrics']['NonEncryptedData'] = vmem['EncryptionMetrics']['NonEncryptedData']
vm['EncryptionMetrics']['TrendChangeLevel'] = vmem['EncryptionMetrics']['TrendChangeLevel']
vm['VmName'] = vmem['Link']['name']
log.info("Checking TempDB for VM " + vm['VmIdentifier'] + " in VPG " + vm['VpgIdentifier'])
oldvmdata = tempdb.search((dbvm.VmIdentifier == vm['VmIdentifier']) & (dbvpg.VpgIdentifier == vm['VpgIdentifier']))
if (oldvmdata):
if oldvmdata:
log.info(vm['VmIdentifier'] + " Record Found, Updating DB")
log.debug("Old Data")
log.debug(oldvmdata)
@@ -119,139 +201,137 @@ def GetStatsFunc(zvm_instance):
log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs))
CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs'])
log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs))
CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs'])
CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs'])
log.debug("CurrentWriteCounterInMBs " + str(CurrentWriteCounterInMBs))
CurrentEncryptedLBs = abs(vm['EncryptionMetrics']['EncryptedData'] - oldvmdata[0]['EncryptionMetrics']['EncryptedData'])
log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs))
CurrentUnencryptedLBs = abs(vm['EncryptionMetrics']['NonEncryptedData'] - oldvmdata[0]['EncryptionMetrics']['NonEncryptedData'])
log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs))
CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel'])
CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel'])
log.debug("CurrentTrendChangeLevel " + str(CurrentTrendChangeLevel))
CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs)
log.debug("CurrentTotalLBs " + str(CurrentTotalLBs))
if CurrentTotalLBs != 0:
CurrentPercentEncrypted = ((CurrentEncryptedLBs / CurrentTotalLBs) * 100)
CurrentPercentEncrypted = (CurrentEncryptedLBs / CurrentTotalLBs) * 100
else:
CurrentPercentEncrypted = 0
log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted))
else:
log.info(f"{vm['VmIdentifier']} - {vm['VmName']} - No Record Found, Inserting into DB")
#insert original VM record to tempdb
log.debug(tempdb.insert(vm))
# Store Calculated Metrics
metricsDictionary["vm_IoOperationsCounter{VpgIdentifier=\"" + str(vm['VpgIdentifier']) + "\",VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + str(siteId) + "\",SiteName=\"" + str(siteName) + "\"}"] = CurrentIops
metricsDictionary["vm_WriteCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentWriteCounterInMBs
metricsDictionary["vm_SyncCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentSyncCounterInMBs
metricsDictionary["vm_NetworkTrafficCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentNetworkTrafficCounterInMBs
metricsDictionary["vm_EncryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentEncryptedLBs
metricsDictionary["vm_UnencryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentUnencryptedLBs
metricsDictionary["vm_TotalDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTotalLBs
metricsDictionary["vm_PercentEncrypted{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentPercentEncrypted
metricsDictionary["vm_TrendChangeLevel{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTrendChangeLevel
# Push calculated metrics to Prometheus Gauges
lbl = dict(
VpgIdentifier=str(vm['VpgIdentifier']),
VmIdentifier=str(vm['VmIdentifier']),
VmName=str(vm['VmName']),
SiteIdentifier=str(siteId),
SiteName=str(siteName)
)
g_vm_iops_counter.labels(**lbl).set(CurrentIops)
g_vm_write_counter.labels(**lbl).set(CurrentWriteCounterInMBs)
g_vm_sync_counter.labels(**lbl).set(CurrentSyncCounterInMBs)
g_vm_network_counter.labels(**lbl).set(CurrentNetworkTrafficCounterInMBs)
g_vm_encrypted_lbs.labels(**lbl).set(CurrentEncryptedLBs)
g_vm_unencrypted_lbs.labels(**lbl).set(CurrentUnencryptedLBs)
g_vm_total_lbs.labels(**lbl).set(CurrentTotalLBs)
g_vm_percent_encrypted.labels(**lbl).set(CurrentPercentEncrypted)
g_vm_trend_change_level.labels(**lbl).set(CurrentTrendChangeLevel)
else:
log.debug(f"VM is only recovering to this site, skipping metrics - {vm['VmIdentifier']}")
#print(f"VM is only recovering to this site, skipping metrics - {vm['VmIdentifier']}")
else:
log.debug("No VMS in Stats API")
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
file_object = open('statsmetrics', 'w')
txt_object = open('statsmetrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
# Function which retrieves stats from various ZVM APIs and stores them in a metrics file
# ---------------------------------------------------------------------------
# Function which retrieves stats from various ZVM APIs
# ---------------------------------------------------------------------------
def GetDataFunc(zvm_instance):
tempdb = TinyDB(storage=MemoryStorage)
dbvm = Query()
zvm = zvm_instance
while (True) :
while True:
global siteId
global siteName
if (zvm.is_authenticated()):
if zvm.is_authenticated():
log.info("Data Collector Loop Running")
metricsDictionary = {}
### VPGs API
vpg_json = None
vpg_json = zvm.vpgs()
if(vpg_json is not None):
vpg_json = zvm.vpgs()
if vpg_json is not None:
log.debug("Got VPG JSON")
for vpg in vpg_json :
metricsDictionary["vpg_storage_used_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["UsedStorageInMB"]
metricsDictionary["vpg_actual_rpo{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ActualRPO"]
metricsDictionary["vpg_throughput_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ThroughputInMB"]
metricsDictionary["vpg_iops{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["IOPs"]
metricsDictionary["vpg_provisioned_storage_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ProvisionedStorageInMB"]
metricsDictionary["vpg_vms_count{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["VmsCount"]
metricsDictionary["vpg_configured_rpo_seconds{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ConfiguredRpoSeconds"]
metricsDictionary["vpg_actual_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ActualHistoryInMinutes"]
metricsDictionary["vpg_configured_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"]
if(vpg["FailSafeHistory"] is None):
metricsDictionary["vpg_failsafe_history_in_minutes_actual{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = 0
metricsDictionary["vpg_failsafe_history_in_minutes_configured{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = 0
for vpg in vpg_json:
lbl = dict(
VpgIdentifier=vpg['VpgIdentifier'],
VpgName=vpg['VpgName'],
VpgPriority=str(vpg['Priority']),
SiteIdentifier=siteId,
SiteName=siteName
)
g_vpg_storage_used.labels(**lbl).set(vpg["UsedStorageInMB"])
g_vpg_actual_rpo.labels(**lbl).set(vpg["ActualRPO"])
g_vpg_throughput.labels(**lbl).set(vpg["ThroughputInMB"])
g_vpg_iops.labels(**lbl).set(vpg["IOPs"])
g_vpg_provisioned_storage.labels(**lbl).set(vpg["ProvisionedStorageInMB"])
g_vpg_vms_count.labels(**lbl).set(vpg["VmsCount"])
g_vpg_configured_rpo.labels(**lbl).set(vpg["ConfiguredRpoSeconds"])
g_vpg_actual_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ActualHistoryInMinutes"])
g_vpg_configured_history.labels(**lbl).set(vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"])
if vpg["FailSafeHistory"] is None:
g_vpg_failsafe_actual.labels(**lbl).set(0)
g_vpg_failsafe_configured.labels(**lbl).set(0)
else:
metricsDictionary["vpg_failsafe_history_in_minutes_actual{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ActualFailSafeHistory"]
metricsDictionary["vpg_failsafe_history_in_minutes_configured{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"]
metricsDictionary["vpg_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["Status"]
metricsDictionary["vpg_substatus{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["SubStatus"]
metricsDictionary["vpg_alert_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["AlertStatus"]
g_vpg_failsafe_actual.labels(**lbl).set(vpg["FailSafeHistory"]["ActualFailSafeHistory"])
g_vpg_failsafe_configured.labels(**lbl).set(vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"])
g_vpg_status.labels(**lbl).set(vpg["Status"])
g_vpg_substatus.labels(**lbl).set(vpg["SubStatus"])
g_vpg_alert_status.labels(**lbl).set(vpg["AlertStatus"])
else:
log.debug("No VPGs Found")
### Datastores APIs
ds_json = None
ds_json = zvm.datastores()
if(ds_json is not None):
### Datastores API
ds_json = zvm.datastores()
if ds_json is not None:
log.debug("Got Datastores API")
for ds in ds_json :
for ds in ds_json:
log.debug(f"Processing {ds['DatastoreName']}")
metricsDictionary["datastore_vras{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumVRAs"]
metricsDictionary["datastore_incoming_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumIncomingVMs"]
metricsDictionary["datastore_outgoing_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumOutgoingVMs"]
metricsDictionary["datastore_usage_capacityinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"]
metricsDictionary["datastore_usage_freeinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"]
metricsDictionary["datastore_usage_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"]
metricsDictionary["datastore_usage_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_protected_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_protected_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_recovery_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_recovery_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_journal_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_journal_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_scratch_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_scratch_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_appliances_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_appliances_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"]
lbl = dict(
datastoreIdentifier=ds['DatastoreIdentifier'],
DatastoreName=ds['DatastoreName'],
SiteIdentifier=siteId,
SiteName=siteName
)
g_ds_vras.labels(**lbl).set(ds["Stats"]["NumVRAs"])
g_ds_incoming_vms.labels(**lbl).set(ds["Stats"]["NumIncomingVMs"])
g_ds_outgoing_vms.labels(**lbl).set(ds["Stats"]["NumOutgoingVMs"])
g_ds_capacity.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"])
g_ds_free.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"])
g_ds_used.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"])
g_ds_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"])
g_ds_zerto_protected_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"])
g_ds_zerto_protected_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"])
g_ds_zerto_recovery_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"])
g_ds_zerto_recovery_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"])
g_ds_zerto_journal_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"])
g_ds_zerto_journal_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"])
g_ds_zerto_scratch_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"])
g_ds_zerto_scratch_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"])
g_ds_zerto_appliances_used.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"])
g_ds_zerto_appliances_provisioned.labels(**lbl).set(ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"])
else:
log.debug("No Datastores Found")
## VMs API
log.debug("Getting VMs API")
scratch_vols = None
scratch_vols = zvm.vms()
if(scratch_vols is not None):
vms_json = zvm.vms()
if vms_json is not None:
log.debug("Got VMs API")
for vm in scratch_vols:
for vm in vms_json:
log.debug("Processing VM: " + str(vm['VmName']))
log.debug("Checking VM " + vm['VmIdentifier'] + " on Protected Site " + vm['ProtectedSite']['identifier'] + " against " + siteId)
@@ -260,181 +340,160 @@ def GetDataFunc(zvm_instance):
if not isinstance(vm["ActualRPO"], int):
vm["ActualRPO"] = -1
metricsDictionary["vm_actualrpo{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["ActualRPO"]
metricsDictionary["vm_throughput_in_mb{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["ThroughputInMB"]
metricsDictionary["vm_iops{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["IOPs"]
metricsDictionary["vm_journal_hard_limit{VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + str(siteName) + "\"}"] = vm["JournalHardLimit"]["LimitValue"]
metricsDictionary["vm_journal_warning_limit{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalWarningThreshold"]["LimitValue"]
metricsDictionary["vm_journal_used_storage_mb{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalUsedStorageMb"]
metricsDictionary["vm_outgoing_bandwidth_in_mbps{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["OutgoingBandWidthInMbps"]
metricsDictionary["vm_used_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["UsedStorageInMB"]
metricsDictionary["vm_provisioned_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["ProvisionedStorageInMB"]
metricsDictionary["vm_status{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["Status"]
metricsDictionary["vm_substatus{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + str(vm['VmName']) + "\",VmRecoveryVRA=\"" + str(vm["RecoveryHostName"]) + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + str(siteId) + "\",VpgName=\"" + str(vm['VpgName']) + "\",SiteName=\"" + siteName + "\"}"] = vm["SubStatus"]
log.debug("Processed VM: " + str(vm['VmName']))
lbl = dict(
VmIdentifier=str(vm['VmIdentifier']),
VmName=str(vm['VmName']),
VmRecoveryVRA=str(vm["RecoveryHostName"]),
VmPriority=str(vm['Priority']),
SiteIdentifier=str(siteId),
VpgName=str(vm['VpgName']),
SiteName=str(siteName)
)
g_vm_actualrpo.labels(**lbl).set(vm["ActualRPO"])
g_vm_throughput.labels(**lbl).set(vm["ThroughputInMB"])
g_vm_iops.labels(**lbl).set(vm["IOPs"])
g_vm_journal_hard_limit.labels(**lbl).set(vm["JournalHardLimit"]["LimitValue"])
g_vm_journal_warning_limit.labels(**lbl).set(vm["JournalWarningThreshold"]["LimitValue"])
g_vm_journal_used_storage.labels(**lbl).set(vm["JournalUsedStorageMb"])
g_vm_outgoing_bandwidth.labels(**lbl).set(vm["OutgoingBandWidthInMbps"])
g_vm_used_storage.labels(**lbl).set(vm["UsedStorageInMB"])
g_vm_provisioned_storage.labels(**lbl).set(vm["ProvisionedStorageInMB"])
g_vm_status.labels(**lbl).set(vm["Status"])
g_vm_substatus.labels(**lbl).set(vm["SubStatus"])
log.debug("Processed VM: " + str(vm['VmName']))
else:
log.debug("VM " + vm['VmIdentifier'] + " is protected to this site")
else:
log.debug("No VMs Found")
## Volumes API for Scratch Volumes
## Volumes API - Scratch Volumes
log.debug("Getting Scratch Volumes")
scratch_vols = None
scratch_vols = zvm.volumes(volumetype="scratch")
if(scratch_vols is not None):
if scratch_vols is not None:
log.debug("Got Scratch Volumes API")
# Accumulate per-VM totals before setting gauges (multiple volumes per VM)
scratch_accumulator = {}
for volume in scratch_vols:
#metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"]
# Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key.
metrickey = "scratch_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"]
percentage_used = (volume["Size"]["UsedInBytes"] / volume["Size"]["ProvisionedInBytes"] * 100)
percentage_used = round(percentage_used, 1)
#metricsDictionary["scratch_volume_percentage_used{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = percentage_used
key = (
volume['ProtectedVm']['Name'],
volume['ProtectedVm']['Identifier'],
volume['OwningVm']['Name'],
volume['Vpg']['Name']
)
scratch_accumulator[key] = scratch_accumulator.get(key, 0) + volume["Size"]["UsedInBytes"]
for (pvm, pvmid, owning_vra, vpg_name), size in scratch_accumulator.items():
g_scratch_vol_size.labels(
ProtectedVm=pvm, ProtectedVmIdentifier=pvmid,
OwningVRA=owning_vra, VpgName=vpg_name,
SiteIdentifier=siteId, SiteName=siteName
).set(size)
else:
log.debug("No Scratch Volumes Found")
## Volumes API for Journal Volumes
## Volumes API - Journal Volumes
log.debug("Getting Journal Volumes")
journal_vols = None
journal_vols = zvm.volumes(volumetype="journal")
if(journal_vols is not None):
if journal_vols is not None:
log.debug("Journal Volumes Exist")
for volume in journal_vols :
# Accumulate per-VM totals before setting gauges (multiple volumes per VM)
journal_size_acc = {}
journal_prov_acc = {}
journal_count_acc = {}
for volume in journal_vols:
log.debug("Journal Volume: " + volume['ProtectedVm']['Name'] + " Calculating total size...")
#metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"]
# Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key.
metrickey = "vm_journal_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"]
metrickey = "vm_journal_volume_provisioned_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["ProvisionedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["ProvisionedInBytes"]
metrickey = "vm_journal_volume_count{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",VpgName=\"" + str(volume['Vpg']['Name']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + 1
else:
metricsDictionary[metrickey] = 1
key = (
volume['ProtectedVm']['Name'],
volume['ProtectedVm']['Identifier'],
volume['OwningVm']['Name'],
volume['Vpg']['Name']
)
journal_size_acc[key] = journal_size_acc.get(key, 0) + volume["Size"]["UsedInBytes"]
journal_prov_acc[key] = journal_prov_acc.get(key, 0) + volume["Size"]["ProvisionedInBytes"]
journal_count_acc[key] = journal_count_acc.get(key, 0) + 1
for key in journal_size_acc:
pvm, pvmid, owning_vra, vpg_name = key
lbl = dict(
ProtectedVm=pvm, ProtectedVmIdentifier=pvmid,
OwningVRA=owning_vra, VpgName=vpg_name,
SiteIdentifier=siteId, SiteName=siteName
)
g_journal_vol_size.labels(**lbl).set(journal_size_acc[key])
g_journal_vol_provisioned.labels(**lbl).set(journal_prov_acc[key])
g_journal_vol_count.labels(**lbl).set(journal_count_acc[key])
else:
log.debug("No Journal Volumes Exist")
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
log.debug("Writing metrics to file")
file_object = open('metrics', 'w')
txt_object = open('metrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
log.debug("Metrics written to file")
# This function will get data every 10 seconds
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
# get VRA CPU and memory usage from vCenter Server
# ---------------------------------------------------------------------------
# Get VRA CPU and memory usage from vCenter Server
# ---------------------------------------------------------------------------
def GetVraMetrics(zvm_instance):
log.debug("GetVraMetrics thread started")
try:
metricsDictionary = {}
zvm = zvm_instance
while True:
vra_names = []
vras = []
global siteId
global siteName
log.debug("Checking Token in VRA CPU MEM Collector")
if (zvm.is_authenticated()):
if zvm.is_authenticated():
log.info("VRA CPU MEM Collector Running")
### VRA API
vras_json = None
vras_json = zvm.vras()
log.debug(vras_json)
if (vras_json is not None):
if vras_json is not None:
log.debug("VRA names: %s", vras_json)
log.debug(type(vras))
for vra in vras_json :
# Gather other VRA Metrics from Zerto API into Metrics Diectionary
metricsDictionary["vra_memory_in_GB{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["MemoryInGB"]
metricsDictionary["vra_vcpu_count{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["NumOfCpus"]
metricsDictionary["vra_protected_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vms"]
metricsDictionary["vra_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vpgs"]
metricsDictionary["vra_protected_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Volumes"]
metricsDictionary["vra_recovery_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vms"]
metricsDictionary["vra_recovery_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vpgs"]
metricsDictionary["vra_recovery_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Volumes"]
metricsDictionary["vra_self_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["SelfProtectedVpgs"]
for vra in vras_json:
lbl = dict(
VraIdentifierStr=vra['VraIdentifierStr'],
VraName=vra['VraName'],
VraVersion=vra['VraVersion'],
HostVersion=vra['HostVersion'],
SiteIdentifier=siteId,
SiteName=siteName
)
g_vra_memory.labels(**lbl).set(vra["MemoryInGB"])
g_vra_vcpu_count.labels(**lbl).set(vra["NumOfCpus"])
g_vra_protected_vms.labels(**lbl).set(vra["ProtectedCounters"]["Vms"])
g_vra_protected_vpgs.labels(**lbl).set(vra["ProtectedCounters"]["Vpgs"])
g_vra_protected_vols.labels(**lbl).set(vra["ProtectedCounters"]["Volumes"])
g_vra_recovery_vms.labels(**lbl).set(vra["RecoveryCounters"]["Vms"])
g_vra_recovery_vpgs.labels(**lbl).set(vra["RecoveryCounters"]["Vpgs"])
g_vra_recovery_vols.labels(**lbl).set(vra["RecoveryCounters"]["Volumes"])
g_vra_self_protected.labels(**lbl).set(vra["SelfProtectedVpgs"])
log.debug("VRA Name: %s", vra['VraName'])
log.info(f"vCenter info: T/F = {is_vcenter_set} Host: {vcenter_host} u: {vcenter_user}")
# get the CPU and memory usage for each VRA
if is_vcenter_set:
log.debug(f"vCenter Info Is Valid... Trying to get CPU and Memory usage for VRAs")
log.debug("vCenter Info Is Valid... Trying to get CPU and Memory usage for VRAs")
try:
log.debug("Trying to get stats from vCenter module")
vradata = vc_connection.get_cpu_mem_used(vra['VraName'])
for item in vradata:
log.debug(item)
# get the CPU usage and memory usage for the VM
cpu_usage_mhz = vradata[0]
memory_usage_mb = vradata[1]
# print the CPU and memory usage for the VM
log.debug(f"VRA {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB")
metricsDictionary["vra_cpu_usage_mhz{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = cpu_usage_mhz
metricsDictionary["vra_memory_usage_mb{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = memory_usage_mb
except:
log.info(f"No VM found with name {vra['VraName']}, or unexpected response.")
if vradata is not None:
for item in vradata:
log.debug(item)
cpu_usage_mhz = vradata[0]
memory_usage_mb = vradata[1]
log.debug(f"VRA {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB")
g_vra_cpu_usage.labels(**lbl).set(cpu_usage_mhz)
g_vra_memory_usage.labels(**lbl).set(memory_usage_mb)
else:
log.info(f"No data returned for VRA {vra['VraName']} from vCenter")
except Exception as e:
log.info(f"No VM found with name {vra['VraName']}, or unexpected response: {e}")
else:
log.debug("No VRAs Found")
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
file_object = open('vrametrics', 'w')
txt_object = open('vrametrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
# This function will get data every 10 seconds
log.debug("Starting Sleep for " + str(int(scrape_speed *2)) + " seconds")
log.debug("Starting Sleep for " + str(int(scrape_speed * 2)) + " seconds")
sleep(scrape_speed * 2)
else:
log.debug("Waiting 1 second for Auth Token")
@@ -442,65 +501,25 @@ def GetVraMetrics(zvm_instance):
except Exception as e:
log.error(f"Error in GetVraMetrics: {e}")
# function which monitors the threads and restarts them if they die
# ---------------------------------------------------------------------------
# Monitors thread health and exporter uptime
# ---------------------------------------------------------------------------
def ThreadProbe():
global container_id
while True:
log.debug("Thread Probe Started")
metricsDictionary = {}
uptime = round((time() - start_time) / 60, 1)
metricsDictionary["exporter_uptime{ExporterInstance=\"" + container_id + "\"}"] = uptime
if data_thread.is_alive():
log.debug("Data Thread Is Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1
else:
log.debug("Data Thread Is NOT Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0
g_exporter_uptime.labels(ExporterInstance=container_id).set(uptime)
if stats_thread.is_alive():
log.debug("Stats Thread Is Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1
else:
log.debug("Stats Thread Is NOT Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0
if vra_metrics_thread.is_alive():
log.debug("VRA Metrics Thread Is Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\",ExporterInstance=\"" + container_id + "\"}"] = 1
else:
log.debug("VRA Metrics Thread Is NOT Alive")
metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\",ExporterInstance=\"" + container_id + "\"}"] = 0
log.debug("Writing Thread data to files")
file_object = open('threads', 'w')
txt_object = open('threads.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
log.debug("Trying to close Thread txt files")
file_object.close()
txt_object.close()
g_thread_status.labels(thread="DataStats", ExporterInstance=container_id).set(1 if data_thread.is_alive() else 0)
g_thread_status.labels(thread="EncryptionStats",ExporterInstance=container_id).set(1 if stats_thread.is_alive() else 0)
g_thread_status.labels(thread="VraMetrics", ExporterInstance=container_id).set(1 if vra_metrics_thread.is_alive() else 0)
log.debug("Probe Thread Going to Sleep")
sleep(30)
#----------------run http server on port -----------------
def WebServer(port):
log.info(f"Web Server Starting on port {port}")
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", port), Handler) as httpd:
log.info(f"Webserver running on port {port}")
httpd.serve_forever()
def start_thread(target_func):
log.debug(f"Starting thread for {target_func.__name__}")
@@ -510,6 +529,7 @@ def start_thread(target_func):
log.debug(f"Thread {target_func.__name__} started")
return thread
"""
Main Program Logic
"""
@@ -517,10 +537,22 @@ Main Program Logic
# Get the hostname of the machine
container_id = str(socket.gethostname())
#set log line format including container_id
log_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(threadName)s;%(message)s", "%Y-%m-%d %H:%M:%S")
log_handler = RotatingFileHandler(filename=f"./logs/Log-{container_id}.log", maxBytes=1024*1024*100, backupCount=5)
log_handler.setFormatter(log_formatter)
class JsonFormatter(logging.Formatter):
"""Formats log records as single-line JSON for container stdout / fluentd ingestion."""
def format(self, record):
log_entry = {
"time": self.formatTime(record, "%Y-%m-%d %H:%M:%S"),
"level": record.levelname,
"thread": record.threadName,
"message": record.getMessage(),
"container": container_id,
}
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
log_handler = logging.StreamHandler(sys.stdout)
log_handler.setFormatter(JsonFormatter())
log = logging.getLogger("Node-Exporter")
log.setLevel(LOGLEVEL)
log.addHandler(log_handler)
@@ -531,36 +563,32 @@ log.debug("Running with Variables:\nVerify SSL: " + str(verifySSL) + "\nZVM Host
# Initialize zvmsite instance
zvm_instance = zvmsite(
host=zvm_url,
port=zvm_port,
port=int(zvm_port),
username=zvm_username,
password=zvm_password,
client_id=client_id,
client_secret=client_secret,
loglevel=LOGLEVEL,
logger=log,
stats=DISABLE_STATS
stats=(DISABLE_STATS != "TRUE")
)
# grant_type="client_credentials",
# Start the zvmsite authentication thread
# Start the zvmsite authentication thread
zvm_instance.connect()
"""
Global Variables used by the program
"""
local_site_info = None
siteId = None
siteName = None
while(siteId is None):
while siteId is None:
if zvm_instance.is_authenticated():
sleep(2)
log.debug("Trying Set Global Vars")
siteId = zvm_instance.site_id
siteName = zvm_instance.site_name
else:
sleep(1)
lastStats = CaseInsensitiveDict()
# Check if vCenter is set, if not disable VRA metrics
# Check if vCenter is set; if not, disable VRA CPU/memory metrics
is_vcenter_set = True
if vcenter_host == "vcenter.local":
log.error("vCenter Host not set. Please set the environment variable VCENTER_HOST, turning off VRA CPU and Memory metrics")
@@ -568,47 +596,39 @@ if vcenter_host == "vcenter.local":
log.debug("vCenter data collection is enabled")
vc_connection = vcsite(vcenter_host, vcenter_user, vcenter_pwd, loglevel="debug", logger=log)
# Starting threads
vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance))
data_thread = start_thread(lambda: GetDataFunc(zvm_instance))
stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance))
log.debug("Starting VRA Metrics")
webserver_thread = start_thread(lambda: WebServer(listen_port))
probe_thread = start_thread(lambda: ThreadProbe)
log.debug(f"ThreadProbe just started on PID {probe_thread}")
# Start prometheus metrics HTTP server (replaces the file-based SimpleHTTPRequestHandler)
# All Gauges from all threads are served thread-safely at http://host:<LISTEN_PORT>/metrics
start_http_server(listen_port)
log.info(f"Prometheus metrics server started on port {listen_port}")
# loop indefinitely
# Starting collection threads
vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance))
data_thread = start_thread(lambda: GetDataFunc(zvm_instance))
stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance))
probe_thread = start_thread(ThreadProbe)
log.debug("All collection threads started")
# Loop indefinitely - monitor and restart any crashed threads
while True:
# check if any thread has crashed
sleep(10)
if not probe_thread.is_alive():
# restart the thread
log.error("Probe Thread Died - Restarting")
probe_thread = start_thread(ThreadProbe)
else:
print("Probe Thread is alive")
log.debug("Probe Thread is alive")
if not data_thread.is_alive():
# restart the thread
log.error("Data Thread Died - Restarting")
data_thread = start_thread(GetDataFunc(zvm_instance))
data_thread = start_thread(lambda: GetDataFunc(zvm_instance))
else:
print("Data API Thread is alive")
log.debug("Data API Thread is alive")
if not stats_thread.is_alive():
# restart the thread
log.error("Stats Thread Died - Restarting")
stats_thread = start_thread(lambda: GetStatsFunc(zvm_instance))
else:
print("Stats API Thread is alive")
log.debug("Stats API Thread is alive")
if not vra_metrics_thread.is_alive():
# restart the thread
log.error("VRA Metrics Thread Died - Restarting")
vra_metrics_thread = start_thread(GetVraMetrics(zvm_instance))
vra_metrics_thread = start_thread(lambda: GetVraMetrics(zvm_instance))
else:
print("VRA Metrics Thread is alive")
if not webserver_thread.is_alive():
# restart the thread
log.error("Webserver Thread Died - Restarting")
webserver_thread = start_thread(WebServer(listen_port))
else:
print("WebServer Thread is alive")
log.debug("VRA Metrics Thread is alive")
sleep(api_timeout)
+4 -11
View File
@@ -1,28 +1,21 @@
prometheus_client>=0.19.0
annotated-types==0.6.0
async-timeout==4.0.3
backoff==2.2.1
boto3==1.28.63
botocore==1.31.63
cachetools==5.3.1
certifi==2023.7.22
charset-normalizer==3.3.0
docopt==0.6.2
idna==3.4
jmespath==1.0.1
monotonic==1.6
posthog==3.0.2
prompt-toolkit==3.0.39
pydantic
pyflakes==3.1.0
pydantic>=2.9.0
Pygments==2.16.1
python-dateutil==2.8.2
pyvim==3.0.3
pyvmomi==8.0.2.0
redis==5.0.1
pyvmomi==9.0.0.0
requests==2.32.0
s3transfer==0.7.0
six==1.16.0
tinydb==4.8.0
typing_extensions==4.8.0
typing_extensions>=4.12.2
urllib3==2.0.6
wcwidth==0.2.8
+3 -31
View File
@@ -17,7 +17,6 @@ from dateutil import parser
from typing import List, Dict, Tuple, Union, Any, Optional
from requests.structures import CaseInsensitiveDict
from logging.handlers import RotatingFileHandler
#from posthog import Posthog
import uuid
from requests import Request, Session
from .version import VERSION
@@ -74,12 +73,6 @@ class zvmsite:
# Get UUID
self.uuid = self.load_or_generate_uuid()
# Posthog stats setup
#if self.stats:
# self.setup_posthog()
# self.posthog.capture(self.uuid, 'ZVMA10 Python Module Loaded')
# self.log.debug("Sent PostHog Hook")
def __authhandler__(self) -> None:
self.log.info(f"Log Level set to {self.LOGLEVEL}")
if not self.__connected__:
@@ -102,9 +95,11 @@ class zvmsite:
}
if self.grant_type == "client_credentials":
data["client_secret"] = self.client_secret
data["scope"] = "openid"
else:
data["username"] = self.username
data["password"] = self.password
data["scope"] = "openid"
uri = self.construct_url(path="auth/realms/zerto/protocol/openid-connect/token")
@@ -129,7 +124,6 @@ class zvmsite:
self.expiresIn -= 10
else:
self.log.info("Authentication thread is already running")
print(f"Auth thread already running")
def is_authenticated(self) -> bool:
# Assuming self.token is the authentication token and it's set upon successful authentication
@@ -172,11 +166,6 @@ class zvmsite:
file.write(new_uuid)
return new_uuid
#def setup_posthog(self) -> None:
# self.posthog = Posthog(project_api_key='phc_HflqUkx9majhzm8DZva8pTwXFRnOn99onA9xPpK5HaQ', host='https://posthog.jpaul.io')
# self.posthog.debug = True
# self.posthog.identify(distinct_id=self.uuid)
def construct_url(self, path="", params=None) -> str:
full_url = f"{self.base_url}/{path}"
if params:
@@ -235,23 +224,6 @@ class zvmsite:
response.raise_for_status()
self.log.debug(f'API Request: {method} - {url}')
# Posthog stats setup
#if self.stats:
# temp_base, temp_path = self.deconstruct_url(url)
# self.posthog.capture( self.uuid, 'API REQUEST',
# {
# "url": temp_base,
# "port": self.port,
# "endpoint": temp_path,
# "method": method,
# "response_time_ms": int(elapsed_time_ms),
# "verify_ssl": self.verify_ssl,
# "grant_type": self.grant_type,
# "status_code": str(response.status_code),
# "sdk_version": self.__version__
# })
# self.log.debug("Sent PostHog Hook")
return response.json()
except requests.exceptions.RequestException as e:
self.log.error(f"Error while sending API request: {e}")
@@ -794,7 +766,7 @@ class zvmsite:
return self.make_api_request("GET", uri, headers=self.apiheader)
def service_profile(self, serviceProfileIdentifier=None) -> Dict[str, Any]:
if siteidentifier is None:
if serviceProfileIdentifier is None:
self.log.error("Service Profile identifier is required for get site function.")
raise ValueError("Service Profile identifier is required.")
-126
View File
@@ -1,126 +0,0 @@
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from requests.structures import CaseInsensitiveDict
from tinydb import TinyDB, Query
from tinydbstorage.storage import MemoryStorage
from logging.handlers import RotatingFileHandler
# Function to get VM Encryption Data from ZVMa version 9.7
def GetStatsFunc():
tempdb = TinyDB(storage=MemoryStorage) # ('./db.json') used for storing db on disk for debugging
dbvm = Query()
dbvpg = Query()
while (True) :
global token
global siteId
global siteName
if (token != ""):
log.info("Got Auth Token!")
log.debug("token: " + str(token))
log.debug("Stats Collector Loop Running")
metricsDictionary = {}
h2 = CaseInsensitiveDict()
h2["Accept"] = "application/json"
h2["Authorization"] = "Bearer " + token
## Statistics API
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/statistics/vms/"
statsapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL)
statsapi_json = statsapi.json()
#log.debug(statsapi_json)
for vm in statsapi_json:
oldvmdata = dict()
CurrentIops = 0
CurrentWriteCounterInMBs = 0
CurrentSyncCounterInMBs = 0
CurrentNetworkTrafficCounterInMBs = 0
CurrentEncryptedLBs = 0
CurrentUnencryptedLBs = 0
CurrentTotalLBs = 0
CurrentPercentEncrypted = 0
VMName = "NA"
oldvmdata = tempdb.search(dbvm.VmIdentifier == vm['VmIdentifier'] and dbvpg.VpgIdentifier == vm['VpgIdentifier'])
log.info("Checking TempDB for VM " + vm['VmIdentifier'] + " in VPG " + vm['VpgIdentifier'])
if (oldvmdata):
log.info(vm['VmIdentifier'] + " Record Found, Updating DB")
log.debug(oldvmdata[0])
log.debug(tempdb.update(vm, dbvm.VmIdentifier == vm['VmIdentifier'] and dbvpg.VpgIdentifier == vm['VpgIdentifier']))
log.debug("!@!@!@!@!@ Stats !@!@!@!@!@")
VMName = oldvmdata[0]['VmName']
log.debug("Current VM " + str(VMName))
CurrentIops = abs(vm['IoOperationsCounter'] - oldvmdata[0]['IoOperationsCounter'])
log.debug("CurrentIops " + str(CurrentIops))
CurrentSyncCounterInMBs = abs(vm['SyncCounterInMBs'] - oldvmdata[0]['SyncCounterInMBs'])
log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs))
CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs'])
log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs))
CurrentEncryptedLBs = abs(vm['EncryptionStatistics']['EncryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['EncryptedDataInLBs'])
log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs))
CurrentUnencryptedLBs = abs(vm['EncryptionStatistics']['UnencryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['UnencryptedDataInLBs'])
log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs))
CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs)
log.debug("CurrentTotalLBs " + str(CurrentTotalLBs))
if CurrentTotalLBs != 0:
CurrentPercentEncrypted = ((CurrentEncryptedLBs / CurrentTotalLBs) * 100)
else:
CurrentPercentEncrypted = 0
log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted))
else:
log.info(vm['VmIdentifier'] + " No Record Found, Inserting into DB")
#insert original VM record to tempdb
log.debug(tempdb.insert(vm))
# update database with VM name, for easier display in Grafana Legends
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vms/" + vm['VmIdentifier'] +"?vpgIdentifier=" + vm['VpgIdentifier']
try:
vapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL)
vapi_json = vapi.json()
except Exception as e:
log.error("Error while sending api request: " + str(e))
VMName = "Unknown"
else:
log.debug("vapi_json: " + str(vapi_json))
tempdb.update({'VmName': vapi_json['VmName']}, dbvm.VmIdentifier == vm['VmIdentifier'])
log.info("Added vm to tempdb " + vm['VmIdentifier'] + " - " + vapi_json['VmName'])
VMName = vapi_json['VmName']
# Store Calculated Metrics
metricsDictionary["vm_IoOperationsCounter{VpgIdentifier=\"" + str(vm['VpgIdentifier']) + "\",VmIdentifier=\"" + str(vm['VmIdentifier']) + "\",VmName=\"" + str(VMName) + "\",SiteIdentifier=\"" + str(siteId) + "\",SiteName=\"" + str(siteName) + "\"}"] = CurrentIops
metricsDictionary["vm_WriteCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentWriteCounterInMBs
metricsDictionary["vm_SyncCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentSyncCounterInMBs
metricsDictionary["vm_NetworkTrafficCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentNetworkTrafficCounterInMBs
metricsDictionary["vm_EncryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentEncryptedLBs
metricsDictionary["vm_UnencryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentUnencryptedLBs
metricsDictionary["vm_TotalDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTotalLBs
metricsDictionary["vm_PercentEncrypted{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentPercentEncrypted
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
file_object = open('statsmetrics', 'w')
txt_object = open('statsmetrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
-3
View File
@@ -1,3 +0,0 @@
print("Initializing zvma9_7 package...")
from .GetStatsFunc import GetStatsFunc
+6 -8
View File
@@ -1,5 +1,3 @@
version: "3.3"
services:
zerto-exporter:
build: .
@@ -7,13 +5,13 @@ services:
- "9999:9999"
environment:
- VERIFY_SSL=False
- ZVM_HOST=192.168.50.60
- ZVM_HOST=192.168.50.30
- ZVM_PORT=443
- CLIENT_ID=api-script
- CLIENT_SECRET=fcYMFuA5TkIUwp6b3hDUxim0f32z8erk
#- CLIENT_ID=api-script
#- CLIENT_SECRET=fcYMFuA5TkIUwp6b3hDUxim0f32z8erk
- ZVM_USERNAME=admin
- ZVM_PASSWORD=Zertodata987!
- LOGLEVEL=INFO #Valid settings are CRITICAL, ERROR, WARNING, INFO, DEBUG
- VCENTER_HOST=192.168.50.50
- VCENTER_HOST=192.168.50.20
- VCENTER_USER=administrator@vsphere.local
- VCENTER_PASSWORD=Zertodata987!
volumes:
- "./logs:/usr/src/app/logs/"