Files
Zerto_Exporter/app/python-node-exporter.py
T
2023-03-23 12:17:21 -04:00

659 lines
45 KiB
Python

import requests
import http.server
import socketserver
import os
import ssl
import logging
import threading
import socket
from pyVim.connect import SmartConnect, Disconnect
from pyVmomi import vim
from time import sleep
from logging.handlers import RotatingFileHandler
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from requests.structures import CaseInsensitiveDict
from tinydb import TinyDB, Query
from tinydbstorage.storage import MemoryStorage
from version import VERSION
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
verifySSL = os.getenv("VERIFY_SSL", 'False').lower() in ('true', '1', 't')
zvm_url = os.environ.get('ZVM_HOST', '192.168.50.60')
zvm_port = os.environ.get('ZVM_PORT', '443')
client_id = os.environ.get('CLIENT_ID', 'api-script')
client_secret = os.environ.get('CLIENT_SECRET', 'js51tDM8oappYUGRJBhF7bcsedNoHA5j')
scrape_speed = int(os.environ.get('SCRAPE_SPEED', 30))
api_timeout = int(os.environ.get('API_TIMEOUT', 5))
LOGLEVEL = os.environ.get('LOGLEVEL', 'INFO').upper()
version = str(VERSION)
vcenter_host = os.environ.get('VCENTER_HOST', 'vcenter.local')
vcenter_user = os.environ.get('VCENTER_USER', 'administrator@vsphere.local')
vcenter_pwd = os.environ.get('VCENTER_PASSWORD', 'supersecret')
# Get the hostname of the machine
container_id = str(socket.gethostname())
#set log line format including container_id
log_formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(threadName)s;%(message)s", "%Y-%m-%d %H:%M:%S")
log_handler = RotatingFileHandler(filename=f"../logs/Log-Main-{container_id}.log", maxBytes=1024*1024*100, backupCount=5)
log_handler.setFormatter(log_formatter)
log = logging.getLogger("Node-Exporter")
log.setLevel(LOGLEVEL)
log.addHandler(log_handler)
log.info(f"Zerto-Node-Exporter - Version {version}")
log.info(f"Log Level: {LOGLEVEL}")
log.debug("Running with Variables:\nVerify SSL: " + str(verifySSL) + "\nZVM Host: " + zvm_url + "\nZVM Port: " + zvm_port + "\nClient-Id: " + client_id + "\nClient Secret: " + client_secret)
# Global Variables
token = ""
siteId = "NotSet"
siteName = "NotSet"
lastStats = CaseInsensitiveDict()
# Authentication Thread which handles authentication and token refresh for ZVM API
def ZvmAuthHandler():
log.debug("ZVMAuthHandler Thread Started")
expiresIn = 0
global token
global siteId
global siteName
retries = 0
while True:
if expiresIn < 30:
h = CaseInsensitiveDict()
h["Content-Type"] = "application/x-www-form-urlencoded"
d = CaseInsensitiveDict()
d["client_id"] = client_id
d["client_secret"] = client_secret
d["grant_type"] = "client_credentials"
uri = "https://" + zvm_url + ":" + zvm_port + "/auth/realms/zerto/protocol/openid-connect/token"
delay = 0
try:
response = requests.post(url=uri, data=d, headers=h, verify=verifySSL)
response.raise_for_status()
except requests.exceptions.RequestException as e:
retries += 1
delay = 2 ** retries
log.error("Error while sending authentication request: " + str(e) + ". Retrying in " + str(delay) + " seconds")
sleep(delay)
continue
else:
retries = 0
responseJSON = response.json()
if 'access_token' not in responseJSON or 'expires_in' not in responseJSON:
log.error("Authentication response does not contain expected keys")
delay = 2 ** retries
sleep(delay)
retries += 1
continue
token = str(responseJSON.get('access_token'))
expiresIn = int(responseJSON.get('expires_in'))
log.info("Authentication successful. Token expires in " + str(expiresIn) + " seconds")
if response.status_code != 200:
log.error("Authentication request failed with status code " + str(response.status_code))
delay = 2 ** retries
sleep(delay)
retries += 1
continue
# Get Site ID and Name
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/localsite"
delay = 0
try:
log.debug("Getting Site ID and Name")
h2 = CaseInsensitiveDict()
h2["Accept"] = "application/json"
h2["Authorization"] = "Bearer " + token
response = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL)
response.raise_for_status()
except requests.exceptions.RequestException as e:
retries += 1
delay = 2 ** retries
log.error("Error while sending authentication request: " + str(e) + ". Retrying in " + str(delay) + " seconds")
sleep(delay)
continue
else:
retries = 0
responseJSON = response.json()
log.debug(responseJSON)
if 'SiteIdentifier' not in responseJSON or 'SiteName' not in responseJSON:
log.error("LocalSite API response does not contain expected keys")
delay = 2 ** retries
sleep(delay)
retries += 1
continue
else:
siteId = str(responseJSON.get('SiteIdentifier'))
siteName = str(responseJSON.get('SiteName'))
log.info("Site ID: " + siteId + " Site Name: " + siteName)
expiresIn -= 10 + delay
log.debug("Token Expires in " + str(expiresIn) + " seconds")
sleep(10)
# Thread which gets VM level encryption statistics from ZVM API
def GetStatsFunc():
tempdb = TinyDB(storage=MemoryStorage)
dbvm = Query()
while (True) :
global token
global siteId
global siteName
if (token != ""):
log.info("Got Auth Token!")
log.debug("token: " + str(token))
log.debug("Stats Collector Loop Running")
metricsDictionary = {}
h2 = CaseInsensitiveDict()
h2["Accept"] = "application/json"
h2["Authorization"] = "Bearer " + token
## Statistics API
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/statistics/vms/"
statsapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL)
statsapi_json = statsapi.json()
#log.debug(statsapi_json)
for vm in statsapi_json:
#log.debug(vm)
oldvmdata = dict()
CurrentIops = 0
CurrentWriteCounterInMBs = 0
CurrentSyncCounterInMBs = 0
CurrentNetworkTrafficCounterInMBs = 0
CurrentEncryptedLBs = 0
CurrentUnencryptedLBs = 0
CurrentTotalLBs = 0
CurrentPercentEncrypted = 0
VMName = "NA"
oldvmdata = tempdb.search(dbvm.VmIdentifier == vm['VmIdentifier'])
log.info("Checking TempDB for VM " + vm['VmIdentifier'])
if (oldvmdata):
log.info(vm['VmIdentifier'] + " Record Found")
log.debug(oldvmdata[0])
log.debug(tempdb.update(vm, dbvm.VmIdentifier == vm['VmIdentifier']))
log.debug("!@!@!@!@!@ Stats !@!@!@!@!@")
VMName = oldvmdata[0]['VmName']
log.debug("Current VM " + str(VMName))
CurrentIops = abs(vm['IoOperationsCounter'] - oldvmdata[0]['IoOperationsCounter'])
log.debug("CurrentIops " + str(CurrentIops))
CurrentSyncCounterInMBs = abs(vm['SyncCounterInMBs'] - oldvmdata[0]['SyncCounterInMBs'])
log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs))
CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs'])
log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs))
CurrentEncryptedLBs = abs(vm['EncryptionStatistics']['EncryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['EncryptedDataInLBs'])
log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs))
CurrentUnencryptedLBs = abs(vm['EncryptionStatistics']['UnencryptedDataInLBs'] - oldvmdata[0]['EncryptionStatistics']['UnencryptedDataInLBs'])
log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs))
CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs)
log.debug("CurrentTotalLBs " + str(CurrentTotalLBs))
if CurrentTotalLBs != 0:
CurrentPercentEncrypted = ((CurrentEncryptedLBs / CurrentTotalLBs) * 100)
else:
CurrentPercentEncrypted = 0
log.debug("CurrentPercentEncrypted " + str(CurrentPercentEncrypted))
else:
log.info(vm['VmIdentifier'] + " No Record Found, Inserting into DB")
#insert original VM record to tempdb
log.debug(tempdb.insert(vm))
# update database with VM name, for easier display in Grafana Legends
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vms/" + vm['VmIdentifier']
try:
vapi = requests.get(url=uri, timeout=3, headers=h2, verify=verifySSL)
vapi_json = vapi.json()
except Exception as e:
log.error("Error while sending api request: " + str(e))
VMName = "Unknown"
else:
log.debug("vapi_json: " + str(vapi_json))
tempdb.update({'VmName': vapi_json['VmName']}, dbvm.VmIdentifier == vm['VmIdentifier'])
log.info("Added to temp vm db" + vm['VmIdentifier'] + " - " + vapi_json['VmName'])
VMName = vapi_json['VmName']
# Store Calculated Metrics
metricsDictionary["vm_IoOperationsCounter{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentIops
metricsDictionary["vm_WriteCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentWriteCounterInMBs
metricsDictionary["vm_SyncCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentSyncCounterInMBs
metricsDictionary["vm_NetworkTrafficCounterInMBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentNetworkTrafficCounterInMBs
metricsDictionary["vm_EncryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentEncryptedLBs
metricsDictionary["vm_UnencryptedDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentUnencryptedLBs
metricsDictionary["vm_TotalDataInLBs{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentTotalLBs
metricsDictionary["vm_PercentEncrypted{VpgIdentifier=\"" + vm['VpgIdentifier'] + "\",VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + VMName + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = CurrentPercentEncrypted
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
file_object = open('statsmetrics', 'w')
txt_object = open('statsmetrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
# Function which retrieves stats from various ZVM APIs and stores them in a metrics file
def GetDataFunc():
tempdb = TinyDB(storage=MemoryStorage)
dbvm = Query()
while (True) :
global token
global siteId
global siteName
if (token != ""):
log.debug("Got Auth Token!")
log.debug("token: " + str(token))
log.info("Data Collector Loop Running")
metricsDictionary = {}
h2 = CaseInsensitiveDict()
h2["Accept"] = "application/json"
h2["Authorization"] = "Bearer " + token
### VPGs API
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vpgs/"
service = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
vpg_json = service.json()
#log.debug(vpg_json)
for vpg in vpg_json :
metricsDictionary["vpg_storage_used_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["UsedStorageInMB"]
metricsDictionary["vpg_actual_rpo{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ActualRPO"]
metricsDictionary["vpg_throughput_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ThroughputInMB"]
metricsDictionary["vpg_iops{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["IOPs"]
metricsDictionary["vpg_provisioned_storage_in_mb{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ProvisionedStorageInMB"]
metricsDictionary["vpg_vms_count{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["VmsCount"]
metricsDictionary["vpg_configured_rpo_seconds{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["ConfiguredRpoSeconds"]
metricsDictionary["vpg_actual_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ActualHistoryInMinutes"]
metricsDictionary["vpg_configured_history_in_minutes{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["HistoryStatusApi"]["ConfiguredHistoryInMinutes"]
metricsDictionary["vpg_failsafe_history_in_minutes_actual{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ActualFailSafeHistory"]
metricsDictionary["vpg_failsafe_history_in_minutes_configured{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["FailSafeHistory"]["ConfiguredFailSafeHistory"]
metricsDictionary["vpg_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["Status"]
metricsDictionary["vpg_substatus{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["SubStatus"]
metricsDictionary["vpg_alert_status{VpgIdentifier=\"" + vpg['VpgIdentifier'] + "\",VpgName=\"" + vpg['VpgName'] + "\",VpgPriority=\"" + str(vpg['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vpg["AlertStatus"]
### Datastores APIs
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/datastores/"
service = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
ds_json = service.json()
#log.debug(ds_json)
for ds in ds_json :
log.debug(f"Processing {ds['DatastoreName']}")
metricsDictionary["datastore_vras{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumVRAs"]
metricsDictionary["datastore_incoming_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumIncomingVMs"]
metricsDictionary["datastore_outgoing_vms{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["NumOutgoingVMs"]
metricsDictionary["datastore_usage_capacityinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["CapacityInBytes"]
metricsDictionary["datastore_usage_freeinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["FreeInBytes"]
metricsDictionary["datastore_usage_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["UsedInBytes"]
metricsDictionary["datastore_usage_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Datastore"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_protected_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_protected_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Protected"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_recovery_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_recovery_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Recovery"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_journal_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_journal_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Journal"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_scratch_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_scratch_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Scratch"]["ProvisionedInBytes"]
metricsDictionary["datastore_usage_zerto_appliances_usedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["UsedInBytes"]
metricsDictionary["datastore_usage_zerto_appliances_provisionedinbytes{datastoreIdentifier=\"" + ds['DatastoreIdentifier'] + "\",DatastoreName=\"" + ds['DatastoreName'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = ds["Stats"]["Usage"]["Zerto"]["Appliances"]["ProvisionedInBytes"]
## VMs API
log.debug("Getting VMs API")
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vms/"
vmapi = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
vmapi_json = vmapi.json()
#log.debug(vmapi_json)
for vm in vmapi_json :
metricsDictionary["vm_actualrpo{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["ActualRPO"]
metricsDictionary["vm_throughput_in_mb{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["ThroughputInMB"]
metricsDictionary["vm_iops{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["IOPs"]
metricsDictionary["vm_journal_hard_limit{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalHardLimit"]["LimitValue"]
metricsDictionary["vm_journal_warning_limit{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalWarningThreshold"]["LimitValue"]
metricsDictionary["vm_journal_used_storage_mb{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["JournalUsedStorageMb"]
metricsDictionary["vm_outgoing_bandwidth_in_mbps{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["OutgoingBandWidthInMbps"]
metricsDictionary["vm_used_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["UsedStorageInMB"]
metricsDictionary["vm_provisioned_storage_in_MB{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["ProvisionedStorageInMB"]
metricsDictionary["vm_status{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["Status"]
metricsDictionary["vm_substatus{VmIdentifier=\"" + vm['VmIdentifier'] + "\",VmName=\"" + vm['VmName'] + "\",VmRecoveryVRA=\"" + vm["RecoveryHostName"] + "\",VmPriority=\"" + str(vm['Priority']) + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vm["SubStatus"]
## Volumes API for Scratch Volumes
log.debug("Getting Scratch Volumes")
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/volumes?volumeType=scratch"
volapi = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
volapi_json = volapi.json()
if(bool(volapi_json)):
for volume in volapi_json :
#metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"]
# Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key.
metrickey = "scratch_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"]
percentage_used = (volume["Size"]["UsedInBytes"] / volume["Size"]["ProvisionedInBytes"] * 100)
percentage_used = round(percentage_used, 1)
#metricsDictionary["scratch_volume_percentage_used{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = percentage_used
## Volumes API for Journal Volumes
log.debug("Getting Journal Volumes")
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/volumes?volumeType=journal"
volapi = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
volapi_json = volapi.json()
if(bool(volapi_json)):
log.debug("Journal Volumes Exist")
for volume in volapi_json :
log.debug("Journal Volume: " + volume['ProtectedVm']['Name'] + "Calculating total size")
#metricsDictionary["scratch_volume_provisioned_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\"}"] = volume["Size"]["ProvisionedInBytes"]
# Determine the key for a given VM, then see if the key is already in the dictionary, if it is add the next disk to the total. If not, create a new key.
metrickey = "vm_journal_volume_size_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["UsedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["UsedInBytes"]
metrickey = "vm_journal_volume_provisioned_in_bytes{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + volume["Size"]["ProvisionedInBytes"]
else:
metricsDictionary[metrickey] = volume["Size"]["ProvisionedInBytes"]
metrickey = "vm_journal_volume_count{ProtectedVm=\"" + volume['ProtectedVm']['Name'] + "\", ProtectedVmIdentifier=\"" + volume['ProtectedVm']['Identifier'] + "\", OwningVRA=\"" + volume['OwningVm']['Name'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"
if (metrickey in metricsDictionary):
metricsDictionary[metrickey] = metricsDictionary[metrickey] + 1
else:
metricsDictionary[metrickey] = 1
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
log.debug("Writing metrics to file")
file_object = open('metrics', 'w')
txt_object = open('metrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
log.debug("Metrics written to file")
# This function will get data every 10 seconds
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
# get VRA CPU and memory usage from vCenter Server
def GetVraMetrics():
# set up API endpoint and headers
log.debug("GetVraCpuMemory() called")
metricsDictionary = {}
while True:
vra_names = []
vras = []
global token
global siteId
global siteName
log.debug("Checking Token in VRA CPU MEM Collector")
if (token != ""):
log.debug("Auth Token Valid!")
log.debug("token: " + str(token))
log.info("VRA CPU MEM Collector Running")
h2 = CaseInsensitiveDict()
h2["Accept"] = "application/json"
h2["Authorization"] = "Bearer " + token
### VRA API
uri = "https://" + zvm_url + ":" + zvm_port + "/v1/vras"
# make API call to get list of VRAs
try:
response = requests.get(url=uri, timeout=api_timeout, headers=h2, verify=verifySSL)
except Exception as e:
log.error(f"Error connecting to {endpoint}: {e}")
return
else:
log.debug("Response from GET /v1/vras: %s", response.text)
# parse JSON response and get the name of each VRA
# Disable SSL certificate verification
context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
context.verify_mode = ssl.CERT_NONE
# connect to vCenter Server
si = None
try:
si = SmartConnect(host=vcenter_host, user=vcenter_user, pwd=vcenter_pwd, sslContext=context)
log.debug("Connected to vCenter Server %s", vcenter_host)
except Exception as e:
log.error(f"Error connecting to vCenter Server: {e}")
return
# get the root folder of the vCenter Server
content = si.RetrieveContent()
root_folder = content.rootFolder
# create a view for all VMs on the vCenter Server
view_manager = content.viewManager
vm_view = view_manager.CreateContainerView(root_folder, [vim.VirtualMachine], True)
vras = response.json()
log.debug("VRA names: %s", vras)
log.debug(type(vras))
for vra in vras :
#vra_names.append(vra['VraName'])
# Gather other VRA Metrics from Zerto API into Metrics Diectionary
metricsDictionary["vra_memory_in_GB{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["MemoryInGB"]
metricsDictionary["vra_vcpu_count{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["NumOfCpus"]
metricsDictionary["vra_protected_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vms"]
metricsDictionary["vra_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Vpgs"]
metricsDictionary["vra_protected_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["ProtectedCounters"]["Volumes"]
metricsDictionary["vra_recovery_vms{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vms"]
metricsDictionary["vra_recovery_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Vpgs"]
metricsDictionary["vra_recovery_volumes{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["RecoveryCounters"]["Volumes"]
metricsDictionary["vra_self_protected_vpgs{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = vra["SelfProtectedVpgs"]
log.debug("VRA Name: %s", vra['VraName'])
# get the CPU and memory usage for each VRA
vm = None
for vm_obj in vm_view.view:
if vm_obj.name == vra['VraName']:
vm = vm_obj
break
if vm is not None:
log.debug("Found VRA VM in vCenter with name %s", vra['VraName'])
# get the CPU usage and memory usage for the VM
cpu_usage_mhz = vm.summary.quickStats.overallCpuUsage
memory_usage_mb = vm.summary.quickStats.guestMemoryUsage
# print the CPU and memory usage for the VM
log.info(f"VM {vm.name} (name: {vra['VraName']}) has CPU usage of {cpu_usage_mhz} MHz and memory usage of {memory_usage_mb} MB")
metricsDictionary["vra_cpu_usage_mhz{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = cpu_usage_mhz
metricsDictionary["vra_memory_usage_mb{VraIdentifierStr=\"" + vra['VraIdentifierStr'] + "\",VraName=\"" + vra['VraName'] + "\",VraVersion=\"" + vra['VraVersion'] + "\",HostVersion=\"" + vra['HostVersion'] + "\",SiteIdentifier=\"" + siteId + "\",SiteName=\"" + siteName + "\"}"] = memory_usage_mb
else:
log.info(f"No VM found with name {vra_name}")
# Disconnect from vCenter
Disconnect(si)
## Write metrics to a human readable metrics.txt file as well as a metrics file that is easy to get in prometheus
file_object = open('vrametrics', 'w')
txt_object = open('vrametrics.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
file_object.close()
txt_object.close()
# This function will get data every 10 seconds
log.debug("Starting Sleep for " + str(scrape_speed) + " seconds")
sleep(scrape_speed * 2)
else:
log.debug("Waiting 1 second for Auth Token")
sleep(1)
# function which monitors the threads and restarts them if they die
def ThreadProbe():
while True:
log.debug("Thread Probe Started")
metricsDictionary = {}
log.debug("Is Auth Thread Alive")
if auth_thread.is_alive():
metricsDictionary["exporter_thread_status{thread=\"" + "AuthHandler" + "\"}"] = 1
else:
metricsDictionary["exporter_thread_status{thread=\"" + "AuthHandler" + "\"}"] = 0
log.debug("Is Data Thread Alive")
if data_thread.is_alive():
metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\"}"] = 1
else:
metricsDictionary["exporter_thread_status{thread=\"" + "DataStats" + "\"}"] = 0
log.debug("Is Stats Thread Alive")
if stats_thread.is_alive():
metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\"}"] = 1
else:
metricsDictionary["exporter_thread_status{thread=\"" + "EncryptionStats" + "\"}"] = 0
log.debug("Is VRA Metrics Thread Alive")
if vra_metrics_thread.is_alive():
metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\"}"] = 1
else:
metricsDictionary["exporter_thread_status{thread=\"" + "VraMetrics" + "\"}"] = 0
log.debug("Writing Probe data to files")
file_object = open('threads', 'w')
txt_object = open('threads.txt', 'w')
for item in metricsDictionary :
file_object.write(item)
file_object.write(" ")
file_object.write(str(metricsDictionary[item]))
file_object.write("\n")
txt_object.write(item)
txt_object.write(" ")
txt_object.write(str(metricsDictionary[item]))
txt_object.write("\n")
log.debug("Trying to Close probe txt files")
file_object.close()
txt_object.close()
log.debug("Probe Thread Going to Sleep")
sleep(30)
#----------------run http server on port 9999-----------------
def WebServer():
log.info("Web Server Started")
PORT = 9999
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
log.info(f"Webserver running on port {PORT}")
httpd.serve_forever()
def start_thread(target_func):
# start a new thread
thread = threading.Thread(target=target_func)
thread.start()
# return the thread object
return thread
# start the threads
auth_thread = start_thread(ZvmAuthHandler)
data_thread = start_thread(GetDataFunc)
stats_thread = start_thread(GetStatsFunc)
vra_metrics_thread = start_thread(GetVraMetrics)
webserver_thread = start_thread(WebServer)
# loop indefinitely
while True:
# check if any thread has crashed
sleep(2)
if not probe_thread.is_alive():
# restart the thread
log.error("Probe Thread Died - Restarting")
probe_thread = start_thread(ThreadProbe)
if not auth_thread.is_alive():
# restart the thread
log.error("Authentication Thread Died - Restarting")
auth_thread = start_thread(ZvmAuthHandler)
if not data_thread.is_alive():
# restart the thread
log.error("Data Thread Died - Restarting")
data_thread = start_thread(GetDataFunc)
if not stats_thread.is_alive():
# restart the thread
log.error("Stats Thread Died - Restarting")
stats_thread = start_thread(GetStatsFunc)
if not vra_metrics_thread.is_alive():
# restart the thread
log.error("VRA Metrics Thread Died - Restarting")
vra_metrics_thread = start_thread(GetVraMetrics)
if not webserver_thread.is_alive():
# restart the thread
log.error("Webserver Thread Died - Restarting")
webserver_thread = start_thread(WebServer)
sleep(api_timeout)