From a4ff29e1b58aa6dbd216f0cbc5d4160cb5eae962 Mon Sep 17 00:00:00 2001 From: Justin Paul Date: Fri, 20 Feb 2026 19:47:35 -0500 Subject: [PATCH] Fix counter spike/negative on ZVM reboot (closes #12) When a ZVM reboots its cumulative counters (IOPs, write/sync/network MBs, encrypted/unencrypted LBs) reset to zero. The previous abs() approach converted the large negative delta into an equally wrong positive spike. Introduces _counter_delta(new, old, vm_id, metric) which: - Returns new - old when new >= old (normal incrementing counter) - Returns new as-is when new < old (counter reset / reboot detected) and logs a WARNING identifying the VM and metric that reset Publishing the raw new value on reset correctly reflects the restarted counter state, suppresses the false spike, and produces a visible dip in Grafana that naturally indicates the ZVM reboot event. Co-Authored-By: Claude Sonnet 4.6 --- app/python-node-exporter.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/app/python-node-exporter.py b/app/python-node-exporter.py index e05d6f2..bed89f7 100644 --- a/app/python-node-exporter.py +++ b/app/python-node-exporter.py @@ -136,6 +136,16 @@ g_thread_status = Gauge('exporter_thread_status', 'Exporter Thread Status', # --------------------------------------------------------------------------- # Thread which gets VM level encryption statistics from ZVM API # --------------------------------------------------------------------------- +def _counter_delta(new, old, vm_id, metric): + """Return new-old normally; if new < old, the ZVM counter reset (reboot). + In that case return new as-is and log a warning so the spike is suppressed.""" + if new >= old: + return new - old + log.warning(f"Counter reset detected for VM {vm_id} metric '{metric}' " + f"(old={old}, new={new}) - ZVM may have rebooted. Publishing raw value.") + return new + + def GetStatsFunc(zvm_instance): tempdb = TinyDB(storage=MemoryStorage) dbvm = Query() @@ -194,22 +204,23 @@ def GetStatsFunc(zvm_instance): log.debug(vm) log.debug("!@!@!@!@!@ Stats !@!@!@!@!@") VMName = oldvmdata[0]['VmName'] + vid = vm['VmIdentifier'] log.debug("Current VM " + str(VMName)) - CurrentIops = abs(vm['IoOperationsCounter'] - oldvmdata[0]['IoOperationsCounter']) + CurrentIops = _counter_delta(vm['IoOperationsCounter'], oldvmdata[0]['IoOperationsCounter'], vid, 'IoOperationsCounter') log.debug("CurrentIops " + str(CurrentIops)) - CurrentSyncCounterInMBs = abs(vm['SyncCounterInMBs'] - oldvmdata[0]['SyncCounterInMBs']) + CurrentSyncCounterInMBs = _counter_delta(vm['SyncCounterInMBs'], oldvmdata[0]['SyncCounterInMBs'], vid, 'SyncCounterInMBs') log.debug("CurrentSyncCounterInMBs " + str(CurrentSyncCounterInMBs)) - CurrentNetworkTrafficCounterInMBs = abs(vm['NetworkTrafficCounterInMBs'] - oldvmdata[0]['NetworkTrafficCounterInMBs']) + CurrentNetworkTrafficCounterInMBs = _counter_delta(vm['NetworkTrafficCounterInMBs'], oldvmdata[0]['NetworkTrafficCounterInMBs'], vid, 'NetworkTrafficCounterInMBs') log.debug("CurrentNetworkTrafficCounterInMBs " + str(CurrentNetworkTrafficCounterInMBs)) - CurrentWriteCounterInMBs = abs(vm['WriteCounterInMBs'] - oldvmdata[0]['WriteCounterInMBs']) + CurrentWriteCounterInMBs = _counter_delta(vm['WriteCounterInMBs'], oldvmdata[0]['WriteCounterInMBs'], vid, 'WriteCounterInMBs') log.debug("CurrentWriteCounterInMBs " + str(CurrentWriteCounterInMBs)) - CurrentEncryptedLBs = abs(vm['EncryptionMetrics']['EncryptedData'] - oldvmdata[0]['EncryptionMetrics']['EncryptedData']) + CurrentEncryptedLBs = _counter_delta(vm['EncryptionMetrics']['EncryptedData'], oldvmdata[0]['EncryptionMetrics']['EncryptedData'], vid, 'EncryptedData') log.debug("CurrentEncryptedLBs " + str(CurrentEncryptedLBs)) - CurrentUnencryptedLBs = abs(vm['EncryptionMetrics']['NonEncryptedData'] - oldvmdata[0]['EncryptionMetrics']['NonEncryptedData']) + CurrentUnencryptedLBs = _counter_delta(vm['EncryptionMetrics']['NonEncryptedData'], oldvmdata[0]['EncryptionMetrics']['NonEncryptedData'], vid, 'NonEncryptedData') log.debug("CurrentUnencryptedLBs " + str(CurrentUnencryptedLBs)) - CurrentTrendChangeLevel = abs(vm['EncryptionMetrics']['TrendChangeLevel'] - oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel']) + CurrentTrendChangeLevel = _counter_delta(vm['EncryptionMetrics']['TrendChangeLevel'], oldvmdata[0]['EncryptionMetrics']['TrendChangeLevel'], vid, 'TrendChangeLevel') log.debug("CurrentTrendChangeLevel " + str(CurrentTrendChangeLevel)) - CurrentTotalLBs = abs(CurrentEncryptedLBs + CurrentUnencryptedLBs) + CurrentTotalLBs = CurrentEncryptedLBs + CurrentUnencryptedLBs log.debug("CurrentTotalLBs " + str(CurrentTotalLBs)) if CurrentTotalLBs != 0: CurrentPercentEncrypted = (CurrentEncryptedLBs / CurrentTotalLBs) * 100