Files
wsfc_zvml_monitor/zerto_wsfc_monitor.ps1
Kosta Mushkin 8ec5c02ffe initial commit
2025-10-31 13:25:37 -04:00

457 lines
19 KiB
PowerShell

# Legal Disclaimer
# This script is an example script and is not supported under any Zerto support program or service.
# The author and Zerto further disclaim all implied warranties including, without limitation,
# any implied warranties of merchantability or of fitness for a particular purpose.
# In no event shall Zerto, its authors or anyone else involved in the creation,
# production or delivery of the scripts be liable for any damages whatsoever (including,
# without limitation, damages for loss of business profits, business interruption, loss of business
# information, or other pecuniary loss) arising out of the use of or the inability to use the sample
# scripts or documentation, even if the author or Zerto has been advised of the possibility of such damages.
# The entire risk arising out of the use or performance of the sample scripts and documentation remains with you.
# Toggle Zerto VPGs to match WSFC owner.
# Enhanced version with comprehensive logging and disk/group ownership detection
# HARD-CODED VALUES (edit the block below to your env)
# --- HARD-CODED CONFIG ---
$ZvmHost = 'XXXX' # ZVM hostname or IP address, i.e. 192.168.222.20
$ZvmUser = 'admin' # ZVM username
$ZvmPassPlain = 'XXXX' # ZVM password
$NodeUser = 'XXX' # WSFC admin username
$NodePassPlain = 'XXX' # WSFC admin password
$ClusterFqdn = 'TOR-HV01.lab.local' # WSFC cluster FQDN
$WSFCGroupName = 'Cluster Group' # WSFC cluster group name, i.e. Cluster Group
$Node1 = 'TOR-HV01-N1' # WSFC node 1 FQDN, i.e. TOR-HV01-N1
$Node2 = 'TOR-HV01-N2' # WSFC node 2 FQDN, i.e. TOR-HV01-N2
# OWNERSHIP DETECTION MODE
# 'Disk' => determine active owner by shared disk ownership (preferred)
# 'Group' => determine active owner by Cluster Group ownership (legacy behavior)
$OwnershipSource = 'Disk' # ✅ Set to 'Disk' or 'Group'
# DISK MONITORING CONFIGURATION (used when OwnershipSource = 'Disk')
$SharedDiskUniqueId = '6589CFC000000307DC30980C15CE8818' # ✅ Your 10GB VMDK UniqueId
$SharedDiskLabel = 'WSFCData1' # Optional helper label
$DiskResourceName = 'Cluster Disk 1' # Optional cluster resource name
$VpgName1 = 'Node1' # mapped to Node1
$VpgName2 = 'node2' # mapped to Node2 (lowercase in your env)
# --- LOGGING CONFIGURATION ---
$LogLevel = 'INFO' # DEBUG, INFO, WARN, ERROR
$LogFile = "C:\Temp\PwshCompat\cluster-monitor-$(Get-Date -Format 'yyyyMMdd-HHmmss').log"
# --- LOGGING FUNCTIONS ---
function Write-Log {
param(
[string]$Message,
[ValidateSet('DEBUG','INFO','WARN','ERROR')]$Level = 'INFO',
[string]$Component = 'MAIN'
)
$timestamp = Get-Date -Format 'yyyy-MM-dd HH:mm:ss.fff'
$logEntry = "[$timestamp] [$Level] [$Component] $Message"
# Write to console with color coding
switch ($Level) {
'DEBUG' { Write-Host $logEntry -ForegroundColor Gray }
'INFO' { Write-Host $logEntry -ForegroundColor White }
'WARN' { Write-Host $logEntry -ForegroundColor Yellow }
'ERROR' { Write-Host $logEntry -ForegroundColor Red }
}
# Write to log file
try {
Add-Content -Path $LogFile -Value $logEntry -ErrorAction SilentlyContinue
} catch {
Write-Host "Failed to write to log file: $($_.Exception.Message)" -ForegroundColor Red
}
}
function Write-SectionHeader {
param([string]$Title)
$separator = "=" * 60
Write-Log $separator 'INFO'
Write-Log " $Title" 'INFO'
Write-Log $separator 'INFO'
}
# --- HOUSEKEEPING (keeps WinPS compat/temp out of locked profile paths) ---
Write-SectionHeader "INITIALIZATION"
Write-Log "Creating temporary directory and setting environment variables" 'INFO' 'INIT'
New-Item -ItemType Directory -Path "C:\Temp\PwshCompat" -Force | Out-Null
$env:TEMP = 'C:\Temp\PwshCompat'; $env:TMP = 'C:\Temp\PwshCompat'
Write-Log "Temporary directory created: C:\Temp\PwshCompat" 'DEBUG' 'INIT'
Write-Log "Log file will be written to: $LogFile" 'INFO' 'INIT'
Write-Log "Ownership detection mode: $OwnershipSource" 'INFO' 'INIT'
# --- MODULES ---
Write-SectionHeader "MODULE LOADING"
try {
Write-Log "Loading FailoverClusters module..." 'INFO' 'MODULES'
Import-Module FailoverClusters -SkipEditionCheck -ErrorAction Stop
Write-Log "FailoverClusters module loaded successfully" 'INFO' 'MODULES'
} catch {
Write-Log "Failed to load FailoverClusters module: $($_.Exception.Message)" 'ERROR' 'MODULES'
throw
}
try {
Write-Log "Loading Zerto.ZvmLinux.Commandlets module..." 'INFO' 'MODULES'
Import-Module Zerto.ZvmLinux.Commandlets -Force -ErrorAction Stop
Write-Log "Zerto module loaded successfully" 'INFO' 'MODULES'
} catch {
Write-Log "Failed to load Zerto module: $($_.Exception.Message)" 'ERROR' 'MODULES'
throw
}
try {
Write-Log "Disabling SSL certificate validation for ZVM connection..." 'INFO' 'MODULES'
Remove-ZvmSslCheck
Write-Log "SSL certificate validation disabled" 'INFO' 'MODULES'
} catch {
Write-Log "Failed to disable SSL check: $($_.Exception.Message)" 'WARN' 'MODULES'
}
# --- CREDENTIALS (from hard-coded strings) ---
Write-SectionHeader "CREDENTIAL SETUP"
Write-Log "Creating credentials for ZVM and cluster nodes..." 'INFO' 'CREDS'
try {
$ZvmCredential = New-Object pscredential($ZvmUser, (ConvertTo-SecureString $ZvmPassPlain -AsPlainText -Force))
Write-Log "ZVM credentials created for user: $ZvmUser" 'DEBUG' 'CREDS'
$NodeCredential = New-Object pscredential($NodeUser, (ConvertTo-SecureString $NodePassPlain -AsPlainText -Force))
Write-Log "Node credentials created for user: $NodeUser" 'DEBUG' 'CREDS'
Write-Log "All credentials created successfully" 'INFO' 'CREDS'
} catch {
Write-Log "Failed to create credentials: $($_.Exception.Message)" 'ERROR' 'CREDS'
throw
}
# --- CONNECT TO ZVM ---
Write-SectionHeader "ZVM CONNECTION"
Write-Log "Attempting to connect to ZVM at: $ZvmHost" 'INFO' 'ZVM'
try {
Connect-ZVM -HostName $ZvmHost -Credential $ZvmCredential | Out-Null
Write-Log "Successfully connected to ZVM at $ZvmHost" 'INFO' 'ZVM'
} catch {
Write-Log "Failed to connect to ZVM: $($_.Exception.Message)" 'ERROR' 'ZVM'
throw
}
# --- LOOK UP VPGs & IDs ---
Write-SectionHeader "VPG DISCOVERY"
Write-Log "Retrieving VPG information from ZVM..." 'INFO' 'VPG'
try {
Write-Log "Looking up VPG: $VpgName1" 'DEBUG' 'VPG'
$vpg1 = Get-ZvmVpg -VpgName $VpgName1 | Select-Object VpgName,Status,SubStatus,Link
if ($vpg1) {
Write-Log "Found VPG1: $($vpg1.VpgName) - Status: $($vpg1.Status) - SubStatus: $($vpg1.SubStatus)" 'INFO' 'VPG'
} else {
Write-Log "VPG1 not found: $VpgName1" 'ERROR' 'VPG'
throw "VPG '$VpgName1' not found"
}
Write-Log "Looking up VPG: $VpgName2" 'DEBUG' 'VPG'
$vpg2 = Get-ZvmVpg -VpgName $VpgName2 | Select-Object VpgName,Status,SubStatus,Link
if ($vpg2) {
Write-Log "Found VPG2: $($vpg2.VpgName) - Status: $($vpg2.Status) - SubStatus: $($vpg2.SubStatus)" 'INFO' 'VPG'
} else {
Write-Log "VPG2 not found: $VpgName2" 'ERROR' 'VPG'
throw "VPG '$VpgName2' not found"
}
$vpgid1 = $vpg1.Link.Identifier
$vpgid2 = $vpg2.Link.Identifier
Write-Log "VPG IDs retrieved - VPG1: $vpgid1, VPG2: $vpgid2" 'DEBUG' 'VPG'
} catch {
Write-Log "Failed to retrieve VPG information: $($_.Exception.Message)" 'ERROR' 'VPG'
throw
}
Write-Log "VPG Summary:" 'INFO' 'VPG'
Write-Log " Node1=$Node1 VPG=$($vpg1.VpgName) Id=$vpgid1 SubStatus=$($vpg1.SubStatus)" 'INFO' 'VPG'
Write-Log " Node2=$Node2 VPG=$($vpg2.VpgName) Id=$vpgid2 SubStatus=$($vpg2.SubStatus)" 'INFO' 'VPG'
# --- OWNERSHIP DETECTION FUNCTIONS ---
Write-SectionHeader "OWNERSHIP DETECTION FUNCTIONS"
function Get-ClusterDiskOwnerByResource {
param(
[string]$ResourceName,
[pscredential]$Cred,
[string]$ProbeNode1,
[string]$ProbeNode2
)
Write-Log "Attempting to find disk owner by cluster resource: $ResourceName" 'DEBUG' 'DISK'
$opt = New-CimSessionOption -Protocol Dcom
foreach ($n in @($ProbeNode1,$ProbeNode2)) {
try {
Write-Log "Checking cluster resources on node: $n" 'DEBUG' 'DISK'
$s = New-CimSession -ComputerName $n -Credential $Cred -SessionOption $opt -ErrorAction Stop
$r = Get-CimInstance -Namespace root\MSCluster -Class MSCluster_Resource -CimSession $s `
-Filter "Type='Physical Disk'"
Remove-CimSession $s
if ($r) {
$match = if ($ResourceName) {
$r | Where-Object { $_.Name -eq $ResourceName -or $_.Name -like "*$ResourceName*" }
} else {
$r | Select-Object -First 1
}
if ($match -and $match.OwnerNode) {
Write-Log "Found disk owner via cluster resource: $($match.OwnerNode)" 'INFO' 'DISK'
return $match.OwnerNode
}
}
} catch {
Write-Log "Failed to check cluster resources on $n : $($_.Exception.Message)" 'DEBUG' 'DISK'
}
}
Write-Log "No disk owner found via cluster resource method" 'WARN' 'DISK'
return $null
}
function Get-ActiveClusterNodeCim {
param([string]$PreferredNode,[string]$FallbackNode,[string]$GroupName,[pscredential]$Cred)
Write-Log "Attempting CIM connection to determine cluster group owner..." 'DEBUG' 'GROUP'
$opt = New-CimSessionOption -Protocol Dcom
foreach ($n in @($PreferredNode,$FallbackNode)) {
try {
Write-Log "Trying CIM connection to node: $n" 'DEBUG' 'GROUP'
$sess = New-CimSession -ComputerName $n -Credential $Cred -SessionOption $opt -ErrorAction Stop
Write-Log "CIM session established to $n" 'DEBUG' 'GROUP'
$rg = Get-CimInstance -Namespace root\MSCluster -Class MSCluster_ResourceGroup -CimSession $sess -Filter "Name='$GroupName'"
Remove-CimSession $sess
if ($rg -and $rg.OwnerNode) {
Write-Log "Cluster group owner determined via CIM: $($rg.OwnerNode)" 'INFO' 'GROUP'
return $rg.OwnerNode
}
} catch {
Write-Log "CIM connection failed to $n : $($_.Exception.Message)" 'DEBUG' 'GROUP'
}
}
Write-Log "CIM method failed to determine cluster group owner" 'WARN' 'GROUP'
return $null
}
function Get-ActiveOwner {
param(
[ValidateSet('Disk','Group')]$Mode,
[string]$GroupName,
[pscredential]$Cred,
[string]$Node1,[string]$Node2,
[string]$DiskResourceName,
[string]$DiskUniqueId,
[string]$DiskLabel
)
Write-Log "Determining active owner using mode: $Mode" 'INFO' 'OWNER'
if ($Mode -eq 'Disk') {
Write-Log "Using disk-based ownership detection" 'INFO' 'OWNER'
# 1) Prefer cluster Physical Disk resource owner (if present)
Write-Log "Step 1: Checking cluster Physical Disk resource owner..." 'DEBUG' 'OWNER'
$owner = Get-ClusterDiskOwnerByResource -ResourceName $DiskResourceName `
-Cred $Cred -ProbeNode1 $Node1 -ProbeNode2 $Node2
if ($owner) {
Write-Log "Active owner determined via cluster disk resource: $owner" 'INFO' 'OWNER'
return $owner
}
Write-Log "Disk-based detection failed, falling back to group-based detection" 'WARN' 'OWNER'
}
# Group owner detection (CIM first, then WinRM fallback)
Write-Log "Using group-based ownership detection" 'INFO' 'OWNER'
# Group owner (CIM)
$owner = Get-ActiveClusterNodeCim -PreferredNode $Node1 -FallbackNode $Node2 -GroupName $GroupName -Cred $Cred
if ($owner) {
Write-Log "Active owner determined via CIM group detection: $owner" 'INFO' 'OWNER'
return $owner
}
# Group owner (WinRM fallback)
Write-Log "CIM method failed, trying WinRM fallback..." 'INFO' 'OWNER'
foreach ($n in @($Node1,$Node2)) {
try {
Write-Log "Attempting WinRM connection to: $n.lab.local" 'DEBUG' 'OWNER'
$owner = Invoke-Command -ComputerName "$n.lab.local" -Authentication Negotiate -Credential $Cred -ScriptBlock {
Import-Module FailoverClusters
(Get-ClusterGroup -Name $using:GroupName).OwnerNode.Name
} -ErrorAction Stop
if ($owner) {
Write-Log "Active owner determined via WinRM group detection: $owner" 'INFO' 'OWNER'
return $owner
}
} catch {
Write-Log "WinRM connection failed to $n : $($_.Exception.Message)" 'DEBUG' 'OWNER'
}
}
Write-Log "All ownership detection methods failed" 'ERROR' 'OWNER'
return $null
}
# --- DETERMINE ACTIVE OWNER ---
Write-SectionHeader "ACTIVE OWNER DETERMINATION"
Write-Log "Ownership detection mode: $OwnershipSource" 'INFO' 'CLUSTER'
$activeNode = Get-ActiveOwner -Mode $OwnershipSource `
-GroupName $WSFCGroupName `
-Cred $NodeCredential `
-Node1 $Node1 -Node2 $Node2 `
-DiskResourceName $DiskResourceName `
-DiskUniqueId $SharedDiskUniqueId `
-DiskLabel $SharedDiskLabel
if (-not $activeNode) {
Write-Log "Unable to determine active owner using mode '$OwnershipSource'" 'ERROR' 'CLUSTER'
throw "Unable to determine active owner using mode '$OwnershipSource'."
}
Write-Log "Active owner determined (mode=$OwnershipSource): $activeNode" 'INFO' 'CLUSTER'
# --- VPG STATE ASSESSMENT ---
Write-SectionHeader "VPG STATE ASSESSMENT"
$desiredOwner = if ($activeNode -eq $Node1) { $VpgName1 } elseif ($activeNode -eq $Node2) { $VpgName2 } else { $null }
Write-Log "Owner '$activeNode' maps to VPG '$desiredOwner'" 'INFO' 'ASSESSMENT'
Write-Log "Current VPG states:" 'INFO' 'ASSESSMENT'
Write-Log " VPG1 ($VpgName1): Status=$($vpg1.Status), SubStatus=$($vpg1.SubStatus)" 'INFO' 'ASSESSMENT'
Write-Log " VPG2 ($VpgName2): Status=$($vpg2.Status), SubStatus=$($vpg2.SubStatus)" 'INFO' 'ASSESSMENT'
# Desired states:
# - If owner = Node1 => VPG1 should be active (NOT paused), VPG2 should be paused
# - If owner = Node2 => VPG2 should be active (NOT paused), VPG1 should be paused
$needResume1 = ($activeNode -eq $Node1) -and ($vpg1.SubStatus -eq 'ReplicationPausedUserInitiated')
$needPause1 = ($activeNode -eq $Node2) -and ($vpg1.SubStatus -ne 'ReplicationPausedUserInitiated')
$needResume2 = ($activeNode -eq $Node2) -and ($vpg2.SubStatus -eq 'ReplicationPausedUserInitiated')
$needPause2 = ($activeNode -eq $Node1) -and ($vpg2.SubStatus -ne 'ReplicationPausedUserInitiated')
Write-Log "Required actions analysis:" 'INFO' 'ASSESSMENT'
Write-Log " VPG1 Resume needed: $needResume1" 'DEBUG' 'ASSESSMENT'
Write-Log " VPG1 Pause needed: $needPause1" 'DEBUG' 'ASSESSMENT'
Write-Log " VPG2 Resume needed: $needResume2" 'DEBUG' 'ASSESSMENT'
Write-Log " VPG2 Pause needed: $needPause2" 'DEBUG' 'ASSESSMENT'
if (-not ($needResume1 -or $needPause1 -or $needResume2 -or $needPause2)) {
Write-Log "✅ Owner matches active VPG; no change is needed." 'INFO' 'ASSESSMENT'
} else {
Write-Log "🔧 Adjustments required to match owner:" 'INFO' 'ASSESSMENT'
if ($needResume1) { Write-Log " - Resume $($vpg1.VpgName)" 'INFO' 'ASSESSMENT' }
if ($needPause1) { Write-Log " - Pause $($vpg1.VpgName)" 'INFO' 'ASSESSMENT' }
if ($needResume2) { Write-Log " - Resume $($vpg2.VpgName)" 'INFO' 'ASSESSMENT' }
if ($needPause2) { Write-Log " - Pause $($vpg2.VpgName)" 'INFO' 'ASSESSMENT' }
}
# --- ACTIONS (idempotent) ---
Write-SectionHeader "VPG ACTIONS"
# VPG1 Actions
if ($needResume1) {
Write-Log "Resuming $($vpg1.VpgName)..." 'INFO' 'ACTION'
try {
Start-ZvmVpgResume -VpgId $vpgid1
Write-Log "Successfully initiated resume for $($vpg1.VpgName)" 'INFO' 'ACTION'
Write-Log "Waiting 10 seconds before force sync..." 'INFO' 'ACTION'
Start-Sleep 10
Write-Log "Initiating force sync for $($vpg1.VpgName)..." 'INFO' 'ACTION'
Start-ZvmVpgForceSync -VpgId $vpgid1
Write-Log "Successfully initiated force sync for $($vpg1.VpgName)" 'INFO' 'ACTION'
} catch {
Write-Log "Failed to resume/sync $($vpg1.VpgName): $($_.Exception.Message)" 'ERROR' 'ACTION'
}
} elseif ($activeNode -eq $Node1) {
Write-Log "$($vpg1.VpgName) already replicating (no action needed)" 'INFO' 'ACTION'
}
if ($needPause1) {
Write-Log "Pausing $($vpg1.VpgName)..." 'INFO' 'ACTION'
try {
Start-ZvmVpgPause -VpgId $vpgid1
Write-Log "Successfully initiated pause for $($vpg1.VpgName)" 'INFO' 'ACTION'
} catch {
Write-Log "Failed to pause $($vpg1.VpgName): $($_.Exception.Message)" 'ERROR' 'ACTION'
}
} elseif ($activeNode -eq $Node2) {
Write-Log "$($vpg1.VpgName) already paused (no action needed)" 'INFO' 'ACTION'
}
# VPG2 Actions
if ($needResume2) {
Write-Log "Resuming $($vpg2.VpgName)..." 'INFO' 'ACTION'
try {
Start-ZvmVpgResume -VpgId $vpgid2
Write-Log "Successfully initiated resume for $($vpg2.VpgName)" 'INFO' 'ACTION'
Write-Log "Waiting 10 seconds before force sync..." 'INFO' 'ACTION'
Start-Sleep 10
Write-Log "Initiating force sync for $($vpg2.VpgName)..." 'INFO' 'ACTION'
Start-ZvmVpgForceSync -VpgId $vpgid2
Write-Log "Successfully initiated force sync for $($vpg2.VpgName)" 'INFO' 'ACTION'
} catch {
Write-Log "Failed to resume/sync $($vpg2.VpgName): $($_.Exception.Message)" 'ERROR' 'ACTION'
}
} elseif ($activeNode -eq $Node2) {
Write-Log "$($vpg2.VpgName) already replicating (no action needed)" 'INFO' 'ACTION'
}
if ($needPause2) {
Write-Log "Pausing $($vpg2.VpgName)..." 'INFO' 'ACTION'
try {
Start-ZvmVpgPause -VpgId $vpgid2
Write-Log "Successfully initiated pause for $($vpg2.VpgName)" 'INFO' 'ACTION'
} catch {
Write-Log "Failed to pause $($vpg2.VpgName): $($_.Exception.Message)" 'ERROR' 'ACTION'
}
} elseif ($activeNode -eq $Node1) {
Write-Log "$($vpg2.VpgName) already paused (no action needed)" 'INFO' 'ACTION'
}
Write-Log "Waiting 10 seconds before final status check..." 'INFO' 'ACTION'
Start-Sleep 10
# --- FINAL STATUS ---
Write-SectionHeader "FINAL STATUS"
Write-Log "Retrieving final VPG status..." 'INFO' 'STATUS'
try {
$finalVpg1 = Get-ZvmVpg -VpgName $VpgName1 | Select-Object VpgName,Status,SubStatus
$finalVpg2 = Get-ZvmVpg -VpgName $VpgName2 | Select-Object VpgName,Status,SubStatus
Write-Log "Final VPG Status:" 'INFO' 'STATUS'
Write-Log " VPG1 ($($finalVpg1.VpgName)): Status=$($finalVpg1.Status), SubStatus=$($finalVpg1.SubStatus)" 'INFO' 'STATUS'
Write-Log " VPG2 ($($finalVpg2.VpgName)): Status=$($finalVpg2.Status), SubStatus=$($finalVpg2.SubStatus)" 'INFO' 'STATUS'
Write-Log "Detailed VPG Status Tables:" 'INFO' 'STATUS'
$finalVpg1 | Format-Table VpgName,Status,SubStatus -Auto
$finalVpg2 | Format-Table VpgName,Status,SubStatus -Auto
} catch {
Write-Log "Failed to retrieve final VPG status: $($_.Exception.Message)" 'ERROR' 'STATUS'
}
Write-SectionHeader "SCRIPT COMPLETION"
Write-Log "Cluster monitoring script completed successfully" 'INFO' 'COMPLETE'
Write-Log "Ownership detection mode used: $OwnershipSource" 'INFO' 'COMPLETE'
Write-Log "Log file saved to: $LogFile" 'INFO' 'COMPLETE'