diff --git a/pkg/clioptions/clusterdiscovery/cluster.go b/pkg/clioptions/clusterdiscovery/cluster.go index b594d9859d13..abdf6f1bec28 100644 --- a/pkg/clioptions/clusterdiscovery/cluster.go +++ b/pkg/clioptions/clusterdiscovery/cluster.go @@ -30,6 +30,13 @@ import ( "github.com/openshift/origin/test/extended/util/azure" ) +// HypervisorConfig contains configuration for hypervisor-based recovery operations +type HypervisorConfig struct { + HypervisorIP string `json:"hypervisorIP"` + SSHUser string `json:"sshUser"` + PrivateKeyPath string `json:"privateKeyPath"` +} + type ClusterConfiguration struct { ProviderName string `json:"type"` @@ -76,6 +83,9 @@ type ClusterConfiguration struct { // IsNoOptionalCapabilities indicates the cluster has no optional capabilities enabled HasNoOptionalCapabilities bool + // HypervisorConfig contains SSH configuration for hypervisor-based recovery operations + HypervisorConfig *HypervisorConfig + // APIGroups contains the set of API groups available in the cluster APIGroups sets.Set[string] `json:"-"` // EnabledFeatureGates contains the set of enabled feature gates in the cluster diff --git a/pkg/cmd/openshift-tests/run/flags.go b/pkg/cmd/openshift-tests/run/flags.go index 106e71ce8266..618667344a00 100644 --- a/pkg/cmd/openshift-tests/run/flags.go +++ b/pkg/cmd/openshift-tests/run/flags.go @@ -1,6 +1,8 @@ package run import ( + "encoding/json" + "fmt" "os" "github.com/openshift-eng/openshift-tests-extension/pkg/extension" @@ -28,9 +30,6 @@ type RunSuiteFlags struct { ToImage string TestOptions []string - // Shared by initialization code - config *clusterdiscovery.ClusterConfiguration - genericclioptions.IOStreams } @@ -84,7 +83,7 @@ func (f *RunSuiteFlags) ToOptions(args []string, availableSuites []*testginkgo.T // shallow copy to mutate ginkgoOptions := f.GinkgoRunSuiteOptions - providerConfig, err := f.SuiteWithKubeTestInitializationPreSuite() + clusterConfig, err := f.SuiteWithKubeTestInitializationPreSuite() if err != nil { return nil, err } @@ -95,13 +94,39 @@ func (f *RunSuiteFlags) ToOptions(args []string, availableSuites []*testginkgo.T return nil, err } + // Parse hypervisor configuration if provided and set it in environment for test context + if f.GinkgoRunSuiteOptions.WithHypervisorConfigJSON != "" { + // Validate the JSON format + var hypervisorConfig clusterdiscovery.HypervisorConfig + if err := json.Unmarshal([]byte(f.GinkgoRunSuiteOptions.WithHypervisorConfigJSON), &hypervisorConfig); err != nil { + return nil, fmt.Errorf("failed to parse hypervisor configuration JSON: %v", err) + } + + // Validate required fields + if hypervisorConfig.HypervisorIP == "" { + return nil, fmt.Errorf("hypervisorIP is required in hypervisor configuration") + } + if hypervisorConfig.SSHUser == "" { + return nil, fmt.Errorf("sshUser is required in hypervisor configuration") + } + if hypervisorConfig.PrivateKeyPath == "" { + return nil, fmt.Errorf("privateKey is required in hypervisor configuration") + } + + // Set the hypervisor configuration in the cluster config + clusterConfig.HypervisorConfig = &hypervisorConfig + + // Also set it in environment for test context access + os.Setenv("HYPERVISOR_CONFIG", f.GinkgoRunSuiteOptions.WithHypervisorConfigJSON) + } + o := &RunSuiteOptions{ GinkgoRunSuiteOptions: ginkgoOptions, Suite: suite, Extension: internalExtension, - ClusterConfig: providerConfig, + ClusterConfig: clusterConfig, FromRepository: f.FromRepository, - CloudProviderJSON: providerConfig.ToJSONString(), + CloudProviderJSON: clusterConfig.ToJSONString(), CloseFn: closeFn, IOStreams: f.IOStreams, } diff --git a/pkg/cmd/openshift-tests/run/options.go b/pkg/cmd/openshift-tests/run/options.go index 13b32ab944a4..c5f9715a47f4 100644 --- a/pkg/cmd/openshift-tests/run/options.go +++ b/pkg/cmd/openshift-tests/run/options.go @@ -31,6 +31,11 @@ type RunSuiteOptions struct { CloseFn iooptions.CloseFunc genericclioptions.IOStreams + // HypervisorConfig contains SSH configuration for hypervisor-based recovery operations + // If set, will run recovery tests that require the hypervisor-based recovery, such as + // the node replacement test in the two_node recovery suite. + HypervisorConfig *clusterdiscovery.HypervisorConfig + // ClusterConfig contains cluster-specific configuration for filtering tests ClusterConfig *clusterdiscovery.ClusterConfiguration diff --git a/pkg/test/filters/cluster_state.go b/pkg/test/filters/cluster_state.go index 2372f75c4081..4e1aa80372bc 100644 --- a/pkg/test/filters/cluster_state.go +++ b/pkg/test/filters/cluster_state.go @@ -67,6 +67,10 @@ func NewClusterStateFilter(config *clusterdiscovery.ClusterConfiguration) *Clust skips = append(skips, "[Skipped:NoOptionalCapabilities]") } + if config.HypervisorConfig == nil { + skips = append(skips, "[Requires:HypervisorSSHConfig]") + } + logrus.WithField("skips", skips).Info("Generated skips for cluster state") return &ClusterStateFilter{ diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go index 76ed19950eea..347e4623333b 100644 --- a/pkg/test/ginkgo/cmd_runsuite.go +++ b/pkg/test/ginkgo/cmd_runsuite.go @@ -96,6 +96,9 @@ type GinkgoRunSuiteOptions struct { // RetryStrategy controls retry behavior and final outcome decisions RetryStrategy RetryStrategy + + // WithHypervisorConfigJSON contains JSON configuration for hypervisor-based recovery operations + WithHypervisorConfigJSON string } func NewGinkgoRunSuiteOptions(streams genericclioptions.IOStreams) *GinkgoRunSuiteOptions { @@ -133,6 +136,7 @@ func (o *GinkgoRunSuiteOptions) BindFlags(flags *pflag.FlagSet) { flags.StringVar(&o.ShardStrategy, "shard-strategy", o.ShardStrategy, "Which strategy to use for sharding (hash)") availableStrategies := getAvailableRetryStrategies() flags.Var(newRetryStrategyFlag(&o.RetryStrategy), "retry-strategy", fmt.Sprintf("Test retry strategy (available: %s, default: %s)", strings.Join(availableStrategies, ", "), defaultRetryStrategy)) + flags.StringVar(&o.WithHypervisorConfigJSON, "with-hypervisor-json", os.Getenv("HYPERVISOR_CONFIG"), "JSON configuration for hypervisor-based recovery operations. Must contain hypervisorIP, sshUser, and privateKeyPath fields.") } func (o *GinkgoRunSuiteOptions) Validate() error { diff --git a/test/extended/two_node/utils/hypervisor.go b/test/extended/two_node/utils/hypervisor.go new file mode 100644 index 000000000000..f6cb8d760d4a --- /dev/null +++ b/test/extended/two_node/utils/hypervisor.go @@ -0,0 +1,125 @@ +// Package utils provides hypervisor configuration and validation utilities for two-node cluster testing. +// +// Tests requiring hypervisor access should include the [Requires:HypervisorSSHConfig] annotation. +// +// Configuration can be provided via command-line flag or environment variable: +// +// openshift-tests run openshift/two-node --with-hypervisor-json='{ +// "IP": "192.168.111.1", +// "User": "root", +// "privateKeyPath": "/path/to/private/key" +// }' +// +// Or: +// +// export HYPERVISOR_CONFIG='{"IP":"192.168.111.1","User":"root","privateKeyPath":"/path/to/key"}' +// openshift-tests run openshift/two-node +// +// Usage example: +// +// if !exutil.HasHypervisorConfig() { +// utils.PrintHypervisorConfigUsage() +// return +// } +// config := exutil.GetHypervisorConfig() +// utils.VerifyHypervisorConnectivity(&config, knownHostsPath) +package utils + +import ( + "fmt" + "strings" + + g "github.com/onsi/ginkgo/v2" + "k8s.io/klog/v2" +) + +// PrintHypervisorConfigUsage prints usage instructions for configuring hypervisor SSH access. +// Call this when HasHypervisorConfig() returns false to provide configuration guidance. +func PrintHypervisorConfigUsage() { + usageMessage := ` +================================================================================ +Two-Node Test Suite - Hypervisor Configuration Required +================================================================================ + +This test requires hypervisor SSH configuration to manage virtual machines +and perform node operations. The [Requires:HypervisorSSHConfig] annotation +indicates this requirement. + +CONFIGURATION METHODS: + +1. Command-Line Flag (recommended for interactive testing): + + openshift-tests run openshift/two-node --with-hypervisor-json='{ + "IP": "192.168.111.1", + "User": "root", + "privateKeyPath": "/path/to/private/key" + }' + +2. Environment Variable (recommended for CI/CD): + + export HYPERVISOR_CONFIG='{"IP":"192.168.111.1","User":"root","privateKeyPath":"/path/to/key"}' + openshift-tests run openshift/two-node + +CONFIGURATION FIELDS: + +- IP: IP address or hostname of the hypervisor +- User: SSH username (typically "root") +- privateKeyPath: Absolute path to SSH private key file + +TROUBLESHOOTING: + +If configuration fails: +1. Verify JSON syntax is valid +2. Check that the private key file exists +3. Test SSH connectivity: ssh -i @ +4. Verify virsh is available: ssh @ 'virsh version' + +================================================================================ +` + g.GinkgoT().Logf(usageMessage) +} + +// VerifyHypervisorAvailability verifies SSH connectivity to the hypervisor and checks +// that virsh and libvirt are available. +func VerifyHypervisorAvailability(sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("Verifying hypervisor connectivity to %s@%s", sshConfig.User, sshConfig.IP) + + // Test basic SSH connectivity + output, _, err := VerifyConnectivity(sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to establish SSH connection to hypervisor", + "user", sshConfig.User, + "host", sshConfig.IP, + "output", output) + klog.ErrorS(nil, "Ensure the hypervisor is accessible and SSH key is correct") + return fmt.Errorf("failed to establish SSH connection to hypervisor %s@%s: %w", sshConfig.User, sshConfig.IP, err) + } + klog.V(2).Infof("SSH connectivity verified: %s", strings.TrimSpace(output)) + + // Test virsh availability and basic functionality + output, err = VerifyVirsh(sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "virsh is not available or not working on hypervisor", + "user", sshConfig.User, + "host", sshConfig.IP, + "output", output) + klog.ErrorS(nil, "Ensure libvirt and virsh are installed on the hypervisor") + return fmt.Errorf("virsh is not available or not working on hypervisor %s@%s: %w", sshConfig.User, sshConfig.IP, err) + } + klog.V(2).Infof("virsh availability verified: %s", strings.TrimSpace(output)) + + // Test libvirt connection by listing VMs + output, err = VirshListAllVMs(sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to connect to libvirt on hypervisor", + "user", sshConfig.User, + "host", sshConfig.IP, + "output", output) + klog.ErrorS(nil, "Ensure libvirtd service is running and user has access") + return fmt.Errorf("failed to connect to libvirt on hypervisor %s@%s: %w", sshConfig.User, sshConfig.IP, err) + } + klog.V(2).Infof("libvirt connection verified, found VMs: %s", strings.TrimSpace(output)) + + klog.V(2).Infof("Hypervisor connectivity verification completed successfully") + return nil +} diff --git a/test/extended/two_node/utils/pacemaker.go b/test/extended/two_node/utils/pacemaker.go new file mode 100644 index 000000000000..189415db66a9 --- /dev/null +++ b/test/extended/two_node/utils/pacemaker.go @@ -0,0 +1,394 @@ +// Package utils provides Pacemaker cluster management utilities for two-node OpenShift cluster testing. +// +// This package enables management and recovery operations for Pacemaker-managed etcd clusters in +// two-node OpenShift deployments. It provides high-level functions for cluster operations, resource +// management, and disaster recovery scenarios. +// +// Background: +// +// Two-node OpenShift clusters use Pacemaker to manage etcd quorum and provide high availability. +// Pacemaker uses the PCS (Pacemaker Configuration System) command-line tool for cluster management. +// This package wraps PCS commands and provides utilities specific to two-node cluster recovery. +// +// Key Features: +// - Pacemaker cluster status monitoring +// - etcd resource management (start, stop, debug operations) +// - STONITH (node fencing) control +// - Cluster membership management (add/remove nodes) +// - etcd revision file restoration +// - Node job cleanup for test scenarios +// - Retry utilities for handling transient failures +// +// Error Handling: +// +// All functions in this package return errors instead of using assertions (o.Expect). +// This makes them suitable for use as library functions. Calling code should check +// and handle errors appropriately, typically using o.Expect() in test code. +// +// Common Usage Patterns: +// +// 1. Monitoring Cluster Status: +// +// status, stderr, err := PcsStatus(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// resourceStatus, stderr, err := PcsResourceStatus("master-0", remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// journal, stderr, err := PcsJournal(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// +// 2. Quorum Recovery Operations: +// +// // Disable STONITH before recovery +// _, _, err := PcsDisableStonith(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// +// // Restore etcd quorum on remote node +// _, _, err := PcsDebugRestart(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// +// // Re-enable STONITH after recovery +// _, _, err := PcsEnableStonith(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// +// 3. Node Replacement Operations: +// +// // Remove old node and add replacement +// err := CycleRemovedNode(failedNodeName, failedNodeIP, runningNodeName, runningNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) +// if err != nil { +// return fmt.Errorf("failed to cycle node: %w", err) +// } +// +// // Restore etcd revision on new node +// err = RestoreEtcdRevision(nodeName, remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath, oc) +// if err != nil { +// return fmt.Errorf("failed to restore etcd revision: %w", err) +// } +// +// // Clean up old jobs +// err = DeleteNodeJobs(authJobName, afterSetupJobName, oc) +// if err != nil { +// return fmt.Errorf("failed to delete jobs: %w", err) +// } +// +// STONITH (Shoot The Other Node In The Head): +// +// STONITH is Pacemaker's node-level fencing mechanism that ensures cluster integrity by forcefully +// powering off or isolating unresponsive nodes. During recovery operations, STONITH is typically +// disabled to prevent automatic fencing, then re-enabled after the cluster is stable. +// +// Two-Node Quorum Challenge: +// +// In a two-node cluster, losing one node means losing quorum (majority). If fencing is properly enabled, +// pacemaker will restore quorum automatically by fencing the failed node and restarting the running node +// as a cluster of one. However, in the case that fencing fails, the PcsDebugRestart function can be used to +// bypass normal cluster checks and force etcd to start on the running node, restoring cluster operations +// until the failed node can be recovered or replaced. +// +// All PCS commands are executed on cluster nodes via two-hop SSH connections through a hypervisor, +// using the SSH utilities from this package. +package utils + +import ( + "fmt" + "time" + + exutil "github.com/openshift/origin/test/extended/util" + "k8s.io/klog/v2" +) + +// Pacemaker-related constants +const ( + superuserPrefix = "sudo" + pcsExecutable = "pcs" + noEnvVars = "" + + // PCS commands + pcsClusterNodeAdd = "cluster node add %s addr=%s --start --enable" + pcsResourceDebugStop = "resource debug-stop etcd --full" + pcsResourceDebugStartEnvVars = "OCF_RESKEY_CRM_meta_notify_start_resource='etcd'" + pcsResourceDebugStart = "resource debug-start etcd --full" + pcsDisableStonith = "property set stonith-enabled=false" + pcsEnableStonith = "property set stonith-enabled=true" + pcsClusterNodeRemove = "cluster node remove %s" + pcsResourceStatus = "resource status etcd node=%s" + pcsStatus = "status" + + etcdNamespace = "openshift-etcd" + mkdirEtcdDir = "sudo mkdir /var/lib/etcd" + chmodEtcdDir = "sudo chmod %o /var/lib/etcd" + revisionJSONTemplate = `{"clusterId":"0","raftIndex":{"https://%s:%s":0},"maxRaftIndex":0,"created":""}` + etcdDirPermissions = 0766 + etcdFilePermissions = 0644 + etcdPort = "2379" + chmodRevisionJSON = "sudo chmod %o /var/lib/etcd/revision.json" +) + +func formatPcsCommandString(command string, envVars string) string { + if envVars != "" { + return fmt.Sprintf("%s %s %s %s", superuserPrefix, envVars, pcsExecutable, command) + } + + return fmt.Sprintf("%s %s %s", superuserPrefix, pcsExecutable, command) +} + +// PcsDebugRestart restores etcd quorum on a node by performing a debug stop and start. +// This is used in single-node quorum recovery scenarios after a node failure. +// +// The function performs the following operations: +// 1. Stops the etcd resource using "pcs resource debug-stop etcd --full" +// 2. Starts the etcd resource using "pcs resource debug-start etcd --full" with notify metadata +// 3. Verifies the operation by checking pacemaker status +// +// This is critical for two-node clusters where losing one node would normally prevent etcd from +// achieving quorum. The debug-start bypasses normal cluster checks to force etcd to start. +// +// Parameters: +// - remoteNodeIP: IP address of the remote node to restore etcd on +// - sshConfig: SSH configuration for connecting to the hypervisor +// - localKnownHostsPath: Path to the known_hosts file for the hypervisor connection +// - remoteKnownHostsPath: Path to the known_hosts file on the hypervisor for the node connection +// +// Returns: +// - string: Command stdout +// - string: Command stderr +// - error: Any error that occurred during the restart operation +func PcsDebugRestart(remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + klog.V(2).Infof("Restoring etcd quorum on remote node: %s", remoteNodeIP) + + // SSH to hypervisor, then to remote node to run pcs debug-start + // We need to chain the SSH commands: host -> hypervisor -> remote node + output, stderr, err := ExecuteRemoteSSHCommand(remoteNodeIP, fmt.Sprintf("%s && %s", formatPcsCommandString(pcsResourceDebugStop, noEnvVars), formatPcsCommandString(pcsResourceDebugStart, pcsResourceDebugStartEnvVars)), sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to restart etcd", "node", remoteNodeIP, "stderr", stderr) + return output, stderr, err + } + + // Log pacemaker status to check if etcd has been started on the remote node + pcsStatusOutput, stderr, err := PcsStatus(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + klog.Warning("Failed to get pacemaker status on remote node", "node", remoteNodeIP, "error", err) + } else { + klog.V(4).Infof("Pacemaker status on remote node %s:\n%s", remoteNodeIP, pcsStatusOutput) + } + + klog.V(2).Infof("Successfully restored etcd quorum on remote node: %s", remoteNodeIP) + return output, stderr, nil +} + +// PcsDebugStart restores etcd quorum on a node by performing a debug start. +// This is used in single-node quorum recovery scenarios after a node failure. +// +// The function performs the following operations: +// 1. Starts the etcd resource using "pcs resource debug-start etcd --full" with notify metadata +// 2. Verifies the operation by checking pacemaker status +// +// This is critical for two-node clusters where losing one node would normally prevent etcd from +// achieving quorum. The debug-start bypasses normal cluster checks to force etcd to start. +// +// Parameters: +// - remoteNodeIP: IP address of the remote node to restore etcd on +// - sshConfig: SSH configuration for connecting to the hypervisor +// - localKnownHostsPath: Path to the known_hosts file for the hypervisor connection +// - remoteKnownHostsPath: Path to the known_hosts file on the hypervisor for the node connection +// +// Returns: +// - string: Command stdout +// - string: Command stderr +// - error: Any error that occurred during the restart operation +func PcsDebugStart(remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + klog.V(2).Infof("Restoring etcd quorum on remote node: %s", remoteNodeIP) + + // SSH to hypervisor, then to remote node to run pcs debug-start + // We need to chain the SSH commands: host -> hypervisor -> remote node + output, stderr, err := ExecuteRemoteSSHCommand(remoteNodeIP, formatPcsCommandString(pcsResourceDebugStart, pcsResourceDebugStartEnvVars), sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to restart etcd", "node", remoteNodeIP, "stderr", stderr) + return output, stderr, err + } + + // Log pacemaker status to check if etcd has been started on the remote node + pcsStatusOutput, stderr, err := PcsStatus(remoteNodeIP, sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + klog.Warning("Failed to get pacemaker status on remote node", "node", remoteNodeIP, "error", err) + } else { + klog.V(4).Infof("Pacemaker status on remote node %s:\n%s", remoteNodeIP, pcsStatusOutput) + } + + klog.V(2).Infof("Successfully restored etcd quorum on remote node: %s", remoteNodeIP) + return output, stderr, nil +} + +// PcsStatus retrieves the overall pacemaker cluster status. +// This shows the state of all cluster resources, nodes, and any failures. +func PcsStatus(remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + return ExecuteRemoteSSHCommand(remoteNodeIP, formatPcsCommandString(pcsStatus, noEnvVars), sshConfig, localKnownHostsPath, remoteKnownHostsPath) +} + +// PcsResourceStatus retrieves the status of a specific pacemaker resource (etcd) on a node. +// This is more targeted than PcsStatus and shows whether the etcd resource is started/stopped. +func PcsResourceStatus(nodeName, remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + return ExecuteRemoteSSHCommand(remoteNodeIP, formatPcsCommandString(fmt.Sprintf(pcsResourceStatus, nodeName), noEnvVars), sshConfig, localKnownHostsPath, remoteKnownHostsPath) +} + +// PcsDisableStonith disables STONITH (Shoot The Other Node In The Head) in the pacemaker cluster. +// This is typically done during maintenance or recovery operations to prevent automatic fencing. +func PcsDisableStonith(remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + return ExecuteRemoteSSHCommand(remoteNodeIP, formatPcsCommandString(pcsDisableStonith, noEnvVars), sshConfig, localKnownHostsPath, remoteKnownHostsPath) +} + +// PcsEnableStonith re-enables STONITH in the pacemaker cluster after maintenance is complete. +// STONITH provides node-level fencing to ensure cluster integrity. +func PcsEnableStonith(remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + return ExecuteRemoteSSHCommand(remoteNodeIP, formatPcsCommandString(pcsEnableStonith, noEnvVars), sshConfig, localKnownHostsPath, remoteKnownHostsPath) +} + +// PcsJournal retrieves the last pcsJournalTailLines lines of the pacemaker systemd journal logs. +// This is useful for debugging pacemaker behavior and troubleshooting cluster issues. +func PcsJournal(pcsJournalTailLines int, remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + return ExecuteRemoteSSHCommand(remoteNodeIP, fmt.Sprintf("sudo journalctl -u pacemaker --no-pager | grep podman-etcd | tail -n %d", pcsJournalTailLines), sshConfig, localKnownHostsPath, remoteKnownHostsPath) +} + +// RestoreEtcdRevision restores the etcd revision.json file on a replacement node and triggers etcd redeployment. +// This is a critical step in node replacement to ensure the new node can join the etcd cluster correctly. +// +// The function performs the following steps: +// 1. Creates /var/lib/etcd directory on the new node +// 2. Sets appropriate permissions on the directory (0766) +// 3. Creates revision.json with cluster metadata pointing to the new node's IP +// 4. Sets file permissions on revision.json (0644) +// 5. Triggers an etcd redeployment via the etcd operator using forceRedeploymentReason +// +// Parameters: +// - nodeName: Name of the replacement OpenShift node (unused but kept for clarity) +// - remoteNodeIP: IP address of the replacement node +// - sshConfig: SSH configuration for connecting to the hypervisor +// - localKnownHostsPath: Path to the known_hosts file for the hypervisor connection +// - remoteKnownHostsPath: Path to the known_hosts file on the hypervisor for the node connection +// - oc: OpenShift CLI client for patching the etcd operator +// +// Returns: +// - error: Any error encountered during revision file creation or etcd redeployment +func RestoreEtcdRevision(nodeName, remoteNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string, oc *exutil.CLI) error { + // Create the revision.json file on the new node using constants + revisionScript := fmt.Sprintf(` + %s + %s + echo '%s' | sudo tee -a /var/lib/etcd/revision.json + %s + `, mkdirEtcdDir, fmt.Sprintf(chmodEtcdDir, etcdDirPermissions), fmt.Sprintf(revisionJSONTemplate, remoteNodeIP, etcdPort), fmt.Sprintf(chmodRevisionJSON, etcdFilePermissions)) + + _, _, err := ExecuteRemoteSSHCommand(remoteNodeIP, revisionScript, sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + return fmt.Errorf("failed to create etcd revision.json on node %s: %w", remoteNodeIP, err) + } + + // Redeploy etcd with a force redeployment reason + forceRedeploymentReason := fmt.Sprintf("recovery-%s", time.Now().Format(time.RFC3339Nano)) + _, err = oc.AsAdmin().Run("patch").Args("etcd", "cluster", "-p", fmt.Sprintf(`{"spec": {"forceRedeploymentReason": "%s"}}`, forceRedeploymentReason), "--type=merge").Output() + if err != nil { + return fmt.Errorf("failed to trigger etcd redeployment: %w", err) + } + + klog.V(2).Infof("Successfully restored etcd revision on node %s and triggered redeployment", remoteNodeIP) + return nil +} + +// CycleRemovedNode removes and re-adds a node in the pacemaker cluster configuration. +// This is necessary when replacing a failed node to update the cluster membership. +// +// The function executes two pcs commands on the remote node: +// 1. "pcs cluster node remove " - removes the old/failed node +// 2. "pcs cluster node add addr= --start --enable" - adds the replacement node +// +// Parameters: +// - failedNodeName: Name of the replacement node to cycle +// - failedNodeIP: IP address of the replacement node +// - runningNodeIP: IP address of the remote node where commands are executed +// - sshConfig: SSH configuration for connecting to the hypervisor +// - localKnownHostsPath: Path to the known_hosts file for the hypervisor connection +// - remoteKnownHostsPath: Path to the known_hosts file on the hypervisor for the node connection +// +// Returns: +// - error: Any error encountered during node removal or addition +func CycleRemovedNode(failedNodeName, failedNodeIP, runningNodeIP string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) error { + // Remove and re-add the node in pacemaker using constants + pcsScript := fmt.Sprintf(` + %s + %s + `, + formatPcsCommandString(fmt.Sprintf(pcsClusterNodeRemove, failedNodeName), noEnvVars), + formatPcsCommandString(fmt.Sprintf(pcsClusterNodeAdd, failedNodeName, failedNodeIP), noEnvVars), + ) + + _, _, err := ExecuteRemoteSSHCommand(runningNodeIP, pcsScript, sshConfig, localKnownHostsPath, remoteKnownHostsPath) + if err != nil { + return fmt.Errorf("failed to cycle node %s in pacemaker cluster: %w", failedNodeName, err) + } + + klog.V(2).Infof("Successfully cycled node %s in pacemaker cluster", failedNodeName) + return nil +} + +// DeleteNodeJobs deletes TNF (Two Node Federation) related jobs for node authentication and setup. +// These jobs need to be cleaned up during node replacement to allow new jobs to be created. +// +// Parameters: +// - authJobName: Name of the TNF authentication job to delete (e.g., "tnf-auth-job-master-0") +// - afterSetupJobName: Name of the TNF after-setup job to delete (e.g., "tnf-after-setup-job-master-0") +// - oc: OpenShift CLI client for deleting the jobs +// +// Returns: +// - error: Any error encountered during job deletion +func DeleteNodeJobs(authJobName, afterSetupJobName string, oc *exutil.CLI) error { + // Delete the old tnf-auth-job using dynamic name + _, err := oc.AsAdmin().Run("delete").Args("job", authJobName, "-n", etcdNamespace).Output() + if err != nil { + return fmt.Errorf("failed to delete job %s: %w", authJobName, err) + } + klog.V(2).Infof("Deleted job %s", authJobName) + + // Delete the old tnf-after-setup-job using dynamic name + _, err = oc.AsAdmin().Run("delete").Args("job", afterSetupJobName, "-n", etcdNamespace).Output() + if err != nil { + return fmt.Errorf("failed to delete job %s: %w", afterSetupJobName, err) + } + klog.V(2).Infof("Deleted job %s", afterSetupJobName) + + return nil +} + +// RetryOperationWithTimeout retries an operation until it succeeds or times out. +// This is a general-purpose retry utility used throughout the two-node test utilities. +// +// The function polls the operation at regular intervals until either: +// - The operation succeeds (returns nil error) +// - The timeout is exceeded +// +// This is useful for operations that may fail temporarily due to cluster state transitions, +// API server unavailability, or resource propagation delays. +// +// Parameters: +// - operation: Function to execute that returns an error (nil on success) +// - timeout: Maximum time to wait for the operation to succeed +// - pollInterval: Time to wait between retry attempts +// - operationName: Descriptive name for logging purposes +// +// Returns: +// - error: nil if operation succeeded, timeout error if it failed within the timeout period +// +// Example: +// +// err := RetryOperationWithTimeout(func() error { +// _, err := oc.AsAdmin().Run("get").Args("node", "master-0").Output() +// return err +// }, 5*time.Minute, 10*time.Second, "get node master-0") +func RetryOperationWithTimeout(operation func() error, timeout, pollInterval time.Duration, operationName string) error { + startTime := time.Now() + + for time.Since(startTime) < timeout { + err := operation() + if err == nil { + klog.V(2).Infof("Operation %s succeeded after %v", operationName, time.Since(startTime)) + return nil + } + + klog.V(4).Infof("Operation %s failed, retrying in %v: %v", operationName, pollInterval, err) + time.Sleep(pollInterval) + } + + return fmt.Errorf("operation %s failed after %v timeout", operationName, timeout) +} diff --git a/test/extended/two_node/utils/ssh.go b/test/extended/two_node/utils/ssh.go new file mode 100644 index 000000000000..20cb0dd9e7d1 --- /dev/null +++ b/test/extended/two_node/utils/ssh.go @@ -0,0 +1,305 @@ +// Package utils provides SSH utilities for remote command execution in two-node cluster tests. +// +// Supports direct SSH connections (local → hypervisor) and two-hop connections (local → hypervisor → node). +// +// Usage example: +// +// // Prepare known_hosts files +// localKnownHostsPath, err := PrepareLocalKnownHostsFile(hypervisorConfig) +// remoteKnownHostsPath, err := PrepareRemoteKnownHostsFile(remoteNodeIP, hypervisorConfig, localKnownHostsPath) +// +// // Execute commands +// output, stderr, err := ExecuteSSHCommand("virsh list --all", hypervisorConfig, localKnownHostsPath) +// output, stderr, err := ExecuteRemoteSSHCommand(remoteNodeIP, "oc get nodes", hypervisorConfig, localKnownHostsPath, remoteKnownHostsPath) +// +// // Cleanup +// CleanupRemoteKnownHostsFile(hypervisorConfig, localKnownHostsPath, remoteKnownHostsPath) +// CleanupLocalKnownHostsFile(hypervisorConfig, localKnownHostsPath) +package utils + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "strings" + "time" + + "k8s.io/klog/v2" +) + +// SSHConfig contains the configuration needed to establish SSH connections to remote hosts +type SSHConfig struct { + IP string // IP address of the remote host + User string // SSH username for authentication + PrivateKeyPath string // Path to the SSH private key file +} + +// SSH-related constants +const ( + // SSH command patterns + sshStrictHostKeyChecking = "StrictHostKeyChecking=no" + userKnownHostsFile = "UserKnownHostsFile" + sshKeyscanCommand = "ssh-keyscan" + sshConnectivityTest = "echo 'SSH connectivity test successful'" + + // Startup operation timeouts and intervals + vmStartTimeout = 2 * time.Minute // Maximum time to wait for VM startup + vmStartPollInterval = 15 * time.Second // Interval between VM state checks + + // File paths + knownHostsTempPrefix = "known_hosts_" // Prefix for temporary known_hosts files + remoteInfix = "remote_" // Infix for remote known_hosts files +) + +// PrepareLocalKnownHostsFile creates a temporary known_hosts file and scans the SSH host key. +// This prevents "permanently added" warnings that cause SSH commands to fail. +// +// Parameters: +// - sshConfig: SSH configuration for the host to scan +// +// Returns: +// - string: Path to the created temporary known_hosts file +// - error: Any error that occurred during file creation or host key scanning +func PrepareLocalKnownHostsFile(sshConfig *SSHConfig) (string, error) { + klog.V(2).Infof("Preparing local known_hosts file for %q", sshConfig.IP) + + // Create a temporary known hosts file + tempFile, err := os.CreateTemp("", knownHostsTempPrefix+"*") + if err != nil { + klog.ErrorS(err, "Failed to create temporary known_hosts file") + return "", err + } + + knownHostsPath := tempFile.Name() + tempFile.Close() + + // Use ssh-keyscan to get the host key and add it to our known hosts file + keyscanCmd := exec.Command(sshKeyscanCommand, "-H", sshConfig.IP) + keyscanOutput, err := keyscanCmd.Output() + if err != nil { + klog.ErrorS(err, "Failed to scan host key", "host", sshConfig.IP) + return "", err + } + + // Write the host key to our known hosts file with secure permissions (0600) + err = os.WriteFile(knownHostsPath, []byte(keyscanOutput), 0600) + if err != nil { + klog.ErrorS(err, "Failed to write known_hosts file") + return "", err + } + + klog.V(2).Infof("Successfully created local known_hosts file: %q", knownHostsPath) + return knownHostsPath, nil +} + +// PrepareRemoteKnownHostsFile creates a known_hosts file on the proxy node for accessing the remote node. +// Used for two-hop SSH connections (local → proxy → remote). +// +// Parameters: +// - remoteNodeIP: IP address of the remote node to scan +// - proxyNodeSSHConfig: SSH configuration for the proxy node (hypervisor) +// - localKnownHostsPath: Path to the local known_hosts file for connecting to the proxy node +// +// Returns: +// - string: Path to the created remote known_hosts file on the proxy node +// - error: Any error that occurred during file creation or host key scanning +func PrepareRemoteKnownHostsFile(remoteNodeIP string, proxyNodeSSHConfig *SSHConfig, localKnownHostsPath string) (string, error) { + klog.V(2).Infof("Preparing remote known_hosts file on proxy node %q for remote node %q", proxyNodeSSHConfig.IP, remoteNodeIP) + + // Create a temporary known hosts file on the proxy node for the remote node + knownHostsPath := fmt.Sprintf("/tmp/%s%s%s", knownHostsTempPrefix, remoteInfix, remoteNodeIP) + + // Use ssh-keyscan on the proxy node to get the remote node's host key and create the file + // Capture stderr for logging instead of suppressing it + keyscanCmd := fmt.Sprintf(`ssh-keyscan -H %s`, remoteNodeIP) + keyscanOutput, stderr, err := ExecuteSSHCommand(keyscanCmd, proxyNodeSSHConfig, localKnownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to scan host key for remote node", "remoteNode", remoteNodeIP, "stderr", stderr) + return "", err + } + + // Log any warnings from ssh-keyscan + if stderr != "" { + klog.V(4).Infof("ssh-keyscan warnings for %s: %s", remoteNodeIP, stderr) + } + + // Create the known hosts file on the proxy node with secure permissions + createKnownHostsCmd := fmt.Sprintf(`echo '%s' > %s && chmod 600 %s`, strings.TrimSpace(keyscanOutput), knownHostsPath, knownHostsPath) + _, _, err = ExecuteSSHCommand(createKnownHostsCmd, proxyNodeSSHConfig, localKnownHostsPath) + if err != nil { + klog.ErrorS(err, "Failed to create known_hosts file on proxy node") + return "", err + } + + klog.V(2).Infof("Successfully created remote known_hosts file: %q", knownHostsPath) + return knownHostsPath, nil +} + +// ExecuteSSHCommand executes a command on a remote host via SSH. +// +// Parameters: +// - command: The command to execute on the remote host +// - sshConfig: SSH configuration for the remote host +// - knownHostsPath: Path to the known_hosts file to use for the connection +// +// Returns: +// - string: Standard output from the command +// - string: Standard error from the command +// - error: Any error that occurred (only non-zero exit codes are treated as errors) +func ExecuteSSHCommand(command string, sshConfig *SSHConfig, knownHostsPath string) (string, string, error) { + // Build the SSH command to run directly on the host + sshArgs := []string{ + "-i", sshConfig.PrivateKeyPath, + "-o", sshStrictHostKeyChecking, + "-o", fmt.Sprintf("%s=%s", userKnownHostsFile, knownHostsPath), + fmt.Sprintf("%s@%s", sshConfig.User, sshConfig.IP), + command, + } + + // Log the SSH command being executed + klog.V(4).Infof("Executing SSH command on %q: ssh %s", sshConfig.IP, strings.Join(sshArgs, " ")) + + // Execute SSH command directly on the host + cmd := exec.Command("ssh", sshArgs...) + + // Capture stdout and stderr separately + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + + // Log the output for debugging (debug level) + if stdout.Len() > 0 { + klog.V(5).Infof("SSH stdout: %q", stdout.String()) + } + if stderr.Len() > 0 { + klog.V(5).Infof("SSH stderr: %q", stderr.String()) + } + + // Only treat non-zero exit codes as errors + // stderr may contain warnings or informational messages that don't indicate failure + if err != nil { + klog.ErrorS(err, "SSH command failed", "host", sshConfig.IP, "stderr", stderr.String()) + return stdout.String(), stderr.String(), fmt.Errorf("SSH command failed: %v, stderr: %q, stdout: %q", err, stderr.String(), stdout.String()) + } + + klog.V(4).Infof("SSH command completed successfully on %q", sshConfig.IP) + return stdout.String(), stderr.String(), nil +} + +// ExecuteRemoteSSHCommand executes a command on an OpenShift node via two-hop SSH (local → hypervisor → node). +// Uses 'core' user for the node connection. +// +// Parameters: +// - remoteNodeIP: IP address of the remote node to execute the command on +// - command: The command to execute on the remote node +// - sshConfig: SSH configuration for the proxy node (hypervisor) +// - localKnownHostsPath: Path to the local known_hosts file for connecting to the proxy node +// - remoteKnownHostsPath: Path to the remote known_hosts file on the proxy node for connecting to the remote node +// +// Returns: +// - string: Standard output from the command +// - string: Standard error from the command +// - error: Any error that occurred during command execution +func ExecuteRemoteSSHCommand(remoteNodeIP, command string, sshConfig *SSHConfig, localKnownHostsPath, remoteKnownHostsPath string) (string, string, error) { + // Build the nested SSH command that will be executed on the hypervisor to reach the node + // This creates: ssh -i key -o options -o UserKnownHostsFile= core@remoteNodeIP 'command' + nestedSSHCommand := fmt.Sprintf("ssh -o %s -o %s=%s core@%s '%s'", + sshStrictHostKeyChecking, + userKnownHostsFile, + remoteKnownHostsPath, + remoteNodeIP, + strings.ReplaceAll(command, "'", "'\\''"), // Escape single quotes in the command + ) + + // Log the full two-hop SSH command being executed + klog.V(4).Infof("Executing two-hop SSH command to node %q via hypervisor %q", remoteNodeIP, sshConfig.IP) + + // Execute the nested SSH command on the hypervisor (which will SSH to the node) + stdout, stderr, err := ExecuteSSHCommand(nestedSSHCommand, sshConfig, localKnownHostsPath) + if err != nil { + klog.ErrorS(err, "Remote SSH command to node failed", "node", remoteNodeIP, "stderr", stderr, "stdout", stdout) + } else { + klog.V(4).Infof("Successfully executed command on remote node %q", remoteNodeIP) + } + + return stdout, stderr, err +} + +// CleanupRemoteKnownHostsFile removes the temporary known_hosts file from the proxy node. +// Errors are logged but not critical. +// +// Parameters: +// - sshConfig: SSH configuration for the proxy node +// - localKnownHostsPath: Path to the local known_hosts file for connecting to the proxy node +// - remoteKnownHostsPath: Path to the remote known_hosts file on the proxy node to remove +// +// Returns: +// - error: Any error that occurred during cleanup (logged as warning, not critical) +func CleanupRemoteKnownHostsFile(sshConfig *SSHConfig, localKnownHostsPath string, remoteKnownHostsPath string) error { + // Clean up the known hosts file on the proxy node (while we still have connectivity) + if remoteKnownHostsPath == "" { + klog.V(2).Info("No remote known_hosts file to clean up") + return nil + } + + klog.V(2).Infof("Cleaning up remote known_hosts file: %q", remoteKnownHostsPath) + + // Clean up the known hosts file on the proxy node + _, _, err := ExecuteSSHCommand(fmt.Sprintf("rm -f %s", remoteKnownHostsPath), sshConfig, localKnownHostsPath) + if err != nil { + klog.Warning("Failed to clean up remote known_hosts file", "error", err) + return err + } + + klog.V(2).Info("Successfully cleaned up remote known_hosts file") + return nil +} + +// CleanupLocalKnownHostsFile removes the temporary local known hosts file. +// This should be called after completing operations that required the local known_hosts file. +// +// The function performs a non-critical cleanup operation. If the cleanup fails, it logs a warning +// but does not fail the test, as the temporary file will eventually be cleaned up by the system. +// +// Parameters: +// - sshConfig: SSH configuration (used for logging context) +// - knownHostsPath: Path to the local known_hosts file to remove +// +// Returns: +// - error: Any error that occurred during cleanup (logged as warning, not critical) +func CleanupLocalKnownHostsFile(sshConfig *SSHConfig, knownHostsPath string) error { + // Clean up the local known hosts file + if knownHostsPath == "" { + klog.V(2).Info("No local known_hosts file to clean up") + return nil + } + + klog.V(2).Infof("Cleaning up local known_hosts file: %q", knownHostsPath) + + err := os.Remove(knownHostsPath) + if err != nil { + klog.Warning("Failed to clean up local known_hosts file", "error", err) + return err + } + + klog.V(2).Info("Successfully cleaned up local known_hosts file") + return nil +} + +// VerifyConnectivity tests SSH connectivity to a remote host by executing a simple echo command. +// This is useful for verifying that SSH is properly configured before attempting more complex operations. +// +// Parameters: +// - sshConfig: SSH configuration for the host to test connectivity to +// - knownHostsPath: Path to the known_hosts file to use for the connection +// +// Returns: +// - string: Standard output from the connectivity test command +// - string: Standard error from the connectivity test command +// - error: Any error that occurred during the connectivity test +func VerifyConnectivity(sshConfig *SSHConfig, knownHostsPath string) (string, string, error) { + return ExecuteSSHCommand(sshConnectivityTest, sshConfig, knownHostsPath) +} diff --git a/test/extended/two_node/utils/virsh.go b/test/extended/two_node/utils/virsh.go new file mode 100644 index 000000000000..f250aa2e87db --- /dev/null +++ b/test/extended/two_node/utils/virsh.go @@ -0,0 +1,676 @@ +// Package utils provides libvirt/virsh utilities for managing virtual machines in two-node cluster testing. +// +// This package enables VM lifecycle management, inspection, and configuration through the virsh +// command-line tool. It supports operations on remote hypervisors via SSH, making it suitable +// for test environments where VMs are managed on a separate hypervisor host. +// +// Key Features: +// - VM lifecycle operations (define, start, stop, destroy, autostart) +// - VM inspection (list VMs, get UUID, dump XML configuration) +// - XML parsing for extracting network configuration (MAC addresses, bridges) +// - VM discovery by MAC address correlation +// - VM recreation from saved XML configurations +// - Wait utilities for VM state transitions +// +// Error Handling: +// +// All functions return errors instead of using assertions. Virsh command failures, +// XML parsing errors, and timeout conditions are returned as errors for the calling +// code to handle appropriately. +// +// Common Usage Patterns: +// +// 1. Listing and Inspecting VMs: +// +// vms, err := VirshListAllVMs(sshConfig, knownHostsPath) +// uuid, err := VirshGetVMUUID("master-0", sshConfig, knownHostsPath) +// xml, err := VirshDumpXML("master-0", sshConfig, knownHostsPath) +// +// 2. VM Lifecycle Management: +// +// err := VirshStartVM("master-0", sshConfig, knownHostsPath) +// err := WaitForVMToStart("master-0", sshConfig, knownHostsPath) +// err := VirshDestroyVM("master-0", sshConfig, knownHostsPath) +// err := VirshUndefineVM("master-0", sshConfig, knownHostsPath) +// +// 3. VM Network Configuration: +// +// mac, err := ExtractMACAddressFromXML(xmlContent, "ostestbm") +// vmName, err := GetVMNameByMACMatch("master-0", "52:54:00:12:34:56", "ostestpr", sshConfig, knownHostsPath) +// uuid, mac, err := GetVMNetworkInfo("master-0", "ostestpr", sshConfig, knownHostsPath) +// +// 4. VM Recovery Operations: +// +// err := RecreateVMFromXML("master-0", xmlContent, sshConfig, knownHostsPath) +// +// All virsh commands are executed on a remote hypervisor via SSH. The functions in this package +// wrap the low-level SSH utilities from this package to provide a higher-level API for VM management. +// +// Retry Utilities: +// +// Some operations like WaitForVMToStart use the RetryOperationWithTimeout utility from the +// pacemaker utilities package to handle transient failures and wait for state transitions. +// +// XML Parsing: +// +// The package includes structures for parsing libvirt domain XML, focusing on network configuration. +// The Domain, Devices, Interface, MAC, and Source types map to libvirt XML elements and enable +// programmatic extraction of VM configuration details. +package utils + +import ( + "encoding/xml" + "fmt" + "strings" + + "k8s.io/klog/v2" +) + +// Domain represents a libvirt domain (virtual machine) configuration +// It maps to the root element in libvirt XML +type Domain struct { + XMLName xml.Name `xml:"domain"` + Name string `xml:"name"` + UUID string `xml:"uuid"` + Devices Devices `xml:"devices"` +} + +// Devices contains the hardware devices attached to a VM +type Devices struct { + Interfaces []Interface `xml:"interface"` +} + +// Interface represents a network interface configuration in libvirt XML +type Interface struct { + Type string `xml:"type,attr"` + MAC MAC `xml:"mac"` + Source Source `xml:"source"` +} + +// MAC contains the MAC address of a network interface +type MAC struct { + Address string `xml:"address,attr"` +} + +// Source specifies the network source (bridge, network, etc) for an interface +type Source struct { + Bridge string `xml:"bridge,attr"` +} + +// Constants for virsh commands +const ( + virshCommand = "virsh" + virshListAllName = "list --all --name" + virshConnectionOption = "-c qemu:///system" +) + +// VerifyVirsh checks if virsh is available and working on the target host +// by executing 'virsh version' command +// +// Parameters: +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The virsh version output +// - error: Any error that occurred during the check +func VerifyVirsh(sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("VerifyVirsh: Checking virsh availability on %s", sshConfig.IP) + output, err := VirshCommand("version", sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VerifyVirsh failed", "host", sshConfig.IP) + } else { + klog.V(2).Infof("VerifyVirsh: Success - %s", output) + } + return output, err +} + +// VirshCommand executes a virsh command on the remote hypervisor via SSH +// +// Parameters: +// - command: The virsh command to execute (without 'virsh' prefix) +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The command output +// - error: Any error that occurred during execution +func VirshCommand(command string, sshConfig *SSHConfig, knownHostsPath string) (string, error) { + fullCommand := fmt.Sprintf("%s %s %s", virshCommand, virshConnectionOption, command) + klog.V(4).Infof("VirshCommand: Executing '%s' on %s", fullCommand, sshConfig.IP) + output, _, err := ExecuteSSHCommand(fullCommand, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshCommand failed", "command", fullCommand, "host", sshConfig.IP) + } else { + klog.V(4).Infof("VirshCommand: Success - output length: %d bytes", len(output)) + } + return output, err +} + +// VirshDumpXML retrieves the XML configuration of a VM +// +// Parameters: +// - vmName: Name of the VM to dump XML for +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The VM's XML configuration +// - error: Any error that occurred during retrieval +func VirshDumpXML(vmName string, sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("VirshDumpXML: Getting XML for VM '%s'", vmName) + output, err := VirshCommand(fmt.Sprintf("dumpxml %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshDumpXML failed", "vm", vmName) + } else { + klog.V(4).Infof("VirshDumpXML: Success for VM '%s' - XML length: %d bytes", vmName, len(output)) + } + return output, err +} + +// VirshListAllVMs lists all VMs (running and stopped) on the hypervisor +// +// Parameters: +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: Newline-separated list of VM names +// - error: Any error that occurred during listing +func VirshListAllVMs(sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("VirshListAllVMs: Listing all VMs on %s", sshConfig.IP) + output, err := VirshCommand(virshListAllName, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshListAllVMs failed", "host", sshConfig.IP) + } else { + vmCount := len(strings.Fields(output)) + klog.V(2).Infof("VirshListAllVMs: Found %d VMs", vmCount) + } + return output, err +} + +// VirshVMExists checks if a VM with the given name exists on the hypervisor +// +// Parameters: +// - vmName: Name of the VM to check +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: Command output (empty if VM doesn't exist) +// - error: Error if VM doesn't exist or command fails +func VirshVMExists(vmName string, sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("VirshVMExists: Checking if VM '%s' exists", vmName) + output, err := VirshCommand(fmt.Sprintf("%s | grep -q %s", virshListAllName, vmName), sshConfig, knownHostsPath) + if err != nil { + klog.V(4).Infof("VirshVMExists: VM '%s' does not exist or grep failed - %v", vmName, err) + } else { + klog.V(2).Infof("VirshVMExists: VM '%s' exists", vmName) + } + return output, err +} + +// VirshGetVMUUID retrieves the UUID of a VM +// +// Parameters: +// - vmName: Name of the VM to get UUID for +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The VM's UUID (trimmed of whitespace) +// - error: Any error that occurred during retrieval +func VirshGetVMUUID(vmName string, sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("VirshGetVMUUID: Getting UUID for VM '%s'", vmName) + output, err := VirshCommand(fmt.Sprintf("domuuid %s", vmName), sshConfig, knownHostsPath) + uuid := strings.TrimSpace(output) + if err != nil { + klog.ErrorS(err, "VirshGetVMUUID failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshGetVMUUID: VM '%s' has UUID: %s", vmName, uuid) + } + return uuid, err +} + +// VirshUndefineVM undefines (removes the configuration of) a VM +// Note: This does not delete the VM's disk images, only the libvirt configuration +// +// Parameters: +// - vmName: Name of the VM to undefine +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during undefine operation +func VirshUndefineVM(vmName string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshUndefineVM: Undefining VM '%s' (including NVRAM)", vmName) + _, err := VirshCommand(fmt.Sprintf("undefine %s --nvram", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshUndefineVM failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshUndefineVM: Successfully undefined VM '%s'", vmName) + } + return err +} + +// VirshDestroyVM forcefully stops (destroys) a running VM +// This is equivalent to pulling the power plug on a physical machine +// +// Parameters: +// - vmName: Name of the VM to destroy +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during destroy operation +func VirshDestroyVM(vmName string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshDestroyVM: Forcefully stopping VM '%s'", vmName) + _, err := VirshCommand(fmt.Sprintf("destroy %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshDestroyVM failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshDestroyVM: Successfully destroyed VM '%s'", vmName) + } + return err +} + +// VirshDefineVM defines (registers) a new VM from an XML configuration file +// +// Parameters: +// - xmlFilePath: Path to the XML file on the hypervisor containing VM configuration +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during define operation +func VirshDefineVM(xmlFilePath string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshDefineVM: Defining VM from XML file '%s'", xmlFilePath) + _, err := VirshCommand(fmt.Sprintf("define %s", xmlFilePath), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshDefineVM failed", "xmlFile", xmlFilePath) + } else { + klog.V(2).Infof("VirshDefineVM: Successfully defined VM from '%s'", xmlFilePath) + } + return err +} + +// VirshStartVM starts a defined VM +// +// Parameters: +// - vmName: Name of the VM to start +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during start operation +func VirshStartVM(vmName string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshStartVM: Starting VM '%s'", vmName) + _, err := VirshCommand(fmt.Sprintf("start %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshStartVM failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshStartVM: Successfully started VM '%s'", vmName) + } + return err +} + +// VirshAutostartVM enables autostart for a VM (starts automatically on hypervisor boot) +// +// Parameters: +// - vmName: Name of the VM to enable autostart for +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during autostart enable operation +func VirshAutostartVM(vmName string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("VirshAutostartVM: Enabling autostart for VM '%s'", vmName) + _, err := VirshCommand(fmt.Sprintf("autostart %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "VirshAutostartVM failed", "vm", vmName) + } else { + klog.V(2).Infof("VirshAutostartVM: Successfully enabled autostart for VM '%s'", vmName) + } + return err +} + +// ExtractIPFromVMXML attempts to extract the IP address for a VM from its XML configuration +// Note: This typically does not work as IP addresses are usually assigned dynamically by DHCP +// and are not stored in the domain XML. This function is kept for reference but may need +// to be replaced with a different IP discovery mechanism (e.g., checking DHCP leases). +// +// Parameters: +// - xmlContent: The VM's XML configuration as a string +// - networkName: The name of the network bridge to find the interface for +// +// Returns: +// - string: The IP address (typically empty as IPs aren't stored in XML) +// - error: Error indicating IP addresses are not in domain XML or parsing failed +func ExtractIPFromVMXML(xmlContent, networkName string) (string, error) { + klog.V(4).Infof("ExtractIPFromVMXML: Attempting to extract IP for network '%s'", networkName) + + var domain Domain + err := xml.Unmarshal([]byte(xmlContent), &domain) + if err != nil { + klog.ErrorS(err, "ExtractIPFromVMXML failed to parse domain XML") + return "", fmt.Errorf("failed to parse domain XML: %v", err) + } + + klog.V(4).Infof("ExtractIPFromVMXML: Parsed domain '%s', checking %d interfaces", domain.Name, len(domain.Devices.Interfaces)) + + // Look for the interface with the specified network + for _, iface := range domain.Devices.Interfaces { + klog.V(4).Infof("ExtractIPFromVMXML: Checking interface with bridge '%s'", iface.Source.Bridge) + if iface.Source.Bridge == networkName { + // Note: IP addresses are typically not stored in the domain XML + // They are assigned dynamically by DHCP. This function might need + // to be updated to get IP from a different source. + klog.Warningf("Found interface for network '%s', but IPs are not in domain XML", networkName) + klog.V(2).Infof("Found interface for network %s, but IP addresses are not stored in domain XML", networkName) + return "", fmt.Errorf("interface found for network %s, but IP addresses are not stored in domain XML", networkName) + } + } + + klog.Warningf("No interface found for network '%s'", networkName) + return "", fmt.Errorf("no interface found for network %s", networkName) +} + +// ExtractMACAddressFromXML extracts the MAC address for a specific network bridge from VM XML. +// This parses the libvirt domain XML to find the network interface attached to the specified +// bridge and returns its MAC address. +// +// The function is commonly used to: +// - Correlate VMs with OpenShift nodes by matching MAC addresses +// - Retrieve network configuration for node replacement operations +// - Discover VM network topology +// +// Parameters: +// - xmlContent: The VM's XML configuration as a string (from virsh dumpxml) +// - networkBridge: The name of the network bridge to find the MAC address for (e.g., "ostestbm", "ostestpr") +// +// Returns: +// - string: The MAC address in standard format (e.g., "52:54:00:12:34:56") +// - error: Error if parsing fails or no interface is found on the specified bridge +func ExtractMACAddressFromXML(xmlContent string, networkBridge string) (string, error) { + klog.V(4).Infof("ExtractMACAddressFromXML: Extracting MAC for bridge '%s'", networkBridge) + + var domain Domain + err := xml.Unmarshal([]byte(xmlContent), &domain) + if err != nil { + klog.ErrorS(err, "ExtractMACAddressFromXML failed to parse domain XML") + return "", fmt.Errorf("failed to parse domain XML: %v", err) + } + + klog.V(4).Infof("ExtractMACAddressFromXML: Parsed domain '%s', checking %d interfaces", domain.Name, len(domain.Devices.Interfaces)) + + // Look for the interface with ostestpr bridge + for _, iface := range domain.Devices.Interfaces { + klog.V(4).Infof("ExtractMACAddressFromXML: Checking interface with bridge '%s', MAC '%s'", iface.Source.Bridge, iface.MAC.Address) + if iface.Source.Bridge == networkBridge { + klog.V(2).Infof("ExtractMACAddressFromXML: Found MAC address '%s' for bridge '%s'", iface.MAC.Address, networkBridge) + klog.V(2).Infof("Found %s interface with MAC: %s", networkBridge, iface.MAC.Address) + return iface.MAC.Address, nil + } + } + + klog.ErrorS(nil, "ExtractMACAddressFromXML: No interface found for bridge", "bridge", networkBridge) + return "", fmt.Errorf("no %s interface found in domain XML", networkBridge) +} + +// GetVMNameByMACMatch finds the VM name that has a specific MAC address on a given network bridge. +// This is used to correlate OpenShift nodes (identified by MAC address) with their underlying VMs. +// +// The function performs an exhaustive search by: +// 1. Listing all VMs on the hypervisor (both running and stopped) +// 2. For each VM, retrieving its XML configuration via virsh dumpxml +// 3. Parsing the XML to extract MAC addresses for interfaces on the specified bridge +// 4. Comparing the extracted MAC with the target MAC address +// 5. Returning the VM name when a match is found +// +// This is useful in node replacement scenarios where you need to find which VM corresponds +// to a specific OpenShift node based on its BareMetalHost MAC address. +// +// Parameters: +// - nodeName: Name of the OpenShift node (used for logging and error messages) +// - nodeMAC: The MAC address to search for (in format "52:54:00:xx:xx:xx") +// - networkBridge: The network bridge name to check (e.g., "ostestpr" for provisioning network) +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The name of the matching VM +// - error: Error if no VM found with the specified MAC or if any operation fails +func GetVMNameByMACMatch(nodeName, nodeMAC string, networkBridge string, sshConfig *SSHConfig, knownHostsPath string) (string, error) { + klog.V(4).Infof("GetVMNameByMACMatch: Searching for VM with MAC '%s' on bridge '%s' (node: %s)", nodeMAC, networkBridge, nodeName) + + // Get list of all VMs using SSH to hypervisor + vmListOutput, err := VirshListAllVMs(sshConfig, knownHostsPath) + klog.V(4).Infof("VirshListAllVMs output: %s", vmListOutput) + if err != nil { + klog.ErrorS(err, "GetVMNameByMACMatch failed to get VM list") + return "", fmt.Errorf("failed to get VM list: %v", err) + } + + vmNames := strings.Fields(vmListOutput) + klog.V(4).Infof("GetVMNameByMACMatch: Found %d VMs to check: %v", len(vmNames), vmNames) + klog.V(2).Infof("Found VMs: %v", vmNames) + + // Check each VM to find the one with matching MAC address + for i, vmName := range vmNames { + if vmName == "" { + klog.V(4).Infof("GetVMNameByMACMatch: Skipping empty VM name at index %d", i) + continue + } + + klog.V(4).Infof("GetVMNameByMACMatch: Checking VM %d/%d: '%s'", i+1, len(vmNames), vmName) + + // Get VM XML configuration using SSH to hypervisor + vmXML, err := VirshDumpXML(vmName, sshConfig, knownHostsPath) + klog.V(4).Infof("Getting XML for VM: %s", vmName) + if err != nil { + klog.Warningf("Could not get XML for VM '%s', skipping - %v", vmName, err) + continue + } + + // Extract MAC address from VM XML for the ostestpr bridge + vmMAC, err := ExtractMACAddressFromXML(vmXML, networkBridge) + if err != nil { + klog.Warningf("Could not extract MAC from VM '%s', skipping - %v", vmName, err) + continue + } + + klog.V(4).Infof("GetVMNameByMACMatch: VM '%s' has MAC '%s'", vmName, vmMAC) + klog.V(2).Infof("VM %s has MAC %s", vmName, vmMAC) + klog.V(4).Infof("Comparing VM MAC %s with target MAC %s", vmMAC, nodeMAC) + + // Check if this VM's MAC matches the node's MAC + if vmMAC == nodeMAC { + klog.V(2).Infof("GetVMNameByMACMatch: Found matching VM '%s' with MAC '%s'", vmName, vmMAC) + klog.V(2).Infof("Found matching VM: %s (MAC: %s)", vmName, vmMAC) + return vmName, nil + } + } + + klog.ErrorS(nil, "GetVMNameByMACMatch: No VM found with MAC", "mac", nodeMAC, "node", nodeName) + return "", fmt.Errorf("no VM found with MAC address %s for node %s", nodeMAC, nodeName) +} + +// GetVMNetworkInfo retrieves the UUID and MAC address for a VM's network interface +// +// Parameters: +// - vmName: Name of the VM to get network info for +// - networkBridge: The network bridge name to extract MAC address from +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - string: The VM's UUID +// - string: The MAC address for the specified network bridge +// - error: Any error that occurred during retrieval +func GetVMNetworkInfo(vmName string, networkBridge string, sshConfig *SSHConfig, knownHostsPath string) (string, string, error) { + klog.V(4).Infof("GetVMNetworkInfo: Getting network info for VM '%s' on bridge '%s'", vmName, networkBridge) + + newUUID, err := VirshGetVMUUID(vmName, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "GetVMNetworkInfo failed to get UUID", "vm", vmName) + return "", "", fmt.Errorf("failed to get VM UUID: %v", err) + } + + newXMLOutput, err := VirshDumpXML(vmName, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "GetVMNetworkInfo failed to get XML", "vm", vmName) + return "", "", fmt.Errorf("failed to get VM XML: %v", err) + } + + newMACAddress, err := ExtractMACAddressFromXML(newXMLOutput, networkBridge) + if err != nil { + klog.ErrorS(err, "GetVMNetworkInfo failed to extract MAC", "vm", vmName) + return "", "", fmt.Errorf("failed to find MAC address in VM XML: %v", err) + } + + klog.V(2).Infof("GetVMNetworkInfo: Successfully retrieved info for VM '%s': UUID=%s, MAC=%s", vmName, newUUID, newMACAddress) + return newUUID, newMACAddress, nil +} + +// RecreateVMFromXML recreates a VM from its XML configuration. +// This is typically used during node replacement or disaster recovery scenarios. +// +// The function performs the following steps: +// 1. Validates VM name to prevent command injection +// 2. Checks if the VM already exists (skips recreation if it does) +// 3. Creates a temporary XML file on the hypervisor (/tmp/.xml) +// 4. Defines the VM in libvirt using the XML configuration +// 5. Starts the VM +// 6. Enables autostart so the VM starts automatically on hypervisor boot +// 7. Cleans up the temporary XML file +// +// Security: The VM name is validated to prevent shell command injection attacks. +// +// Parameters: +// - vmName: Name of the VM to recreate (must not contain shell metacharacters) +// - xmlContent: The complete libvirt XML configuration for the VM +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Any error that occurred during recreation (nil if VM already exists) +func RecreateVMFromXML(vmName, xmlContent string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("RecreateVMFromXML: Starting recreation of VM '%s'", vmName) + + // Validate VM name to prevent command injection + if strings.ContainsAny(vmName, ";&|$`\\\"'<>()[]{}!*?~") { + klog.ErrorS(nil, "RecreateVMFromXML: Invalid VM name contains shell metacharacters", "vmName", vmName) + return fmt.Errorf("invalid VM name contains shell metacharacters: %s", vmName) + } + + // Check if VM already exists using the dedicated function + _, err := VirshVMExists(vmName, sshConfig, knownHostsPath) + if err == nil { + klog.V(2).Infof("RecreateVMFromXML: VM '%s' already exists, skipping recreation", vmName) + klog.V(2).Infof("VM %s already exists, skipping recreation", vmName) + return nil + } + klog.V(4).Infof("RecreateVMFromXML: VM '%s' does not exist, proceeding with recreation", vmName) + + // Create a temporary file on the hypervisor with the XML content + createXMLCommand := fmt.Sprintf(`cat > /tmp/%s.xml <<'XML_EOF' +%s +XML_EOF`, vmName, xmlContent) + + klog.V(4).Infof("RecreateVMFromXML: Creating temporary XML file /tmp/%s.xml", vmName) + _, _, err = ExecuteSSHCommand(createXMLCommand, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "RecreateVMFromXML failed to create XML file") + return fmt.Errorf("failed to create XML file on hypervisor: %v", err) + } + + // Redefine the VM using the backed up XML (using helper function) + klog.V(4).Infof("RecreateVMFromXML: Defining VM '%s' from XML", vmName) + err = VirshDefineVM(fmt.Sprintf("/tmp/%s.xml", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "RecreateVMFromXML failed to define VM") + return fmt.Errorf("failed to define VM: %v", err) + } + + // Start the VM (using helper function) + klog.V(4).Infof("RecreateVMFromXML: Starting VM '%s'", vmName) + err = VirshStartVM(vmName, sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "RecreateVMFromXML failed to start VM") + return fmt.Errorf("failed to start VM: %v", err) + } + + // Enable autostart (using helper function) + klog.V(4).Infof("RecreateVMFromXML: Enabling autostart for VM '%s'", vmName) + err = VirshAutostartVM(vmName, sshConfig, knownHostsPath) + if err != nil { + klog.Warningf("Failed to enable autostart (non-fatal) - %v", err) + } + + // Clean up temporary XML file + klog.V(4).Infof("RecreateVMFromXML: Cleaning up temporary XML file /tmp/%s.xml", vmName) + _, _, err = ExecuteSSHCommand(fmt.Sprintf("rm -f /tmp/%s.xml", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.Warningf("Failed to clean up XML file (non-fatal) - %v", err) + } + + klog.V(2).Infof("RecreateVMFromXML: Successfully recreated VM '%s'", vmName) + klog.V(2).Infof("Recreated VM: %s", vmName) + return nil +} + +// WaitForVMToStart waits for a VM to reach running state with retry logic. +// This polls the VM state periodically until it reports as "running" or the timeout is exceeded. +// +// The function performs two checks: +// 1. Verifies the VM exists in the virsh VM list +// 2. Checks that the VM's domain state is "running" (not just defined or paused) +// +// Parameters: +// - vmName: Name of the VM to wait for +// - sshConfig: SSH configuration for connecting to the hypervisor +// - knownHostsPath: Path to the known_hosts file for SSH +// +// Returns: +// - error: Error if VM doesn't start within timeout period (vmStartTimeout) or if any operation fails +func WaitForVMToStart(vmName string, sshConfig *SSHConfig, knownHostsPath string) error { + klog.V(2).Infof("WaitForVMToStart: Starting wait for VM '%s' to reach running state", vmName) + klog.V(2).Infof("Waiting for VM %s to start...", vmName) + + err := RetryOperationWithTimeout(func() error { + klog.V(4).Infof("WaitForVMToStart: Checking if VM '%s' is running (retry iteration)", vmName) + + // Check if VM exists using VirshVMExists helper + _, err := VirshVMExists(vmName, sshConfig, knownHostsPath) + if err != nil { + klog.V(4).Infof("WaitForVMToStart: VM '%s' not found in VM list - %v", vmName, err) + return fmt.Errorf("VM %s not yet running: %v", vmName, err) + } + + // Check if VM is actually running (not just defined) + statusOutput, err := VirshCommand(fmt.Sprintf("domstate %s", vmName), sshConfig, knownHostsPath) + if err != nil { + klog.ErrorS(err, "WaitForVMToStart failed to check VM state", "vm", vmName) + return fmt.Errorf("failed to check VM %s state: %v", vmName, err) + } + + statusOutput = strings.TrimSpace(statusOutput) + klog.V(4).Infof("WaitForVMToStart: VM '%s' current state: %s", vmName, statusOutput) + + if !strings.Contains(statusOutput, "running") { + return fmt.Errorf("VM %s is not running, current state: %s", vmName, statusOutput) + } + + klog.V(2).Infof("WaitForVMToStart: VM '%s' is confirmed running", vmName) + klog.V(2).Infof("VM %s is now running", vmName) + return nil + }, vmStartTimeout, vmStartPollInterval, fmt.Sprintf("VM %s startup", vmName)) + + if err != nil { + klog.ErrorS(err, "WaitForVMToStart timeout or error", "vm", vmName) + } else { + klog.V(2).Infof("WaitForVMToStart: Successfully confirmed VM '%s' is running", vmName) + } + + return err +} diff --git a/test/extended/util/test_setup.go b/test/extended/util/test_setup.go index 4a64234625f2..be05160c7dbf 100644 --- a/test/extended/util/test_setup.go +++ b/test/extended/util/test_setup.go @@ -2,6 +2,7 @@ package util import ( "context" + "encoding/json" "flag" "fmt" "os" @@ -56,6 +57,36 @@ func InitStandardFlags() { func InitTest(dryRun bool) error { InitDefaultEnvironmentVariables() + // Set hypervisor configuration in TestContext if available + hypervisorConfigJSON := os.Getenv("HYPERVISOR_CONFIG") + if hypervisorConfigJSON != "" { + // Parse and validate hypervisor configuration + var hypervisorConfig struct { + HypervisorIP string `json:"hypervisorIP"` + SSHUser string `json:"sshUser"` + PrivateKey string `json:"privateKey"` + } + if err := json.Unmarshal([]byte(hypervisorConfigJSON), &hypervisorConfig); err != nil { + return fmt.Errorf("failed to parse hypervisor configuration JSON: %v", err) + } + + // Validate required fields + if hypervisorConfig.HypervisorIP == "" { + return fmt.Errorf("hypervisorIP is required in hypervisor configuration") + } + if hypervisorConfig.SSHUser == "" { + return fmt.Errorf("sshUser is required in hypervisor configuration") + } + if hypervisorConfig.PrivateKey == "" { + return fmt.Errorf("privateKey is required in hypervisor configuration") + } + + // Store the hypervisor configuration in TestContext for tests to access + // We'll use the existing CloudConfig.ConfigFile field to store the JSON + // This is a workaround since we can't extend TestContextType directly + TestContext.CloudConfig.ConfigFile = hypervisorConfigJSON + } + TestContext.DeleteNamespace = os.Getenv("DELETE_NAMESPACE") != "false" TestContext.VerifyServiceAccount = true testfiles.AddFileSource(e2etestingmanifests.GetE2ETestingManifestsFS()) @@ -228,3 +259,31 @@ func addRoleToE2EServiceAccounts(rbacClient rbacv1client.RbacV1Interface, namesp FatalErr(err) } } + +// GetHypervisorConfig returns the hypervisor configuration if available +func GetHypervisorConfig() *struct { + HypervisorIP string `json:"hypervisorIP"` + SSHUser string `json:"sshUser"` + PrivateKey string `json:"privateKey"` +} { + hypervisorConfigJSON := TestContext.CloudConfig.ConfigFile + if hypervisorConfigJSON == "" { + return nil + } + + var hypervisorConfig struct { + HypervisorIP string `json:"hypervisorIP"` + SSHUser string `json:"sshUser"` + PrivateKey string `json:"privateKey"` + } + if err := json.Unmarshal([]byte(hypervisorConfigJSON), &hypervisorConfig); err != nil { + return nil + } + + return &hypervisorConfig +} + +// HasHypervisorConfig returns true if hypervisor configuration is available +func HasHypervisorConfig() bool { + return TestContext.CloudConfig.ConfigFile != "" +}