Skip to content

Commit f32ae4c

Browse files
committed
Add initial version of etcd kill and recovery test
1 parent f00cea8 commit f32ae4c

File tree

1 file changed

+41
-0
lines changed

1 file changed

+41
-0
lines changed

test/extended/two_node/tnf_recovery.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,47 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
169169
memberPromotedVotingTimeout, pollInterval)
170170

171171
})
172+
173+
g.It("should recover from etcd process crash", func() {
174+
// Note: This test kills the etcd process/container on one node to simulate
175+
// a process crash, testing Pacemaker's ability to detect and restart etcd
176+
survivedNode := peerNode
177+
g.GinkgoT().Printf("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n",
178+
targetNode.Name, targetNode.Status.Addresses[0].Address, peerNode.Name, peerNode.Status.Addresses[0].Address)
179+
180+
g.By(fmt.Sprintf("Killing etcd process/container on %s", targetNode.Name))
181+
// Try multiple methods to kill etcd - container kill, process kill, or service stop
182+
_, err := util.DebugNodeRetryWithOptionsAndChroot(oc, targetNode.Name, "openshift-etcd",
183+
"bash", "-c", "podman kill etcd 2>/dev/null || pkill -9 etcd 2>/dev/null || systemctl stop etcd 2>/dev/null || true")
184+
o.Expect(err).To(o.BeNil(), "Expected to kill etcd process without command errors")
185+
186+
g.By("Waiting for Pacemaker to detect etcd failure and begin recovery")
187+
// Give Pacemaker time to detect the failure and start recovery
188+
time.Sleep(30 * time.Second)
189+
190+
g.By(fmt.Sprintf("Ensuring %s becomes leader and %s rejoins as learner", peerNode.Name, targetNode.Name))
191+
validateEtcdRecoveryState(etcdClientFactory,
192+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
193+
&targetNode, false, true, // targetNode expected started == false, learner == true
194+
memberIsLeaderTimeout, pollInterval)
195+
196+
g.By(fmt.Sprintf("Ensuring %s rejoins as learner", targetNode.Name))
197+
validateEtcdRecoveryState(etcdClientFactory,
198+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
199+
&targetNode, true, true, // targetNode expected started == true, learner == true
200+
memberRejoinedLearnerTimeout, pollInterval)
201+
202+
g.By(fmt.Sprintf("Ensuring %s is promoted back to voting member", targetNode.Name))
203+
validateEtcdRecoveryState(etcdClientFactory,
204+
&survivedNode, true, false, // survivedNode expected started == true, learner == false
205+
&targetNode, true, false, // targetNode expected started == true, learner == false
206+
memberPromotedVotingTimeout, pollInterval)
207+
208+
g.By("Ensuring etcd cluster operator is healthy after recovery")
209+
o.Eventually(func() error {
210+
return ensureEtcdOperatorHealthy(oc)
211+
}, etcdOperatorIsHealthyTimeout, pollInterval).ShouldNot(o.HaveOccurred(), "etcd cluster operator should be healthy after recovery")
212+
})
172213
})
173214

174215
func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {

0 commit comments

Comments
 (0)