@@ -169,6 +169,47 @@ var _ = g.Describe("[sig-etcd][apigroup:config.openshift.io][OCPFeatureGate:Dual
169
169
memberPromotedVotingTimeout , pollInterval )
170
170
171
171
})
172
+
173
+ g .It ("should recover from etcd process crash" , func () {
174
+ // Note: This test kills the etcd process/container on one node to simulate
175
+ // a process crash, testing Pacemaker's ability to detect and restart etcd
176
+ survivedNode := peerNode
177
+ g .GinkgoT ().Printf ("Randomly selected %s (%s) for etcd process crash and %s (%s) to survive\n " ,
178
+ targetNode .Name , targetNode .Status .Addresses [0 ].Address , peerNode .Name , peerNode .Status .Addresses [0 ].Address )
179
+
180
+ g .By (fmt .Sprintf ("Killing etcd process/container on %s" , targetNode .Name ))
181
+ // Try multiple methods to kill etcd - container kill, process kill, or service stop
182
+ _ , err := util .DebugNodeRetryWithOptionsAndChroot (oc , targetNode .Name , "openshift-etcd" ,
183
+ "bash" , "-c" , "podman kill etcd 2>/dev/null || pkill -9 etcd 2>/dev/null || systemctl stop etcd 2>/dev/null || true" )
184
+ o .Expect (err ).To (o .BeNil (), "Expected to kill etcd process without command errors" )
185
+
186
+ g .By ("Waiting for Pacemaker to detect etcd failure and begin recovery" )
187
+ // Give Pacemaker time to detect the failure and start recovery
188
+ time .Sleep (30 * time .Second )
189
+
190
+ g .By (fmt .Sprintf ("Ensuring %s becomes leader and %s rejoins as learner" , peerNode .Name , targetNode .Name ))
191
+ validateEtcdRecoveryState (etcdClientFactory ,
192
+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
193
+ & targetNode , false , true , // targetNode expected started == false, learner == true
194
+ memberIsLeaderTimeout , pollInterval )
195
+
196
+ g .By (fmt .Sprintf ("Ensuring %s rejoins as learner" , targetNode .Name ))
197
+ validateEtcdRecoveryState (etcdClientFactory ,
198
+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
199
+ & targetNode , true , true , // targetNode expected started == true, learner == true
200
+ memberRejoinedLearnerTimeout , pollInterval )
201
+
202
+ g .By (fmt .Sprintf ("Ensuring %s is promoted back to voting member" , targetNode .Name ))
203
+ validateEtcdRecoveryState (etcdClientFactory ,
204
+ & survivedNode , true , false , // survivedNode expected started == true, learner == false
205
+ & targetNode , true , false , // targetNode expected started == true, learner == false
206
+ memberPromotedVotingTimeout , pollInterval )
207
+
208
+ g .By ("Ensuring etcd cluster operator is healthy after recovery" )
209
+ o .Eventually (func () error {
210
+ return ensureEtcdOperatorHealthy (oc )
211
+ }, etcdOperatorIsHealthyTimeout , pollInterval ).ShouldNot (o .HaveOccurred (), "etcd cluster operator should be healthy after recovery" )
212
+ })
172
213
})
173
214
174
215
func getMembers (etcdClientFactory helpers.EtcdClientCreator ) ([]* etcdserverpb.Member , error ) {
0 commit comments