@@ -18,12 +18,14 @@ package rediscluster
1818
1919import (
2020 "context"
21+ "fmt"
2122 "time"
2223
2324 "github.com/OT-CONTAINER-KIT/redis-operator/api/status"
2425 redisv1beta2 "github.com/OT-CONTAINER-KIT/redis-operator/api/v1beta2"
2526 intctrlutil "github.com/OT-CONTAINER-KIT/redis-operator/pkg/controllerutil"
2627 "github.com/OT-CONTAINER-KIT/redis-operator/pkg/k8sutils"
28+ retry "github.com/avast/retry-go"
2729 "github.com/go-logr/logr"
2830 appsv1 "k8s.io/api/apps/v1"
2931 "k8s.io/apimachinery/pkg/runtime"
@@ -193,11 +195,26 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request
193195 }
194196 if int (totalReplicas ) > 1 && unhealthyNodeCount >= int (totalReplicas )- 1 {
195197 reqLogger .Info ("healthy leader count does not match desired; attempting to repair disconnected masters" )
196- if err := k8sutils .RepairDisconnectedMasters (ctx , r .K8sClient , r .Log , instance ); err == nil {
197- // requeue after 30 seconds, allowing cluster time to become healthy after issuing CLUSTER MEET
198- return intctrlutil .RequeueAfter (reqLogger , time .Second * 30 , "successfully repaired disconnected masters" )
198+ if err := k8sutils .RepairDisconnectedMasters (ctx , r .K8sClient , r .Log , instance ); err != nil {
199+ reqLogger .Error (err , "failed to repair disconnected masters" )
199200 }
200- reqLogger .Info ("failed to repair disconnected masters; starting failover" )
201+
202+ err := retry .Do (func () error {
203+ unhealthyNodeCount , err := k8sutils .UnhealthyNodesInCluster (ctx , r .K8sClient , r .Log , instance )
204+ if err != nil {
205+ return err
206+ }
207+ if unhealthyNodeCount == 0 {
208+ return nil
209+ }
210+ return fmt .Errorf ("%d unhealthy nodes" , unhealthyNodeCount )
211+ }, retry .Attempts (3 ), retry .Delay (time .Second * 5 ))
212+
213+ if err == nil {
214+ reqLogger .Info ("repairing unhealthy masters successful, no unhealthy masters left" )
215+ return intctrlutil .RequeueAfter (reqLogger , time .Second * 30 , "no unhealthy nodes found after repairing disconnected masters" )
216+ }
217+ reqLogger .Info ("unhealthy nodes exist after attempting to repair disconnected masters; starting failover" )
201218 if err := k8sutils .ExecuteFailoverOperation (ctx , r .K8sClient , r .Log , instance ); err != nil {
202219 return intctrlutil .RequeueWithError (err , reqLogger , "" )
203220 }
0 commit comments