Skip to content

Commit 4e2b4d7

Browse files
committed
handle failover
1 parent 63eb8c5 commit 4e2b4d7

File tree

3 files changed

+24
-4
lines changed

3 files changed

+24
-4
lines changed

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ require (
2020

2121
require (
2222
emperror.dev/errors v0.8.0 // indirect
23+
github.com/avast/retry-go v3.0.0+incompatible // indirect
2324
github.com/beorn7/perks v1.0.1 // indirect
2425
github.com/cespare/xxhash/v2 v2.2.0 // indirect
2526
github.com/davecgh/go-spew v1.1.1 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbt
77
github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
88
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
99
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
10+
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
11+
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
1012
github.com/banzaicloud/k8s-objectmatcher v1.8.0 h1:Nugn25elKtPMTA2br+JgHNeSQ04sc05MDPmpJnd1N2A=
1113
github.com/banzaicloud/k8s-objectmatcher v1.8.0/go.mod h1:p2LSNAjlECf07fbhDyebTkPUIYnU05G+WfGgkTmgeMg=
1214
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=

pkg/controllers/rediscluster/rediscluster_controller.go

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@ package rediscluster
1818

1919
import (
2020
"context"
21+
"fmt"
2122
"time"
2223

2324
"github.com/OT-CONTAINER-KIT/redis-operator/api/status"
2425
redisv1beta2 "github.com/OT-CONTAINER-KIT/redis-operator/api/v1beta2"
2526
intctrlutil "github.com/OT-CONTAINER-KIT/redis-operator/pkg/controllerutil"
2627
"github.com/OT-CONTAINER-KIT/redis-operator/pkg/k8sutils"
28+
retry "github.com/avast/retry-go"
2729
"github.com/go-logr/logr"
2830
appsv1 "k8s.io/api/apps/v1"
2931
"k8s.io/apimachinery/pkg/runtime"
@@ -193,11 +195,26 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request
193195
}
194196
if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 {
195197
reqLogger.Info("healthy leader count does not match desired; attempting to repair disconnected masters")
196-
if err := k8sutils.RepairDisconnectedMasters(ctx, r.K8sClient, r.Log, instance); err == nil {
197-
// requeue after 30 seconds, allowing cluster time to become healthy after issuing CLUSTER MEET
198-
return intctrlutil.RequeueAfter(reqLogger, time.Second*30, "successfully repaired disconnected masters")
198+
if err := k8sutils.RepairDisconnectedMasters(ctx, r.K8sClient, r.Log, instance); err != nil {
199+
reqLogger.Error(err, "failed to repair disconnected masters")
199200
}
200-
reqLogger.Info("failed to repair disconnected masters; starting failover")
201+
202+
err := retry.Do(func() error {
203+
unhealthyNodeCount, err := k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, r.Log, instance)
204+
if err != nil {
205+
return err
206+
}
207+
if unhealthyNodeCount == 0 {
208+
return nil
209+
}
210+
return fmt.Errorf("%d unhealthy nodes", unhealthyNodeCount)
211+
}, retry.Attempts(3), retry.Delay(time.Second*5))
212+
213+
if err == nil {
214+
reqLogger.Info("repairing unhealthy masters successful, no unhealthy masters left")
215+
return intctrlutil.RequeueAfter(reqLogger, time.Second*30, "no unhealthy nodes found after repairing disconnected masters")
216+
}
217+
reqLogger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover")
201218
if err := k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, r.Log, instance); err != nil {
202219
return intctrlutil.RequeueWithError(err, reqLogger, "")
203220
}

0 commit comments

Comments
 (0)