@@ -468,9 +468,8 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
468468 }
469469
470470 // Ensures the number of etcd members is in sync with the number of machines/nodes.
471- // NOTE: This is usually required after a machine deletion.
472- if err := r .reconcileEtcdMembers (ctx , controlPlane ); err != nil {
473- return ctrl.Result {}, err
471+ if result , err := r .reconcileEtcdMembers (ctx , controlPlane ); err != nil || ! result .IsZero () {
472+ return result , err
474473 }
475474
476475 // Handle machines in deletion phase; when drain and wait for volume detach completed, forward etcd leadership
@@ -1207,62 +1206,109 @@ func maxTime(t1, t2 time.Time) time.Time {
12071206 return t2
12081207}
12091208
1210- // reconcileEtcdMembers ensures the number of etcd members is in sync with the number of machines/nodes.
1211- // This is usually required after a machine deletion.
1209+ // reconcileEtcdMembers ensures that the list of etcd members is consistent with the list of Machine/Node.
1210+ // If etcd members are not consistent with the list of Machine/Node "unknown" etcd members are removed and a new
1211+ // reconcile is triggered.
12121212//
1213- // NOTE: this func uses KCP conditions, it is required to call reconcileControlPlaneAndMachinesConditions before this.
1214- func (r * KubeadmControlPlaneReconciler ) reconcileEtcdMembers (ctx context.Context , controlPlane * internal.ControlPlane ) error {
1213+ // NOTE: This operation is a safeguard in case for some reason the etcd member removal operation that is performed
1214+ // as part of the Machine deletion workflow doesn't work as expected (see reconcilePreTerminateHook).
1215+ // It is also a safeguard against "unmanaged" etcd members being added to the etcd cluster.
1216+ func (r * KubeadmControlPlaneReconciler ) reconcileEtcdMembers (ctx context.Context , controlPlane * internal.ControlPlane ) (ctrl.Result , error ) {
12151217 log := ctrl .LoggerFrom (ctx )
12161218
12171219 // If etcd is not managed by KCP this is a no-op.
12181220 if ! controlPlane .IsEtcdManaged () {
1219- return nil
1221+ return ctrl. Result {}, nil
12201222 }
12211223
12221224 // If there is no KCP-owned control-plane machines, then control-plane has not been initialized yet.
12231225 if controlPlane .Machines .Len () == 0 {
1224- return nil
1226+ return ctrl. Result {}, nil
12251227 }
12261228
12271229 // No op if for any reason the etcdMember list is not populated at this stage.
1228- if controlPlane .EtcdMembers == nil {
1229- return nil
1230+ if len ( controlPlane .EtcdMembers ) == 0 {
1231+ return ctrl. Result {}, nil
12301232 }
12311233
1232- // Potential inconsistencies between the list of members and the list of machines/nodes are
1234+ // Potential inconsistencies between the list of members and the list of Machine/Node are
12331235 // surfaced using the EtcdClusterHealthyCondition; if this condition is true, meaning no inconsistencies exists, return early.
12341236 if conditions .IsTrue (controlPlane .KCP , controlplanev1 .KubeadmControlPlaneEtcdClusterHealthyCondition ) {
1235- return nil
1237+ return ctrl. Result {}, nil
12361238 }
12371239
1238- // Collect all the node names.
1239- // Note: EtcdClusterHealthyCondition true also implies that there are no machines still provisioning,
1240- // so we can ignore this case.
1240+ // Collect all the Node names from control plane Machines.
12411241 nodeNames := []string {}
12421242 for _ , machine := range controlPlane .Machines {
12431243 if ! machine .Status .NodeRef .IsDefined () {
1244- // If there are provisioning machines (machines without a node yet), return.
1245- return nil
1244+ // If there are provisioning machines (machines without a Node yet), KCP must wait for the control plane
1245+ // to become stable before acting on etcd members.
1246+ return ctrl.Result {}, nil
12461247 }
12471248 nodeNames = append (nodeNames , machine .Status .NodeRef .Name )
12481249 }
12491250
12501251 workloadCluster , err := controlPlane .GetWorkloadCluster (ctx )
12511252 if err != nil {
1252- // Failing at connecting to the workload cluster can mean workload cluster is unhealthy for a variety of reasons such as etcd quorum loss.
1253- return errors .Wrap (err , "cannot get remote client to workload cluster" )
1253+ return ctrl.Result {}, errors .Wrap (err , "cannot get remote client to workload cluster" )
12541254 }
12551255
1256- removedMembers , err := workloadCluster .ReconcileEtcdMembersAndControlPlaneNodes (ctx , controlPlane .EtcdMembers , nodeNames )
1257- if err != nil {
1258- return errors .Wrap (err , "failed attempt to reconcile etcd members" )
1256+ // Check if there are etcd member without the corresponding Node/Machines.
1257+ // If any, delete it with best effort
1258+ removedMembers := []string {}
1259+ allErrors := []error {}
1260+
1261+ for _ , member := range controlPlane .EtcdMembers {
1262+ // If this member is just added, it has a empty name until the etcd pod starts. Ignore this member until a name will show up.
1263+ if member .Name == "" {
1264+ continue
1265+ }
1266+
1267+ // If the member have a corresponding Node/Machine, continue.
1268+ nodeFound := false
1269+ for _ , nodeName := range nodeNames {
1270+ if member .Name == nodeName {
1271+ nodeFound = true
1272+ break
1273+ }
1274+ }
1275+ if nodeFound {
1276+ continue
1277+ }
1278+
1279+ // If the Node/Machineg corresponding to an etcd member cannot be found, KCP must remove it.
1280+ //
1281+ // Before deleting the etcd member, KCP should assess the potential effects of this operation.
1282+ //
1283+ // Most specifically KCP should determine if this operation is going to leave the Kubernetes control plane components
1284+ // and the etcd cluster in operational state or not.
1285+ //
1286+ // In this case, the Target cluster will have the same list of machines as the current clusters
1287+ // but we are going to remove the etcd members without a corresponding Node/Machine (no etcd member are going to be added).
1288+ etcdMemberToBeDeleted := member .Name
1289+ addEtcdMember := false
1290+ if ! r .targetEtcdClusterHealthy (ctx , controlPlane , addEtcdMember , etcdMemberToBeDeleted ) {
1291+ allErrors = append (allErrors , errors .Errorf ("etcd member %s does not have a corresponding Machine, it must be removed from the cluster but this operation can lead to quorum loss. Please check the etcd status" , etcdMemberToBeDeleted ))
1292+ break
1293+ }
1294+
1295+ if err := workloadCluster .RemoveEtcdMember (ctx , member .Name ); err != nil {
1296+ allErrors = append (allErrors , err )
1297+ continue
1298+ }
1299+ removedMembers = append (removedMembers , member .Name )
12591300 }
12601301
12611302 if len (removedMembers ) > 0 {
1262- log .Info ("Etcd members without nodes removed from the cluster" , "members" , removedMembers )
1303+ log .Info ("Etcd members without a corresponding Machines removed from the cluster" , "members" , removedMembers )
1304+
1305+ // If there are no errors, force a new reconcile after removing an etcd member.
1306+ if len (allErrors ) == 0 {
1307+ return ctrl.Result {RequeueAfter : 1 * time .Second }, nil
1308+ }
12631309 }
12641310
1265- return nil
1311+ return ctrl. Result {}, kerrors . NewAggregate ( allErrors )
12661312}
12671313
12681314func (r * KubeadmControlPlaneReconciler ) reconcilePreTerminateHook (ctx context.Context , controlPlane * internal.ControlPlane ) (ctrl.Result , error ) {
@@ -1307,34 +1353,97 @@ func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Co
13071353 return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
13081354 }
13091355
1310- // The following will execute and remove the pre-terminate hook from the Machine.
1356+ // If removing the last control plane machine, no need of additional checks/operations.
1357+ if controlPlane .Machines .Len () <= 1 {
1358+ if err := r .removePreTerminateHookAnnotationFromMachine (ctx , deletingMachine ); err != nil {
1359+ return ctrl.Result {}, err
1360+ }
13111361
1312- // If we have more than 1 Machine and etcd is managed we forward etcd leadership and remove the member
1313- // to keep the etcd cluster healthy.
1314- if controlPlane .Machines .Len () > 1 && controlPlane .IsEtcdManaged () {
1315- workloadCluster , err := controlPlane .GetWorkloadCluster (ctx )
1316- if err != nil {
1317- return ctrl.Result {}, errors .Wrapf (err , "failed to remove etcd member for deleting Machine %s: failed to create client to workload cluster" , klog .KObj (deletingMachine ))
1362+ log .Info ("Waiting for Machines to be deleted" , "machines" , strings .Join (controlPlane .Machines .Filter (collections .HasDeletionTimestamp ).Names (), ", " ))
1363+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1364+ }
1365+
1366+ // Before removing the etcd member/removing the pre-terminate hook on KCP machines
1367+ // KCP determines again if deleting this machine is going to leave the Kubernetes control plane components
1368+ // and the etcd cluster in operational state or not.
1369+ //
1370+ // NOTE: This check acts as additional safeguard preventing deletion to complete.
1371+ // Similar checks are also performed before triggering deletion when remediating unhealthy machines and
1372+ // also before triggering deletion when scaling down the number of control plane replicas.
1373+ // (in case of scale down KCP actually checks that everything must be in healthy state).
1374+ //
1375+ // Target list of machines will have current machines -1 machine (the deletingMachine).
1376+ // As a consequence:
1377+ // - KubernetesControlPlane on the deletingMachine is going to be deleted, no KubernetesControlPlane is going to be added.
1378+ kubernetesControlPlaneToBeDeleted := deletingMachine .Name
1379+ addKubernetesControlPlane := false
1380+
1381+ // Check target Kubernetes control plane components.
1382+ if ! r .targetKubernetesControlPlaneComponentsHealthy (ctx , controlPlane , addKubernetesControlPlane , kubernetesControlPlaneToBeDeleted ) {
1383+ log .Info (fmt .Sprintf ("Cannot delete control plane Machine %s when there are no control plane Machines with all Kubernetes control plane components in healthy state. Please check Kubernetes control plane component status" , klog .KObj (deletingMachine )))
1384+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1385+ }
1386+
1387+ // The following will execute and remove the pre-terminate hook from the Machine.
1388+ if ! controlPlane .IsEtcdManaged () {
1389+ if err := r .removePreTerminateHookAnnotationFromMachine (ctx , deletingMachine ); err != nil {
1390+ return ctrl.Result {}, err
13181391 }
13191392
1320- // Note: In regular deletion cases (remediation, scale down) the leader should have been already moved.
1321- // We're doing this again here in case the Machine became leader again or the Machine deletion was
1322- // triggered in another way (e.g. a user running kubectl delete machine)
1323- etcdLeaderCandidate := controlPlane .Machines .Filter (collections .Not (collections .HasDeletionTimestamp )).Newest ()
1324- if etcdLeaderCandidate != nil {
1325- if err := workloadCluster .ForwardEtcdLeadership (ctx , deletingMachine , etcdLeaderCandidate ); err != nil {
1326- return ctrl.Result {}, errors .Wrapf (err , "failed to move leadership to candidate Machine %s" , etcdLeaderCandidate .Name )
1327- }
1328- } else {
1329- log .Info ("Skip forwarding etcd leadership, because there is no other control plane Machine without a deletionTimestamp" )
1393+ log .Info ("Waiting for Machines to be deleted" , "machines" , strings .Join (controlPlane .Machines .Filter (collections .HasDeletionTimestamp ).Names (), ", " ))
1394+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1395+ }
1396+
1397+ if len (controlPlane .EtcdMembers ) == 0 {
1398+ log .Info ("Cannot check etcd cluster health before remediation, etcd member list is empty" )
1399+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1400+ }
1401+
1402+ // If etcd is managed by KCP, check target etcd cluster.
1403+ // Target list of machines will have current machines -1 machine (the deletingMachine).
1404+ // As a consequence:
1405+ // - etcd member on the deletingMachine is going to be deleted, no etcd member are going to be added.
1406+ etcdMemberToBeDeleted := r .tryGetEtcdMemberName (ctx , controlPlane , deletingMachine )
1407+ addEtcdMember := false
1408+
1409+ // If it was not possible to get the etcd member, no other checks can be performed. Continue with deletion.
1410+ // Note: reconcileEtcMembers acts a safeguard if an etcdMember is added/reported after this point.
1411+ if etcdMemberToBeDeleted == "" {
1412+ if err := r .removePreTerminateHookAnnotationFromMachine (ctx , deletingMachine ); err != nil {
1413+ return ctrl.Result {}, err
13301414 }
13311415
1332- // Note: Removing the etcd member will lead to the etcd and the kube-apiserver Pod on the Machine shutting down.
1333- // If ControlPlaneKubeletLocalMode is used, the kubelet is communicating with the local apiserver and thus now
1334- // won't be able to see any updates to e.g. Pods anymore.
1335- if err := workloadCluster .RemoveEtcdMemberForMachine (ctx , deletingMachine ); err != nil {
1336- return ctrl.Result {}, errors .Wrapf (err , "failed to remove etcd member for deleting Machine %s" , klog .KObj (deletingMachine ))
1416+ log .Info ("Waiting for Machines to be deleted" , "machines" , strings .Join (controlPlane .Machines .Filter (collections .HasDeletionTimestamp ).Names (), ", " ))
1417+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1418+ }
1419+
1420+ if ! r .targetEtcdClusterHealthy (ctx , controlPlane , addEtcdMember , etcdMemberToBeDeleted ) {
1421+ log .Info (fmt .Sprintf ("Cannot delete control plane Machine %s, the operation can lead to etcd quorum loss. Please check the etcd status" , klog .KObj (deletingMachine )))
1422+ return ctrl.Result {RequeueAfter : deleteRequeueAfter }, nil
1423+ }
1424+
1425+ workloadCluster , err := controlPlane .GetWorkloadCluster (ctx )
1426+ if err != nil {
1427+ return ctrl.Result {}, errors .Wrapf (err , "failed to remove etcd member for deleting Machine %s: failed to create client to workload cluster" , klog .KObj (deletingMachine ))
1428+ }
1429+
1430+ // Note: In regular deletion cases (remediation, scale down) the leader should have been already moved.
1431+ // We're doing this again here in case the Machine became leader again or the Machine deletion was
1432+ // triggered in another way (e.g. a user running kubectl delete machine)
1433+ etcdLeaderCandidate := controlPlane .Machines .Filter (collections .Not (collections .HasDeletionTimestamp )).Newest ()
1434+ if etcdLeaderCandidate != nil {
1435+ if err := workloadCluster .ForwardEtcdLeadership (ctx , deletingMachine , etcdLeaderCandidate ); err != nil {
1436+ return ctrl.Result {}, errors .Wrapf (err , "failed to move leadership to candidate Machine %s" , etcdLeaderCandidate .Name )
13371437 }
1438+ } else {
1439+ log .Info ("Skip forwarding etcd leadership, there is no other control plane Machine without a deletionTimestamp" )
1440+ }
1441+
1442+ // Note: Removing the etcd member will lead to the etcd and the kube-apiserver Pod on the Machine shutting down.
1443+ // If ControlPlaneKubeletLocalMode is used, the kubelet is communicating with the local apiserver and thus now
1444+ // won't be able to see any updates to e.g. Pods anymore.
1445+ if err := workloadCluster .RemoveEtcdMember (ctx , etcdMemberToBeDeleted ); err != nil {
1446+ return ctrl.Result {}, errors .Wrapf (err , "failed to remove etcd member for deleting Machine %s" , klog .KObj (deletingMachine ))
13381447 }
13391448
13401449 if err := r .removePreTerminateHookAnnotationFromMachine (ctx , deletingMachine ); err != nil {
0 commit comments