Skip to content

Commit 5ac5356

Browse files
committed
refact/lb/sg: isolate sg deletion fragments to be reused
Isolating security group deletion fragments from EnsureLoadBalancerDeleted to buildSecurityGroupsToDelete and deleteSecurityGroupsWithBackoff, so the envaluation criteria and backof deletion can be reused in future implementations, i.e. NLB with Security Groups.
1 parent 31a4ec4 commit 5ac5356

1 file changed

Lines changed: 154 additions & 97 deletions

File tree

pkg/providers/v1/aws.go

Lines changed: 154 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"errors"
2222
"fmt"
2323
"io"
24+
"math"
2425
"net"
2526
"regexp"
2627
"sort"
@@ -2862,6 +2863,154 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
28622863
return nil
28632864
}
28642865

2866+
// deleteSecurityGroupsWithBackoff deletes a list of security group IDs with retries and exponential backoff.
2867+
// The function attempts to delete each security group in the provided list, handling potential dependency violations
2868+
// caused by resources still being associated with the security groups (e.g., load balancers in the process of deletion).
2869+
//
2870+
// Parameters:
2871+
// - `ctx`: The context for the operation.
2872+
// - `svcName`: The name of the service associated with the security groups.
2873+
// - `securityGroupIDs`: A map of security group IDs to be deleted.
2874+
//
2875+
// Behavior:
2876+
// - If the list of security group IDs is empty, the function returns immediately.
2877+
// - The function retries deletion for up to 10 minutes, with an initial backoff of 5 seconds that doubles with each retry.
2878+
// - Dependency violations are logged and ignored, allowing retries until the timeout is reached.
2879+
// - If all security groups are successfully deleted, the function exits.
2880+
// - If the timeout is reached and some security groups remain, an error is returned.
2881+
//
2882+
// Returns:
2883+
// - `error`: An error if any security groups could not be deleted within the timeout period.
2884+
func (c *Cloud) deleteSecurityGroupsWithBackoff(ctx context.Context, svcName string, securityGroupIDs map[string]struct{}) error {
2885+
if len(securityGroupIDs) == 0 {
2886+
return nil
2887+
}
2888+
// Configurable backoff parameters
2889+
const (
2890+
initialBackoff = 5 * time.Second
2891+
maxBackoff = 60 * time.Second
2892+
)
2893+
retry := 1
2894+
backoff := initialBackoff
2895+
timeoutAt := time.Now().Add(10 * time.Minute)
2896+
2897+
// Loop through and try to delete them
2898+
for {
2899+
for securityGroupID := range securityGroupIDs {
2900+
_, err := c.ec2.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{
2901+
GroupId: &securityGroupID,
2902+
})
2903+
if err == nil {
2904+
delete(securityGroupIDs, securityGroupID)
2905+
continue
2906+
}
2907+
ignore := false
2908+
if awsError, ok := err.(awserr.Error); ok {
2909+
if awsError.Code() == "DependencyViolation" {
2910+
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
2911+
ignore = true
2912+
}
2913+
}
2914+
if !ignore {
2915+
return fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
2916+
}
2917+
}
2918+
2919+
if len(securityGroupIDs) == 0 {
2920+
klog.V(2).Info("Deleted all security groups for load balancer: ", svcName)
2921+
break
2922+
}
2923+
2924+
if time.Now().After(timeoutAt) {
2925+
ids := []string{}
2926+
for id := range securityGroupIDs {
2927+
ids = append(ids, id)
2928+
}
2929+
2930+
return fmt.Errorf("timed out deleting ELB: %s. Could not delete security groups %v after %d retries", svcName, strings.Join(ids, ","), retry)
2931+
}
2932+
2933+
klog.V(2).Infof("Waiting for load-balancer %q to delete so we can delete security groups: %v", svcName, securityGroupIDs)
2934+
time.Sleep(backoff)
2935+
2936+
// Increment backoff with a cap
2937+
backoff = time.Duration(math.Min(float64(backoff*2), float64(maxBackoff)))
2938+
retry++
2939+
}
2940+
return nil
2941+
}
2942+
2943+
// buildSecurityGroupsToDelete evaluates all deletion criteria and creates a list of valid security group IDs to be deleted.
2944+
// It returns two maps:
2945+
// - `securityGroupIDs`: A map of security group IDs that are eligible for deletion.
2946+
// - `taggedLBSecurityGroups`: A map of security group IDs that are tagged and associated with the load balancer.
2947+
// The function filters security groups based on the following criteria:
2948+
// - Excludes security groups defined in the Cloud Configuration.
2949+
// - Excludes security groups with no cluster tags.
2950+
// - Excludes security groups annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups` or
2951+
// `service.beta.kubernetes.io/aws-load-balancer-extra-security-groups`.
2952+
//
2953+
// Parameters:
2954+
// - `ctx`: The context for the operation.
2955+
// - `service`: The Kubernetes service object.
2956+
// - `lbSecurityGroups`: A list of security group IDs associated with the load balancer.
2957+
// Returns:
2958+
// - `securityGroupIDs`: A map of security group IDs to be deleted.
2959+
// - `taggedLBSecurityGroups`: A map of tagged security group IDs.
2960+
// - `error`: An error if the operation fails.
2961+
func (c *Cloud) buildSecurityGroupsToDelete(ctx context.Context, service *v1.Service, lbSecurityGroups []string) (map[string]struct{}, map[string]struct{}, error) {
2962+
securityGroupIDs := map[string]struct{}{}
2963+
taggedLBSecurityGroups := map[string]struct{}{}
2964+
2965+
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2966+
describeRequest.Filters = []ec2types.Filter{
2967+
newEc2Filter("group-id", lbSecurityGroups...),
2968+
}
2969+
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2970+
if err != nil {
2971+
return nil, nil, fmt.Errorf("error querying security groups for ELB: %q", err)
2972+
}
2973+
2974+
annotatedSgSet := map[string]bool{}
2975+
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
2976+
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
2977+
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)
2978+
2979+
for _, sg := range annotatedSgsList {
2980+
annotatedSgSet[sg] = true
2981+
}
2982+
2983+
for _, sg := range response {
2984+
sgID := aws.StringValue(sg.GroupId)
2985+
2986+
if sgID == c.cfg.Global.ElbSecurityGroup {
2987+
//We don't want to delete a security group that was defined in the Cloud Configuration.
2988+
continue
2989+
}
2990+
if sgID == "" {
2991+
klog.Warningf("Ignoring empty security group in %s", service.Name)
2992+
continue
2993+
}
2994+
2995+
if !c.tagging.hasClusterTag(sg.Tags) {
2996+
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
2997+
continue
2998+
} else {
2999+
taggedLBSecurityGroups[sgID] = struct{}{}
3000+
}
3001+
3002+
// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
3003+
if _, ok := annotatedSgSet[sgID]; ok {
3004+
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
3005+
continue
3006+
}
3007+
3008+
securityGroupIDs[sgID] = struct{}{}
3009+
}
3010+
3011+
return securityGroupIDs, taggedLBSecurityGroups, nil
3012+
}
3013+
28653014
// EnsureLoadBalancerDeleted implements LoadBalancer.EnsureLoadBalancerDeleted.
28663015
func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error {
28673016
if isLBExternal(service.Annotations) {
@@ -2933,58 +3082,11 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
29333082
// if the load balancer security group is being deleted.
29343083
securityGroupIDs := map[string]struct{}{}
29353084
taggedLBSecurityGroups := map[string]struct{}{}
2936-
{
2937-
// Delete the security group(s) for the load balancer
2938-
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
2939-
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself
2940-
2941-
var loadBalancerSGs = aws.StringValueSlice(lb.SecurityGroups)
2942-
2943-
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2944-
describeRequest.Filters = []ec2types.Filter{
2945-
newEc2Filter("group-id", loadBalancerSGs...),
2946-
}
2947-
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2948-
if err != nil {
2949-
return fmt.Errorf("error querying security groups for ELB: %q", err)
2950-
}
2951-
annotatedSgSet := map[string]bool{}
2952-
annotatedSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerSecurityGroups])
2953-
annotatedExtraSgsList := getSGListFromAnnotation(service.Annotations[ServiceAnnotationLoadBalancerExtraSecurityGroups])
2954-
annotatedSgsList = append(annotatedSgsList, annotatedExtraSgsList...)
2955-
2956-
for _, sg := range annotatedSgsList {
2957-
annotatedSgSet[sg] = true
2958-
}
2959-
2960-
for _, sg := range response {
2961-
sgID := aws.StringValue(sg.GroupId)
2962-
2963-
if sgID == c.cfg.Global.ElbSecurityGroup {
2964-
//We don't want to delete a security group that was defined in the Cloud Configuration.
2965-
continue
2966-
}
2967-
if sgID == "" {
2968-
klog.Warningf("Ignoring empty security group in %s", service.Name)
2969-
continue
2970-
}
29713085

2972-
if !c.tagging.hasClusterTag(sg.Tags) {
2973-
klog.Warningf("Ignoring security group with no cluster tag in %s", service.Name)
2974-
continue
2975-
} else {
2976-
taggedLBSecurityGroups[sgID] = struct{}{}
2977-
}
2978-
2979-
// This is an extra protection of deletion of non provisioned Security Group which is annotated with `service.beta.kubernetes.io/aws-load-balancer-security-groups`.
2980-
if _, ok := annotatedSgSet[sgID]; ok {
2981-
klog.Warningf("Ignoring security group with annotation `service.beta.kubernetes.io/aws-load-balancer-security-groups` or service.beta.kubernetes.io/aws-load-balancer-extra-security-groups in %s", service.Name)
2982-
continue
2983-
}
2984-
2985-
securityGroupIDs[sgID] = struct{}{}
2986-
}
2987-
}
3086+
// Delete the security group(s) for the load balancer
3087+
// Note that this is annoying: the load balancer disappears from the API immediately, but it is still
3088+
// deleting in the background. We get a DependencyViolation until the load balancer has deleted itself
3089+
securityGroupIDs, taggedLBSecurityGroups, err = c.buildSecurityGroupsToDelete(ctx, service, aws.StringValueSlice(lb.SecurityGroups))
29883090

29893091
{
29903092
// Determine the load balancer security group id
@@ -3018,52 +3120,7 @@ func (c *Cloud) EnsureLoadBalancerDeleted(ctx context.Context, clusterName strin
30183120
}
30193121
}
30203122

3021-
{
3022-
3023-
// Loop through and try to delete them
3024-
timeoutAt := time.Now().Add(time.Second * 600)
3025-
for {
3026-
for securityGroupID := range securityGroupIDs {
3027-
request := &ec2.DeleteSecurityGroupInput{}
3028-
request.GroupId = &securityGroupID
3029-
_, err := c.ec2.DeleteSecurityGroup(ctx, request)
3030-
if err == nil {
3031-
delete(securityGroupIDs, securityGroupID)
3032-
} else {
3033-
ignore := false
3034-
if awsError, ok := err.(awserr.Error); ok {
3035-
if awsError.Code() == "DependencyViolation" {
3036-
klog.V(2).Infof("Ignoring DependencyViolation while deleting load-balancer security group (%s), assuming because LB is in process of deleting", securityGroupID)
3037-
ignore = true
3038-
}
3039-
}
3040-
if !ignore {
3041-
return fmt.Errorf("error while deleting load balancer security group (%s): %q", securityGroupID, err)
3042-
}
3043-
}
3044-
}
3045-
3046-
if len(securityGroupIDs) == 0 {
3047-
klog.V(2).Info("Deleted all security groups for load balancer: ", service.Name)
3048-
break
3049-
}
3050-
3051-
if time.Now().After(timeoutAt) {
3052-
ids := []string{}
3053-
for id := range securityGroupIDs {
3054-
ids = append(ids, id)
3055-
}
3056-
3057-
return fmt.Errorf("timed out deleting ELB: %s. Could not delete security groups %v", service.Name, strings.Join(ids, ","))
3058-
}
3059-
3060-
klog.V(2).Info("Waiting for load-balancer to delete so we can delete security groups: ", service.Name)
3061-
3062-
time.Sleep(10 * time.Second)
3063-
}
3064-
}
3065-
3066-
return nil
3123+
return c.deleteSecurityGroupsWithBackoff(ctx, service.Name, securityGroupIDs)
30673124
}
30683125

30693126
// UpdateLoadBalancer implements LoadBalancer.UpdateLoadBalancer

0 commit comments

Comments
 (0)