Skip to content

Commit 234b7ed

Browse files
committed
e2e/fix: update aws node/retry resilience
1 parent 49b2f8e commit 234b7ed

2 files changed

Lines changed: 163 additions & 23 deletions

File tree

tests/e2e/helper_aws.go

Lines changed: 160 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ package e2e
1818
import (
1919
"context"
2020
"fmt"
21+
"os"
22+
"strconv"
2123
"time"
2224

2325
"github.com/aws/aws-sdk-go-v2/aws"
@@ -34,28 +36,89 @@ import (
3436
)
3537

3638
const (
39+
// DefaultRetryTimeoutMinutes is the default timeout for load balancer DNS resolution with retries.
40+
// This can be overridden via the E2E_LB_TIMEOUT_MINUTES environment variable.
3741
DefaultRetryTimeoutMinutes = 20
42+
43+
// DefaultRetryMaxAttempts is the default maximum number of retry attempts for AWS API calls.
44+
// This can be overridden via E2ETestHelperAWSOptions.
45+
DefaultRetryMaxAttempts = 10
46+
47+
// DefaultRetryMaxBackoff is the default maximum backoff duration between retries.
48+
// This can be overridden via E2ETestHelperAWSOptions.
49+
DefaultRetryMaxBackoff = 30 * time.Second
50+
51+
// DefaultTargetPollInterval is the default polling interval for target health checks.
52+
// Target registration is eventual consistency, so we use fixed-interval polling.
53+
DefaultTargetPollInterval = 5 * time.Second
54+
55+
// DefaultTargetWaitTimeout is the default timeout for waiting for targets to register.
56+
// AWS target health checks typically complete within 30-90 seconds.
57+
DefaultTargetWaitTimeout = 3 * time.Minute
58+
59+
// EnvLBTimeoutMinutes is the environment variable name for configuring LB timeout.
60+
EnvLBTimeoutMinutes = "E2E_LB_TIMEOUT_MINUTES"
3861
)
3962

63+
// E2ETestHelperAWSOptions configures the behavior of E2ETestHelperAWS.
64+
type E2ETestHelperAWSOptions struct {
65+
// MaxAttempts configures the maximum number of retry attempts for AWS API calls.
66+
// Default: DefaultRetryMaxAttempts (10)
67+
MaxAttempts int
68+
69+
// MaxBackoff configures the maximum backoff duration between retries.
70+
// Default: DefaultRetryMaxBackoff (30 seconds)
71+
MaxBackoff time.Duration
72+
}
73+
74+
// ApplyDefaults fills in default values for unset options.
75+
func (o *E2ETestHelperAWSOptions) ApplyDefaults() {
76+
if o.MaxAttempts == 0 {
77+
o.MaxAttempts = DefaultRetryMaxAttempts
78+
}
79+
if o.MaxBackoff == 0 {
80+
o.MaxBackoff = DefaultRetryMaxBackoff
81+
}
82+
}
83+
4084
// E2ETestHelperAWS provides AWS API operations for e2e tests.
85+
// AWS SDK v2 clients are safe for concurrent use and do not require explicit cleanup.
86+
// Context cancellation will terminate ongoing API calls.
4187
type E2ETestHelperAWS struct {
4288
retryer *retry.Standard
4389
ec2Client *ec2.Client
4490
elbClient *elb.Client
4591
elbv2Client *elbv2.Client
4692
}
4793

48-
// NewAWSHelper creates a new AWS helper with configured clients
94+
// NewAWSHelper creates a new AWS helper with configured clients using default options.
4995
func NewAWSHelper(ctx context.Context) (*E2ETestHelperAWS, error) {
96+
return NewAWSHelperWithOptions(ctx, E2ETestHelperAWSOptions{})
97+
}
98+
99+
// NewAWSHelperWithOptions creates a new AWS helper with custom retry configuration.
100+
// Configure custom retryer to handle transient AWS API errors and credential failures.
101+
// AWS API limits for ELB are generous (400 TPS for DescribeLoadBalancers),
102+
// so aggressive retries are safe and necessary for CI stability.
103+
//
104+
// For rate-limited environments or shared test accounts, adjust MaxAttempts and MaxBackoff:
105+
//
106+
// helper, err := NewAWSHelperWithOptions(ctx, E2ETestHelperAWSOptions{
107+
// MaxAttempts: 5,
108+
// MaxBackoff: 10 * time.Second,
109+
// })
110+
func NewAWSHelperWithOptions(ctx context.Context, opts E2ETestHelperAWSOptions) (*E2ETestHelperAWS, error) {
111+
opts.ApplyDefaults()
112+
50113
cfg, err := config.LoadDefaultConfig(ctx)
51114
framework.ExpectNoError(err, "unable to load AWS config")
115+
if err != nil {
116+
return nil, fmt.Errorf("unable to load AWS config: %w", err)
117+
}
52118

53-
// Configure custom retryer to handle transient AWS API errors and credential failures.
54-
// AWS API limits for ELB are generous (400 TPS for DescribeLoadBalancers),
55-
// so aggressive retries are safe and necessary for CI stability.
56119
customRetryer := retry.NewStandard(func(o *retry.StandardOptions) {
57-
o.MaxAttempts = 10 // Handle IMDS timeouts and transient errors
58-
o.MaxBackoff = 30 * time.Second // Cap backoff to avoid excessive wait
120+
o.MaxAttempts = opts.MaxAttempts
121+
o.MaxBackoff = opts.MaxBackoff
59122
})
60123

61124
// Create AWS clients with custom retryer
@@ -80,8 +143,10 @@ func (h *E2ETestHelperAWS) GetELBV2Client() *elbv2.Client {
80143
}
81144

82145
// GetLBTargets returns the targets for a given LB DNS name, listener port, and target port.
146+
// This performs a single check without retry. For waiting until targets are registered,
147+
// use WaitForLBTargets instead.
83148
func (h *E2ETestHelperAWS) GetLBTargets(ctx context.Context, lbDNSName string, listenerPort, targetPort int32) ([]string, error) {
84-
foundLB, err := h.GetLoadBalancerFromDNSNameWithRetry(ctx, lbDNSName)
149+
foundLB, err := h.GetLoadBalancerFromDNSNameDefaultTimeout(ctx, lbDNSName)
85150
if err != nil {
86151
return nil, fmt.Errorf("failed to get load balancer from DNS name: %v", err)
87152
}
@@ -94,6 +159,7 @@ func (h *E2ETestHelperAWS) GetLBTargets(ctx context.Context, lbDNSName string, l
94159
return nil, fmt.Errorf("failed to describe listeners: %v", err)
95160
}
96161

162+
framework.Logf("Found %d listeners for load balancer, checking for listener port %d", len(listenersOut.Listeners), int32(listenerPort))
97163
targetGroupARNs := map[string]struct{}{}
98164
for _, listener := range listenersOut.Listeners {
99165
if aws.ToInt32(listener.Port) == int32(listenerPort) {
@@ -105,28 +171,79 @@ func (h *E2ETestHelperAWS) GetLBTargets(ctx context.Context, lbDNSName string, l
105171
}
106172
}
107173
}
174+
framework.Logf("Found %d target groups for listener", len(targetGroupARNs))
108175
if len(targetGroupARNs) == 0 {
109176
return nil, fmt.Errorf("no target groups found for LB: %s", lbARN)
110177
}
111178

112179
targets := []string{}
113180
for tgARN := range targetGroupARNs {
181+
framework.Logf("Describing target group %s", tgARN)
114182
tgHealth, err := h.elbv2Client.DescribeTargetHealth(ctx, &elbv2.DescribeTargetHealthInput{
115183
TargetGroupArn: aws.String(tgARN),
116184
})
117185
if err != nil {
118186
return nil, fmt.Errorf("failed to describe target health for TG %s: %v", tgARN, err)
119187
}
188+
framework.Logf("Found %d targets", len(tgHealth.TargetHealthDescriptions))
120189
for _, target := range tgHealth.TargetHealthDescriptions {
121-
if aws.ToInt32(target.Target.Port) == int32(targetPort) {
122-
targets = append(targets, aws.ToString(target.Target.Id))
123-
}
190+
targets = append(targets, aws.ToString(target.Target.Id))
124191
}
125192
}
126193
return targets, nil
127194
}
128195

129-
// GetLoadBalancerFromDNSName describes a load balancers filtered by DNS name.
196+
// WaitForLBTargets polls until the specified load balancer has at least the expected number of targets.
197+
// This uses fixed-interval polling (not exponential backoff) because target registration is
198+
// eventual consistency, not an API failure scenario.
199+
//
200+
// AWS target registration typically takes 30-90 seconds for initial health checks.
201+
// Uses 5-second polling interval as a balance between responsiveness and API call volume.
202+
//
203+
// Example:
204+
//
205+
// // Wait up to 3 minutes for at least 3 targets to be registered
206+
// targets, err := helper.WaitForLBTargets(ctx, dnsName, 80, 8080, 3, 3*time.Minute)
207+
func (h *E2ETestHelperAWS) WaitForLBTargets(ctx context.Context, lbDNSName string, listenerPort, targetPort int32, minTargets int, timeout time.Duration) ([]string, error) {
208+
var targets []string
209+
var lastErr error
210+
211+
framework.Logf("Waiting for LB %s to have at least %d targets (timeout: %v) on port %d", lbDNSName, minTargets, timeout, listenerPort)
212+
213+
// Use fixed-interval polling for state convergence (Kubernetes standard pattern)
214+
// Poll immediately first (true), then every 5 seconds
215+
err := wait.PollUntilContextTimeout(ctx, DefaultTargetPollInterval, timeout, true, func(ctx context.Context) (bool, error) {
216+
var err error
217+
targets, err = h.GetLBTargets(ctx, lbDNSName, listenerPort, targetPort)
218+
if err != nil {
219+
// Log but continue polling - target groups might not be ready yet
220+
framework.Logf("error getting LB targets (will retry): %v", err)
221+
lastErr = err
222+
return false, nil
223+
}
224+
225+
framework.Logf("LB target status: found %d targets, waiting for at least %d", len(targets), minTargets)
226+
227+
if len(targets) >= minTargets {
228+
framework.Logf("Target count satisfied: %d >= %d", len(targets), minTargets)
229+
return true, nil // Success
230+
}
231+
232+
return false, nil // Keep polling
233+
})
234+
235+
if err != nil {
236+
if lastErr != nil {
237+
return targets, fmt.Errorf("timed out waiting for %d targets (last error: %v): %w", minTargets, lastErr, err)
238+
}
239+
return targets, fmt.Errorf("timed out waiting for %d targets (got %d): %w", minTargets, len(targets), err)
240+
}
241+
242+
return targets, nil
243+
}
244+
245+
// GetLoadBalancerFromDNSName performs a single attempt to describe load balancers filtered by DNS name.
246+
// For retry logic with exponential backoff, use GetLoadBalancerFromDNSNameWithBackoff.
130247
func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSName(ctx context.Context, lbDNSName string) (*elbv2types.LoadBalancer, error) {
131248
framework.Logf("describing load balancers with DNS %s", lbDNSName)
132249

@@ -149,10 +266,10 @@ func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSName(ctx context.Context, lbDNS
149266
return nil, fmt.Errorf("no load balancer found with DNS name: %s", lbDNSName)
150267
}
151268

152-
// GetLoadBalancerFromDNSNameWithTimeout describes a load balancers filtered by DNS name with retry using
153-
// exponential backoff.
154-
// AWS API
155-
func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameWithTimeout(ctx context.Context, lbDNSName string, timeout time.Duration) (*elbv2types.LoadBalancer, error) {
269+
// GetLoadBalancerFromDNSNameWithBackoff describes a load balancer filtered by DNS name with retry using
270+
// exponential backoff and a custom timeout.
271+
// Use this when you need control over the timeout duration for specific test scenarios.
272+
func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameWithBackoff(ctx context.Context, lbDNSName string, timeout time.Duration) (*elbv2types.LoadBalancer, error) {
156273
var foundLB *elbv2types.LoadBalancer
157274

158275
ctx, cancel := context.WithTimeout(ctx, timeout)
@@ -187,10 +304,33 @@ func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameWithTimeout(ctx context.Con
187304
return foundLB, nil
188305
}
189306

190-
// GetLoadBalancerFromDNSNameWithRetry describes a load balancers filtered by DNS name with
191-
// default retry values.
192-
// The default timeout is 20 minutes based on the AWS API limits and different regions
193-
// where DNS propagation.
307+
// GetLoadBalancerFromDNSNameDefaultTimeout describes a load balancer filtered by DNS name with
308+
// default retry configuration.
309+
// The default timeout is 20 minutes (configurable via E2E_LB_TIMEOUT_MINUTES env var),
310+
// based on AWS API limits and DNS propagation delays across different regions.
311+
//
312+
// Example: Override default timeout via environment variable:
313+
//
314+
// export E2E_LB_TIMEOUT_MINUTES=30 # Use 30 minutes for slow environments
315+
func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameDefaultTimeout(ctx context.Context, lbDNSName string) (*elbv2types.LoadBalancer, error) {
316+
timeout := getDefaultLBTimeout()
317+
return h.GetLoadBalancerFromDNSNameWithBackoff(ctx, lbDNSName, timeout)
318+
}
319+
320+
// getDefaultLBTimeout returns the default load balancer timeout.
321+
// Checks E2E_LB_TIMEOUT_MINUTES environment variable first, falls back to DefaultRetryTimeoutMinutes.
322+
func getDefaultLBTimeout() time.Duration {
323+
if timeoutStr := os.Getenv(EnvLBTimeoutMinutes); timeoutStr != "" {
324+
if timeoutMinutes, err := strconv.Atoi(timeoutStr); err == nil && timeoutMinutes > 0 {
325+
return time.Duration(timeoutMinutes) * time.Minute
326+
}
327+
}
328+
return DefaultRetryTimeoutMinutes * time.Minute
329+
}
330+
331+
// Deprecated: GetLoadBalancerFromDNSNameWithRetry is deprecated.
332+
// Use GetLoadBalancerFromDNSNameDefaultTimeout instead for clarity.
333+
// This function will be removed in a future version.
194334
func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameWithRetry(ctx context.Context, lbDNSName string) (*elbv2types.LoadBalancer, error) {
195-
return h.GetLoadBalancerFromDNSNameWithTimeout(ctx, lbDNSName, DefaultRetryTimeoutMinutes*time.Minute)
335+
return h.GetLoadBalancerFromDNSNameDefaultTimeout(ctx, lbDNSName)
196336
}

tests/e2e/loadbalancer.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,9 @@ var _ = Describe("[cloud-provider-aws-e2e] loadbalancer", func() {
119119
}
120120
lbDNS := cfg.svc.Status.LoadBalancer.Ingress[0].Hostname
121121
// TODO expected lbDNS not empty
122-
// TODO expect awshelper not nil
123-
targets, err := cfg.GetAWSHelper().GetLBTargets(ctx, lbDNS, 80, cfg.GetServicePodPort())
124-
framework.ExpectNoError(err, "failed to get LB target count")
122+
123+
targets, err := cfg.GetAWSHelper().WaitForLBTargets(ctx, lbDNS, 80, cfg.GetServicePodPort(), cfg.nodeCount, DefaultTargetWaitTimeout)
124+
framework.ExpectNoError(err, "failed waiting for LB targets to register")
125125
framework.Logf("LB targets: %v", targets)
126126
gomega.Expect(len(targets)).To(gomega.Equal(cfg.nodeCount), "AWS LB target count validation failed")
127127
},

0 commit comments

Comments
 (0)