@@ -18,6 +18,8 @@ package e2e
1818import (
1919 "context"
2020 "fmt"
21+ "os"
22+ "strconv"
2123 "time"
2224
2325 "github.com/aws/aws-sdk-go-v2/aws"
@@ -34,28 +36,89 @@ import (
3436)
3537
3638const (
39+ // DefaultRetryTimeoutMinutes is the default timeout for load balancer DNS resolution with retries.
40+ // This can be overridden via the E2E_LB_TIMEOUT_MINUTES environment variable.
3741 DefaultRetryTimeoutMinutes = 20
42+
43+ // DefaultRetryMaxAttempts is the default maximum number of retry attempts for AWS API calls.
44+ // This can be overridden via E2ETestHelperAWSOptions.
45+ DefaultRetryMaxAttempts = 10
46+
47+ // DefaultRetryMaxBackoff is the default maximum backoff duration between retries.
48+ // This can be overridden via E2ETestHelperAWSOptions.
49+ DefaultRetryMaxBackoff = 30 * time .Second
50+
51+ // DefaultTargetPollInterval is the default polling interval for target health checks.
52+ // Target registration is eventual consistency, so we use fixed-interval polling.
53+ DefaultTargetPollInterval = 5 * time .Second
54+
55+ // DefaultTargetWaitTimeout is the default timeout for waiting for targets to register.
56+ // AWS target health checks typically complete within 30-90 seconds.
57+ DefaultTargetWaitTimeout = 3 * time .Minute
58+
59+ // EnvLBTimeoutMinutes is the environment variable name for configuring LB timeout.
60+ EnvLBTimeoutMinutes = "E2E_LB_TIMEOUT_MINUTES"
3861)
3962
63+ // E2ETestHelperAWSOptions configures the behavior of E2ETestHelperAWS.
64+ type E2ETestHelperAWSOptions struct {
65+ // MaxAttempts configures the maximum number of retry attempts for AWS API calls.
66+ // Default: DefaultRetryMaxAttempts (10)
67+ MaxAttempts int
68+
69+ // MaxBackoff configures the maximum backoff duration between retries.
70+ // Default: DefaultRetryMaxBackoff (30 seconds)
71+ MaxBackoff time.Duration
72+ }
73+
74+ // ApplyDefaults fills in default values for unset options.
75+ func (o * E2ETestHelperAWSOptions ) ApplyDefaults () {
76+ if o .MaxAttempts == 0 {
77+ o .MaxAttempts = DefaultRetryMaxAttempts
78+ }
79+ if o .MaxBackoff == 0 {
80+ o .MaxBackoff = DefaultRetryMaxBackoff
81+ }
82+ }
83+
4084// E2ETestHelperAWS provides AWS API operations for e2e tests.
85+ // AWS SDK v2 clients are safe for concurrent use and do not require explicit cleanup.
86+ // Context cancellation will terminate ongoing API calls.
4187type E2ETestHelperAWS struct {
4288 retryer * retry.Standard
4389 ec2Client * ec2.Client
4490 elbClient * elb.Client
4591 elbv2Client * elbv2.Client
4692}
4793
48- // NewAWSHelper creates a new AWS helper with configured clients
94+ // NewAWSHelper creates a new AWS helper with configured clients using default options.
4995func NewAWSHelper (ctx context.Context ) (* E2ETestHelperAWS , error ) {
96+ return NewAWSHelperWithOptions (ctx , E2ETestHelperAWSOptions {})
97+ }
98+
99+ // NewAWSHelperWithOptions creates a new AWS helper with custom retry configuration.
100+ // Configure custom retryer to handle transient AWS API errors and credential failures.
101+ // AWS API limits for ELB are generous (400 TPS for DescribeLoadBalancers),
102+ // so aggressive retries are safe and necessary for CI stability.
103+ //
104+ // For rate-limited environments or shared test accounts, adjust MaxAttempts and MaxBackoff:
105+ //
106+ // helper, err := NewAWSHelperWithOptions(ctx, E2ETestHelperAWSOptions{
107+ // MaxAttempts: 5,
108+ // MaxBackoff: 10 * time.Second,
109+ // })
110+ func NewAWSHelperWithOptions (ctx context.Context , opts E2ETestHelperAWSOptions ) (* E2ETestHelperAWS , error ) {
111+ opts .ApplyDefaults ()
112+
50113 cfg , err := config .LoadDefaultConfig (ctx )
51114 framework .ExpectNoError (err , "unable to load AWS config" )
115+ if err != nil {
116+ return nil , fmt .Errorf ("unable to load AWS config: %w" , err )
117+ }
52118
53- // Configure custom retryer to handle transient AWS API errors and credential failures.
54- // AWS API limits for ELB are generous (400 TPS for DescribeLoadBalancers),
55- // so aggressive retries are safe and necessary for CI stability.
56119 customRetryer := retry .NewStandard (func (o * retry.StandardOptions ) {
57- o .MaxAttempts = 10 // Handle IMDS timeouts and transient errors
58- o .MaxBackoff = 30 * time . Second // Cap backoff to avoid excessive wait
120+ o .MaxAttempts = opts . MaxAttempts
121+ o .MaxBackoff = opts . MaxBackoff
59122 })
60123
61124 // Create AWS clients with custom retryer
@@ -80,8 +143,10 @@ func (h *E2ETestHelperAWS) GetELBV2Client() *elbv2.Client {
80143}
81144
82145// GetLBTargets returns the targets for a given LB DNS name, listener port, and target port.
146+ // This performs a single check without retry. For waiting until targets are registered,
147+ // use WaitForLBTargets instead.
83148func (h * E2ETestHelperAWS ) GetLBTargets (ctx context.Context , lbDNSName string , listenerPort , targetPort int32 ) ([]string , error ) {
84- foundLB , err := h .GetLoadBalancerFromDNSNameWithRetry (ctx , lbDNSName )
149+ foundLB , err := h .GetLoadBalancerFromDNSNameDefaultTimeout (ctx , lbDNSName )
85150 if err != nil {
86151 return nil , fmt .Errorf ("failed to get load balancer from DNS name: %v" , err )
87152 }
@@ -94,6 +159,7 @@ func (h *E2ETestHelperAWS) GetLBTargets(ctx context.Context, lbDNSName string, l
94159 return nil , fmt .Errorf ("failed to describe listeners: %v" , err )
95160 }
96161
162+ framework .Logf ("Found %d listeners for load balancer, checking for listener port %d" , len (listenersOut .Listeners ), int32 (listenerPort ))
97163 targetGroupARNs := map [string ]struct {}{}
98164 for _ , listener := range listenersOut .Listeners {
99165 if aws .ToInt32 (listener .Port ) == int32 (listenerPort ) {
@@ -105,28 +171,79 @@ func (h *E2ETestHelperAWS) GetLBTargets(ctx context.Context, lbDNSName string, l
105171 }
106172 }
107173 }
174+ framework .Logf ("Found %d target groups for listener" , len (targetGroupARNs ))
108175 if len (targetGroupARNs ) == 0 {
109176 return nil , fmt .Errorf ("no target groups found for LB: %s" , lbARN )
110177 }
111178
112179 targets := []string {}
113180 for tgARN := range targetGroupARNs {
181+ framework .Logf ("Describing target group %s" , tgARN )
114182 tgHealth , err := h .elbv2Client .DescribeTargetHealth (ctx , & elbv2.DescribeTargetHealthInput {
115183 TargetGroupArn : aws .String (tgARN ),
116184 })
117185 if err != nil {
118186 return nil , fmt .Errorf ("failed to describe target health for TG %s: %v" , tgARN , err )
119187 }
188+ framework .Logf ("Found %d targets" , len (tgHealth .TargetHealthDescriptions ))
120189 for _ , target := range tgHealth .TargetHealthDescriptions {
121- if aws .ToInt32 (target .Target .Port ) == int32 (targetPort ) {
122- targets = append (targets , aws .ToString (target .Target .Id ))
123- }
190+ targets = append (targets , aws .ToString (target .Target .Id ))
124191 }
125192 }
126193 return targets , nil
127194}
128195
129- // GetLoadBalancerFromDNSName describes a load balancers filtered by DNS name.
196+ // WaitForLBTargets polls until the specified load balancer has at least the expected number of targets.
197+ // This uses fixed-interval polling (not exponential backoff) because target registration is
198+ // eventual consistency, not an API failure scenario.
199+ //
200+ // AWS target registration typically takes 30-90 seconds for initial health checks.
201+ // Uses 5-second polling interval as a balance between responsiveness and API call volume.
202+ //
203+ // Example:
204+ //
205+ // // Wait up to 3 minutes for at least 3 targets to be registered
206+ // targets, err := helper.WaitForLBTargets(ctx, dnsName, 80, 8080, 3, 3*time.Minute)
207+ func (h * E2ETestHelperAWS ) WaitForLBTargets (ctx context.Context , lbDNSName string , listenerPort , targetPort int32 , minTargets int , timeout time.Duration ) ([]string , error ) {
208+ var targets []string
209+ var lastErr error
210+
211+ framework .Logf ("Waiting for LB %s to have at least %d targets (timeout: %v) on port %d" , lbDNSName , minTargets , timeout , listenerPort )
212+
213+ // Use fixed-interval polling for state convergence (Kubernetes standard pattern)
214+ // Poll immediately first (true), then every 5 seconds
215+ err := wait .PollUntilContextTimeout (ctx , DefaultTargetPollInterval , timeout , true , func (ctx context.Context ) (bool , error ) {
216+ var err error
217+ targets , err = h .GetLBTargets (ctx , lbDNSName , listenerPort , targetPort )
218+ if err != nil {
219+ // Log but continue polling - target groups might not be ready yet
220+ framework .Logf ("error getting LB targets (will retry): %v" , err )
221+ lastErr = err
222+ return false , nil
223+ }
224+
225+ framework .Logf ("LB target status: found %d targets, waiting for at least %d" , len (targets ), minTargets )
226+
227+ if len (targets ) >= minTargets {
228+ framework .Logf ("Target count satisfied: %d >= %d" , len (targets ), minTargets )
229+ return true , nil // Success
230+ }
231+
232+ return false , nil // Keep polling
233+ })
234+
235+ if err != nil {
236+ if lastErr != nil {
237+ return targets , fmt .Errorf ("timed out waiting for %d targets (last error: %v): %w" , minTargets , lastErr , err )
238+ }
239+ return targets , fmt .Errorf ("timed out waiting for %d targets (got %d): %w" , minTargets , len (targets ), err )
240+ }
241+
242+ return targets , nil
243+ }
244+
245+ // GetLoadBalancerFromDNSName performs a single attempt to describe load balancers filtered by DNS name.
246+ // For retry logic with exponential backoff, use GetLoadBalancerFromDNSNameWithBackoff.
130247func (h * E2ETestHelperAWS ) GetLoadBalancerFromDNSName (ctx context.Context , lbDNSName string ) (* elbv2types.LoadBalancer , error ) {
131248 framework .Logf ("describing load balancers with DNS %s" , lbDNSName )
132249
@@ -149,10 +266,10 @@ func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSName(ctx context.Context, lbDNS
149266 return nil , fmt .Errorf ("no load balancer found with DNS name: %s" , lbDNSName )
150267}
151268
152- // GetLoadBalancerFromDNSNameWithTimeout describes a load balancers filtered by DNS name with retry using
153- // exponential backoff.
154- // AWS API
155- func (h * E2ETestHelperAWS ) GetLoadBalancerFromDNSNameWithTimeout (ctx context.Context , lbDNSName string , timeout time.Duration ) (* elbv2types.LoadBalancer , error ) {
269+ // GetLoadBalancerFromDNSNameWithBackoff describes a load balancer filtered by DNS name with retry using
270+ // exponential backoff and a custom timeout .
271+ // Use this when you need control over the timeout duration for specific test scenarios.
272+ func (h * E2ETestHelperAWS ) GetLoadBalancerFromDNSNameWithBackoff (ctx context.Context , lbDNSName string , timeout time.Duration ) (* elbv2types.LoadBalancer , error ) {
156273 var foundLB * elbv2types.LoadBalancer
157274
158275 ctx , cancel := context .WithTimeout (ctx , timeout )
@@ -187,10 +304,33 @@ func (h *E2ETestHelperAWS) GetLoadBalancerFromDNSNameWithTimeout(ctx context.Con
187304 return foundLB , nil
188305}
189306
190- // GetLoadBalancerFromDNSNameWithRetry describes a load balancers filtered by DNS name with
191- // default retry values.
192- // The default timeout is 20 minutes based on the AWS API limits and different regions
193- // where DNS propagation.
307+ // GetLoadBalancerFromDNSNameDefaultTimeout describes a load balancer filtered by DNS name with
308+ // default retry configuration.
309+ // The default timeout is 20 minutes (configurable via E2E_LB_TIMEOUT_MINUTES env var),
310+ // based on AWS API limits and DNS propagation delays across different regions.
311+ //
312+ // Example: Override default timeout via environment variable:
313+ //
314+ // export E2E_LB_TIMEOUT_MINUTES=30 # Use 30 minutes for slow environments
315+ func (h * E2ETestHelperAWS ) GetLoadBalancerFromDNSNameDefaultTimeout (ctx context.Context , lbDNSName string ) (* elbv2types.LoadBalancer , error ) {
316+ timeout := getDefaultLBTimeout ()
317+ return h .GetLoadBalancerFromDNSNameWithBackoff (ctx , lbDNSName , timeout )
318+ }
319+
320+ // getDefaultLBTimeout returns the default load balancer timeout.
321+ // Checks E2E_LB_TIMEOUT_MINUTES environment variable first, falls back to DefaultRetryTimeoutMinutes.
322+ func getDefaultLBTimeout () time.Duration {
323+ if timeoutStr := os .Getenv (EnvLBTimeoutMinutes ); timeoutStr != "" {
324+ if timeoutMinutes , err := strconv .Atoi (timeoutStr ); err == nil && timeoutMinutes > 0 {
325+ return time .Duration (timeoutMinutes ) * time .Minute
326+ }
327+ }
328+ return DefaultRetryTimeoutMinutes * time .Minute
329+ }
330+
331+ // Deprecated: GetLoadBalancerFromDNSNameWithRetry is deprecated.
332+ // Use GetLoadBalancerFromDNSNameDefaultTimeout instead for clarity.
333+ // This function will be removed in a future version.
194334func (h * E2ETestHelperAWS ) GetLoadBalancerFromDNSNameWithRetry (ctx context.Context , lbDNSName string ) (* elbv2types.LoadBalancer , error ) {
195- return h .GetLoadBalancerFromDNSNameWithTimeout (ctx , lbDNSName , DefaultRetryTimeoutMinutes * time . Minute )
335+ return h .GetLoadBalancerFromDNSNameDefaultTimeout (ctx , lbDNSName )
196336}
0 commit comments