Skip to content

Error status handler improvements #742

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 8, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docs/documentation/features.md
Original file line number Diff line number Diff line change
@@ -166,19 +166,21 @@ A successful execution resets the retry.

### Setting Error Status After Last Retry Attempt

In order to facilitate error reporting in case a last retry attempt fails, Reconciler can implement the following
In order to facilitate error reporting Reconciler can implement the following
[interface](https://github.com/java-operator-sdk/java-operator-sdk/blob/main/operator-framework-core/src/main/java/io/javaoperatorsdk/operator/api/reconciler/ErrorStatusHandler.java):

```java
public interface ErrorStatusHandler<T extends CustomResource<?, ?>> {
public interface ErrorStatusHandler<T extends HasMetadata> {

T updateErrorStatus(T resource, RuntimeException e);
Optional<T> updateErrorStatus(T resource, RetryInfo retryInfo, RuntimeException e);

}
```

The `updateErrorStatus` resource is called when it's the last retry attempt according the retry configuration and the
reconciler execution still resulted in a runtime exception.
The `updateErrorStatus` method is called in case an exception is thrown from the reconciler. It is also called when
there is no retry configured, just after the reconciler execution.
In the first call the `RetryInfo.getAttemptCount()` is always zero, since it is not a result of a retry
(regardless if retry is configured or not).

The result of the method call is used to make a status update on the custom resource. This is always a sub-resource
update request, so no update on custom resource itself (like spec of metadata) happens. Note that this update request
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package io.javaoperatorsdk.operator.api.reconciler;

import java.util.Optional;

import io.fabric8.kubernetes.api.model.HasMetadata;

public interface ErrorStatusHandler<T extends HasMetadata> {
@@ -19,9 +21,10 @@ public interface ErrorStatusHandler<T extends HasMetadata> {
* should not be updates on custom resource after it is marked for deletion.
*
* @param resource to update the status on
* @param retryInfo
* @param e exception thrown from the reconciler
* @return the updated resource
*/
T updateErrorStatus(T resource, RuntimeException e);
Optional<T> updateErrorStatus(T resource, RetryInfo retryInfo, RuntimeException e);

}
Original file line number Diff line number Diff line change
@@ -11,13 +11,7 @@
import io.fabric8.kubernetes.client.dsl.Resource;
import io.javaoperatorsdk.operator.api.ObservedGenerationAware;
import io.javaoperatorsdk.operator.api.config.ControllerConfiguration;
import io.javaoperatorsdk.operator.api.reconciler.BaseControl;
import io.javaoperatorsdk.operator.api.reconciler.Context;
import io.javaoperatorsdk.operator.api.reconciler.DefaultContext;
import io.javaoperatorsdk.operator.api.reconciler.DeleteControl;
import io.javaoperatorsdk.operator.api.reconciler.ErrorStatusHandler;
import io.javaoperatorsdk.operator.api.reconciler.Reconciler;
import io.javaoperatorsdk.operator.api.reconciler.UpdateControl;
import io.javaoperatorsdk.operator.api.reconciler.*;
import io.javaoperatorsdk.operator.processing.Controller;

import static io.javaoperatorsdk.operator.processing.KubernetesResourceUtils.getName;
@@ -114,7 +108,7 @@ private PostExecutionControl<R> handleReconcile(
cloneResourceForErrorStatusHandlerIfNeeded(originalResource, context);
return reconcileExecution(executionScope, resourceForExecution, originalResource, context);
} catch (RuntimeException e) {
handleLastAttemptErrorStatusHandler(originalResource, context, e);
handleErrorStatusHandler(originalResource, context, e);
throw e;
}
}
@@ -128,7 +122,7 @@ private PostExecutionControl<R> handleReconcile(
* place for status update.
*/
private R cloneResourceForErrorStatusHandlerIfNeeded(R resource, Context context) {
if (isLastAttemptOfRetryAndErrorStatusHandlerPresent(context) ||
if (isErrorStatusHandlerPresent() ||
shouldUpdateObservedGenerationAutomatically(resource)) {
return configuration().getConfigurationService().getResourceCloner().clone(resource);
} else {
@@ -164,26 +158,32 @@ && shouldUpdateObservedGenerationAutomatically(resourceForExecution)) {
return createPostExecutionControl(updatedCustomResource, updateControl);
}

private void handleLastAttemptErrorStatusHandler(R resource, Context context,
private void handleErrorStatusHandler(R resource, Context context,
RuntimeException e) {
if (isLastAttemptOfRetryAndErrorStatusHandlerPresent(context)) {
if (isErrorStatusHandlerPresent()) {
try {
var retryInfo = context.getRetryInfo().orElse(new RetryInfo() {
@Override
public int getAttemptCount() {
return 0;
}

@Override
public boolean isLastAttempt() {
return controller.getConfiguration().getRetryConfiguration() == null;
}
});
var updatedResource = ((ErrorStatusHandler<R>) controller.getReconciler())
.updateErrorStatus(resource, e);
customResourceFacade.updateStatus(updatedResource);
.updateErrorStatus(resource, retryInfo, e);
updatedResource.ifPresent(customResourceFacade::updateStatus);
} catch (RuntimeException ex) {
log.error("Error during error status handling.", ex);
}
}
}

private boolean isLastAttemptOfRetryAndErrorStatusHandlerPresent(Context context) {
if (context.getRetryInfo().isPresent()) {
return context.getRetryInfo().get().isLastAttempt()
&& controller.getReconciler() instanceof ErrorStatusHandler;
} else {
return false;
}
private boolean isErrorStatusHandlerPresent() {
return controller.getReconciler() instanceof ErrorStatusHandler;
}

private R updateStatusGenerationAware(R resource) {
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package io.javaoperatorsdk.operator.processing.event;

import java.util.ArrayList;
import java.util.Optional;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -11,6 +12,7 @@
import io.fabric8.kubernetes.api.model.ObjectMeta;
import io.fabric8.kubernetes.client.CustomResource;
import io.javaoperatorsdk.operator.TestUtils;
import io.javaoperatorsdk.operator.api.config.Cloner;
import io.javaoperatorsdk.operator.api.config.ConfigurationService;
import io.javaoperatorsdk.operator.api.config.ControllerConfiguration;
import io.javaoperatorsdk.operator.api.monitoring.Metrics;
@@ -56,9 +58,12 @@ private <R extends HasMetadata> ReconciliationDispatcher<R> init(R customResourc
when(configuration.getName()).thenReturn("EventDispatcherTestController");
when(configService.getMetrics()).thenReturn(Metrics.NOOP);
when(configuration.getConfigurationService()).thenReturn(configService);
when(configService.getResourceCloner()).thenReturn(ConfigurationService.DEFAULT_CLONER);
when(reconciler.reconcile(eq(customResource), any()))
.thenReturn(UpdateControl.updateResource(customResource));
when(configService.getResourceCloner()).thenReturn(new Cloner() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this changed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure to make easier to do when(..).thenReturn() with mockito. Since otherwise its a different instance.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but then you're not really testing the actual behavior?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be understood as a form of a mocking of that clone method / cloner. The goal here is not to test the cloner but only if it is called in case it's relevant.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand but the problem is what if the behavior changes when the instance is not actually cloned?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually not, is just what instance we move around, for some special cases.

@Override
public <R extends HasMetadata> R clone(R object) {
return object;
}
});
when(reconciler.cleanup(eq(customResource), any()))
.thenReturn(DeleteControl.defaultDelete());
when(customResourceFacade.replaceWithLock(any())).thenReturn(null);
@@ -351,9 +356,9 @@ void callErrorStatusHandlerIfImplemented() {

when(reconciler.reconcile(any(), any()))
.thenThrow(new IllegalStateException("Error Status Test"));
when(((ErrorStatusHandler) reconciler).updateErrorStatus(any(), any())).then(a -> {
when(((ErrorStatusHandler) reconciler).updateErrorStatus(any(), any(), any())).then(a -> {
testCustomResource.getStatus().setConfigMapStatus(ERROR_MESSAGE);
return testCustomResource;
return Optional.of(testCustomResource);
});

reconciliationDispatcher.handleExecution(
@@ -373,7 +378,25 @@ public boolean isLastAttempt() {

verify(customResourceFacade, times(1)).updateStatus(testCustomResource);
verify(((ErrorStatusHandler) reconciler), times(1)).updateErrorStatus(eq(testCustomResource),
any());
any(), any());
}

@Test
void callErrorStatusHandlerEvenOnFirstError() {
testCustomResource.addFinalizer(DEFAULT_FINALIZER);

when(reconciler.reconcile(any(), any()))
.thenThrow(new IllegalStateException("Error Status Test"));
when(((ErrorStatusHandler) reconciler).updateErrorStatus(any(), any(), any())).then(a -> {
testCustomResource.getStatus().setConfigMapStatus(ERROR_MESSAGE);
return Optional.of(testCustomResource);
});
reconciliationDispatcher.handleExecution(
new ExecutionScope(
testCustomResource, null));
verify(customResourceFacade, times(1)).updateStatus(testCustomResource);
verify(((ErrorStatusHandler) reconciler), times(1)).updateErrorStatus(eq(testCustomResource),
any(), any());
}

private ObservedGenCustomResource createObservedGenCustomResource() {
Original file line number Diff line number Diff line change
@@ -17,13 +17,15 @@

public class ErrorStatusHandlerIT {

public static final int MAX_RETRY_ATTEMPTS = 3;
ErrorStatusHandlerTestReconciler reconciler = new ErrorStatusHandlerTestReconciler();

@RegisterExtension
OperatorExtension operator =
OperatorExtension.builder()
.withConfigurationService(DefaultConfigurationService.instance())
.withReconciler(reconciler, new GenericRetry().setMaxAttempts(3).withLinearRetry())
.withReconciler(reconciler,
new GenericRetry().setMaxAttempts(MAX_RETRY_ATTEMPTS).withLinearRetry())
.build();

@Test
@@ -32,16 +34,18 @@ public void testErrorMessageSetEventually() {
operator.create(ErrorStatusHandlerTestCustomResource.class, createCustomResource());

await()
.atMost(15, TimeUnit.SECONDS)
.pollInterval(1L, TimeUnit.SECONDS)
.atMost(10, TimeUnit.SECONDS)
.pollInterval(250, TimeUnit.MICROSECONDS)
.untilAsserted(
() -> {
ErrorStatusHandlerTestCustomResource res =
operator.get(ErrorStatusHandlerTestCustomResource.class,
resource.getMetadata().getName());
assertThat(res.getStatus()).isNotNull();
assertThat(res.getStatus().getMessage())
.isEqualTo(ErrorStatusHandlerTestReconciler.ERROR_STATUS_MESSAGE);
for (int i = 0; i < MAX_RETRY_ATTEMPTS + 1; i++) {
assertThat(res.getStatus().getMessages())
.contains(ErrorStatusHandlerTestReconciler.ERROR_STATUS_MESSAGE + i);
}
});
}

Original file line number Diff line number Diff line change
@@ -20,15 +20,18 @@

public class RetryIT {
public static final int RETRY_INTERVAL = 150;
public static final int MAX_RETRY_ATTEMPTS = 5;

public static final int NUMBER_FAILED_EXECUTIONS = 3;

@RegisterExtension
OperatorExtension operator =
OperatorExtension.builder()
.withConfigurationService(DefaultConfigurationService.instance())
.withReconciler(
new RetryTestCustomReconciler(),
new RetryTestCustomReconciler(NUMBER_FAILED_EXECUTIONS),
new GenericRetry().setInitialInterval(RETRY_INTERVAL).withLinearRetry()
.setMaxAttempts(5))
.setMaxAttempts(MAX_RETRY_ATTEMPTS))
.build();


@@ -40,7 +43,7 @@ public void retryFailedExecution() {

await("cr status updated")
.pollDelay(
RETRY_INTERVAL * (RetryTestCustomReconciler.NUMBER_FAILED_EXECUTIONS + 2),
RETRY_INTERVAL * (NUMBER_FAILED_EXECUTIONS + 2),
TimeUnit.MILLISECONDS)
.pollInterval(
RETRY_INTERVAL,
@@ -49,7 +52,7 @@ public void retryFailedExecution() {
.untilAsserted(() -> {
assertThat(
TestUtils.getNumberOfExecutions(operator))
.isEqualTo(RetryTestCustomReconciler.NUMBER_FAILED_EXECUTIONS + 1);
.isEqualTo(NUMBER_FAILED_EXECUTIONS + 1);

RetryTestCustomResource finalResource =
operator.get(RetryTestCustomResource.class,
@@ -59,7 +62,7 @@ public void retryFailedExecution() {
});
}

public RetryTestCustomResource createTestCustomResource(String id) {
public static RetryTestCustomResource createTestCustomResource(String id) {
RetryTestCustomResource resource = new RetryTestCustomResource();
resource.setMetadata(
new ObjectMetaBuilder()
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package io.javaoperatorsdk.operator;

import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.RegisterExtension;

import io.javaoperatorsdk.operator.config.runtime.DefaultConfigurationService;
import io.javaoperatorsdk.operator.junit.OperatorExtension;
import io.javaoperatorsdk.operator.processing.retry.GenericRetry;
import io.javaoperatorsdk.operator.sample.retry.RetryTestCustomReconciler;
import io.javaoperatorsdk.operator.sample.retry.RetryTestCustomResource;

import static io.javaoperatorsdk.operator.RetryIT.createTestCustomResource;
import static org.assertj.core.api.Assertions.assertThat;

public class RetryMaxAttemptIT {

public static final int MAX_RETRY_ATTEMPTS = 3;
public static final int RETRY_INTERVAL = 100;
public static final int ALL_EXECUTION_TO_FAIL = 99;

RetryTestCustomReconciler reconciler = new RetryTestCustomReconciler(ALL_EXECUTION_TO_FAIL);

@RegisterExtension
OperatorExtension operator =
OperatorExtension.builder()
.withConfigurationService(DefaultConfigurationService.instance())
.withReconciler(reconciler,
new GenericRetry().setInitialInterval(RETRY_INTERVAL).withLinearRetry()
.setMaxAttempts(MAX_RETRY_ATTEMPTS))
.build();


@Test
public void retryFailedExecution() throws InterruptedException {
RetryTestCustomResource resource = createTestCustomResource("max-retry");

operator.create(RetryTestCustomResource.class, resource);

Thread.sleep((MAX_RETRY_ATTEMPTS + 2) * RETRY_INTERVAL);
assertThat(reconciler.getNumberOfExecutions()).isEqualTo(4);
}
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
package io.javaoperatorsdk.operator.sample.errorstatushandler;

import java.util.ArrayList;
import java.util.List;

public class ErrorStatusHandlerTestCustomResourceStatus {

private String message;
private List<String> messages;

public String getMessage() {
return message;
public List<String> getMessages() {
if (messages == null) {
messages = new ArrayList<>();
}
return messages;
}

public ErrorStatusHandlerTestCustomResourceStatus setMessage(String message) {
this.message = message;
public ErrorStatusHandlerTestCustomResourceStatus setMessages(List<String> messages) {
this.messages = messages;
return this;
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package io.javaoperatorsdk.operator.sample.errorstatushandler;

import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;

import org.slf4j.Logger;
@@ -45,11 +46,11 @@ public int getNumberOfExecutions() {
}

@Override
public ErrorStatusHandlerTestCustomResource updateErrorStatus(
ErrorStatusHandlerTestCustomResource resource, RuntimeException e) {
public Optional<ErrorStatusHandlerTestCustomResource> updateErrorStatus(
ErrorStatusHandlerTestCustomResource resource, RetryInfo retryInfo, RuntimeException e) {
log.info("Setting status.");
ensureStatusExists(resource);
resource.getStatus().setMessage(ERROR_STATUS_MESSAGE);
return resource;
resource.getStatus().getMessages().add(ERROR_STATUS_MESSAGE + retryInfo.getAttemptCount());
return Optional.of(resource);
}
}
Original file line number Diff line number Diff line change
@@ -17,15 +17,19 @@
public class RetryTestCustomReconciler
implements Reconciler<RetryTestCustomResource>, TestExecutionInfoProvider {

public static final int NUMBER_FAILED_EXECUTIONS = 2;

public static final String FINALIZER_NAME =
ControllerUtils.getDefaultFinalizerName(
CustomResource.getCRDName(RetryTestCustomResource.class));
private static final Logger log =
LoggerFactory.getLogger(RetryTestCustomReconciler.class);
private final AtomicInteger numberOfExecutions = new AtomicInteger(0);

private int numberOfExecutionFails;

public RetryTestCustomReconciler(int numberOfExecutionFails) {
this.numberOfExecutionFails = numberOfExecutionFails;
}

@Override
public UpdateControl<RetryTestCustomResource> reconcile(
RetryTestCustomResource resource, Context context) {
@@ -36,7 +40,7 @@ public UpdateControl<RetryTestCustomResource> reconcile(
}
log.info("Value: " + resource.getSpec().getValue());

if (numberOfExecutions.get() < NUMBER_FAILED_EXECUTIONS + 1) {
if (numberOfExecutions.get() < numberOfExecutionFails + 1) {
throw new RuntimeException("Testing Retry");
}
if (context.getRetryInfo().isEmpty() || context.getRetryInfo().get().isLastAttempt()) {
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@@ -173,8 +174,9 @@ private <T> T loadYaml(Class<T> clazz, String yaml) {
}

@Override
public WebPage updateErrorStatus(WebPage resource, RuntimeException e) {
public Optional<WebPage> updateErrorStatus(WebPage resource, RetryInfo retryInfo,
RuntimeException e) {
resource.getStatus().setErrorMessage("Error: " + e.getMessage());
return resource;
return Optional.of(resource);
}
}