diff --git a/changelogs/unreleased/9646-kaovilai b/changelogs/unreleased/9646-kaovilai new file mode 100644 index 0000000000..f7599e912c --- /dev/null +++ b/changelogs/unreleased/9646-kaovilai @@ -0,0 +1 @@ +Fix backup-finalizer: do not set backup phase to Completed before PutBackupMetadata succeeds diff --git a/pkg/controller/backup_finalizer_controller.go b/pkg/controller/backup_finalizer_controller.go index b24c132fa9..aaf9fc8b9a 100644 --- a/pkg/controller/backup_finalizer_controller.go +++ b/pkg/controller/backup_finalizer_controller.go @@ -202,25 +202,31 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } backupScheduleName := backupRequest.GetLabels()[velerov1api.ScheduleNameLabel] + + // Determine the final phase and completion timestamp, but do NOT set them + // on the in-memory backup object yet. We first need to upload metadata and + // contents to object storage. If the upload fails, the deferred patch must + // NOT write a terminal phase to the API server so the controller can retry. + var finalPhase velerov1api.BackupPhase switch backup.Status.Phase { case velerov1api.BackupPhaseFinalizing: - backup.Status.Phase = velerov1api.BackupPhaseCompleted - r.metrics.RegisterBackupSuccess(backupScheduleName) - r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc) + finalPhase = velerov1api.BackupPhaseCompleted case velerov1api.BackupPhaseFinalizingPartiallyFailed: - backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed - r.metrics.RegisterBackupPartialFailure(backupScheduleName) - r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) + finalPhase = velerov1api.BackupPhasePartiallyFailed } - backup.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()} - backup.Status.CSIVolumeSnapshotsCompleted = updateCSIVolumeSnapshotsCompleted(operations) + completionTimestamp := &metav1.Time{Time: r.clock.Now()} + csiVolumeSnapshotsCompleted := updateCSIVolumeSnapshotsCompleted(operations) - recordBackupMetrics(log, backup, outBackupFile, r.metrics, true) + // Encode backup JSON with the final phase for object storage, so that the + // metadata in storage reflects the completed state. + backupForUpload := backup.DeepCopy() + backupForUpload.Status.Phase = finalPhase + backupForUpload.Status.CompletionTimestamp = completionTimestamp + backupForUpload.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted - // update backup metadata in object store backupJSON := new(bytes.Buffer) - if err := encode.To(backup, "json", backupJSON); err != nil { + if err := encode.To(backupForUpload, "json", backupJSON); err != nil { return ctrl.Result{}, errors.Wrap(err, "error encoding backup json") } err = backupStore.PutBackupMetadata(backup.Name, backupJSON) @@ -233,6 +239,24 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents") } } + + // Uploads succeeded — now safe to set the final phase on the in-memory + // backup object so the deferred patch writes it to the API server. + backup.Status.Phase = finalPhase + backup.Status.CompletionTimestamp = completionTimestamp + backup.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted + + switch finalPhase { + case velerov1api.BackupPhaseCompleted: + r.metrics.RegisterBackupSuccess(backupScheduleName) + r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc) + case velerov1api.BackupPhasePartiallyFailed: + r.metrics.RegisterBackupPartialFailure(backupScheduleName) + r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) + } + + recordBackupMetrics(log, backup, outBackupFile, r.metrics, true) + return ctrl.Result{}, nil } diff --git a/pkg/controller/backup_finalizer_controller_test.go b/pkg/controller/backup_finalizer_controller_test.go index ade3ce2cd7..32c0812a73 100644 --- a/pkg/controller/backup_finalizer_controller_test.go +++ b/pkg/controller/backup_finalizer_controller_test.go @@ -18,6 +18,7 @@ package controller import ( "bytes" + "fmt" "io" "testing" "time" @@ -39,8 +40,10 @@ import ( "github.com/vmware-tanzu/velero/pkg/itemoperation" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/metrics" + persistencemocks "github.com/vmware-tanzu/velero/pkg/persistence/mocks" "github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt" "github.com/vmware-tanzu/velero/pkg/plugin/framework" + pluginmocks "github.com/vmware-tanzu/velero/pkg/plugin/mocks" "github.com/vmware-tanzu/velero/pkg/plugin/velero" velerotest "github.com/vmware-tanzu/velero/pkg/test" ) @@ -242,3 +245,151 @@ func TestBackupFinalizerReconcile(t *testing.T) { }) } } + +func TestBackupFinalizerReconcile_PutBackupMetadataFail(t *testing.T) { + tests := []struct { + name string + initialPhase velerov1api.BackupPhase + }{ + { + name: "Finalizing backup stays Finalizing when PutBackupMetadata fails", + initialPhase: velerov1api.BackupPhaseFinalizing, + }, + { + name: "FinalizingPartiallyFailed backup stays FinalizingPartiallyFailed when PutBackupMetadata fails", + initialPhase: velerov1api.BackupPhaseFinalizingPartiallyFailed, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClock := testclocks.NewFakeClock(time.Now()) + defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result() + + backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-meta-fail"). + StorageLocation("default"). + ObjectMeta(builder.WithUID("foo")). + StartTimestamp(fakeClock.Now()). + Phase(test.initialPhase).Result() + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + + // Use local mocks to avoid interference with other tests + localPluginManager := &pluginmocks.Manager{} + localBackupStore := &persistencemocks.BackupStore{} + + backupper := new(fakeBackupper) + reconciler := NewBackupFinalizerReconciler( + fakeClient, + fakeGlobalClient, + fakeClock, + backupper, + func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager }, + NewBackupTracker(), + NewFakeSingleObjectBackupStoreGetter(localBackupStore), + logrus.StandardLogger(), + metrics.NewServerMetrics(), + 10*time.Minute, + ) + + localPluginManager.On("CleanupClients").Return(nil) + localBackupStore.On("GetBackupItemOperations", backup.Name).Return(nil, nil) + // PutBackupMetadata fails + localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload")) + localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil) + localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil) + + _, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}}) + assert.Error(t, err, "reconcile should return error when PutBackupMetadata fails") + + backupAfter := velerov1api.Backup{} + err = fakeClient.Get(t.Context(), types.NamespacedName{ + Namespace: backup.Namespace, + Name: backup.Name, + }, &backupAfter) + require.NoError(t, err) + assert.Equal(t, test.initialPhase, backupAfter.Status.Phase, + "backup phase should remain %s when PutBackupMetadata fails", test.initialPhase) + assert.Nil(t, backupAfter.Status.CompletionTimestamp, + "CompletionTimestamp should not be set when PutBackupMetadata fails") + }) + } +} + +func TestBackupFinalizerReconcile_PutBackupContentsFail(t *testing.T) { + fakeClock := testclocks.NewFakeClock(time.Now()) + metav1Now := metav1.NewTime(fakeClock.Now()) + defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result() + + backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-contents-fail"). + StorageLocation("default"). + ObjectMeta(builder.WithUID("foo")). + StartTimestamp(fakeClock.Now()). + Phase(velerov1api.BackupPhaseFinalizing).Result() + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + + localPluginManager := &pluginmocks.Manager{} + localBackupStore := &persistencemocks.BackupStore{} + + backupper := new(fakeBackupper) + reconciler := NewBackupFinalizerReconciler( + fakeClient, + fakeGlobalClient, + fakeClock, + backupper, + func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager }, + NewBackupTracker(), + NewFakeSingleObjectBackupStoreGetter(localBackupStore), + logrus.StandardLogger(), + metrics.NewServerMetrics(), + 10*time.Minute, + ) + + operations := []*itemoperation.BackupOperation{ + { + Spec: itemoperation.BackupOperationSpec{ + BackupName: "backup-contents-fail", + BackupUID: "foo", + BackupItemAction: "foo", + ResourceIdentifier: velero.ResourceIdentifier{ + GroupResource: kuberesource.Pods, + Namespace: "ns-1", + Name: "pod-1", + }, + OperationID: "operation-1", + }, + Status: itemoperation.OperationStatus{ + Phase: itemoperation.OperationPhaseCompleted, + Created: &metav1Now, + }, + }, + } + + localPluginManager.On("CleanupClients").Return(nil) + localPluginManager.On("GetBackupItemActionsV2").Return(nil, nil) + localBackupStore.On("GetBackupItemOperations", backup.Name).Return(operations, nil) + localBackupStore.On("GetBackupContents", mock.Anything).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil) + localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(nil) + // PutBackupContents fails + localBackupStore.On("PutBackupContents", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload")) + localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil) + localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil) + backupper.On("FinalizeBackup", mock.Anything, mock.Anything, mock.Anything, mock.Anything, framework.BackupItemActionResolverV2{}, mock.Anything, mock.Anything).Return(nil) + + _, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}}) + assert.Error(t, err, "reconcile should return error when PutBackupContents fails") + + backupAfter := velerov1api.Backup{} + err = fakeClient.Get(t.Context(), types.NamespacedName{ + Namespace: backup.Namespace, + Name: backup.Name, + }, &backupAfter) + require.NoError(t, err) + assert.Equal(t, velerov1api.BackupPhaseFinalizing, backupAfter.Status.Phase, + "backup phase should remain Finalizing when PutBackupContents fails") + assert.Nil(t, backupAfter.Status.CompletionTimestamp, + "CompletionTimestamp should not be set when PutBackupContents fails") +}