From dfcb6120218faa10410da75c2aff64c36a25f1fe Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Tue, 24 Mar 2026 18:19:59 -0400 Subject: [PATCH 1/2] Fix backup-finalizer: do not set backup phase to Completed before PutBackupMetadata succeeds Previously, the backup finalizer controller set backup.Status.Phase to Completed/PartiallyFailed in-memory BEFORE calling PutBackupMetadata and PutBackupContents. When these uploads failed (e.g., due to object lock or immutability), the deferred patch function still wrote the terminal phase to the Kubernetes API server, preventing the controller from retrying the upload on the next reconcile. This fix moves the phase assignment to AFTER both uploads succeed. A DeepCopy of the backup is used to encode the JSON with the final phase for object storage, while the in-memory backup object retains the Finalizing phase until uploads complete. Caveats: - CompletionTimestamp is now captured before upload but only committed to the API server after upload succeeds. On retry after a transient failure, a new timestamp is generated, so the completion time reflects when the upload finally succeeded rather than when finalization processing completed. - Metrics (RegisterBackupSuccess/RegisterBackupPartialFailure) are now recorded after uploads succeed, so they accurately reflect only fully persisted backups. - The metadata uploaded to object storage contains the final phase and completion timestamp via DeepCopy, so storage state is correct even before the API server is patched. Fixes #9645 Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy Signed-off-by: Tiger Kaovilai --- pkg/controller/backup_finalizer_controller.go | 46 ++++-- .../backup_finalizer_controller_test.go | 151 ++++++++++++++++++ 2 files changed, 186 insertions(+), 11 deletions(-) diff --git a/pkg/controller/backup_finalizer_controller.go b/pkg/controller/backup_finalizer_controller.go index b24c132fa9..aaf9fc8b9a 100644 --- a/pkg/controller/backup_finalizer_controller.go +++ b/pkg/controller/backup_finalizer_controller.go @@ -202,25 +202,31 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ } } backupScheduleName := backupRequest.GetLabels()[velerov1api.ScheduleNameLabel] + + // Determine the final phase and completion timestamp, but do NOT set them + // on the in-memory backup object yet. We first need to upload metadata and + // contents to object storage. If the upload fails, the deferred patch must + // NOT write a terminal phase to the API server so the controller can retry. + var finalPhase velerov1api.BackupPhase switch backup.Status.Phase { case velerov1api.BackupPhaseFinalizing: - backup.Status.Phase = velerov1api.BackupPhaseCompleted - r.metrics.RegisterBackupSuccess(backupScheduleName) - r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc) + finalPhase = velerov1api.BackupPhaseCompleted case velerov1api.BackupPhaseFinalizingPartiallyFailed: - backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed - r.metrics.RegisterBackupPartialFailure(backupScheduleName) - r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) + finalPhase = velerov1api.BackupPhasePartiallyFailed } - backup.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()} - backup.Status.CSIVolumeSnapshotsCompleted = updateCSIVolumeSnapshotsCompleted(operations) + completionTimestamp := &metav1.Time{Time: r.clock.Now()} + csiVolumeSnapshotsCompleted := updateCSIVolumeSnapshotsCompleted(operations) - recordBackupMetrics(log, backup, outBackupFile, r.metrics, true) + // Encode backup JSON with the final phase for object storage, so that the + // metadata in storage reflects the completed state. + backupForUpload := backup.DeepCopy() + backupForUpload.Status.Phase = finalPhase + backupForUpload.Status.CompletionTimestamp = completionTimestamp + backupForUpload.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted - // update backup metadata in object store backupJSON := new(bytes.Buffer) - if err := encode.To(backup, "json", backupJSON); err != nil { + if err := encode.To(backupForUpload, "json", backupJSON); err != nil { return ctrl.Result{}, errors.Wrap(err, "error encoding backup json") } err = backupStore.PutBackupMetadata(backup.Name, backupJSON) @@ -233,6 +239,24 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents") } } + + // Uploads succeeded — now safe to set the final phase on the in-memory + // backup object so the deferred patch writes it to the API server. + backup.Status.Phase = finalPhase + backup.Status.CompletionTimestamp = completionTimestamp + backup.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted + + switch finalPhase { + case velerov1api.BackupPhaseCompleted: + r.metrics.RegisterBackupSuccess(backupScheduleName) + r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc) + case velerov1api.BackupPhasePartiallyFailed: + r.metrics.RegisterBackupPartialFailure(backupScheduleName) + r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure) + } + + recordBackupMetrics(log, backup, outBackupFile, r.metrics, true) + return ctrl.Result{}, nil } diff --git a/pkg/controller/backup_finalizer_controller_test.go b/pkg/controller/backup_finalizer_controller_test.go index ade3ce2cd7..32c0812a73 100644 --- a/pkg/controller/backup_finalizer_controller_test.go +++ b/pkg/controller/backup_finalizer_controller_test.go @@ -18,6 +18,7 @@ package controller import ( "bytes" + "fmt" "io" "testing" "time" @@ -39,8 +40,10 @@ import ( "github.com/vmware-tanzu/velero/pkg/itemoperation" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/metrics" + persistencemocks "github.com/vmware-tanzu/velero/pkg/persistence/mocks" "github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt" "github.com/vmware-tanzu/velero/pkg/plugin/framework" + pluginmocks "github.com/vmware-tanzu/velero/pkg/plugin/mocks" "github.com/vmware-tanzu/velero/pkg/plugin/velero" velerotest "github.com/vmware-tanzu/velero/pkg/test" ) @@ -242,3 +245,151 @@ func TestBackupFinalizerReconcile(t *testing.T) { }) } } + +func TestBackupFinalizerReconcile_PutBackupMetadataFail(t *testing.T) { + tests := []struct { + name string + initialPhase velerov1api.BackupPhase + }{ + { + name: "Finalizing backup stays Finalizing when PutBackupMetadata fails", + initialPhase: velerov1api.BackupPhaseFinalizing, + }, + { + name: "FinalizingPartiallyFailed backup stays FinalizingPartiallyFailed when PutBackupMetadata fails", + initialPhase: velerov1api.BackupPhaseFinalizingPartiallyFailed, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClock := testclocks.NewFakeClock(time.Now()) + defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result() + + backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-meta-fail"). + StorageLocation("default"). + ObjectMeta(builder.WithUID("foo")). + StartTimestamp(fakeClock.Now()). + Phase(test.initialPhase).Result() + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + + // Use local mocks to avoid interference with other tests + localPluginManager := &pluginmocks.Manager{} + localBackupStore := &persistencemocks.BackupStore{} + + backupper := new(fakeBackupper) + reconciler := NewBackupFinalizerReconciler( + fakeClient, + fakeGlobalClient, + fakeClock, + backupper, + func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager }, + NewBackupTracker(), + NewFakeSingleObjectBackupStoreGetter(localBackupStore), + logrus.StandardLogger(), + metrics.NewServerMetrics(), + 10*time.Minute, + ) + + localPluginManager.On("CleanupClients").Return(nil) + localBackupStore.On("GetBackupItemOperations", backup.Name).Return(nil, nil) + // PutBackupMetadata fails + localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload")) + localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil) + localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil) + + _, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}}) + assert.Error(t, err, "reconcile should return error when PutBackupMetadata fails") + + backupAfter := velerov1api.Backup{} + err = fakeClient.Get(t.Context(), types.NamespacedName{ + Namespace: backup.Namespace, + Name: backup.Name, + }, &backupAfter) + require.NoError(t, err) + assert.Equal(t, test.initialPhase, backupAfter.Status.Phase, + "backup phase should remain %s when PutBackupMetadata fails", test.initialPhase) + assert.Nil(t, backupAfter.Status.CompletionTimestamp, + "CompletionTimestamp should not be set when PutBackupMetadata fails") + }) + } +} + +func TestBackupFinalizerReconcile_PutBackupContentsFail(t *testing.T) { + fakeClock := testclocks.NewFakeClock(time.Now()) + metav1Now := metav1.NewTime(fakeClock.Now()) + defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result() + + backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-contents-fail"). + StorageLocation("default"). + ObjectMeta(builder.WithUID("foo")). + StartTimestamp(fakeClock.Now()). + Phase(velerov1api.BackupPhaseFinalizing).Result() + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation) + + localPluginManager := &pluginmocks.Manager{} + localBackupStore := &persistencemocks.BackupStore{} + + backupper := new(fakeBackupper) + reconciler := NewBackupFinalizerReconciler( + fakeClient, + fakeGlobalClient, + fakeClock, + backupper, + func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager }, + NewBackupTracker(), + NewFakeSingleObjectBackupStoreGetter(localBackupStore), + logrus.StandardLogger(), + metrics.NewServerMetrics(), + 10*time.Minute, + ) + + operations := []*itemoperation.BackupOperation{ + { + Spec: itemoperation.BackupOperationSpec{ + BackupName: "backup-contents-fail", + BackupUID: "foo", + BackupItemAction: "foo", + ResourceIdentifier: velero.ResourceIdentifier{ + GroupResource: kuberesource.Pods, + Namespace: "ns-1", + Name: "pod-1", + }, + OperationID: "operation-1", + }, + Status: itemoperation.OperationStatus{ + Phase: itemoperation.OperationPhaseCompleted, + Created: &metav1Now, + }, + }, + } + + localPluginManager.On("CleanupClients").Return(nil) + localPluginManager.On("GetBackupItemActionsV2").Return(nil, nil) + localBackupStore.On("GetBackupItemOperations", backup.Name).Return(operations, nil) + localBackupStore.On("GetBackupContents", mock.Anything).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil) + localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(nil) + // PutBackupContents fails + localBackupStore.On("PutBackupContents", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload")) + localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil) + localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil) + backupper.On("FinalizeBackup", mock.Anything, mock.Anything, mock.Anything, mock.Anything, framework.BackupItemActionResolverV2{}, mock.Anything, mock.Anything).Return(nil) + + _, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}}) + assert.Error(t, err, "reconcile should return error when PutBackupContents fails") + + backupAfter := velerov1api.Backup{} + err = fakeClient.Get(t.Context(), types.NamespacedName{ + Namespace: backup.Namespace, + Name: backup.Name, + }, &backupAfter) + require.NoError(t, err) + assert.Equal(t, velerov1api.BackupPhaseFinalizing, backupAfter.Status.Phase, + "backup phase should remain Finalizing when PutBackupContents fails") + assert.Nil(t, backupAfter.Status.CompletionTimestamp, + "CompletionTimestamp should not be set when PutBackupContents fails") +} From cd866893386b92cdb33571081845a8e42b9f327a Mon Sep 17 00:00:00 2001 From: Tiger Kaovilai Date: Tue, 24 Mar 2026 18:20:29 -0400 Subject: [PATCH 2/2] Add changelog for #9646 Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy Signed-off-by: Tiger Kaovilai --- changelogs/unreleased/9646-kaovilai | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9646-kaovilai diff --git a/changelogs/unreleased/9646-kaovilai b/changelogs/unreleased/9646-kaovilai new file mode 100644 index 0000000000..f7599e912c --- /dev/null +++ b/changelogs/unreleased/9646-kaovilai @@ -0,0 +1 @@ +Fix backup-finalizer: do not set backup phase to Completed before PutBackupMetadata succeeds