Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/9646-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix backup-finalizer: do not set backup phase to Completed before PutBackupMetadata succeeds
46 changes: 35 additions & 11 deletions pkg/controller/backup_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,25 +202,31 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}
}
backupScheduleName := backupRequest.GetLabels()[velerov1api.ScheduleNameLabel]

// Determine the final phase and completion timestamp, but do NOT set them
// on the in-memory backup object yet. We first need to upload metadata and
// contents to object storage. If the upload fails, the deferred patch must
// NOT write a terminal phase to the API server so the controller can retry.
var finalPhase velerov1api.BackupPhase
switch backup.Status.Phase {
case velerov1api.BackupPhaseFinalizing:
backup.Status.Phase = velerov1api.BackupPhaseCompleted
r.metrics.RegisterBackupSuccess(backupScheduleName)
r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc)
finalPhase = velerov1api.BackupPhaseCompleted
case velerov1api.BackupPhaseFinalizingPartiallyFailed:
backup.Status.Phase = velerov1api.BackupPhasePartiallyFailed
r.metrics.RegisterBackupPartialFailure(backupScheduleName)
r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
finalPhase = velerov1api.BackupPhasePartiallyFailed
}

backup.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()}
backup.Status.CSIVolumeSnapshotsCompleted = updateCSIVolumeSnapshotsCompleted(operations)
completionTimestamp := &metav1.Time{Time: r.clock.Now()}
csiVolumeSnapshotsCompleted := updateCSIVolumeSnapshotsCompleted(operations)

recordBackupMetrics(log, backup, outBackupFile, r.metrics, true)
// Encode backup JSON with the final phase for object storage, so that the
// metadata in storage reflects the completed state.
backupForUpload := backup.DeepCopy()
backupForUpload.Status.Phase = finalPhase
backupForUpload.Status.CompletionTimestamp = completionTimestamp
backupForUpload.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted
Comment on lines +206 to +226
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

finalPhase has no default assignment and is used unconditionally. If backup.Status.Phase is not one of the two handled values in this switch on this code path, finalPhase will be the zero value and could be uploaded/patched as an empty phase. A concrete fix is to add an explicit default/guard (e.g., return early) or initialize finalPhase from backup.Status.Phase and only override it for the handled Finalizing* phases.

Copilot uses AI. Check for mistakes.

// update backup metadata in object store
backupJSON := new(bytes.Buffer)
if err := encode.To(backup, "json", backupJSON); err != nil {
if err := encode.To(backupForUpload, "json", backupJSON); err != nil {
return ctrl.Result{}, errors.Wrap(err, "error encoding backup json")
}
err = backupStore.PutBackupMetadata(backup.Name, backupJSON)
Expand All @@ -233,6 +239,24 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents")
}
}

// Uploads succeeded — now safe to set the final phase on the in-memory
// backup object so the deferred patch writes it to the API server.
backup.Status.Phase = finalPhase
backup.Status.CompletionTimestamp = completionTimestamp
backup.Status.CSIVolumeSnapshotsCompleted = csiVolumeSnapshotsCompleted

switch finalPhase {
case velerov1api.BackupPhaseCompleted:
r.metrics.RegisterBackupSuccess(backupScheduleName)
r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusSucc)
case velerov1api.BackupPhasePartiallyFailed:
r.metrics.RegisterBackupPartialFailure(backupScheduleName)
r.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}

recordBackupMetrics(log, backup, outBackupFile, r.metrics, true)

return ctrl.Result{}, nil
}

Expand Down
151 changes: 151 additions & 0 deletions pkg/controller/backup_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import (
"bytes"
"fmt"
"io"
"testing"
"time"
Expand All @@ -39,8 +40,10 @@
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/kuberesource"
"github.com/vmware-tanzu/velero/pkg/metrics"
persistencemocks "github.com/vmware-tanzu/velero/pkg/persistence/mocks"
"github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt"
"github.com/vmware-tanzu/velero/pkg/plugin/framework"
pluginmocks "github.com/vmware-tanzu/velero/pkg/plugin/mocks"
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
)
Expand Down Expand Up @@ -242,3 +245,151 @@
})
}
}

func TestBackupFinalizerReconcile_PutBackupMetadataFail(t *testing.T) {
tests := []struct {
name string
initialPhase velerov1api.BackupPhase
}{
{
name: "Finalizing backup stays Finalizing when PutBackupMetadata fails",
initialPhase: velerov1api.BackupPhaseFinalizing,
},
{
name: "FinalizingPartiallyFailed backup stays FinalizingPartiallyFailed when PutBackupMetadata fails",
initialPhase: velerov1api.BackupPhaseFinalizingPartiallyFailed,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClock := testclocks.NewFakeClock(time.Now())
defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result()

backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-meta-fail").
StorageLocation("default").
ObjectMeta(builder.WithUID("foo")).
StartTimestamp(fakeClock.Now()).
Phase(test.initialPhase).Result()

fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation)
fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation)

// Use local mocks to avoid interference with other tests
localPluginManager := &pluginmocks.Manager{}
localBackupStore := &persistencemocks.BackupStore{}

backupper := new(fakeBackupper)
reconciler := NewBackupFinalizerReconciler(
fakeClient,
fakeGlobalClient,
fakeClock,
backupper,
func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager },
NewBackupTracker(),
NewFakeSingleObjectBackupStoreGetter(localBackupStore),
logrus.StandardLogger(),
metrics.NewServerMetrics(),
10*time.Minute,
)

localPluginManager.On("CleanupClients").Return(nil)
localBackupStore.On("GetBackupItemOperations", backup.Name).Return(nil, nil)
// PutBackupMetadata fails
localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload"))
localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil)
localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil)

_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}})
assert.Error(t, err, "reconcile should return error when PutBackupMetadata fails")

Check failure on line 304 in pkg/controller/backup_finalizer_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

require-error: for error assertions use require (testifylint)

backupAfter := velerov1api.Backup{}
err = fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: backup.Namespace,
Name: backup.Name,
}, &backupAfter)
require.NoError(t, err)
assert.Equal(t, test.initialPhase, backupAfter.Status.Phase,
"backup phase should remain %s when PutBackupMetadata fails", test.initialPhase)
assert.Nil(t, backupAfter.Status.CompletionTimestamp,
"CompletionTimestamp should not be set when PutBackupMetadata fails")
})
}
}

func TestBackupFinalizerReconcile_PutBackupContentsFail(t *testing.T) {
fakeClock := testclocks.NewFakeClock(time.Now())
metav1Now := metav1.NewTime(fakeClock.Now())
defaultBackupLocation := builder.ForBackupStorageLocation(velerov1api.DefaultNamespace, "default").Result()

backup := builder.ForBackup(velerov1api.DefaultNamespace, "backup-contents-fail").
StorageLocation("default").
ObjectMeta(builder.WithUID("foo")).
StartTimestamp(fakeClock.Now()).
Phase(velerov1api.BackupPhaseFinalizing).Result()
Comment on lines +320 to +329
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new reconciler logic computes a finalPhase for both Finalizing and FinalizingPartiallyFailed, but the PutBackupContents failure test only covers Finalizing. Add a table-driven variant (similar to TestBackupFinalizerReconcile_PutBackupMetadataFail) that also exercises the FinalizingPartiallyFailed starting phase and asserts the phase/timestamp remain unchanged on PutBackupContents failure.

Copilot uses AI. Check for mistakes.

fakeClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation)
fakeGlobalClient := velerotest.NewFakeControllerRuntimeClient(t, backup, defaultBackupLocation)

localPluginManager := &pluginmocks.Manager{}
localBackupStore := &persistencemocks.BackupStore{}

backupper := new(fakeBackupper)
reconciler := NewBackupFinalizerReconciler(
fakeClient,
fakeGlobalClient,
fakeClock,
backupper,
func(logrus.FieldLogger) clientmgmt.Manager { return localPluginManager },
NewBackupTracker(),
NewFakeSingleObjectBackupStoreGetter(localBackupStore),
logrus.StandardLogger(),
metrics.NewServerMetrics(),
10*time.Minute,
)

operations := []*itemoperation.BackupOperation{
{
Spec: itemoperation.BackupOperationSpec{
BackupName: "backup-contents-fail",
BackupUID: "foo",
BackupItemAction: "foo",
ResourceIdentifier: velero.ResourceIdentifier{
GroupResource: kuberesource.Pods,
Namespace: "ns-1",
Name: "pod-1",
},
OperationID: "operation-1",
},
Status: itemoperation.OperationStatus{
Phase: itemoperation.OperationPhaseCompleted,
Created: &metav1Now,
},
},
}

localPluginManager.On("CleanupClients").Return(nil)
localPluginManager.On("GetBackupItemActionsV2").Return(nil, nil)
localBackupStore.On("GetBackupItemOperations", backup.Name).Return(operations, nil)
localBackupStore.On("GetBackupContents", mock.Anything).Return(io.NopCloser(bytes.NewReader([]byte("hello world"))), nil)
localBackupStore.On("PutBackupMetadata", mock.Anything, mock.Anything).Return(nil)
// PutBackupContents fails
localBackupStore.On("PutBackupContents", mock.Anything, mock.Anything).Return(fmt.Errorf("object lock prevented upload"))
localBackupStore.On("GetBackupVolumeInfos", mock.Anything).Return(nil, nil)
localBackupStore.On("PutBackupVolumeInfos", mock.Anything, mock.Anything).Return(nil)
backupper.On("FinalizeBackup", mock.Anything, mock.Anything, mock.Anything, mock.Anything, framework.BackupItemActionResolverV2{}, mock.Anything, mock.Anything).Return(nil)
Copy link

Copilot AI Mar 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This mock expectation hard-codes framework.BackupItemActionResolverV2{} as an exact argument match, which is brittle if the reconciler passes a non-zero resolver value. Prefer matching that parameter with mock.Anything or mock.MatchedBy(...) so the test only asserts what it actually cares about (that FinalizeBackup is called and returns nil).

Suggested change
backupper.On("FinalizeBackup", mock.Anything, mock.Anything, mock.Anything, mock.Anything, framework.BackupItemActionResolverV2{}, mock.Anything, mock.Anything).Return(nil)
backupper.On("FinalizeBackup", mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil)

Copilot uses AI. Check for mistakes.

_, err := reconciler.Reconcile(t.Context(), ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}})
assert.Error(t, err, "reconcile should return error when PutBackupContents fails")

Check failure on line 383 in pkg/controller/backup_finalizer_controller_test.go

View workflow job for this annotation

GitHub Actions / Run Linter Check

require-error: for error assertions use require (testifylint)

backupAfter := velerov1api.Backup{}
err = fakeClient.Get(t.Context(), types.NamespacedName{
Namespace: backup.Namespace,
Name: backup.Name,
}, &backupAfter)
require.NoError(t, err)
assert.Equal(t, velerov1api.BackupPhaseFinalizing, backupAfter.Status.Phase,
"backup phase should remain Finalizing when PutBackupContents fails")
assert.Nil(t, backupAfter.Status.CompletionTimestamp,
"CompletionTimestamp should not be set when PutBackupContents fails")
}
Loading