Skip to content

Commit 8a6ac7a

Browse files
authored
fix: backup deletion silently succeeds when tarball download fails (#9693)
* Enhance backup deletion logic to handle tarball download failures and clean up associated CSI VolumeSnapshotContents Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * added changelog Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * Refactor error handling in backup deletion Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * Refactor backup deletion logic to skip CSI snapshot cleanup on tarball download failure Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * prevent backup deletion when errors occur Signed-off-by: Priyansh Choudhary <im1706@gmail.com> * added logger Signed-off-by: Priyansh Choudhary <im1706@gmail.com>
1 parent 15db9d2 commit 8a6ac7a

File tree

3 files changed

+127
-21
lines changed

3 files changed

+127
-21
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Enhance backup deletion logic to handle tarball download failures

pkg/controller/backup_deletion_controller.go

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23+
"strings"
2324
"time"
2425

2526
jsonpatch "github.com/evanphx/json-patch/v5"
@@ -267,8 +268,17 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
267268

268269
if err != nil {
269270
log.WithError(err).Errorf("Unable to download tarball for backup %s, skipping associated DeleteItemAction plugins", backup.Name)
271+
// for backups which failed before tarball object could be uploaded we do offline cleanup
270272
log.Info("Cleaning up CSI volumesnapshots")
271273
r.deleteCSIVolumeSnapshotsIfAny(ctx, backup, log)
274+
275+
// If the tarball simply does not exist (HTTP 404 / not found), the download
276+
// failure is permanent and not retryable, so we let deletion proceed.
277+
// For transient errors (throttling, auth failures, network issues), record
278+
// the error to fail the deletion so it can be retried later.
279+
if !isTarballNotFoundError(err) {
280+
errs = append(errs, errors.Wrapf(err, "error downloading backup tarball, CSI snapshot cleanup was skipped").Error())
281+
}
272282
} else {
273283
defer closeAndRemoveFile(backupFile, r.logger)
274284
deleteCtx := &delete.Context{
@@ -351,11 +361,13 @@ func (r *backupDeletionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
351361
}
352362
}
353363

354-
if backupStore != nil {
364+
if backupStore != nil && len(errs) == 0 {
355365
log.Info("Removing backup from backup storage")
356366
if err := backupStore.DeleteBackup(backup.Name); err != nil {
357367
errs = append(errs, err.Error())
358368
}
369+
} else if len(errs) > 0 {
370+
log.Info("Skipping removal of backup from backup storage due to previous errors")
359371
}
360372

361373
log.Info("Removing restores")
@@ -691,3 +703,28 @@ func batchDeleteSnapshots(ctx context.Context, repoEnsurer *repository.Ensurer,
691703

692704
return errs
693705
}
706+
707+
// isTarballNotFoundError reports whether err indicates that the backup tarball
708+
// does not exist in object storage (e.g. HTTP 404 / not-found). Such errors are
709+
// permanent and not retryable, so callers should let deletion proceed (skipping
710+
// DeleteItemAction plugins) rather than failing the entire deletion.
711+
//
712+
// Transient errors (throttling, auth failures, network timeouts) return false so
713+
// the deletion is failed and can be retried once the storage is reachable again.
714+
func isTarballNotFoundError(err error) bool {
715+
if err == nil {
716+
return false
717+
}
718+
// Lower-case once for all comparisons.
719+
msg := strings.ToLower(err.Error())
720+
// Common "not found" indicators across cloud providers:
721+
// - "not found" / "does not exist": generic, in-memory object store
722+
// - "nosuchkey": AWS S3
723+
// - "blobnotfound": Azure Blob Storage
724+
// - "objectnotexist": GCS
725+
return strings.Contains(msg, "not found") ||
726+
strings.Contains(msg, "does not exist") ||
727+
strings.Contains(msg, "nosuchkey") ||
728+
strings.Contains(msg, "blobnotfound") ||
729+
strings.Contains(msg, "objectnotexist")
730+
}

pkg/controller/backup_deletion_controller_test.go

Lines changed: 88 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@ import (
2525
"reflect"
2626
"time"
2727

28-
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
29-
3028
"context"
3129

3230
"github.com/sirupsen/logrus"
@@ -606,7 +604,7 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
606604
// Make sure snapshot was deleted
607605
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
608606
})
609-
t.Run("backup is still deleted if downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
607+
t.Run("backup deletion fails with error when downloading tarball fails for DeleteItemAction plugins", func(t *testing.T) {
610608
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
611609
backup.UID = "uid"
612610
backup.Spec.StorageLocation = "primary"
@@ -672,38 +670,108 @@ func TestBackupDeletionControllerReconcile(t *testing.T) {
672670

673671
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
674672
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("error downloading tarball"))
675-
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
676673

677674
_, err := td.controller.Reconcile(t.Context(), td.req)
678675
require.NoError(t, err)
679676

680677
td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
681-
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
678+
// DeleteBackup (removing backup data from object storage) must NOT be called
679+
// when there are errors, so that the deletion can be retried later.
680+
td.backupStore.AssertNotCalled(t, "DeleteBackup", input.Spec.BackupName)
682681

683-
// the dbr should be deleted
682+
// the dbr should still exist and be marked Processed with errors
684683
res := &velerov1api.DeleteBackupRequest{}
685684
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
686-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
687-
if err == nil {
688-
t.Logf("status of the dbr: %s, errors in dbr: %v", res.Status.Phase, res.Status.Errors)
689-
}
685+
require.NoError(t, err, "Expected DBR to still exist after tarball download failure")
686+
assert.Equal(t, velerov1api.DeleteBackupRequestPhaseProcessed, res.Status.Phase)
687+
require.Len(t, res.Status.Errors, 1)
688+
assert.Contains(t, res.Status.Errors[0], "error downloading backup tarball, CSI snapshot cleanup was skipped")
690689

691-
// backup CR should be deleted
690+
// backup CR should NOT be deleted
692691
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
693692
Namespace: velerov1api.DefaultNamespace,
694693
Name: backup.Name,
695694
}, &velerov1api.Backup{})
696-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error, but actual value of error: %v", err)
695+
require.NoError(t, err, "Expected backup CR to still exist after tarball download failure")
696+
})
697+
t.Run("backup is still deleted if downloading tarball returns a not-found error", func(t *testing.T) {
698+
backup := builder.ForBackup(velerov1api.DefaultNamespace, "foo").Result()
699+
backup.UID = "uid"
700+
backup.Spec.StorageLocation = "primary"
697701

698-
// leaked CSI snapshot should be deleted
699-
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
700-
Namespace: "user-ns",
701-
Name: "vs-1",
702-
}, &snapshotv1api.VolumeSnapshot{})
703-
assert.True(t, apierrors.IsNotFound(err), "Expected not found error for the leaked CSI snapshot, but actual value of error: %v", err)
702+
input := defaultTestDbr()
703+
input.Labels = nil
704704

705-
// Make sure snapshot was deleted
706-
assert.Equal(t, 0, td.volumeSnapshotter.SnapshotsTaken.Len())
705+
location := &velerov1api.BackupStorageLocation{
706+
ObjectMeta: metav1.ObjectMeta{
707+
Namespace: backup.Namespace,
708+
Name: backup.Spec.StorageLocation,
709+
},
710+
Spec: velerov1api.BackupStorageLocationSpec{
711+
Provider: "objStoreProvider",
712+
StorageType: velerov1api.StorageType{
713+
ObjectStorage: &velerov1api.ObjectStorageLocation{
714+
Bucket: "bucket",
715+
},
716+
},
717+
},
718+
Status: velerov1api.BackupStorageLocationStatus{
719+
Phase: velerov1api.BackupStorageLocationPhaseAvailable,
720+
},
721+
}
722+
723+
snapshotLocation := &velerov1api.VolumeSnapshotLocation{
724+
ObjectMeta: metav1.ObjectMeta{
725+
Namespace: backup.Namespace,
726+
Name: "vsl-1",
727+
},
728+
Spec: velerov1api.VolumeSnapshotLocationSpec{
729+
Provider: "provider-1",
730+
},
731+
}
732+
733+
td := setupBackupDeletionControllerTest(t, defaultTestDbr(), backup, location, snapshotLocation)
734+
td.volumeSnapshotter.SnapshotsTaken.Insert("snap-1")
735+
736+
snapshots := []*volume.Snapshot{
737+
{
738+
Spec: volume.SnapshotSpec{
739+
Location: "vsl-1",
740+
},
741+
Status: volume.SnapshotStatus{
742+
ProviderSnapshotID: "snap-1",
743+
},
744+
},
745+
}
746+
747+
pluginManager := &pluginmocks.Manager{}
748+
pluginManager.On("GetVolumeSnapshotter", "provider-1").Return(td.volumeSnapshotter, nil)
749+
pluginManager.On("GetDeleteItemActions").Return([]velero.DeleteItemAction{new(mocks.DeleteItemAction)}, nil)
750+
pluginManager.On("CleanupClients")
751+
td.controller.newPluginManager = func(logrus.FieldLogger) clientmgmt.Manager { return pluginManager }
752+
753+
td.backupStore.On("GetBackupVolumeSnapshots", input.Spec.BackupName).Return(snapshots, nil)
754+
// Simulate a 404/not-found error (tarball has already been removed from storage)
755+
td.backupStore.On("GetBackupContents", input.Spec.BackupName).Return(nil, fmt.Errorf("key not found"))
756+
td.backupStore.On("DeleteBackup", input.Spec.BackupName).Return(nil)
757+
758+
_, err := td.controller.Reconcile(t.Context(), td.req)
759+
require.NoError(t, err)
760+
761+
td.backupStore.AssertCalled(t, "GetBackupContents", input.Spec.BackupName)
762+
td.backupStore.AssertCalled(t, "DeleteBackup", input.Spec.BackupName)
763+
764+
// the dbr should be deleted (not-found is treated as permanent, deletion proceeds)
765+
res := &velerov1api.DeleteBackupRequest{}
766+
err = td.fakeClient.Get(ctx, td.req.NamespacedName, res)
767+
assert.True(t, apierrors.IsNotFound(err), "Expected DBR to be deleted after not-found tarball error, but actual error: %v", err)
768+
769+
// backup CR should be deleted because there are no errors in errs
770+
err = td.fakeClient.Get(t.Context(), types.NamespacedName{
771+
Namespace: velerov1api.DefaultNamespace,
772+
Name: backup.Name,
773+
}, &velerov1api.Backup{})
774+
assert.True(t, apierrors.IsNotFound(err), "Expected backup CR to be deleted after not-found tarball error, but actual error: %v", err)
707775
})
708776
t.Run("Expired request will be deleted if the status is processed", func(t *testing.T) {
709777
expired := time.Date(2018, 4, 3, 12, 0, 0, 0, time.UTC)

0 commit comments

Comments
 (0)