Skip to content

Commit 5ad4e60

Browse files
[release-1.18] Fix VolumeGroupSnapshot restore failure with Ceph RBD CSI driver (#9687)
* Fix VolumeGroupSnapshot restore failure with Ceph RBD CSI driver (#9516) * Fix VolumeGroupSnapshot restore on Ceph RBD This PR fixes two related issues affecting CSI snapshot restore on Ceph RBD: 1. VolumeGroupSnapshot restore fails because Ceph RBD populates volumeGroupSnapshotHandle on pre-provisioned VSCs, but Velero doesn't create the required VGSC during restore. 2. CSI snapshot restore fails because VolumeSnapshotClassName is removed from restored VSCs, preventing the CSI controller from getting credentials for snapshot verification. Changes: - Capture volumeGroupSnapshotHandle during backup as VS annotation - Create stub VGSC during restore with matching handle in status - Look up VolumeSnapshotClass by driver and set on restored VSC Fixes #9512 Fixes #9515 Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Add changelog for VGS restore fix Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Fix gofmt import order Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Add changelog for VGS restore fix Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Fix import alias corev1 to corev1api per lint config Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Fix: Add snapshot handles to existing stub VGSC and add unit tests When multiple VolumeSnapshots from the same VolumeGroupSnapshot are restored, they share the same VolumeGroupSnapshotHandle but have different individual snapshot handles. This commit: 1. Fixes incomplete logic where existing VGSC wasn't updated with new snapshot handles (addresses review feedback) 2. Fixes race condition where Create returning AlreadyExists would skip adding the snapshot handle 3. Adds comprehensive unit tests for ensureStubVGSCExists (5 cases) and addSnapshotHandleToVGSC (4 cases) functions Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Clean up stub VolumeGroupSnapshotContents during restore finalization Add cleanup logic for stub VGSCs created during VolumeGroupSnapshot restore. The stub VGSCs are temporary objects needed to satisfy CSI controller validation during VSC reconciliation. Once all related VSCs become ReadyToUse, the stub VGSCs are no longer needed and should be removed. The cleanup runs in the restore finalizer controller's execute() phase. Before deleting each VGSC, it polls until all related VolumeSnapshotContents (correlated by snapshot handle) are ReadyToUse, with a timeout fallback. Deletion failures and CRD-not-installed scenarios are treated as warnings rather than errors to avoid failing the restore. Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Fix lint: remove unused nolint directive and simplify cleanupStubVGSC return The cleanupStubVGSC function only produces warnings (not errors), so simplify its return signature. Also remove the now-unused nolint:unparam directive on execute() since warnings are no longer always nil. Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> --------- Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> * Rename changelog file to match cherry-pick PR number Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com> --------- Signed-off-by: Shubham Pampattiwar <spampatt@redhat.com>
1 parent f854a06 commit 5ad4e60

File tree

10 files changed

+805
-13
lines changed

10 files changed

+805
-13
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix VolumeGroupSnapshot restore failure with Ceph RBD CSI driver by creating stub VolumeGroupSnapshotContent during restore and looking up VolumeSnapshotClass by driver for credential support

internal/volume/volumes_information.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ type CSISnapshotInfo struct {
146146

147147
// The VolumeSnapshot's Status.ReadyToUse value
148148
ReadyToUse *bool
149+
150+
// The VolumeGroupSnapshotHandle from VSC status, used to create stub VGSC during restore
151+
// for CSI drivers that populate this field (e.g., Ceph RBD).
152+
VolumeGroupSnapshotHandle string `json:"volumeGroupSnapshotHandle,omitempty"`
149153
}
150154

151155
// SnapshotDataMovementInfo is used for displaying the snapshot data mover status.
@@ -456,6 +460,10 @@ func (v *BackupVolumesInformation) generateVolumeInfoForCSIVolumeSnapshot() {
456460
if volumeSnapshotContent.Status.SnapshotHandle != nil {
457461
snapshotHandle = *volumeSnapshotContent.Status.SnapshotHandle
458462
}
463+
volumeGroupSnapshotHandle := ""
464+
if volumeSnapshotContent.Status != nil && volumeSnapshotContent.Status.VolumeGroupSnapshotHandle != nil {
465+
volumeGroupSnapshotHandle = *volumeSnapshotContent.Status.VolumeGroupSnapshotHandle
466+
}
459467
if pvcPVInfo := v.pvMap.retrieve("", *volumeSnapshot.Spec.Source.PersistentVolumeClaimName, volumeSnapshot.Namespace); pvcPVInfo != nil {
460468
volumeInfo := &BackupVolumeInfo{
461469
BackupMethod: CSISnapshot,
@@ -466,12 +474,13 @@ func (v *BackupVolumesInformation) generateVolumeInfoForCSIVolumeSnapshot() {
466474
SnapshotDataMoved: false,
467475
PreserveLocalSnapshot: true,
468476
CSISnapshotInfo: &CSISnapshotInfo{
469-
VSCName: *volumeSnapshot.Status.BoundVolumeSnapshotContentName,
470-
Size: size,
471-
Driver: volumeSnapshotContent.Spec.Driver,
472-
SnapshotHandle: snapshotHandle,
473-
OperationID: operation.Spec.OperationID,
474-
ReadyToUse: volumeSnapshot.Status.ReadyToUse,
477+
VSCName: *volumeSnapshot.Status.BoundVolumeSnapshotContentName,
478+
Size: size,
479+
Driver: volumeSnapshotContent.Spec.Driver,
480+
SnapshotHandle: snapshotHandle,
481+
OperationID: operation.Spec.OperationID,
482+
ReadyToUse: volumeSnapshot.Status.ReadyToUse,
483+
VolumeGroupSnapshotHandle: volumeGroupSnapshotHandle,
475484
},
476485
PVInfo: &PVInfo{
477486
ReclaimPolicy: string(pvcPVInfo.PV.Spec.PersistentVolumeReclaimPolicy),

pkg/apis/velero/v1/labels_annotations.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ const (
141141
VolumeSnapshotRestoreSize = "velero.io/csi-volumesnapshot-restore-size"
142142
DriverNameAnnotation = "velero.io/csi-driver-name"
143143
VSCDeletionPolicyAnnotation = "velero.io/csi-vsc-deletion-policy"
144+
VolumeGroupSnapshotHandleAnnotation = "velero.io/csi-volumegroupsnapshot-handle"
144145
VolumeSnapshotClassSelectorLabel = "velero.io/csi-volumesnapshot-class"
145146
VolumeSnapshotClassDriverBackupAnnotationPrefix = "velero.io/csi-volumesnapshot-class"
146147
VolumeSnapshotClassDriverPVCAnnotation = "velero.io/csi-volumesnapshot-class"

pkg/backup/actions/csi/volumesnapshot_action.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ func (p *volumeSnapshotBackupItemAction) Execute(
151151
annotations[velerov1api.VolumeSnapshotRestoreSize] = resource.NewQuantity(
152152
*vsc.Status.RestoreSize, resource.BinarySI).String()
153153
}
154+
155+
// Capture VolumeGroupSnapshotHandle to create stub VGSC during restore
156+
// for CSI drivers that populate this field (e.g., Ceph RBD).
157+
if vsc.Status.VolumeGroupSnapshotHandle != nil {
158+
annotations[velerov1api.VolumeGroupSnapshotHandleAnnotation] = *vsc.Status.VolumeGroupSnapshotHandle
159+
}
154160
}
155161

156162
p.log.Infof("Patching VolumeSnapshotContent %s with velero BackupNameLabel",

pkg/controller/restore_finalizer_controller.go

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import (
2222
"sync"
2323
"time"
2424

25+
volumegroupsnapshotv1beta1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1beta1"
26+
snapshotv1api "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
2527
"github.com/pkg/errors"
2628
"github.com/sirupsen/logrus"
2729
corev1api "k8s.io/api/core/v1"
@@ -43,6 +45,7 @@ import (
4345
"github.com/vmware-tanzu/velero/pkg/persistence"
4446
"github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt"
4547
"github.com/vmware-tanzu/velero/pkg/plugin/velero"
48+
"github.com/vmware-tanzu/velero/pkg/util/boolptr"
4649
kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube"
4750
"github.com/vmware-tanzu/velero/pkg/util/results"
4851
)
@@ -291,13 +294,16 @@ type finalizerContext struct {
291294
resourceTimeout time.Duration
292295
}
293296

294-
func (ctx *finalizerContext) execute() (results.Result, results.Result) { //nolint:unparam //temporarily ignore the lint report: result 0 is always nil (unparam)
297+
func (ctx *finalizerContext) execute() (results.Result, results.Result) {
295298
warnings, errs := results.Result{}, results.Result{}
296299

297300
// implement finalization tasks
298301
pdpErrs := ctx.patchDynamicPVWithVolumeInfo()
299302
errs.Merge(&pdpErrs)
300303

304+
vgscWarnings := ctx.cleanupStubVGSC()
305+
warnings.Merge(&vgscWarnings)
306+
301307
rehErrs := ctx.WaitRestoreExecHook()
302308
errs.Merge(&rehErrs)
303309

@@ -443,6 +449,93 @@ func (ctx *finalizerContext) patchDynamicPVWithVolumeInfo() (errs results.Result
443449
return errs
444450
}
445451

452+
// cleanupStubVGSC deletes stub VolumeGroupSnapshotContent objects that were
453+
// created during restore to satisfy CSI controller validation. These stubs are
454+
// labeled with velero.io/restore-name for identification.
455+
// Before deleting each VGSC, it waits for all related VolumeSnapshotContents
456+
// to become ReadyToUse, since the CSI controller needs the VGSC during VSC reconciliation.
457+
func (ctx *finalizerContext) cleanupStubVGSC() (warnings results.Result) {
458+
ctx.logger.Info("cleaning up stub VolumeGroupSnapshotContents")
459+
460+
vgscList := &volumegroupsnapshotv1beta1.VolumeGroupSnapshotContentList{}
461+
err := ctx.crClient.List(
462+
context.Background(),
463+
vgscList,
464+
client.MatchingLabels{velerov1api.RestoreNameLabel: ctx.restore.Name},
465+
)
466+
if err != nil {
467+
// If the CRD is not installed, listing will fail. This is expected
468+
// on clusters without VolumeGroupSnapshot support, so treat as warning.
469+
ctx.logger.WithError(err).Warn("failed to list stub VolumeGroupSnapshotContents, skipping cleanup")
470+
warnings.Add("cluster", errors.Wrap(err, "failed to list stub VolumeGroupSnapshotContents"))
471+
return warnings
472+
}
473+
474+
if len(vgscList.Items) == 0 {
475+
ctx.logger.Info("no stub VolumeGroupSnapshotContents to clean up")
476+
return warnings
477+
}
478+
479+
for i := range vgscList.Items {
480+
vgsc := &vgscList.Items[i]
481+
log := ctx.logger.WithField("vgsc", vgsc.Name)
482+
483+
// Collect the snapshot handles associated with this VGSC
484+
snapshotHandles := map[string]bool{}
485+
if vgsc.Spec.Source.GroupSnapshotHandles != nil {
486+
for _, h := range vgsc.Spec.Source.GroupSnapshotHandles.VolumeSnapshotHandles {
487+
snapshotHandles[h] = true
488+
}
489+
}
490+
491+
if len(snapshotHandles) > 0 {
492+
// Wait for related VSCs to become ReadyToUse before deleting the VGSC
493+
log.Infof("waiting for %d related VolumeSnapshotContents to become ReadyToUse", len(snapshotHandles))
494+
err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, ctx.resourceTimeout, true, func(context.Context) (bool, error) {
495+
vscList := &snapshotv1api.VolumeSnapshotContentList{}
496+
if err := ctx.crClient.List(context.Background(), vscList, client.MatchingLabels{velerov1api.RestoreNameLabel: ctx.restore.Name}); err != nil {
497+
log.WithError(err).Warn("failed to list VolumeSnapshotContents")
498+
return false, nil
499+
}
500+
501+
for j := range vscList.Items {
502+
vsc := &vscList.Items[j]
503+
if vsc.Spec.Source.SnapshotHandle == nil {
504+
continue
505+
}
506+
if !snapshotHandles[*vsc.Spec.Source.SnapshotHandle] {
507+
continue
508+
}
509+
// This VSC is related to our VGSC
510+
if vsc.Status == nil || !boolptr.IsSetToTrue(vsc.Status.ReadyToUse) {
511+
log.Debugf("VolumeSnapshotContent %s not yet ReadyToUse", vsc.Name)
512+
return false, nil
513+
}
514+
}
515+
return true, nil
516+
})
517+
if err != nil {
518+
log.WithError(err).Warn("timed out waiting for related VolumeSnapshotContents to become ReadyToUse, proceeding with VGSC deletion")
519+
warnings.Add("cluster", errors.Wrapf(err, "timed out waiting for VSCs related to VGSC %s", vgsc.Name))
520+
}
521+
}
522+
523+
log.Info("deleting stub VolumeGroupSnapshotContent")
524+
if err := ctx.crClient.Delete(context.Background(), vgsc); err != nil {
525+
if apierrors.IsNotFound(err) {
526+
log.Info("stub VolumeGroupSnapshotContent already deleted")
527+
continue
528+
}
529+
log.WithError(err).Warn("failed to delete stub VolumeGroupSnapshotContent")
530+
warnings.Add("cluster", errors.Wrapf(err, "failed to delete stub VolumeGroupSnapshotContent %s", vgsc.Name))
531+
} else {
532+
log.Info("deleted stub VolumeGroupSnapshotContent")
533+
}
534+
}
535+
536+
return warnings
537+
}
538+
446539
func needPatch(newPV *corev1api.PersistentVolume, pvInfo *volume.PVInfo) bool {
447540
if newPV.Spec.PersistentVolumeReclaimPolicy != corev1api.PersistentVolumeReclaimPolicy(pvInfo.ReclaimPolicy) {
448541
return true

0 commit comments

Comments
 (0)