Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions pkg/controller/restore_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,21 @@ func (ctx *finalizerContext) WaitRestoreExecHook() (errs results.Result) {
log := ctx.logger.WithField("restore", ctx.restore.Name)
log.Info("Waiting for restore exec hooks starts")

// wait for restore exec hooks to finish
err := wait.PollUntilContextCancel(context.Background(), 1*time.Second, true, func(context.Context) (bool, error) {
// Bound the wait by resourceTimeout (the same budget Velero already
// applies to other finalizer phases). Previously this poll had no
// deadline, so a hook that was registered via Add() but never
// recorded as executed (pod evicted, container never ready,
// goroutine panic, leaked tracker entry) left the restore stuck in
// Finalizing forever and — because the finalizer controller is a
// single-threaded controller-runtime reconciler — blocked every
// other restore on the cluster.
timeout := ctx.resourceTimeout
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do not need to check <0 I think, this should be sufficient

// Bound the wait so we never poll forever on orphaned hook tracker state.
	waitCtx, waitCancel := context.WithTimeout(context.Background(), ctx.resourceTimeout)
	defer waitCancel()

	// wait for restore exec hooks to finish
	err := wait.PollUntilContextCancel(waitCtx, 1*time.Second, true, func(pollCtx context.Context) (bool, error) {

if timeout <= 0 {
timeout = 10 * time.Minute
}
pollCtx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
err := wait.PollUntilContextCancel(pollCtx, 1*time.Second, true, func(context.Context) (bool, error) {
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can also add a check here if the restore CR is missing skip it

log.Debug("Checking the progress of hooks execution")
if ctx.multiHookTracker.IsComplete(ctx.restore.Name) {
return true, nil
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion

// Check that the Restore CR still exists; abort if it has been deleted.
		restore := &velerov1api.Restore{}
		if err := ctx.crClient.Get(pollCtx, client.ObjectKey{
			Namespace: ctx.restore.Namespace,
			Name:      ctx.restore.Name,
		}, restore); err != nil {
			if apierrors.IsNotFound(err) {
				log.Warn("Restore CR no longer exists, aborting hook wait")
				return true, nil
			}
			log.WithError(err).Warn("Error checking restore CR existence during hook wait")
		}

Expand Down
26 changes: 26 additions & 0 deletions pkg/controller/restore_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,13 @@ func TestWaitRestoreExecHook(t *testing.T) {
hookFailed, hookErr := true, fmt.Errorf("hook failed")
hookTracker3.Add(restoreName3, podNs, podName, container, source, hookName, hook.PhasePre, 0)

// hookTracker4: an entry that is added but never recorded. This
// reproduces the hang that motivated the resourceTimeout guard —
// without the timeout, WaitRestoreExecHook would block forever.
hookTracker4 := hook.NewMultiHookTracker()
restoreName4 := "restore4"
hookTracker4.Add(restoreName4, "ns", "pod", "con1", "s1", "h1", hook.PhasePre, 0)

tests := []struct {
name string
hookTracker *hook.MultiHookTracker
Expand All @@ -497,6 +504,8 @@ func TestWaitRestoreExecHook(t *testing.T) {
hookName string
hookFailed bool
hookErr error
resourceTimeout time.Duration
expectTimeoutErr bool
}{
{
name: "no restore exec hooks",
Expand Down Expand Up @@ -530,6 +539,16 @@ func TestWaitRestoreExecHook(t *testing.T) {
hookFailed: hookFailed,
hookErr: hookErr,
},
{
name: "hook never recorded should timeout instead of hanging",
hookTracker: hookTracker4,
restore: builder.ForRestore(velerov1api.DefaultNamespace, restoreName4).Result(),
expectedHooksAttempted: 0,
expectedHooksFailed: 0,
expectedHookErrs: 1,
resourceTimeout: 3 * time.Second,
expectTimeoutErr: true,
},
}

for _, tc := range tests {
Expand All @@ -542,6 +561,7 @@ func TestWaitRestoreExecHook(t *testing.T) {
crClient: fakeClient,
restore: tc.restore,
multiHookTracker: tc.hookTracker,
resourceTimeout: tc.resourceTimeout,
}
require.NoError(t, ctx.crClient.Create(t.Context(), tc.restore))

Expand All @@ -553,6 +573,12 @@ func TestWaitRestoreExecHook(t *testing.T) {
}

errs := ctx.WaitRestoreExecHook()
if tc.expectTimeoutErr {
// The poll should be bounded by resourceTimeout and surface
// a timeout error rather than hanging the reconciler.
assert.NotEmpty(t, errs.Namespaces, "expected timeout error but got none")
continue
}
assert.Len(t, errs.Namespaces, tc.expectedHookErrs)

updated := &velerov1api.Restore{}
Expand Down
Loading