Skip to content

Commit 856e7d2

Browse files
utam0kmacsko
authored andcommitted
scheduler: Stop clearing NominatedNodeName on all cases
Signed-off-by: utam0k <[email protected]>
1 parent 461ba83 commit 856e7d2

File tree

4 files changed

+170
-25
lines changed

4 files changed

+170
-25
lines changed

pkg/scheduler/schedule_one.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,15 @@ func (sched *Scheduler) ScheduleOne(ctx context.Context) {
136136
}()
137137
}
138138

139-
var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
139+
// newFailureNominatingInfo returns the appropriate NominatingInfo for scheduling failures.
140+
// When NominatedNodeNameForExpectation feature is enabled, it returns nil (no clearing).
141+
// Otherwise, it returns NominatingInfo to clear the pod's nominated node.
142+
func (sched *Scheduler) newFailureNominatingInfo() *framework.NominatingInfo {
143+
if sched.nominatedNodeNameForExpectationEnabled {
144+
return nil
145+
}
146+
return &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
147+
}
140148

141149
// schedulingCycle tries to schedule a single Pod.
142150
func (sched *Scheduler) schedulingCycle(
@@ -156,13 +164,13 @@ func (sched *Scheduler) schedulingCycle(
156164
}()
157165
if err == ErrNoNodesAvailable {
158166
status := fwk.NewStatus(fwk.UnschedulableAndUnresolvable).WithError(err)
159-
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status
167+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, status
160168
}
161169

162170
fitError, ok := err.(*framework.FitError)
163171
if !ok {
164172
logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod))
165-
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.AsStatus(err)
173+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, fwk.AsStatus(err)
166174
}
167175

168176
// SchedulePod() may have failed because the pod would not fit on any host, so we try to
@@ -205,7 +213,7 @@ func (sched *Scheduler) schedulingCycle(
205213
// This relies on the fact that Error will check if the pod has been bound
206214
// to a node and if so will not add it back to the unscheduled pods queue
207215
// (otherwise this would cause an infinite loop).
208-
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.AsStatus(err)
216+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.AsStatus(err)
209217
}
210218

211219
// Run the Reserve method of reserve plugins.
@@ -226,9 +234,9 @@ func (sched *Scheduler) schedulingCycle(
226234
}
227235
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, sts)
228236
fitErr.Diagnosis.AddPluginStatus(sts)
229-
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
237+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
230238
}
231-
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts
239+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, sts
232240
}
233241

234242
// Run "permit" plugins.
@@ -250,10 +258,10 @@ func (sched *Scheduler) schedulingCycle(
250258
}
251259
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, runPermitStatus)
252260
fitErr.Diagnosis.AddPluginStatus(runPermitStatus)
253-
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
261+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
254262
}
255263

256-
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus
264+
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, runPermitStatus
257265
}
258266

259267
// At the end of a successful scheduling cycle, pop and move up Pods if needed.
@@ -385,7 +393,7 @@ func (sched *Scheduler) handleBindingCycleError(
385393
}
386394
}
387395

388-
sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start)
396+
sched.FailureHandler(ctx, fwk, podInfo, status, sched.newFailureNominatingInfo(), start)
389397
}
390398

391399
func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {

pkg/scheduler/schedule_one_test.go

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -916,8 +916,31 @@ func TestSchedulerScheduleOne(t *testing.T) {
916916
mockScheduleResult: emptyScheduleResult,
917917
eventReason: "FailedScheduling",
918918
},
919+
{
920+
name: "pod with existing nominated node name on scheduling error",
921+
sendPod: func() *v1.Pod {
922+
p := podWithID("foo", "")
923+
p.Status.NominatedNodeName = "existing-node"
924+
return p
925+
}(),
926+
injectSchedulingError: schedulingErr,
927+
mockScheduleResult: scheduleResultOk,
928+
expectError: schedulingErr,
929+
expectErrorPod: func() *v1.Pod {
930+
p := podWithID("foo", "")
931+
p.Status.NominatedNodeName = "existing-node"
932+
return p
933+
}(),
934+
expectPodInBackoffQ: func() *v1.Pod {
935+
p := podWithID("foo", "")
936+
p.Status.NominatedNodeName = "existing-node"
937+
return p
938+
}(),
939+
eventReason: "FailedScheduling",
940+
},
919941
}
920942

943+
// Test with QueueingHints and NominatedNodeNameForExpectation feature gates
921944
for _, qHintEnabled := range []bool{true, false} {
922945
for _, item := range table {
923946
asyncAPICallsEnabled := []bool{true, false}
@@ -930,8 +953,8 @@ func TestSchedulerScheduleOne(t *testing.T) {
930953
nominatedNodeNameForExpectationEnabled = []bool{*item.nominatedNodeNameForExpectationEnabled}
931954
}
932955
for _, nominatedNodeNameForExpectationEnabled := range nominatedNodeNameForExpectationEnabled {
933-
if nominatedNodeNameForExpectationEnabled && !qHintEnabled {
934-
// If the QHint feature gate is disabled, NominatedNodeNameForExpectation cannot be enabled
956+
if (asyncAPICallsEnabled || nominatedNodeNameForExpectationEnabled) && !qHintEnabled {
957+
// If the QHint feature gate is disabled, NominatedNodeNameForExpectation and SchedulerAsyncAPICalls cannot be enabled
935958
// because that means users set the emilation version to 1.33 or later.
936959
continue
937960
}
@@ -946,6 +969,8 @@ func TestSchedulerScheduleOne(t *testing.T) {
946969
var gotForgetPod *v1.Pod
947970
var gotAssumedPod *v1.Pod
948971
var gotBinding *v1.Binding
972+
var gotNominatingInfo *framework.NominatingInfo
973+
949974
client := clientsetfake.NewClientset(item.sendPod)
950975
informerFactory := informers.NewSharedInformerFactory(client, 0)
951976
client.PrependReactor("create", "pods", func(action clienttesting.Action) (bool, runtime.Object, error) {
@@ -1050,6 +1075,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
10501075
sched.FailureHandler = func(ctx context.Context, fwk framework.Framework, p *framework.QueuedPodInfo, status *fwk.Status, ni *framework.NominatingInfo, start time.Time) {
10511076
gotPod = p.Pod
10521077
gotError = status.AsError()
1078+
gotNominatingInfo = ni
10531079

10541080
sched.handleSchedulingFailure(ctx, fwk, p, status, ni, start)
10551081
}
@@ -1131,6 +1157,16 @@ func TestSchedulerScheduleOne(t *testing.T) {
11311157
t.Errorf("Expected unschedulable pods to be empty, but it's not.\nGot: %v", unschedulablePods)
11321158
}
11331159
}
1160+
if item.expectError != nil {
1161+
var expectedNominatingInfo *framework.NominatingInfo
1162+
// Check nominatingInfo expectation based on feature gate
1163+
if !nominatedNodeNameForExpectationEnabled {
1164+
expectedNominatingInfo = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
1165+
}
1166+
if diff := cmp.Diff(expectedNominatingInfo, gotNominatingInfo); diff != "" {
1167+
t.Errorf("Unexpected nominatingInfo diff (-want +got):\n%s", diff)
1168+
}
1169+
}
11341170
stopFunc()
11351171
})
11361172
}
@@ -2172,6 +2208,55 @@ func TestUpdatePod(t *testing.T) {
21722208
expectPatchRequest: true,
21732209
expectedPatchDataPattern: `{"status":{"nominatedNodeName":"node1"}}`,
21742210
},
2211+
{
2212+
name: "Should not update nominated node name when nominatingInfo is nil",
2213+
currentPodConditions: []v1.PodCondition{
2214+
{
2215+
Type: "currentType",
2216+
Status: "currentStatus",
2217+
LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)),
2218+
LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)),
2219+
Reason: "currentReason",
2220+
Message: "currentMessage",
2221+
},
2222+
},
2223+
newPodCondition: &v1.PodCondition{
2224+
Type: "currentType",
2225+
Status: "newStatus",
2226+
LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 1, 1, 1, 1, time.UTC)),
2227+
LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 1, 1, 1, 1, time.UTC)),
2228+
Reason: "newReason",
2229+
Message: "newMessage",
2230+
},
2231+
currentNominatedNodeName: "existing-node",
2232+
newNominatingInfo: nil,
2233+
expectPatchRequest: true,
2234+
expectedPatchDataPattern: `{"status":{"\$setElementOrder/conditions":\[{"type":"currentType"}],"conditions":\[{"lastProbeTime":"2020-05-13T01:01:01Z","lastTransitionTime":".*","message":"newMessage","reason":"newReason","status":"newStatus","type":"currentType"}]}}`,
2235+
},
2236+
{
2237+
name: "Should not make patch request when nominatingInfo is nil and pod condition is unchanged",
2238+
currentPodConditions: []v1.PodCondition{
2239+
{
2240+
Type: "currentType",
2241+
Status: "currentStatus",
2242+
LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)),
2243+
LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)),
2244+
Reason: "currentReason",
2245+
Message: "currentMessage",
2246+
},
2247+
},
2248+
newPodCondition: &v1.PodCondition{
2249+
Type: "currentType",
2250+
Status: "currentStatus",
2251+
LastProbeTime: metav1.NewTime(time.Date(2020, 5, 13, 0, 0, 0, 0, time.UTC)),
2252+
LastTransitionTime: metav1.NewTime(time.Date(2020, 5, 12, 0, 0, 0, 0, time.UTC)),
2253+
Reason: "currentReason",
2254+
Message: "currentMessage",
2255+
},
2256+
currentNominatedNodeName: "existing-node",
2257+
newNominatingInfo: nil,
2258+
expectPatchRequest: false,
2259+
},
21752260
}
21762261
for _, asyncAPICallsEnabled := range []bool{true, false} {
21772262
for _, test := range tests {

pkg/scheduler/scheduler.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ type Scheduler struct {
121121
// registeredHandlers contains the registrations of all handlers. It's used to check if all handlers have finished syncing before the scheduling cycles start.
122122
registeredHandlers []cache.ResourceEventHandlerRegistration
123123

124+
// nominatedNodeNameForExpectationEnabled stores whether the NominatedNodeNameForExpectation feature gate is enabled.
124125
nominatedNodeNameForExpectationEnabled bool
125126
}
126127

test/integration/scheduler/preemption/preemption_test.go

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,9 +1541,9 @@ func newAlwaysFail(_ context.Context, _ runtime.Object, _ framework.Handle) (fra
15411541
return &alwaysFail{}, nil
15421542
}
15431543

1544-
// TestNominatedNodeCleanUp verifies if a pod's nominatedNodeName is set and unset
1544+
// TestNominatedNode verifies if a pod's nominatedNodeName is set and unset
15451545
// properly in different scenarios.
1546-
func TestNominatedNodeCleanUp(t *testing.T) {
1546+
func TestNominatedNode(t *testing.T) {
15471547
tests := []struct {
15481548
name string
15491549
// Initial nodes to simulate special conditions.
@@ -1561,6 +1561,9 @@ func TestNominatedNodeCleanUp(t *testing.T) {
15611561
// Register dummy plugin to simulate particular scheduling failures. Optional.
15621562
customPlugins *configv1.Plugins
15631563
outOfTreeRegistry frameworkruntime.Registry
1564+
1565+
// Enable NominatedNodeNameForExpectation feature gate
1566+
enableNominatedNodeNameForExpectation bool
15641567
}{
15651568
{
15661569
name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod with another preemption",
@@ -1626,6 +1629,26 @@ func TestNominatedNodeCleanUp(t *testing.T) {
16261629
deleteFakeNode: true,
16271630
podNamesToDelete: []string{"low"},
16281631
},
1632+
{
1633+
name: "node removal causes unschedulable pods to be re-enqueued with feature gate enabled",
1634+
nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "2"},
1635+
podsToCreate: [][]*v1.Pod{
1636+
{
1637+
st.MakePod().Name("low").Priority(lowPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Obj(),
1638+
},
1639+
{
1640+
st.MakePod().Name("medium").Priority(mediumPriority).Req(map[v1.ResourceName]string{v1.ResourceCPU: "2"}).Obj(),
1641+
},
1642+
},
1643+
postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{
1644+
testutils.WaitForPodToSchedule,
1645+
testutils.WaitForNominatedNodeName,
1646+
},
1647+
// Delete the fake node to simulate an ErrNoNodesAvailable error.
1648+
deleteFakeNode: true,
1649+
podNamesToDelete: []string{"low"},
1650+
enableNominatedNodeNameForExpectation: true,
1651+
},
16291652
{
16301653
name: "mid-priority pod preempts low-priority pod at the beginning, but could not find candidates after the nominated node is deleted",
16311654
// Create a taint node to simulate the `UnschedulableAndUnresolvable` condition in `findCandidates` during preemption.
@@ -1687,9 +1710,10 @@ func TestNominatedNodeCleanUp(t *testing.T) {
16871710
for _, asyncPreemptionEnabled := range []bool{true, false} {
16881711
for _, asyncAPICallsEnabled := range []bool{true, false} {
16891712
for _, tt := range tests {
1690-
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v, Async API calls enabled: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled), func(t *testing.T) {
1713+
t.Run(fmt.Sprintf("%s (Async preemption enabled: %v, Async API calls enabled: %v, NominatedNodeName for expectation enabled: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled, tt.enableNominatedNodeNameForExpectation), func(t *testing.T) {
16911714
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncPreemption, asyncPreemptionEnabled)
16921715
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SchedulerAsyncAPICalls, asyncAPICallsEnabled)
1716+
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NominatedNodeNameForExpectation, tt.enableNominatedNodeNameForExpectation)
16931717

16941718
cfg := configtesting.V1ToInternalWithDefaults(t, configv1.KubeSchedulerConfiguration{
16951719
Profiles: []configv1.KubeSchedulerProfile{{
@@ -1717,6 +1741,8 @@ func TestNominatedNodeCleanUp(t *testing.T) {
17171741
t.Fatalf("Error creating node %v: %v", nodeName, err)
17181742
}
17191743

1744+
// Track pods that were once nominated
1745+
podsOnceNominated := []string{}
17201746
// Create pods and run post check if necessary.
17211747
for i, pods := range tt.podsToCreate {
17221748
for _, p := range pods {
@@ -1733,6 +1759,12 @@ func TestNominatedNodeCleanUp(t *testing.T) {
17331759
}
17341760
}
17351761
}
1762+
1763+
for _, p := range pods {
1764+
if p.Status.NominatedNodeName != "" {
1765+
podsOnceNominated = append(podsOnceNominated, p.Name)
1766+
}
1767+
}
17361768
}
17371769

17381770
// Delete the fake node if necessary.
@@ -1742,6 +1774,23 @@ func TestNominatedNodeCleanUp(t *testing.T) {
17421774
}
17431775
}
17441776

1777+
if tt.enableNominatedNodeNameForExpectation {
1778+
for _, podName := range podsOnceNominated {
1779+
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
1780+
pod, err := cs.CoreV1().Pods(ns).Get(ctx, podName, metav1.GetOptions{})
1781+
if err != nil {
1782+
t.Errorf("Error getting the %s pod: %v", podName, err)
1783+
}
1784+
if len(pod.Status.NominatedNodeName) != 0 {
1785+
return true, nil
1786+
}
1787+
return false, err
1788+
}); err != nil {
1789+
t.Errorf(".status.nominatedNodeName of the pod %v/%v was not cleared after node deletion: %v", ns, podName, err)
1790+
}
1791+
}
1792+
}
1793+
17451794
// Force deleting the terminating pods if necessary.
17461795
// This is required if we demand to delete terminating Pods physically.
17471796
for _, podName := range tt.podNamesToDelete {
@@ -1750,18 +1799,20 @@ func TestNominatedNodeCleanUp(t *testing.T) {
17501799
}
17511800
}
17521801

1753-
// Verify if .status.nominatedNodeName is cleared.
1754-
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
1755-
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
1756-
if err != nil {
1757-
t.Errorf("Error getting the medium pod: %v", err)
1758-
}
1759-
if len(pod.Status.NominatedNodeName) == 0 {
1760-
return true, nil
1802+
// Verify if .status.nominatedNodeName is cleared when NominatedNodeNameForExpectation is disabled.
1803+
if !tt.enableNominatedNodeNameForExpectation {
1804+
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
1805+
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
1806+
if err != nil {
1807+
t.Errorf("Error getting the medium pod: %v", err)
1808+
}
1809+
if len(pod.Status.NominatedNodeName) == 0 {
1810+
return true, nil
1811+
}
1812+
return false, err
1813+
}); err != nil {
1814+
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
17611815
}
1762-
return false, err
1763-
}); err != nil {
1764-
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
17651816
}
17661817
})
17671818
}

0 commit comments

Comments
 (0)