Skip to content

Commit 6e78fc2

Browse files
author
Aaron Lehmann
committed
allocator: Less aggressive retry
Instead of retrying unallocated tasks, services, and networks every time data changes in the store, limit these retries to every 5 minutes. When a repeated attempt to allocate one of these objects fails, log it at the debug log level, to reduce noise in the logs. Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
1 parent e4762bc commit 6e78fc2

File tree

2 files changed

+54
-19
lines changed

2 files changed

+54
-19
lines changed

manager/allocator/allocator_test.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@ import (
1616
"github.com/stretchr/testify/require"
1717
)
1818

19+
func init() {
20+
// set artificially low retry interval for testing
21+
retryInterval = 5 * time.Millisecond
22+
}
23+
1924
func TestAllocator(t *testing.T) {
2025
s := store.NewMemoryStore(nil)
2126
assert.NotNil(t, s)

manager/allocator/network.go

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@ const (
2626
allocatedStatusMessage = "pending task scheduling"
2727
)
2828

29-
var errNoChanges = errors.New("task unchanged")
29+
var (
30+
errNoChanges = errors.New("task unchanged")
31+
32+
retryInterval = 5 * time.Minute
33+
)
3034

3135
func newIngressNetwork() *api.Network {
3236
return &api.Network{
@@ -57,19 +61,28 @@ type networkContext struct {
5761
// the actual network allocation.
5862
nwkAllocator *networkallocator.NetworkAllocator
5963

60-
// A table of unallocated tasks which will be revisited if any thing
64+
// A set of tasks which are ready to be allocated as a batch. This is
65+
// distinct from "unallocatedTasks" which are tasks that failed to
66+
// allocate on the first try, being held for a future retry.
67+
pendingTasks map[string]*api.Task
68+
69+
// A set of unallocated tasks which will be revisited if any thing
6170
// changes in system state that might help task allocation.
6271
unallocatedTasks map[string]*api.Task
6372

64-
// A table of unallocated services which will be revisited if
73+
// A set of unallocated services which will be revisited if
6574
// any thing changes in system state that might help service
6675
// allocation.
6776
unallocatedServices map[string]*api.Service
6877

69-
// A table of unallocated networks which will be revisited if
78+
// A set of unallocated networks which will be revisited if
7079
// any thing changes in system state that might help network
7180
// allocation.
7281
unallocatedNetworks map[string]*api.Network
82+
83+
// lastRetry is the last timestamp when unallocated
84+
// tasks/services/networks were retried.
85+
lastRetry time.Time
7386
}
7487

7588
func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
@@ -80,10 +93,12 @@ func (a *Allocator) doNetworkInit(ctx context.Context) (err error) {
8093

8194
nc := &networkContext{
8295
nwkAllocator: na,
96+
pendingTasks: make(map[string]*api.Task),
8397
unallocatedTasks: make(map[string]*api.Task),
8498
unallocatedServices: make(map[string]*api.Service),
8599
unallocatedNetworks: make(map[string]*api.Network),
86100
ingressNetwork: newIngressNetwork(),
101+
lastRetry: time.Now(),
87102
}
88103
a.netCtx = nc
89104
defer func() {
@@ -401,12 +416,22 @@ func (a *Allocator) doNetworkAlloc(ctx context.Context, ev events.Event) {
401416
case state.EventCreateNode, state.EventUpdateNode, state.EventDeleteNode:
402417
a.doNodeAlloc(ctx, ev)
403418
case state.EventCreateTask, state.EventUpdateTask, state.EventDeleteTask:
404-
a.doTaskAlloc(ctx, ev)
419+
a.doTaskAlloc(ctx, ev, nc.pendingTasks)
405420
case state.EventCommit:
406-
a.procUnallocatedNetworks(ctx)
407-
a.procUnallocatedServices(ctx)
408-
a.procUnallocatedTasksNetwork(ctx)
409-
return
421+
a.procTasksNetwork(ctx, nc.pendingTasks, false)
422+
423+
if time.Since(nc.lastRetry) > retryInterval {
424+
a.procUnallocatedNetworks(ctx)
425+
a.procUnallocatedServices(ctx)
426+
a.procTasksNetwork(ctx, nc.unallocatedTasks, true)
427+
nc.lastRetry = time.Now()
428+
}
429+
430+
// Any left over tasks are moved to the unallocated set
431+
for _, t := range nc.pendingTasks {
432+
nc.unallocatedTasks[t.ID] = t
433+
}
434+
nc.pendingTasks = make(map[string]*api.Task)
410435
}
411436
}
412437

@@ -551,7 +576,7 @@ func (a *Allocator) taskCreateNetworkAttachments(t *api.Task, s *api.Service) {
551576
taskUpdateNetworks(t, networks)
552577
}
553578

554-
func (a *Allocator) doTaskAlloc(ctx context.Context, ev events.Event) {
579+
func (a *Allocator) doTaskAlloc(ctx context.Context, ev events.Event, toAllocate map[string]*api.Task) {
555580
var (
556581
isDelete bool
557582
t *api.Task
@@ -579,14 +604,16 @@ func (a *Allocator) doTaskAlloc(ctx context.Context, ev events.Event) {
579604
}
580605
}
581606

582-
// Cleanup any task references that might exist in unallocatedTasks
607+
// Cleanup any task references that might exist
608+
delete(toAllocate, t.ID)
583609
delete(nc.unallocatedTasks, t.ID)
584610
return
585611
}
586612

587613
// If we are already in allocated state, there is
588614
// absolutely nothing else to do.
589615
if t.Status.State >= api.TaskStatePending {
616+
delete(toAllocate, t.ID)
590617
delete(nc.unallocatedTasks, t.ID)
591618
return
592619
}
@@ -616,7 +643,7 @@ func (a *Allocator) doTaskAlloc(ctx context.Context, ev events.Event) {
616643
// based on service spec.
617644
a.taskCreateNetworkAttachments(t, s)
618645

619-
nc.unallocatedTasks[t.ID] = t
646+
toAllocate[t.ID] = t
620647
}
621648

622649
func (a *Allocator) allocateNode(ctx context.Context, node *api.Node) error {
@@ -948,15 +975,18 @@ func (a *Allocator) procUnallocatedServices(ctx context.Context) {
948975
}
949976
}
950977

951-
func (a *Allocator) procUnallocatedTasksNetwork(ctx context.Context) {
952-
nc := a.netCtx
953-
allocatedTasks := make([]*api.Task, 0, len(nc.unallocatedTasks))
978+
func (a *Allocator) procTasksNetwork(ctx context.Context, toAllocate map[string]*api.Task, quiet bool) {
979+
allocatedTasks := make([]*api.Task, 0, len(toAllocate))
954980

955-
for _, t := range nc.unallocatedTasks {
981+
for _, t := range toAllocate {
956982
if err := a.allocateTask(ctx, t); err == nil {
957983
allocatedTasks = append(allocatedTasks, t)
958984
} else if err != errNoChanges {
959-
log.G(ctx).WithError(err).Error("task allocation failure")
985+
if quiet {
986+
log.G(ctx).WithError(err).Debug("task allocation failure")
987+
} else {
988+
log.G(ctx).WithError(err).Error("task allocation failure")
989+
}
960990
}
961991
}
962992

@@ -978,11 +1008,11 @@ func (a *Allocator) procUnallocatedTasksNetwork(ctx context.Context) {
9781008
})
9791009

9801010
if err != nil {
981-
log.G(ctx).WithError(err).Error("failed a store batch operation while processing unallocated tasks")
1011+
log.G(ctx).WithError(err).Error("failed a store batch operation while processing tasks")
9821012
}
9831013

9841014
for _, t := range allocatedTasks[:committed] {
985-
delete(nc.unallocatedTasks, t.ID)
1015+
delete(toAllocate, t.ID)
9861016
}
9871017
}
9881018

0 commit comments

Comments
 (0)