Skip to content

Commit bad744e

Browse files
committed
ClusterOperators should not go Progressing only for a node reboot
This is to cover the node rebooting case from the rule [1] that is introduced recently: ``` Operators should not report Progressing only because DaemonSets owned by them are adjusting to a new node from cluster scaleup or a node rebooting from cluster upgrade. ``` The test fails if - `co/machine-config` never became Progressing=True during a cluster upgrade, or - some CO left Progressing=False during the upgrade after `machine-config` became Progressing=True. This should not have taken place as `machine-config` was rebooting the nodes which was the only thing ongoing to the cluster during that time. [1]. https://github.com/openshift/api/blob/61248d910ff74aef020492922d14e6dadaba598b/config/v1/types_cluster_operator.go#L163-L164
1 parent 92f0f58 commit bad744e

File tree

2 files changed

+136
-7
lines changed

2 files changed

+136
-7
lines changed

pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
4545
isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
4646
if isUpgrade {
4747
junits = append(junits, testUpgradeOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
48+
junits = append(junits, clusterOperatorIsNotProgressingWhenMachineConfigIs(finalIntervals)...)
4849
} else {
4950
junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
5051
}

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 135 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,21 @@ import (
66
"strings"
77
"time"
88

9-
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
9+
configv1 "github.com/openshift/api/config/v1"
10+
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
11+
"github.com/sirupsen/logrus"
1012
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/sets"
1114
"k8s.io/client-go/kubernetes"
15+
"k8s.io/client-go/rest"
1216

13-
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
14-
"github.com/sirupsen/logrus"
15-
16-
configv1 "github.com/openshift/api/config/v1"
17-
clientconfigv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
1817
"github.com/openshift/origin/pkg/monitor/monitorapi"
1918
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
2019
platformidentification2 "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
20+
"github.com/openshift/origin/pkg/monitortestlibrary/utility"
21+
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
2122
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
2223
exutil "github.com/openshift/origin/test/extended/util"
23-
"k8s.io/client-go/rest"
2424
)
2525

2626
// exceptionCallback consumes a suspicious condition and returns an
@@ -599,6 +599,134 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
599599
return ret
600600
}
601601

602+
func clusterOperatorIsNotProgressingWhenMachineConfigIs(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
603+
var ret []*junitapi.JUnitTestCase
604+
upgradeWindows := getUpgradeWindows(events)
605+
606+
var machineConfigProgressing time.Time
607+
var eventsInUpgradeWindows monitorapi.Intervals
608+
609+
var start, stop time.Time
610+
for _, event := range events {
611+
if !isInUpgradeWindow(upgradeWindows, event) {
612+
continue
613+
}
614+
eventsInUpgradeWindows = append(eventsInUpgradeWindows, event)
615+
if start.IsZero() || event.From.Before(start) {
616+
start = event.From
617+
}
618+
if stop.IsZero() || event.To.After(stop) {
619+
stop = event.To
620+
}
621+
}
622+
duration := stop.Sub(start).Seconds()
623+
624+
eventsByOperator := getEventsByOperator(eventsInUpgradeWindows)
625+
for _, mcEvent := range eventsByOperator["machine-config"] {
626+
condition := monitorapi.GetOperatorConditionStatus(mcEvent)
627+
if condition == nil {
628+
continue // ignore non-condition intervals
629+
}
630+
if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
631+
machineConfigProgressing = mcEvent.To
632+
}
633+
}
634+
635+
mcTestCase := &junitapi.JUnitTestCase{
636+
Name: fmt.Sprintf("[bz-Machine Config Operator] clusteroperator/machine-config must go Progressing=True during an upgrade test"),
637+
Duration: duration,
638+
}
639+
if machineConfigProgressing.IsZero() {
640+
mcTestCase.FailureOutput = &junitapi.FailureOutput{
641+
Output: fmt.Sprintf("machine-config was never Progressing=True during the upgrade window from %s to %s", start.Format(time.RFC3339), stop.Format(time.RFC3339)),
642+
}
643+
return []*junitapi.JUnitTestCase{mcTestCase}
644+
} else {
645+
mcTestCase.SystemOut = fmt.Sprintf("machine-config became Progressing=True at %s during the upgrade window from %s to %s", machineConfigProgressing.Format(time.RFC3339), start.Format(time.RFC3339), stop.Format(time.RFC3339))
646+
}
647+
ret = append(ret, mcTestCase)
648+
649+
for _, operatorName := range platformidentification.KnownOperators.Difference(sets.NewString("machine-config")).List() {
650+
bzComponent := platformidentification.GetBugzillaComponentForOperator(operatorName)
651+
if bzComponent == "Unknown" {
652+
bzComponent = operatorName
653+
}
654+
testName := fmt.Sprintf("[bz-%v] clusteroperator/%v should stay Progressing=False while MCO is Progressing=True", bzComponent, operatorName)
655+
operatorEvents := eventsByOperator[operatorName]
656+
if len(operatorEvents) == 0 {
657+
ret = append(ret, &junitapi.JUnitTestCase{
658+
Name: testName,
659+
Duration: duration,
660+
})
661+
continue
662+
}
663+
664+
except := func(co string, condition *configv1.ClusterOperatorStatusCondition) string {
665+
return ""
666+
}
667+
668+
var excepted, fatal []string
669+
for _, operatorEvent := range operatorEvents {
670+
if operatorEvent.From.Before(machineConfigProgressing) {
671+
continue
672+
}
673+
condition := monitorapi.GetOperatorConditionStatus(operatorEvent)
674+
if condition == nil {
675+
continue // ignore non-condition intervals
676+
}
677+
if condition.Type == "" {
678+
fatal = append(fatal, fmt.Sprintf("failed to convert %v into a condition with a type", operatorEvent))
679+
continue
680+
}
681+
682+
if condition.Type != configv1.OperatorProgressing || condition.Status == configv1.ConditionFalse {
683+
continue
684+
}
685+
686+
// if there was any switch, it was wrong/unexpected at some point
687+
failure := fmt.Sprintf("%v", operatorEvent)
688+
689+
exception := except(operatorName, condition)
690+
if exception == "" {
691+
fatal = append(fatal, failure)
692+
} else {
693+
excepted = append(excepted, fmt.Sprintf("%s (exception: %s)", failure, exception))
694+
}
695+
}
696+
697+
output := fmt.Sprintf("%d unexpected clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", len(fatal), start.Format(time.RFC3339), stop.Format(time.RFC3339))
698+
if len(fatal) > 0 {
699+
output = fmt.Sprintf("%s. These did not match any known exceptions, so they cause this test-case to fail:\n\n%v\n", output, strings.Join(fatal, "\n"))
700+
} else {
701+
output = fmt.Sprintf("%s, as desired.", output)
702+
}
703+
output = fmt.Sprintf("%s\n%d unwelcome but acceptable clusteroperator state transitions while machine-config is progressing during the upgrade window from %s to %s", output, len(excepted), start.Format(time.RFC3339), stop.Format(time.RFC3339))
704+
if len(excepted) > 0 {
705+
output = fmt.Sprintf("%s. These should not happen, but because they are tied to exceptions, the fact that they did happen is not sufficient to cause this test-case to fail:\n\n%v\n", output, strings.Join(excepted, "\n"))
706+
} else {
707+
output = fmt.Sprintf("%s, as desired.", output)
708+
}
709+
710+
if len(fatal) > 0 || len(excepted) > 0 {
711+
ret = append(ret, &junitapi.JUnitTestCase{
712+
Name: testName,
713+
Duration: duration,
714+
SystemOut: output,
715+
FailureOutput: &junitapi.FailureOutput{
716+
Output: output,
717+
},
718+
})
719+
}
720+
721+
if len(fatal) == 0 {
722+
// add a success so we flake (or pass) and don't fail
723+
ret = append(ret, &junitapi.JUnitTestCase{Name: testName, Duration: duration, SystemOut: output})
724+
}
725+
}
726+
727+
return ret
728+
}
729+
602730
type startedStaged struct {
603731
// OSUpdateStarted is the event Reason emitted by the machine config operator when a node begins extracting
604732
// it's OS content.

0 commit comments

Comments
 (0)