Skip to content

Commit ab9854e

Browse files
committed
MCO-1828: Fix pathological tests for disruptive suite
This commit adds a behavior change to the way matchers work to allow some matchers to apply during "disruptive periods" in disruptive runs. During disruptive periods (delimited by start-end matchers) a new type of matcher can target a given set of events to "skip" its occurence.
1 parent e7b4f7e commit ab9854e

File tree

9 files changed

+198
-25
lines changed

9 files changed

+198
-25
lines changed

pkg/defaultmonitortests/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ func newUniversalMonitorTests(info monitortestframework.MonitorTestInitializatio
199199
monitorTestRegistry.AddMonitorTestOrDie("additional-events-collector", "Test Framework", additionaleventscollector.NewIntervalSerializer())
200200
monitorTestRegistry.AddMonitorTestOrDie("known-image-checker", "Test Framework", knownimagechecker.NewEnsureValidImages())
201201
monitorTestRegistry.AddMonitorTestOrDie("e2e-test-analyzer", "Test Framework", e2etestanalyzer.NewAnalyzer())
202-
monitorTestRegistry.AddMonitorTestOrDie("event-collector", "Test Framework", watchevents.NewEventWatcher())
202+
monitorTestRegistry.AddMonitorTestOrDie("event-collector", "Test Framework", watchevents.NewEventWatcher(info))
203203
monitorTestRegistry.AddMonitorTestOrDie("clusteroperator-collector", "Test Framework", watchclusteroperators.NewOperatorWatcher())
204204
monitorTestRegistry.AddMonitorTestOrDie("initial-and-final-operator-log-scraper", "Test Framework", operatorloganalyzer.InitialAndFinalOperatorLogScraper())
205205
monitorTestRegistry.AddMonitorTestOrDie("lease-checker", "Test Framework", operatorloganalyzer.OperatorLeaseCheck())

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
v1 "github.com/openshift/api/config/v1"
1212
configclient "github.com/openshift/client-go/config/clientset/versioned"
1313
operatorv1client "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1"
14+
"github.com/openshift/origin/pkg/monitortestframework"
1415
"github.com/sirupsen/logrus"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
"k8s.io/client-go/kubernetes"
@@ -170,9 +171,7 @@ func (r *AllowedPathologicalEventRegistry) MatchesAny(i monitorapi.Interval) (bo
170171
// AllowedByAny checks if the given event locator and message match any registered matcher, and if
171172
// the repeating event should be allowed.
172173
// Returns true if so, the matcher name, and the matcher itself.
173-
func (r *AllowedPathologicalEventRegistry) AllowedByAny(
174-
i monitorapi.Interval,
175-
topology v1.TopologyMode) (bool, EventMatcher) {
174+
func (r *AllowedPathologicalEventRegistry) AllowedByAny(i monitorapi.Interval, topology v1.TopologyMode) (bool, EventMatcher) {
176175
l := i.Locator
177176
msg := i.Message
178177
for k, m := range r.matchers {
@@ -196,7 +195,11 @@ func (r *AllowedPathologicalEventRegistry) GetMatcherByName(name string) (EventM
196195

197196
// AllowedPathologicalEvents is the list of all allowed duplicate events on all jobs. Upgrade has an additional
198197
// list which is combined with this one.
199-
func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals monitorapi.Intervals) *AllowedPathologicalEventRegistry {
198+
func NewUniversalPathologicalEventMatchers(
199+
kubeConfig *rest.Config,
200+
finalIntervals monitorapi.Intervals,
201+
clusterStabilityDuringTest *monitortestframework.ClusterStabilityDuringTest,
202+
) *AllowedPathologicalEventRegistry {
200203
registry := &AllowedPathologicalEventRegistry{matchers: map[string]EventMatcher{}}
201204

202205
// [sig-apps] StatefulSet Basic StatefulSet functionality [StatefulSetBasic] should not deadlock when a pod's predecessor fails [Suite:openshift/conformance/parallel] [Suite:k8s]
@@ -502,15 +505,56 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
502505
twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
503506
registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
504507

508+
// Add disruption matchers for Disruptive suites only
509+
if clusterStabilityDuringTest != nil && *clusterStabilityDuringTest == monitortestframework.Disruptive {
510+
disruptivePeriodList := NewDisruptivePeriodListCordonedPeriods(finalIntervals)
511+
512+
// cluster-storage-operator reports some events that may look pathological but are expected if the
513+
// nodes are rebooting/being upgraded
514+
registry.AddPathologicalEventMatcherOrDie(
515+
&DisruptionAwarePathologicalEventMatcher{
516+
disruptivePeriodList: disruptivePeriodList,
517+
delegate: &SimplePathologicalEventMatcher{
518+
name: "DriverOperatorCRProgressing",
519+
messageReasonRegex: regexp.MustCompile("^OperatorStatusChanged$"),
520+
messageHumanRegex: regexp.MustCompile(`^Status for clusteroperator/storage changed: Progressing changed from (True|False) to (True|False)`),
521+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
522+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^cluster-storage-operator$`),
523+
monitorapi.LocatorDeploymentKey: regexp.MustCompile(`^openshift-cluster-storage-operator$`),
524+
},
525+
},
526+
},
527+
)
528+
529+
// TopologyAwareHints events need to be discarded during node reboot/upgrade
530+
registry.AddPathologicalEventMatcherOrDie(
531+
&DisruptionAwarePathologicalEventMatcher{
532+
disruptivePeriodList: disruptivePeriodList,
533+
delegate: &SimplePathologicalEventMatcher{
534+
name: "TopologyAwareHints",
535+
messageReasonRegex: regexp.MustCompile("^TopologyAwareHints(Enabled|Disabled)$"),
536+
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
537+
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-dns$`),
538+
monitorapi.LocatorServiceKey: regexp.MustCompile(`^dns-default$`),
539+
},
540+
},
541+
},
542+
)
543+
}
544+
505545
return registry
506546
}
507547

508548
// NewUpgradePathologicalEventMatchers creates the registry for allowed events during upgrade.
509549
// Contains everything in the universal set as well.
510-
func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals monitorapi.Intervals) *AllowedPathologicalEventRegistry {
550+
func NewUpgradePathologicalEventMatchers(
551+
kubeConfig *rest.Config,
552+
finalIntervals monitorapi.Intervals,
553+
clusterStabilityDuringTest *monitortestframework.ClusterStabilityDuringTest,
554+
) *AllowedPathologicalEventRegistry {
511555

512556
// Start with the main list of matchers:
513-
registry := NewUniversalPathologicalEventMatchers(kubeConfig, finalIntervals)
557+
registry := NewUniversalPathologicalEventMatchers(kubeConfig, finalIntervals, clusterStabilityDuringTest)
514558

515559
// Now add in the matchers we only want to apply during upgrade:
516560

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events.go

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strconv"
77
"strings"
88

9+
"github.com/openshift/origin/pkg/monitortestframework"
910
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
1011
"github.com/sirupsen/logrus"
1112

@@ -21,8 +22,12 @@ import (
2122
"k8s.io/client-go/rest"
2223
)
2324

24-
func TestDuplicatedEventForUpgrade(events monitorapi.Intervals, kubeClientConfig *rest.Config) []*junitapi.JUnitTestCase {
25-
registry := NewUpgradePathologicalEventMatchers(kubeClientConfig, events)
25+
func TestDuplicatedEventForUpgrade(
26+
events monitorapi.Intervals,
27+
kubeClientConfig *rest.Config,
28+
clusterStabilityDuringTest *monitortestframework.ClusterStabilityDuringTest,
29+
) []*junitapi.JUnitTestCase {
30+
registry := NewUpgradePathologicalEventMatchers(kubeClientConfig, events, clusterStabilityDuringTest)
2631

2732
evaluator := duplicateEventsEvaluator{
2833
registry: registry,
@@ -43,8 +48,12 @@ func TestDuplicatedEventForUpgrade(events monitorapi.Intervals, kubeClientConfig
4348
return tests
4449
}
4550

46-
func TestDuplicatedEventForStableSystem(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
47-
registry := NewUniversalPathologicalEventMatchers(clientConfig, events)
51+
func TestDuplicatedEventForStableSystem(
52+
events monitorapi.Intervals,
53+
clientConfig *rest.Config,
54+
clusterStabilityDuringTest *monitortestframework.ClusterStabilityDuringTest,
55+
) []*junitapi.JUnitTestCase {
56+
registry := NewUniversalPathologicalEventMatchers(clientConfig, events, clusterStabilityDuringTest)
4857

4958
evaluator := duplicateEventsEvaluator{
5059
registry: registry,
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
package pathologicaleventlibrary
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
"time"
7+
8+
v1 "github.com/openshift/api/config/v1"
9+
"github.com/openshift/origin/pkg/monitor/monitorapi"
10+
"github.com/sirupsen/logrus"
11+
)
12+
13+
type DisruptivePeriod struct {
14+
start time.Time
15+
end time.Time
16+
}
17+
18+
func (p *DisruptivePeriod) In(interval monitorapi.Interval) bool {
19+
return interval.From.After(p.start) && interval.To.Before(p.end)
20+
}
21+
22+
func (p *DisruptivePeriod) String() string {
23+
return fmt.Sprintf("Disruptive Period from %s to %s", p.start.Format(time.RFC3339Nano), p.end.Format(time.RFC3339Nano))
24+
}
25+
26+
type DisruptivePeriodList []DisruptivePeriod
27+
28+
func (l *DisruptivePeriodList) GetMatchingPeriod(interval monitorapi.Interval) *DisruptivePeriod {
29+
for _, disruptionPeriod := range *l {
30+
if disruptionPeriod.In(interval) {
31+
return &disruptionPeriod
32+
}
33+
}
34+
return nil
35+
}
36+
37+
type DisruptionAwarePathologicalEventMatcher struct {
38+
// delegate is a normal event matcher.
39+
delegate *SimplePathologicalEventMatcher
40+
41+
disruptivePeriodList DisruptivePeriodList
42+
}
43+
44+
func (ade *DisruptionAwarePathologicalEventMatcher) Name() string {
45+
return ade.delegate.Name()
46+
}
47+
48+
func (ade *DisruptionAwarePathologicalEventMatcher) Matches(i monitorapi.Interval) bool {
49+
return ade.delegate.Matches(i)
50+
}
51+
52+
func (ade *DisruptionAwarePathologicalEventMatcher) Allows(i monitorapi.Interval, topology v1.TopologyMode) bool {
53+
54+
// Check the delegate matcher first, if it matches, proceed to additional checks
55+
if !ade.delegate.Allows(i, topology) {
56+
return false
57+
}
58+
59+
// Match the pathological event if it happens in-between a disruption period
60+
disruptionPeriod := ade.disruptivePeriodList.GetMatchingPeriod(i)
61+
if disruptionPeriod != nil {
62+
logrus.Infof("ignoring %s pathological event as they fall within range of the disruption period: %s", i, disruptionPeriod)
63+
return true
64+
}
65+
66+
return false
67+
}
68+
69+
func NewDisruptivePeriodList(startMatcher EventMatcher, stopMatcher EventMatcher, events monitorapi.Intervals) DisruptivePeriodList {
70+
periods := DisruptivePeriodList{}
71+
var currentPeriod *DisruptivePeriod
72+
for _, event := range events {
73+
if startMatcher.Matches(event) {
74+
currentPeriod = &DisruptivePeriod{
75+
start: event.From,
76+
}
77+
} else if stopMatcher.Matches(event) {
78+
if currentPeriod != nil {
79+
currentPeriod.end = event.To
80+
periods = append(periods, *currentPeriod)
81+
}
82+
currentPeriod = nil
83+
}
84+
}
85+
return periods
86+
}
87+
88+
func NewDisruptivePeriodListCordonedPeriods(events monitorapi.Intervals) DisruptivePeriodList {
89+
startMatcher := &SimplePathologicalEventMatcher{
90+
name: "CordonStartMatcher",
91+
messageReasonRegex: regexp.MustCompile(`^Cordon$`),
92+
messageHumanRegex: regexp.MustCompile(`^Cordoned node to apply update$`),
93+
}
94+
stopMatcher := &SimplePathologicalEventMatcher{
95+
name: "CordonStartMatcher",
96+
messageReasonRegex: regexp.MustCompile(`^Uncordon$`),
97+
messageHumanRegex: regexp.MustCompile(`and node has been uncordoned$`),
98+
}
99+
return NewDisruptivePeriodList(startMatcher, stopMatcher, events)
100+
}

pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ func TestAllowedRepeatedEvents(t *testing.T) {
176176
},
177177
}
178178
for _, test := range tests {
179-
registry := NewUpgradePathologicalEventMatchers(nil, nil)
179+
registry := NewUpgradePathologicalEventMatchers(nil, nil, nil)
180180
t.Run(test.name, func(t *testing.T) {
181181
i := monitorapi.Interval{
182182
Condition: monitorapi.Condition{
@@ -387,7 +387,7 @@ func TestPathologicalEventsWithNamespaces(t *testing.T) {
387387
events := monitorapi.Intervals(test.intervals)
388388

389389
// Using upgrade for now, this has everything:
390-
registry := NewUpgradePathologicalEventMatchers(nil, test.intervals)
390+
registry := NewUpgradePathologicalEventMatchers(nil, test.intervals, nil)
391391

392392
evaluator := duplicateEventsEvaluator{
393393
registry: registry,
@@ -645,7 +645,7 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
645645

646646
events := monitorapi.Intervals(test.intervals)
647647
evaluator := duplicateEventsEvaluator{
648-
registry: NewUniversalPathologicalEventMatchers(nil, events),
648+
registry: NewUniversalPathologicalEventMatchers(nil, events, nil),
649649
}
650650

651651
testName := "events should not repeat"

pkg/monitortests/testframework/legacytestframeworkmonitortests/monitortest.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
5757

5858
isUpgrade := platformidentification.DidUpgradeHappenDuringCollection(finalIntervals, time.Time{}, time.Time{})
5959
if isUpgrade {
60-
junits = append(junits, pathologicaleventlibrary.TestDuplicatedEventForUpgrade(finalIntervals, w.adminRESTConfig)...)
60+
junits = append(junits, pathologicaleventlibrary.TestDuplicatedEventForUpgrade(finalIntervals, w.adminRESTConfig, w.clusterStabilityDuringTest)...)
6161
junits = append(junits, testAlerts(finalIntervals, alerts.AllowedAlertsDuringUpgrade, jobType, w.clusterStabilityDuringTest,
6262
w.adminRESTConfig, w.duration, w.recordedResources)...)
6363
} else {
64-
junits = append(junits, pathologicaleventlibrary.TestDuplicatedEventForStableSystem(finalIntervals, w.adminRESTConfig)...)
64+
junits = append(junits, pathologicaleventlibrary.TestDuplicatedEventForStableSystem(finalIntervals, w.adminRESTConfig, w.clusterStabilityDuringTest)...)
6565
junits = append(junits, testAlerts(finalIntervals, alerts.AllowedAlertsDuringConformance, jobType, w.clusterStabilityDuringTest,
6666
w.adminRESTConfig, w.duration, w.recordedResources)...)
6767
}

pkg/monitortests/testframework/watchevents/event.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
v1 "github.com/openshift/api/config/v1"
12+
"github.com/openshift/origin/pkg/monitortestframework"
1213
"github.com/sirupsen/logrus"
1314

1415
"github.com/openshift/origin/pkg/monitortestlibrary/pathologicaleventlibrary"
@@ -26,7 +27,7 @@ import (
2627

2728
var reMatchFirstQuote = regexp.MustCompile(`"([^"]+)"( in (\d+(\.\d+)?(s|ms)$))?`)
2829

29-
func startEventMonitoring(ctx context.Context, m monitorapi.RecorderWriter, adminRESTConfig *rest.Config, client kubernetes.Interface) {
30+
func startEventMonitoring(ctx context.Context, m monitorapi.RecorderWriter, adminRESTConfig *rest.Config, client kubernetes.Interface, info monitortestframework.MonitorTestInitializationInfo) {
3031

3132
// filter out events written "now" but with significantly older start times (events
3233
// created in test jobs are the most common)
@@ -64,7 +65,15 @@ func startEventMonitoring(ctx context.Context, m monitorapi.RecorderWriter, admi
6465
return nil
6566
}
6667
if processedEventUIDs[event.UID] != event.ResourceVersion {
67-
recordAddOrUpdateEvent(ctx, m, topology, client, significantlyBeforeNow, event)
68+
recordAddOrUpdateEvent(
69+
ctx,
70+
m,
71+
topology,
72+
client,
73+
significantlyBeforeNow,
74+
event,
75+
&info.ClusterStabilityDuringTest,
76+
)
6877
processedEventUIDs[event.UID] = event.ResourceVersion
6978
}
7079
return nil
@@ -75,7 +84,15 @@ func startEventMonitoring(ctx context.Context, m monitorapi.RecorderWriter, admi
7584
return nil
7685
}
7786
if processedEventUIDs[event.UID] != event.ResourceVersion {
78-
recordAddOrUpdateEvent(ctx, m, topology, client, significantlyBeforeNow, event)
87+
recordAddOrUpdateEvent(
88+
ctx,
89+
m,
90+
topology,
91+
client,
92+
significantlyBeforeNow,
93+
event,
94+
&info.ClusterStabilityDuringTest,
95+
)
7996
processedEventUIDs[event.UID] = event.ResourceVersion
8097
}
8198
return nil
@@ -91,7 +108,9 @@ func recordAddOrUpdateEvent(
91108
topology v1.TopologyMode,
92109
client kubernetes.Interface,
93110
significantlyBeforeNow time.Time,
94-
obj *corev1.Event) {
111+
obj *corev1.Event,
112+
clusterStabilityDuringTest *monitortestframework.ClusterStabilityDuringTest,
113+
) {
95114

96115
recorder.RecordResource("events", obj)
97116

@@ -188,7 +207,7 @@ func recordAddOrUpdateEvent(
188207
// times it occurred. We include upgrade allowances here. (the upgrade set contains both)
189208
// We do not pass a Kubeconfig or list of final intervals (as final intervals obviously do not exist), so a small subset of more matchers will not be active,
190209
// and will not get flagged as "interesting" as a result.
191-
registry := pathologicaleventlibrary.NewUpgradePathologicalEventMatchers(nil, nil)
210+
registry := pathologicaleventlibrary.NewUpgradePathologicalEventMatchers(nil, nil, clusterStabilityDuringTest)
192211

193212
intervalBuilder := monitorapi.NewInterval(monitorapi.SourceKubeEvent, level)
194213

pkg/monitortests/testframework/watchevents/event_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ func Test_recordAddOrUpdateEvent(t *testing.T) {
145145
}
146146
t.Run(tt.name, func(t *testing.T) {
147147
significantlyBeforeNow := now.UTC().Add(-15 * time.Minute)
148-
recordAddOrUpdateEvent(tt.args.ctx, tt.args.m, "", nil, significantlyBeforeNow, tt.args.kubeEvent)
148+
recordAddOrUpdateEvent(tt.args.ctx, tt.args.m, "", nil, significantlyBeforeNow, tt.args.kubeEvent, nil)
149149
intervals := tt.args.m.Intervals(now.Add(-10*time.Minute), now.Add(10*time.Minute))
150150
assert.Equal(t, 1, len(intervals))
151151
interval := intervals[0]

pkg/monitortests/testframework/watchevents/monitortest.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@ import (
1313
)
1414

1515
type eventWatcher struct {
16+
info monitortestframework.MonitorTestInitializationInfo
1617
}
1718

18-
func NewEventWatcher() monitortestframework.MonitorTest {
19-
return &eventWatcher{}
19+
func NewEventWatcher(info monitortestframework.MonitorTestInitializationInfo) monitortestframework.MonitorTest {
20+
return &eventWatcher{info: info}
2021
}
2122

2223
func (w *eventWatcher) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
@@ -29,7 +30,7 @@ func (w *eventWatcher) StartCollection(ctx context.Context, adminRESTConfig *res
2930
return err
3031
}
3132

32-
startEventMonitoring(ctx, recorder, adminRESTConfig, kubeClient)
33+
startEventMonitoring(ctx, recorder, adminRESTConfig, kubeClient, w.info)
3334

3435
return nil
3536
}

0 commit comments

Comments
 (0)