diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go index 85eb203b3751..e320ad03a72c 100644 --- a/pkg/monitor/monitorapi/types.go +++ b/pkg/monitor/monitorapi/types.go @@ -247,8 +247,9 @@ const ( LeaseAcquiringStarted IntervalReason = "StartedAcquiring" LeaseAcquired IntervalReason = "Acquired" - ReasonBadOperatorApply IntervalReason = "BadOperatorApply" - ReasonKubeAPIServer500s IntervalReason = "KubeAPIServer500s" + ReasonBadOperatorApply IntervalReason = "BadOperatorApply" + ReasonKubeAPIServer500s IntervalReason = "KubeAPIServer500s" + ReasonKubeAPIServerNoLeader IntervalReason = "KubeAPIServerNoLeader" ReasonHighGeneration IntervalReason = "HighGeneration" ReasonInvalidGeneration IntervalReason = "GenerationViolation" diff --git a/pkg/monitortests/kubeapiserver/auditloganalyzer/handle_count_tracking.go b/pkg/monitortests/kubeapiserver/auditloganalyzer/handle_count_tracking.go index 9effa47d0868..bc8e9d02b6bf 100644 --- a/pkg/monitortests/kubeapiserver/auditloganalyzer/handle_count_tracking.go +++ b/pkg/monitortests/kubeapiserver/auditloganalyzer/handle_count_tracking.go @@ -4,18 +4,19 @@ import ( "bytes" "encoding/csv" "fmt" - "gonum.org/v1/plot" - "gonum.org/v1/plot/plotter" - "gonum.org/v1/plot/vg" "image/color" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - auditv1 "k8s.io/apiserver/pkg/apis/audit/v1" "os" "path" "strconv" "strings" "sync" "time" + + "gonum.org/v1/plot" + "gonum.org/v1/plot/plotter" + "gonum.org/v1/plot/vg" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + auditv1 "k8s.io/apiserver/pkg/apis/audit/v1" ) type CountForSecond struct { @@ -27,6 +28,10 @@ type CountForSecond struct { NumberOfRequestsReceived int + NumberOfRequestsReceivedThatAccessedEtcd int + NumberOfRequestsReceivedDidNotAccessEtcd int + NumberOfRequestsReceivedSawNoLeader int + // NumberOfRequestsReceivedThatLaterGot500 is calculated based on when the request was received instead of completed // because different requests have different timeouts, so it's more useful to categorize based on received time NumberOfRequestsReceivedThatLaterGot500 int @@ -78,6 +83,15 @@ func (s *countTracking) HandleAuditLogEvent(auditEvent *auditv1.Event, beginning for i := receivedIndex; i <= completionIndex; i++ { s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfConcurrentRequestsBeingHandled++ } + + if _, accessedEtcd := auditEvent.Annotations["apiserver.latency.k8s.io/etcd"]; accessedEtcd { + s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedThatAccessedEtcd++ + } else { + s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedDidNotAccessEtcd++ + } + if _, gotNoLeaderError := auditEvent.Annotations["apiserver.internal.openshift.io/no-leader"]; gotNoLeaderError { + s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedSawNoLeader++ + } } // returns received index, completion index, status code, and false if the index is out of bounds @@ -115,16 +129,26 @@ func (c *CountsForRun) countIndexesFromAuditTime(in *auditv1.Event) (int, int, i func (c *CountsForRun) ToCSV() ([]byte, error) { out := &bytes.Buffer{} csvWriter := csv.NewWriter(out) - if err := csvWriter.Write([]string{"seconds after cluster start", "number of requests being handled", "number of requests received", "number of requests received this second getting 500"}); err != nil { + if err := csvWriter.Write([]string{ + "seconds after cluster start", "number of requests being handled", "number of requests received", "number of requests received this second getting 500", + "number of requests received accessing etcd", "number of requests received NOT accessing etcd", "number of requests received with no leader", "percentage of requests received with no leader"}); err != nil { return nil, fmt.Errorf("failed writing headers: %w", err) } for i, curr := range c.CountsForEachSecond { + percentNoLeader := 0 + if curr.NumberOfRequestsReceivedThatAccessedEtcd > 0 { + percentNoLeader = int(float32(curr.NumberOfRequestsReceivedSawNoLeader) / float32(curr.NumberOfRequestsReceivedThatAccessedEtcd) * 100.0) + } record := []string{ strconv.Itoa(i), strconv.Itoa(curr.NumberOfConcurrentRequestsBeingHandled), strconv.Itoa(curr.NumberOfRequestsReceived), strconv.Itoa(curr.NumberOfRequestsReceivedThatLaterGot500), + strconv.Itoa(curr.NumberOfRequestsReceivedThatAccessedEtcd), + strconv.Itoa(curr.NumberOfRequestsReceivedDidNotAccessEtcd), + strconv.Itoa(curr.NumberOfRequestsReceivedSawNoLeader), + strconv.Itoa(percentNoLeader), } if err := csvWriter.Write(record); err != nil { @@ -187,7 +211,7 @@ func (c *CountsForRun) SubsetDataAtTime(endTime metav1.Time) *CountsForRun { return ret } -func (c *CountsForRun) ToLineChart() (*plot.Plot, error) { +func (c *CountsForRun) ToConcurrentRequestsLineChart() (*plot.Plot, error) { p := plot.New() p.Title.Text = "Requests by Second of Cluster Life" p.Y.Label.Text = "Number of Requests in that Second" @@ -232,6 +256,35 @@ func (c *CountsForRun) ToLineChart() (*plot.Plot, error) { return p, nil } +func (c *CountsForRun) ToEtcdNoLeaderLineChart() (*plot.Plot, error) { + p := plot.New() + p.Title.Text = "Percentage of Requests Each Second with Etcd No Leader" + p.Y.Label.Text = "Percentage of No Leader Responses in that Second" + p.X.Label.Text = "Seconds of Cluster Life" + p.X.Tick.Marker = plot.TimeTicks{} + plotter.DefaultLineStyle.Width = vg.Points(1) + + percentageOfNoLeaderResponses := plotter.XYs{} + for i, requestCounts := range c.CountsForEachSecond { + timeOfSecond := c.EstimatedStartOfCluster.Add(time.Duration(i) * time.Second) + percentNoLeader := float64(0.0) + if requestCounts.NumberOfRequestsReceivedThatAccessedEtcd > 0 { + percentNoLeader = float64(requestCounts.NumberOfRequestsReceivedSawNoLeader) / float64(requestCounts.NumberOfRequestsReceivedThatAccessedEtcd) * 100.0 + } + percentageOfNoLeaderResponses = append(percentageOfNoLeaderResponses, plotter.XY{X: float64(timeOfSecond.Unix()), Y: percentNoLeader}) + } + + lineOfPercentageOfNoLeaderResponses, err := plotter.NewLine(percentageOfNoLeaderResponses) + if err != nil { + return nil, err + } + lineOfPercentageOfNoLeaderResponses.LineStyle.Color = color.RGBA{R: 255, G: 0, B: 0, A: 255} + p.Add(lineOfPercentageOfNoLeaderResponses) + p.Legend.Add("Percentage of No Leader Responses in that Second", lineOfPercentageOfNoLeaderResponses) + + return p, nil +} + func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string) error { csvContent, err := c.ToCSV() if err != nil { @@ -241,7 +294,7 @@ func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string if err := os.WriteFile(requestCountsByTimeFile, csvContent, 0644); err != nil { return err } - requestCountsForEntireRunPlot, err := c.ToLineChart() + requestCountsForEntireRunPlot, err := c.ToConcurrentRequestsLineChart() if err != nil { return err } @@ -251,5 +304,15 @@ func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string return err } + noEtcdLeaderByTimeEntireRunPlot, err := c.ToEtcdNoLeaderLineChart() + if err != nil { + return err + } + noEtcdLeaderByTimeTimeGraphFile := path.Join(storageDir, fmt.Sprintf("%s_%s.png", name, timeSuffix)) + err = noEtcdLeaderByTimeEntireRunPlot.Save(vg.Length(c.NumberOfSeconds), 500, noEtcdLeaderByTimeTimeGraphFile) + if err != nil { + return err + } + return nil } diff --git a/pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go b/pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go index 2eed46b25ffa..cd4576dbbf21 100644 --- a/pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go +++ b/pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go @@ -153,39 +153,139 @@ func (w *auditLogAnalyzer) CollectData(ctx context.Context, storageDir string, b return retIntervals, nil, err } -func (*auditLogAnalyzer) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { - return nil, nil +func (w *auditLogAnalyzer) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { + ret := monitorapi.Intervals{} + if w.requestCountTracking != nil { + requestCountIntervals := func() monitorapi.Intervals { + w.requestCountTracking.lock.Lock() + defer w.requestCountTracking.lock.Unlock() + + retIntervals := monitorapi.Intervals{} + + lastSecondSawNoLeader := false + currTotalRequestsToEtcd := 0 + currTotalRequestsWithNoLeader := 0 + startOfNoLeaderInterval := time.Time{} + for second, counts := range w.requestCountTracking.CountsForRun.CountsForEachSecond { + switch { + case !lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader > 0: + lastSecondSawNoLeader = true + currTotalRequestsToEtcd += counts.NumberOfRequestsReceivedThatAccessedEtcd + currTotalRequestsWithNoLeader += counts.NumberOfRequestsReceivedSawNoLeader + startOfNoLeaderInterval = w.requestCountTracking.CountsForRun.EstimatedStartOfCluster.Add(time.Second * time.Duration(second)) + + case lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader > 0: + // continuing to have no leader problems + currTotalRequestsToEtcd += counts.NumberOfRequestsReceivedThatAccessedEtcd + currTotalRequestsWithNoLeader += counts.NumberOfRequestsReceivedSawNoLeader + + case !lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader == 0: + // continuing to be no leader problem-free, do nothing + + case lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader == 0: + endOfNoLeaderInterval := w.requestCountTracking.CountsForRun.EstimatedStartOfCluster.Add(time.Second * time.Duration(second)) + failurePercentage := int((float32(currTotalRequestsWithNoLeader) / float32(currTotalRequestsToEtcd)) * 100) + retIntervals = append(retIntervals, + monitorapi.NewInterval(monitorapi.SourceAuditLog, monitorapi.Error). + Locator(monitorapi.NewLocator().KubeAPIServerWithLB("any")). + Message(monitorapi.NewMessage(). + Reason(monitorapi.ReasonKubeAPIServerNoLeader). + WithAnnotation(monitorapi.AnnotationCount, strconv.Itoa(currTotalRequestsWithNoLeader)). + WithAnnotation(monitorapi.AnnotationPercentage, strconv.Itoa(failurePercentage)). + HumanMessagef("%d requests made during this time failed with 'no leader' out of %d total", currTotalRequestsWithNoLeader, currTotalRequestsToEtcd), + ). + Display(). + Build(startOfNoLeaderInterval, endOfNoLeaderInterval)) + + lastSecondSawNoLeader = false + currTotalRequestsToEtcd = 0 + currTotalRequestsWithNoLeader = 0 + startOfNoLeaderInterval = time.Time{} + } + } + + return retIntervals + }() + ret = append(ret, requestCountIntervals...) + } + + return ret, nil } func (w *auditLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { ret := []*junitapi.JUnitTestCase{} - fiveHundredsTestName := "[Jira:kube-apiserver] kube-apiserver should not have internal failures" - apiserver500s := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { - return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServer500s - }) - totalDurationOf500s := 0 - fiveHundredsFailures := []string{} - for _, interval := range apiserver500s { - totalDurationOf500s += int(interval.To.Sub(interval.From).Seconds()) + 1 - fiveHundredsFailures = append(fiveHundredsFailures, interval.String()) - } - if totalDurationOf500s > 60 { - ret = append(ret, &junitapi.JUnitTestCase{ - Name: fiveHundredsTestName, - FailureOutput: &junitapi.FailureOutput{ - Message: strings.Join(fiveHundredsFailures, "\n"), - Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOf500s), - }, + { + testName := "[Jira:kube-apiserver] kube-apiserver should not have internal failures" + offendingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServer500s }) - // flake for now - ret = append(ret, &junitapi.JUnitTestCase{ - Name: fiveHundredsTestName, - }) - } else { - ret = append(ret, &junitapi.JUnitTestCase{ - Name: fiveHundredsTestName, + totalDurationOfOffense := 0 + offenses := []string{} + for _, interval := range offendingIntervals { + totalDurationOfOffense += int(interval.To.Sub(interval.From).Seconds()) + 1 + offenses = append(offenses, interval.String()) + } + if totalDurationOfOffense > 60 { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + Message: strings.Join(offenses, "\n"), + Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense), + }, + }) + // flake for now + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + }) + } else { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + }) + } + } + + { + testName := "[Jira:etcd] kube-apiserver should not have etcd no leader problems" + offendingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServerNoLeader }) + totalDurationOfOffense := 0 + offenses := []string{} + for _, interval := range offendingIntervals { + totalDurationOfOffense += int(interval.To.Sub(interval.From).Seconds()) + 1 + offenses = append(offenses, interval.String()) + } + if totalDurationOfOffense > 100 { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + Message: strings.Join(offenses, "\n"), + Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense), + }, + }) + // surely we don't have more than 100 seconds of no leader, start of failure? + //ret = append(ret, &junitapi.JUnitTestCase{ + // Name: testName, + //}) + } else if totalDurationOfOffense > 0 { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + FailureOutput: &junitapi.FailureOutput{ + Message: strings.Join(offenses, "\n"), + Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense), + }, + }) + // flake for now + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + }) + + } else { + ret = append(ret, &junitapi.JUnitTestCase{ + Name: testName, + }) + } } allPlatformNamespaces, err := watchnamespaces.GetAllPlatformNamespaces()