Skip to content

Commit

Permalink
start tracking number of seconds without an etcd leader
Browse files Browse the repository at this point in the history
  • Loading branch information
deads2k committed Jan 14, 2025
1 parent b573a5b commit e32a270
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 36 deletions.
5 changes: 3 additions & 2 deletions pkg/monitor/monitorapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,9 @@ const (
LeaseAcquiringStarted IntervalReason = "StartedAcquiring"
LeaseAcquired IntervalReason = "Acquired"

ReasonBadOperatorApply IntervalReason = "BadOperatorApply"
ReasonKubeAPIServer500s IntervalReason = "KubeAPIServer500s"
ReasonBadOperatorApply IntervalReason = "BadOperatorApply"
ReasonKubeAPIServer500s IntervalReason = "KubeAPIServer500s"
ReasonKubeAPIServerNoLeader IntervalReason = "KubeAPIServerNoLeader"

ReasonHighGeneration IntervalReason = "HighGeneration"
ReasonInvalidGeneration IntervalReason = "GenerationViolation"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@ import (
"bytes"
"encoding/csv"
"fmt"
"gonum.org/v1/plot"
"gonum.org/v1/plot/plotter"
"gonum.org/v1/plot/vg"
"image/color"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
"os"
"path"
"strconv"
"strings"
"sync"
"time"

"gonum.org/v1/plot"
"gonum.org/v1/plot/plotter"
"gonum.org/v1/plot/vg"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
)

type CountForSecond struct {
Expand All @@ -27,6 +28,10 @@ type CountForSecond struct {

NumberOfRequestsReceived int

NumberOfRequestsReceivedThatAccessedEtcd int
NumberOfRequestsReceivedDidNotAccessEtcd int
NumberOfRequestsReceivedSawNoLeader int

// NumberOfRequestsReceivedThatLaterGot500 is calculated based on when the request was received instead of completed
// because different requests have different timeouts, so it's more useful to categorize based on received time
NumberOfRequestsReceivedThatLaterGot500 int
Expand Down Expand Up @@ -78,6 +83,15 @@ func (s *countTracking) HandleAuditLogEvent(auditEvent *auditv1.Event, beginning
for i := receivedIndex; i <= completionIndex; i++ {
s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfConcurrentRequestsBeingHandled++
}

if _, accessedEtcd := auditEvent.Annotations["apiserver.latency.k8s.io/etcd"]; accessedEtcd {
s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedThatAccessedEtcd++
} else {
s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedDidNotAccessEtcd++
}
if _, gotNoLeaderError := auditEvent.Annotations["apiserver.internal.openshift.io/no-leader"]; gotNoLeaderError {
s.CountsForRun.CountsForEachSecond[receivedIndex].NumberOfRequestsReceivedSawNoLeader++
}
}

// returns received index, completion index, status code, and false if the index is out of bounds
Expand Down Expand Up @@ -115,16 +129,26 @@ func (c *CountsForRun) countIndexesFromAuditTime(in *auditv1.Event) (int, int, i
func (c *CountsForRun) ToCSV() ([]byte, error) {
out := &bytes.Buffer{}
csvWriter := csv.NewWriter(out)
if err := csvWriter.Write([]string{"seconds after cluster start", "number of requests being handled", "number of requests received", "number of requests received this second getting 500"}); err != nil {
if err := csvWriter.Write([]string{
"seconds after cluster start", "number of requests being handled", "number of requests received", "number of requests received this second getting 500",
"number of requests received accessing etcd", "number of requests received NOT accessing etcd", "number of requests received with no leader", "percentage of requests received with no leader"}); err != nil {
return nil, fmt.Errorf("failed writing headers: %w", err)
}

for i, curr := range c.CountsForEachSecond {
percentNoLeader := 0
if curr.NumberOfRequestsReceivedThatAccessedEtcd > 0 {
percentNoLeader = int(float32(curr.NumberOfRequestsReceivedSawNoLeader) / float32(curr.NumberOfRequestsReceivedThatAccessedEtcd) * 100.0)
}
record := []string{
strconv.Itoa(i),
strconv.Itoa(curr.NumberOfConcurrentRequestsBeingHandled),
strconv.Itoa(curr.NumberOfRequestsReceived),
strconv.Itoa(curr.NumberOfRequestsReceivedThatLaterGot500),
strconv.Itoa(curr.NumberOfRequestsReceivedThatAccessedEtcd),
strconv.Itoa(curr.NumberOfRequestsReceivedDidNotAccessEtcd),
strconv.Itoa(curr.NumberOfRequestsReceivedSawNoLeader),
strconv.Itoa(percentNoLeader),
}

if err := csvWriter.Write(record); err != nil {
Expand Down Expand Up @@ -187,7 +211,7 @@ func (c *CountsForRun) SubsetDataAtTime(endTime metav1.Time) *CountsForRun {
return ret
}

func (c *CountsForRun) ToLineChart() (*plot.Plot, error) {
func (c *CountsForRun) ToConcurrentRequestsLineChart() (*plot.Plot, error) {
p := plot.New()
p.Title.Text = "Requests by Second of Cluster Life"
p.Y.Label.Text = "Number of Requests in that Second"
Expand Down Expand Up @@ -232,6 +256,35 @@ func (c *CountsForRun) ToLineChart() (*plot.Plot, error) {
return p, nil
}

func (c *CountsForRun) ToEtcdNoLeaderLineChart() (*plot.Plot, error) {
p := plot.New()
p.Title.Text = "Percentage of Requests Each Second with Etcd No Leader"
p.Y.Label.Text = "Percentage of No Leader Responses in that Second"
p.X.Label.Text = "Seconds of Cluster Life"
p.X.Tick.Marker = plot.TimeTicks{}
plotter.DefaultLineStyle.Width = vg.Points(1)

percentageOfNoLeaderResponses := plotter.XYs{}
for i, requestCounts := range c.CountsForEachSecond {
timeOfSecond := c.EstimatedStartOfCluster.Add(time.Duration(i) * time.Second)
percentNoLeader := float64(0.0)
if requestCounts.NumberOfRequestsReceivedThatAccessedEtcd > 0 {
percentNoLeader = float64(requestCounts.NumberOfRequestsReceivedSawNoLeader) / float64(requestCounts.NumberOfRequestsReceivedThatAccessedEtcd) * 100.0
}
percentageOfNoLeaderResponses = append(percentageOfNoLeaderResponses, plotter.XY{X: float64(timeOfSecond.Unix()), Y: percentNoLeader})
}

lineOfPercentageOfNoLeaderResponses, err := plotter.NewLine(percentageOfNoLeaderResponses)
if err != nil {
return nil, err
}
lineOfPercentageOfNoLeaderResponses.LineStyle.Color = color.RGBA{R: 255, G: 0, B: 0, A: 255}
p.Add(lineOfPercentageOfNoLeaderResponses)
p.Legend.Add("Percentage of No Leader Responses in that Second", lineOfPercentageOfNoLeaderResponses)

return p, nil
}

func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string) error {
csvContent, err := c.ToCSV()
if err != nil {
Expand All @@ -241,7 +294,7 @@ func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string
if err := os.WriteFile(requestCountsByTimeFile, csvContent, 0644); err != nil {
return err
}
requestCountsForEntireRunPlot, err := c.ToLineChart()
requestCountsForEntireRunPlot, err := c.ToConcurrentRequestsLineChart()
if err != nil {
return err
}
Expand All @@ -251,5 +304,15 @@ func (c *CountsForRun) WriteContentToStorage(storageDir, name, timeSuffix string
return err
}

noEtcdLeaderByTimeEntireRunPlot, err := c.ToEtcdNoLeaderLineChart()
if err != nil {
return err
}
noEtcdLeaderByTimeTimeGraphFile := path.Join(storageDir, fmt.Sprintf("%s_%s.png", name, timeSuffix))
err = noEtcdLeaderByTimeEntireRunPlot.Save(vg.Length(c.NumberOfSeconds), 500, noEtcdLeaderByTimeTimeGraphFile)
if err != nil {
return err
}

return nil
}
152 changes: 126 additions & 26 deletions pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,39 +153,139 @@ func (w *auditLogAnalyzer) CollectData(ctx context.Context, storageDir string, b
return retIntervals, nil, err
}

func (*auditLogAnalyzer) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, nil
func (w *auditLogAnalyzer) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
ret := monitorapi.Intervals{}
if w.requestCountTracking != nil {
requestCountIntervals := func() monitorapi.Intervals {
w.requestCountTracking.lock.Lock()
defer w.requestCountTracking.lock.Unlock()

retIntervals := monitorapi.Intervals{}

lastSecondSawNoLeader := false
currTotalRequestsToEtcd := 0
currTotalRequestsWithNoLeader := 0
startOfNoLeaderInterval := time.Time{}
for second, counts := range w.requestCountTracking.CountsForRun.CountsForEachSecond {
switch {
case !lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader > 0:
lastSecondSawNoLeader = true
currTotalRequestsToEtcd += counts.NumberOfRequestsReceivedThatAccessedEtcd
currTotalRequestsWithNoLeader += counts.NumberOfRequestsReceivedSawNoLeader
startOfNoLeaderInterval = w.requestCountTracking.CountsForRun.EstimatedStartOfCluster.Add(time.Second * time.Duration(second))

case lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader > 0:
// continuing to have no leader problems
currTotalRequestsToEtcd += counts.NumberOfRequestsReceivedThatAccessedEtcd
currTotalRequestsWithNoLeader += counts.NumberOfRequestsReceivedSawNoLeader

case !lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader == 0:
// continuing to be no leader problem-free, do nothing

case lastSecondSawNoLeader && counts.NumberOfRequestsReceivedSawNoLeader == 0:
endOfNoLeaderInterval := w.requestCountTracking.CountsForRun.EstimatedStartOfCluster.Add(time.Second * time.Duration(second))
failurePercentage := int((float32(currTotalRequestsWithNoLeader) / float32(currTotalRequestsToEtcd)) * 100)
retIntervals = append(retIntervals,
monitorapi.NewInterval(monitorapi.SourceAuditLog, monitorapi.Error).
Locator(monitorapi.NewLocator().KubeAPIServerWithLB("any")).
Message(monitorapi.NewMessage().
Reason(monitorapi.ReasonKubeAPIServerNoLeader).
WithAnnotation(monitorapi.AnnotationCount, strconv.Itoa(currTotalRequestsWithNoLeader)).
WithAnnotation(monitorapi.AnnotationPercentage, strconv.Itoa(failurePercentage)).
HumanMessagef("%d requests made during this time failed with 'no leader' out of %d total", currTotalRequestsWithNoLeader, currTotalRequestsToEtcd),
).
Display().
Build(startOfNoLeaderInterval, endOfNoLeaderInterval))

lastSecondSawNoLeader = false
currTotalRequestsToEtcd = 0
currTotalRequestsWithNoLeader = 0
startOfNoLeaderInterval = time.Time{}
}
}

return retIntervals
}()
ret = append(ret, requestCountIntervals...)
}

return ret, nil
}

func (w *auditLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
ret := []*junitapi.JUnitTestCase{}

fiveHundredsTestName := "[Jira:kube-apiserver] kube-apiserver should not have internal failures"
apiserver500s := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServer500s
})
totalDurationOf500s := 0
fiveHundredsFailures := []string{}
for _, interval := range apiserver500s {
totalDurationOf500s += int(interval.To.Sub(interval.From).Seconds()) + 1
fiveHundredsFailures = append(fiveHundredsFailures, interval.String())
}
if totalDurationOf500s > 60 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: fiveHundredsTestName,
FailureOutput: &junitapi.FailureOutput{
Message: strings.Join(fiveHundredsFailures, "\n"),
Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOf500s),
},
{
testName := "[Jira:kube-apiserver] kube-apiserver should not have internal failures"
offendingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServer500s
})
// flake for now
ret = append(ret, &junitapi.JUnitTestCase{
Name: fiveHundredsTestName,
})
} else {
ret = append(ret, &junitapi.JUnitTestCase{
Name: fiveHundredsTestName,
totalDurationOfOffense := 0
offenses := []string{}
for _, interval := range offendingIntervals {
totalDurationOfOffense += int(interval.To.Sub(interval.From).Seconds()) + 1
offenses = append(offenses, interval.String())
}
if totalDurationOfOffense > 60 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
Message: strings.Join(offenses, "\n"),
Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense),
},
})
// flake for now
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
})
} else {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
})
}
}

{
testName := "[Jira:etcd] kube-apiserver should not have etcd no leader problems"
offendingIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
return eventInterval.Message.Reason == monitorapi.ReasonKubeAPIServerNoLeader
})
totalDurationOfOffense := 0
offenses := []string{}
for _, interval := range offendingIntervals {
totalDurationOfOffense += int(interval.To.Sub(interval.From).Seconds()) + 1
offenses = append(offenses, interval.String())
}
if totalDurationOfOffense > 100 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
Message: strings.Join(offenses, "\n"),
Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense),
},
})
// surely we don't have more than 100 seconds of no leader, start of failure?
//ret = append(ret, &junitapi.JUnitTestCase{
// Name: testName,
//})
} else if totalDurationOfOffense > 0 {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
Message: strings.Join(offenses, "\n"),
Output: fmt.Sprintf("kube-apiserver had internal errors for %v seconds total", totalDurationOfOffense),
},
})
// flake for now
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
})

} else {
ret = append(ret, &junitapi.JUnitTestCase{
Name: testName,
})
}
}

allPlatformNamespaces, err := watchnamespaces.GetAllPlatformNamespaces()
Expand Down

0 comments on commit e32a270

Please sign in to comment.