From 4f176d9dafb2dded06ba6f7dfc64c8095c71ee80 Mon Sep 17 00:00:00 2001 From: Nate Mortensen Date: Thu, 30 Jan 2025 14:11:11 -0800 Subject: [PATCH] Optimize poller counting Instead of iterating through the entire history cache, use the count where the values aren't needed. Additionally remove the tasklist isolation specific logic from emitMisconfiguredPartitionMetrics. This could add additional CPU usage when enabling tasklist isolation and accurate when tasklist <-> isolation group assignment is implemented. Generally we need to revisit this metric as the source of truth for partition count is moving from the dynamic config to persistence, but this metric still relies only on dynamic config. If partition autoscaling is enabled then it seems strange to alert in this scenario. --- service/matching/poller/history.go | 5 +++++ service/matching/poller/history_test.go | 10 ++++++++++ service/matching/tasklist/task_list_manager.go | 12 ++---------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/service/matching/poller/history.go b/service/matching/poller/history.go index 30b55eeff2c..8bc93b44c75 100644 --- a/service/matching/poller/history.go +++ b/service/matching/poller/history.go @@ -46,6 +46,7 @@ type ( History interface { UpdatePollerInfo(id Identity, info Info) HasPollerAfter(earliestAccessTime time.Time) bool + GetPollerCount() int GetPollerInfo(earliestAccessTime time.Time) []*types.PollerInfo GetPollerIsolationGroups(earliestAccessTime time.Time) map[string]int } @@ -106,6 +107,10 @@ func (pollers *history) HasPollerAfter(earliestAccessTime time.Time) bool { return false } +func (pollers *history) GetPollerCount() int { + return pollers.historyCache.Size() +} + func (pollers *history) GetPollerInfo(earliestAccessTime time.Time) []*types.PollerInfo { var result []*types.PollerInfo // optimistic size get, it can change before Iterator call. diff --git a/service/matching/poller/history_test.go b/service/matching/poller/history_test.go index 88d1c4b3b75..986e55163e2 100644 --- a/service/matching/poller/history_test.go +++ b/service/matching/poller/history_test.go @@ -120,6 +120,16 @@ func TestHistory_HasPollerAfter(t *testing.T) { }) } +func TestGetPollerCount(t *testing.T) { + mockCtrl := gomock.NewController(t) + mockCache := cache.NewMockCache(mockCtrl) + mockCache.EXPECT().Size().Return(10) + p := &history{ + historyCache: mockCache, + } + assert.Equal(t, 10, p.GetPollerCount()) +} + func TestGetPollerInfo(t *testing.T) { t.Run("with_time_filter", func(t *testing.T) { mockCtrl := gomock.NewController(t) diff --git a/service/matching/tasklist/task_list_manager.go b/service/matching/tasklist/task_list_manager.go index ba968a1d563..85e1de1c69e 100644 --- a/service/matching/tasklist/task_list_manager.go +++ b/service/matching/tasklist/task_list_manager.go @@ -206,7 +206,7 @@ func NewManager( tlMgr.pollerHistory = poller.NewPollerHistory(func() { scope.UpdateGauge(metrics.PollerPerTaskListCounter, - float64(len(tlMgr.pollerHistory.GetPollerInfo(time.Time{})))) + float64(tlMgr.pollerHistory.GetPollerCount())) }, timeSource) livenessInterval := taskListConfig.IdleTasklistCheckInterval() @@ -945,15 +945,7 @@ func (c *taskListManagerImpl) emitMisconfiguredPartitionMetrics() { if c.config.NumReadPartitions() != c.config.NumWritePartitions() { c.scope.UpdateGauge(metrics.TaskListReadWritePartitionMismatchGauge, 1) } - pollerCount := len(c.pollerHistory.GetPollerInfo(time.Time{})) - if c.enableIsolation { // if isolation enabled, get the minimum poller count among the isolation groups - pollerCountsByIsolationGroup := c.pollerHistory.GetPollerIsolationGroups(time.Time{}) - for _, count := range pollerCountsByIsolationGroup { - if count < pollerCount { - pollerCount = count - } - } - } + pollerCount := c.pollerHistory.GetPollerCount() if pollerCount < c.config.NumReadPartitions() || pollerCount < c.config.NumWritePartitions() { c.scope.Tagged(metrics.IsolationEnabledTag(c.enableIsolation)).UpdateGauge(metrics.TaskListPollerPartitionMismatchGauge, 1) }