From 0d2c701799d93f8811957f2b7e862e38b145b9b0 Mon Sep 17 00:00:00 2001 From: Charles Cheng Date: Wed, 27 Nov 2024 21:31:02 +0000 Subject: [PATCH 1/4] Add metrics to acs for eni provisioning workflow monitoring --- ecs-agent/acs/session/session.go | 25 ++++++ ecs-agent/acs/session/session_test.go | 125 +++++++++++++++++++++++++- ecs-agent/metrics/constants.go | 5 ++ 3 files changed, 153 insertions(+), 2 deletions(-) diff --git a/ecs-agent/acs/session/session.go b/ecs-agent/acs/session/session.go index 6a3f733cf99..04f0ecc36dc 100644 --- a/ecs-agent/acs/session/session.go +++ b/ecs-agent/acs/session/session.go @@ -99,6 +99,8 @@ type session struct { disconnectJitter time.Duration inactiveInstanceReconnectDelay time.Duration lastConnectedTime time.Time + firstACSConnectionTime time.Time + firstDiscoverPollEndpointTime time.Time } // NewSession creates a new Session. @@ -158,6 +160,8 @@ func NewSession(containerInstanceARN string, disconnectJitter: wsclient.DisconnectJitterMax, inactiveInstanceReconnectDelay: inactiveInstanceReconnectDelay, lastConnectedTime: time.Time{}, + firstACSConnectionTime: time.Time{}, + firstDiscoverPollEndpointTime: time.Time{}, } } @@ -234,7 +238,14 @@ func (s *session) Start(ctx context.Context) error { // startSessionOnce creates a session with ACS and handles requests using the passed // in arguments. func (s *session) startSessionOnce(ctx context.Context) error { + if s.GetFirstDiscoverPollEndpointTime().IsZero() { + s.firstDiscoverPollEndpointTime = time.Now() + } + + discoverPollEndpointMetric := s.metricsFactory.New(metrics.ACSDiscoverPollEndpointDurationName) acsEndpoint, err := s.ecsClient.DiscoverPollEndpoint(s.containerInstanceARN) + discoverPollEndpointMetric.Done(err) + if err != nil { logger.Error("ACS: Unable to discover poll endpoint", logger.Fields{ "containerInstanceARN": s.containerInstanceARN, @@ -253,6 +264,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { // Invoke Connect method as soon as we create client. This will ensure all the // request handlers to be associated with this client have a valid connection. + acsConnectionMetric := s.metricsFactory.New(metrics.ACSConnectionMetricDurationName) disconnectTimer, err := client.Connect(metrics.ACSDisconnectTimeoutMetricName, s.disconnectTimeout, s.disconnectJitter) if err != nil { @@ -262,8 +274,13 @@ func (s *session) startSessionOnce(ctx context.Context) error { }) return err } + acsConnectionMetric.Done(err) defer disconnectTimer.Stop() + if s.GetFirstACSConnectionTime().IsZero() { + s.firstACSConnectionTime = time.Now() + } + // Record the timestamp of the last connection to ACS. s.lastConnectedTime = time.Now() @@ -475,3 +492,11 @@ func formatDockerVersion(dockerVersionValue string) string { func (s *session) GetLastConnectedTime() time.Time { return s.lastConnectedTime } + +func (s *session) GetFirstACSConnectionTime() time.Time { + return s.firstACSConnectionTime +} + +func (s *session) GetFirstDiscoverPollEndpointTime() time.Time { + return s.firstDiscoverPollEndpointTime +} diff --git a/ecs-agent/acs/session/session_test.go b/ecs-agent/acs/session/session_test.go index 170cfccd898..1cd742773be 100644 --- a/ecs-agent/acs/session/session_test.go +++ b/ecs-agent/acs/session/session_test.go @@ -39,6 +39,7 @@ import ( "github.com/aws/amazon-ecs-agent/ecs-agent/doctor" "github.com/aws/amazon-ecs-agent/ecs-agent/eventstream" metricsfactory "github.com/aws/amazon-ecs-agent/ecs-agent/metrics" + mock_metrics "github.com/aws/amazon-ecs-agent/ecs-agent/metrics/mocks" "github.com/aws/amazon-ecs-agent/ecs-agent/utils/retry" mock_retry "github.com/aws/amazon-ecs-agent/ecs-agent/utils/retry/mock" "github.com/aws/amazon-ecs-agent/ecs-agent/wsclient" @@ -224,6 +225,16 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -243,7 +254,7 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) { // Connect fails 10 times. mockWsClient.EXPECT().Connect(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, io.EOF).Times(10), // Cancel trying to connect to ACS on the 11th attempt. - // Failure to retry on Connect() errors should cause the test to time out as the context is never canceled. + // Failure to retry on ConnACSDisconnectTimeoutMetricNameect() errors should cause the test to time out as the context is never canceled. mockWsClient.EXPECT().Connect(gomock.Any(), gomock.Any(), gomock.Any()).Do(func(interface{}, interface{}, interface{}) { cancel() @@ -253,6 +264,7 @@ func TestSessionReconnectsOnConnectErrors(t *testing.T) { containerInstanceARN: testconst.ContainerInstanceARN, ecsClient: ecsClient, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -345,6 +357,16 @@ func TestSessionReconnectsWithoutBackoffOnEOFError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -377,6 +399,7 @@ func TestSessionReconnectsWithoutBackoffOnEOFError(t *testing.T) { inactiveInstanceCB: noopFunc, backoff: mockBackoff, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -394,6 +417,16 @@ func TestSessionReconnectsWithBackoffOnNonEOFError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -427,6 +460,7 @@ func TestSessionReconnectsWithBackoffOnNonEOFError(t *testing.T) { inactiveInstanceCB: noopFunc, backoff: mockBackoff, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -444,6 +478,16 @@ func TestSessionCallsInactiveInstanceCB(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -480,6 +524,7 @@ func TestSessionCallsInactiveInstanceCB(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: inactiveInstanceCB, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -499,6 +544,16 @@ func TestSessionReconnectDelayForInactiveInstanceError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -540,6 +595,7 @@ func TestSessionReconnectDelayForInactiveInstanceError(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -559,6 +615,16 @@ func TestSessionReconnectsOnServeErrors(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -590,6 +656,7 @@ func TestSessionReconnectsOnServeErrors(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -608,6 +675,16 @@ func TestSessionStopsWhenContextIsCanceled(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).AnyTimes() + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).AnyTimes() + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).AnyTimes() + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -635,6 +712,7 @@ func TestSessionStopsWhenContextIsCanceled(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -653,6 +731,16 @@ func TestSessionStopsWhenContextIsErrorDueToTimeout(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry) + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()) + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()) + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry) + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -677,6 +765,7 @@ func TestSessionStopsWhenContextIsErrorDueToTimeout(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, inactiveInstanceReconnectDelay: 1 * time.Hour, @@ -694,6 +783,16 @@ func TestSessionReconnectsOnDiscoverPollEndpointError(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).Times(2) + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).Times(2) + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()) + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry) + ecsClient := mock_ecs.NewMockECSClient(ctrl) ctx, cancel := context.WithCancel(context.Background()) @@ -725,6 +824,7 @@ func TestSessionReconnectsOnDiscoverPollEndpointError(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -756,6 +856,16 @@ func TestConnectionIsClosedOnIdle(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry) + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()) + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()) + ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() ctx, cancel := context.WithCancel(context.Background()) @@ -783,6 +893,7 @@ func TestConnectionIsClosedOnIdle(t *testing.T) { ecsClient: ecsClient, inactiveInstanceCB: noopFunc, clientFactory: mockClientFactory, + metricsFactory: mockMetricsFactory, heartbeatTimeout: 20 * time.Millisecond, heartbeatJitter: 10 * time.Millisecond, disconnectTimeout: 30 * time.Millisecond, @@ -1003,6 +1114,16 @@ func TestSessionCorrectlySetsSendCredentials(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() + mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl) + + mockDiscoverPollEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.DiscoverPollEndpointDuration").Return(mockDiscoverPollEndpointEntry).Times(10) + mockDiscoverPollEndpointEntry.EXPECT().Done(gomock.Any()).Times(10) + + mockACSConnectEndpointEntry := mock_metrics.NewMockEntry(ctrl) + mockMetricsFactory.EXPECT().New("ACSStartSession.ACSConnectEndpointDuration").Return(mockACSConnectEndpointEntry).Times(10) + mockACSConnectEndpointEntry.EXPECT().Done(gomock.Any()).Times(10) + const numInvocations = 10 ecsClient := mock_ecs.NewMockECSClient(ctrl) ecsClient.EXPECT().DiscoverPollEndpoint(gomock.Any()).Return(acsURL, nil).AnyTimes() @@ -1025,7 +1146,7 @@ func TestSessionCorrectlySetsSendCredentials(t *testing.T) { nil, noopFunc, mockClientFactory, - metricsfactory.NewNopEntryFactory(), + mockMetricsFactory, agentVersion, agentGitShortHash, dockerVersion, diff --git a/ecs-agent/metrics/constants.go b/ecs-agent/metrics/constants.go index 9537a58fd7a..021dacd944a 100644 --- a/ecs-agent/metrics/constants.go +++ b/ecs-agent/metrics/constants.go @@ -46,6 +46,11 @@ const ( ACSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".ACSDisconnectTimeout" TCSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".TCSDisconnectTimeout" + // ACS Session Metrics + acsStartSessionNamespace = "ACSStartSession" + ACSDiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" + ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" + dbClientMetricNamespace = "Data" GetNetworkConfigurationByTaskMetricName = dbClientMetricNamespace + ".GetNetworkConfigurationByTask" SaveNetworkNamespaceMetricName = dbClientMetricNamespace + ".SaveNetworkNamespace" From 7a2750bcd7281c8aebce8be6ecc7f572deb66229 Mon Sep 17 00:00:00 2001 From: Charles Cheng Date: Wed, 27 Nov 2024 21:31:02 +0000 Subject: [PATCH 2/4] Add metrics to acs for eni provisioning workflow monitoring --- .../ecs-agent/acs/session/session.go | 25 +++++++++++++++++++ .../ecs-agent/metrics/constants.go | 5 ++++ 2 files changed, 30 insertions(+) diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go index 6a3f733cf99..04f0ecc36dc 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go @@ -99,6 +99,8 @@ type session struct { disconnectJitter time.Duration inactiveInstanceReconnectDelay time.Duration lastConnectedTime time.Time + firstACSConnectionTime time.Time + firstDiscoverPollEndpointTime time.Time } // NewSession creates a new Session. @@ -158,6 +160,8 @@ func NewSession(containerInstanceARN string, disconnectJitter: wsclient.DisconnectJitterMax, inactiveInstanceReconnectDelay: inactiveInstanceReconnectDelay, lastConnectedTime: time.Time{}, + firstACSConnectionTime: time.Time{}, + firstDiscoverPollEndpointTime: time.Time{}, } } @@ -234,7 +238,14 @@ func (s *session) Start(ctx context.Context) error { // startSessionOnce creates a session with ACS and handles requests using the passed // in arguments. func (s *session) startSessionOnce(ctx context.Context) error { + if s.GetFirstDiscoverPollEndpointTime().IsZero() { + s.firstDiscoverPollEndpointTime = time.Now() + } + + discoverPollEndpointMetric := s.metricsFactory.New(metrics.ACSDiscoverPollEndpointDurationName) acsEndpoint, err := s.ecsClient.DiscoverPollEndpoint(s.containerInstanceARN) + discoverPollEndpointMetric.Done(err) + if err != nil { logger.Error("ACS: Unable to discover poll endpoint", logger.Fields{ "containerInstanceARN": s.containerInstanceARN, @@ -253,6 +264,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { // Invoke Connect method as soon as we create client. This will ensure all the // request handlers to be associated with this client have a valid connection. + acsConnectionMetric := s.metricsFactory.New(metrics.ACSConnectionMetricDurationName) disconnectTimer, err := client.Connect(metrics.ACSDisconnectTimeoutMetricName, s.disconnectTimeout, s.disconnectJitter) if err != nil { @@ -262,8 +274,13 @@ func (s *session) startSessionOnce(ctx context.Context) error { }) return err } + acsConnectionMetric.Done(err) defer disconnectTimer.Stop() + if s.GetFirstACSConnectionTime().IsZero() { + s.firstACSConnectionTime = time.Now() + } + // Record the timestamp of the last connection to ACS. s.lastConnectedTime = time.Now() @@ -475,3 +492,11 @@ func formatDockerVersion(dockerVersionValue string) string { func (s *session) GetLastConnectedTime() time.Time { return s.lastConnectedTime } + +func (s *session) GetFirstACSConnectionTime() time.Time { + return s.firstACSConnectionTime +} + +func (s *session) GetFirstDiscoverPollEndpointTime() time.Time { + return s.firstDiscoverPollEndpointTime +} diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go index 9537a58fd7a..021dacd944a 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go @@ -46,6 +46,11 @@ const ( ACSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".ACSDisconnectTimeout" TCSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".TCSDisconnectTimeout" + // ACS Session Metrics + acsStartSessionNamespace = "ACSStartSession" + ACSDiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" + ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" + dbClientMetricNamespace = "Data" GetNetworkConfigurationByTaskMetricName = dbClientMetricNamespace + ".GetNetworkConfigurationByTask" SaveNetworkNamespaceMetricName = dbClientMetricNamespace + ".SaveNetworkNamespace" From 9410bbf07307e4bf0fc3171ab07d3423ed7acc0e Mon Sep 17 00:00:00 2001 From: Charles Cheng Date: Wed, 27 Nov 2024 21:31:02 +0000 Subject: [PATCH 3/4] Add metrics to acs for eni provisioning workflow monitoring --- ecs-agent/acs/session/session.go | 3 +-- ecs-agent/api/ecs/client/ecs_client.go | 8 +++++++- ecs-agent/api/ecs/interface.go | 4 ++++ ecs-agent/metrics/constants.go | 6 +++--- go.mod | 3 +++ 5 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 go.mod diff --git a/ecs-agent/acs/session/session.go b/ecs-agent/acs/session/session.go index 04f0ecc36dc..4cc8dc7c922 100644 --- a/ecs-agent/acs/session/session.go +++ b/ecs-agent/acs/session/session.go @@ -242,9 +242,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { s.firstDiscoverPollEndpointTime = time.Now() } - discoverPollEndpointMetric := s.metricsFactory.New(metrics.ACSDiscoverPollEndpointDurationName) acsEndpoint, err := s.ecsClient.DiscoverPollEndpoint(s.containerInstanceARN) - discoverPollEndpointMetric.Done(err) if err != nil { logger.Error("ACS: Unable to discover poll endpoint", logger.Fields{ @@ -253,6 +251,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { }) return err } + s.metricsFactory.New(metrics.DiscoverPollEndpointDurationName).WithGauge(s.ecsClient.GetDiscoverPollEndpointDuration()).Done(nil) client := s.clientFactory.New( s.acsURL(acsEndpoint), diff --git a/ecs-agent/api/ecs/client/ecs_client.go b/ecs-agent/api/ecs/client/ecs_client.go index 6e57ba6b27e..eee36f04a89 100644 --- a/ecs-agent/api/ecs/client/ecs_client.go +++ b/ecs-agent/api/ecs/client/ecs_client.go @@ -77,6 +77,7 @@ type ecsClient struct { shouldExcludeIPv6PortBinding bool sascCustomRetryBackoff func(func() error) error stscAttachmentCustomRetryBackoff func(func() error) error + discoverPollEndpointDuration time.Duration } // NewECSClient creates a new ECSClient interface object. @@ -747,7 +748,7 @@ func (client *ecsClient) discoverPollEndpoint(containerInstanceArn string, } } } - + discoverPollEndpointStartTime := time.Now() // Cache miss or expired, invoke the ECS DiscoverPollEndpoint API. logger.Debug("Invoking DiscoverPollEndpoint", logger.Fields{ field.ContainerInstanceARN: containerInstanceArn, @@ -777,6 +778,7 @@ func (client *ecsClient) discoverPollEndpoint(containerInstanceArn string, return nil, err } + client.discoverPollEndpointDuration = time.Since(discoverPollEndpointStartTime) // Cache the response from ECS. client.pollEndpointCache.Set(containerInstanceArn, output) return output, nil @@ -870,3 +872,7 @@ func trimString(inputString string, maxLen int) string { return inputString } } + +func (client *ecsClient) GetDiscoverPollEndpointDuration() time.Duration { + return client.discoverPollEndpointDuration +} diff --git a/ecs-agent/api/ecs/interface.go b/ecs-agent/api/ecs/interface.go index fb3b3fab53f..6c962dc2b90 100644 --- a/ecs-agent/api/ecs/interface.go +++ b/ecs-agent/api/ecs/interface.go @@ -14,6 +14,8 @@ package ecs import ( + "time" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/request" @@ -62,6 +64,8 @@ type ECSClient interface { UpdateContainerInstancesState(instanceARN, status string) error // GetHostResources retrieves a map that map the resource name to the corresponding resource GetHostResources() (map[string]*ecs.Resource, error) + // GetDiscoverPollEndpointDuration retrieves the time it takes for the DiscoverPollEndpoint call + GetDiscoverPollEndpointDuration() time.Duration } // ECSSDK is an interface that specifies the subset of the AWS Go SDK's ECS diff --git a/ecs-agent/metrics/constants.go b/ecs-agent/metrics/constants.go index 021dacd944a..0be004b5deb 100644 --- a/ecs-agent/metrics/constants.go +++ b/ecs-agent/metrics/constants.go @@ -47,9 +47,9 @@ const ( TCSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".TCSDisconnectTimeout" // ACS Session Metrics - acsStartSessionNamespace = "ACSStartSession" - ACSDiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" - ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" + acsStartSessionNamespace = "ACSStartSession" + DiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" + ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" dbClientMetricNamespace = "Data" GetNetworkConfigurationByTaskMetricName = dbClientMetricNamespace + ".GetNetworkConfigurationByTask" diff --git a/go.mod b/go.mod new file mode 100644 index 00000000000..bd833852a62 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/aws/amazon-ecs-agent/ecs-agent + +go 1.22.7 From 70c17a40c323b0224860250b1bdee1131938deb8 Mon Sep 17 00:00:00 2001 From: Charles Cheng Date: Wed, 27 Nov 2024 21:31:02 +0000 Subject: [PATCH 4/4] Add metrics to acs for eni provisioning workflow monitoring --- .../aws/amazon-ecs-agent/ecs-agent/acs/session/session.go | 3 +-- .../ecs-agent/api/ecs/client/ecs_client.go | 8 +++++++- .../aws/amazon-ecs-agent/ecs-agent/api/ecs/interface.go | 4 ++++ .../aws/amazon-ecs-agent/ecs-agent/metrics/constants.go | 6 +++--- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go index 04f0ecc36dc..4cc8dc7c922 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/acs/session/session.go @@ -242,9 +242,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { s.firstDiscoverPollEndpointTime = time.Now() } - discoverPollEndpointMetric := s.metricsFactory.New(metrics.ACSDiscoverPollEndpointDurationName) acsEndpoint, err := s.ecsClient.DiscoverPollEndpoint(s.containerInstanceARN) - discoverPollEndpointMetric.Done(err) if err != nil { logger.Error("ACS: Unable to discover poll endpoint", logger.Fields{ @@ -253,6 +251,7 @@ func (s *session) startSessionOnce(ctx context.Context) error { }) return err } + s.metricsFactory.New(metrics.DiscoverPollEndpointDurationName).WithGauge(s.ecsClient.GetDiscoverPollEndpointDuration()).Done(nil) client := s.clientFactory.New( s.acsURL(acsEndpoint), diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/client/ecs_client.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/client/ecs_client.go index 6e57ba6b27e..eee36f04a89 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/client/ecs_client.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/client/ecs_client.go @@ -77,6 +77,7 @@ type ecsClient struct { shouldExcludeIPv6PortBinding bool sascCustomRetryBackoff func(func() error) error stscAttachmentCustomRetryBackoff func(func() error) error + discoverPollEndpointDuration time.Duration } // NewECSClient creates a new ECSClient interface object. @@ -747,7 +748,7 @@ func (client *ecsClient) discoverPollEndpoint(containerInstanceArn string, } } } - + discoverPollEndpointStartTime := time.Now() // Cache miss or expired, invoke the ECS DiscoverPollEndpoint API. logger.Debug("Invoking DiscoverPollEndpoint", logger.Fields{ field.ContainerInstanceARN: containerInstanceArn, @@ -777,6 +778,7 @@ func (client *ecsClient) discoverPollEndpoint(containerInstanceArn string, return nil, err } + client.discoverPollEndpointDuration = time.Since(discoverPollEndpointStartTime) // Cache the response from ECS. client.pollEndpointCache.Set(containerInstanceArn, output) return output, nil @@ -870,3 +872,7 @@ func trimString(inputString string, maxLen int) string { return inputString } } + +func (client *ecsClient) GetDiscoverPollEndpointDuration() time.Duration { + return client.discoverPollEndpointDuration +} diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/interface.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/interface.go index fb3b3fab53f..6c962dc2b90 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/interface.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/api/ecs/interface.go @@ -14,6 +14,8 @@ package ecs import ( + "time" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/request" @@ -62,6 +64,8 @@ type ECSClient interface { UpdateContainerInstancesState(instanceARN, status string) error // GetHostResources retrieves a map that map the resource name to the corresponding resource GetHostResources() (map[string]*ecs.Resource, error) + // GetDiscoverPollEndpointDuration retrieves the time it takes for the DiscoverPollEndpoint call + GetDiscoverPollEndpointDuration() time.Duration } // ECSSDK is an interface that specifies the subset of the AWS Go SDK's ECS diff --git a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go index 021dacd944a..0be004b5deb 100644 --- a/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go +++ b/agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go @@ -47,9 +47,9 @@ const ( TCSDisconnectTimeoutMetricName = agentAvailabilityNamespace + ".TCSDisconnectTimeout" // ACS Session Metrics - acsStartSessionNamespace = "ACSStartSession" - ACSDiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" - ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" + acsStartSessionNamespace = "ACSStartSession" + DiscoverPollEndpointDurationName = acsStartSessionNamespace + ".DiscoverPollEndpointDuration" + ACSConnectionMetricDurationName = acsStartSessionNamespace + ".ACSConnectEndpointDuration" dbClientMetricNamespace = "Data" GetNetworkConfigurationByTaskMetricName = dbClientMetricNamespace + ".GetNetworkConfigurationByTask"