From ddc4deb0b5f4b5a41a2c32562ca7a469e269971f Mon Sep 17 00:00:00 2001 From: BinBin He Date: Fri, 27 Dec 2024 17:52:17 +0000 Subject: [PATCH] Terminally exit on unrecoverable exceptions for RCI. --- agent/app/agent.go | 31 ++++++--- agent/app/agent_test.go | 142 ++++++++++++++++++++++++---------------- agent/app/errors.go | 13 ++-- 3 files changed, 112 insertions(+), 74 deletions(-) diff --git a/agent/app/agent.go b/agent/app/agent.go index 6a4d0bc7ab3..6197bfdeeb8 100644 --- a/agent/app/agent.go +++ b/agent/app/agent.go @@ -447,10 +447,15 @@ func (agent *ecsAgent) doStart(containerChangeEventStream *eventstream.EventStre // Register the container instance err = agent.registerContainerInstance(client, vpcSubnetAttributes) if err != nil { - if isTransient(err) { - return exitcodes.ExitError + if isTerminal(err) { + // On unrecoverable error codes, agent should terminally exit. + logger.Critical("Agent will terminally exit, unable to register container instance:", logger.Fields{ + field.Error: err, + }) + return exitcodes.ExitTerminal } - return exitcodes.ExitTerminal + // Other errors are considered recoverable and will be retried. + return exitcodes.ExitError } // Load Managed Daemon images asynchronously @@ -834,13 +839,19 @@ func (agent *ecsAgent) registerContainerInstance( field.Error: err, }) if retriable, ok := err.(apierrors.Retriable); ok && !retriable.Retry() { - return err + return terminalError{err} } if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeInvalidParameterException) { logger.Critical("Instance registration attempt with an invalid parameter", logger.Fields{ field.Error: err, }) - return err + return terminalError{err} + } + if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeClientException) { + logger.Critical("Instance registration attempt with client performing invalid action", logger.Fields{ + field.Error: err, + }) + return terminalError{err} } if _, ok := err.(apierrors.AttributeError); ok { attributeErrorMsg := "" @@ -850,9 +861,9 @@ func (agent *ecsAgent) registerContainerInstance( logger.Critical("Instance registration attempt with invalid attribute(s)", logger.Fields{ field.Error: attributeErrorMsg, }) - return err + return terminalError{err} } - return transientError{err} + return err } logger.Info("Instance registration completed successfully", logger.Fields{ "instanceArn": containerInstanceArn, @@ -882,7 +893,7 @@ func (agent *ecsAgent) reregisterContainerInstance(client ecs.ECSClient, capabil }) if apierrors.IsInstanceTypeChangedError(err) { seelog.Criticalf(instanceTypeMismatchErrorFormat, err) - return err + return terminalError{err} } if _, ok := err.(apierrors.AttributeError); ok { attributeErrorMsg := "" @@ -892,9 +903,9 @@ func (agent *ecsAgent) reregisterContainerInstance(client ecs.ECSClient, capabil logger.Critical("Instance re-registration attempt with invalid attribute(s)", logger.Fields{ field.Error: attributeErrorMsg, }) - return err + return terminalError{err} } - return transientError{err} + return err } // startAsyncRoutines starts all background methods diff --git a/agent/app/agent_test.go b/agent/app/agent_test.go index 9dc8bbafa36..842edb31651 100644 --- a/agent/app/agent_test.go +++ b/agent/app/agent_test.go @@ -222,7 +222,7 @@ func TestDoStartNewTaskEngineError(t *testing.T) { assert.Equal(t, exitcodes.ExitTerminal, exitCode) } -func TestDoStartRegisterContainerInstanceErrorTerminal(t *testing.T) { +func TestDoStartRegisterContainerInstanceErrorTransient(t *testing.T) { ctrl, credentialsManager, state, imageManager, client, dockerClient, _, _, execCmdMgr, _ := setup(t) defer ctrl.Finish() @@ -282,7 +282,7 @@ func TestDoStartRegisterContainerInstanceErrorTerminal(t *testing.T) { exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), credentialsManager, state, imageManager, client, execCmdMgr) - assert.Equal(t, exitcodes.ExitTerminal, exitCode) + assert.Equal(t, exitcodes.ExitError, exitCode) } func TestDoStartRegisterContainerInstanceErrorNonTerminal(t *testing.T) { @@ -1438,67 +1438,93 @@ func TestRegisterContainerInstanceWhenContainerInstanceARNIsNotSetAttributeError assert.False(t, isTransient(err)) } -func TestRegisterContainerInstanceInvalidParameterTerminalError(t *testing.T) { - ctrl, credentialsManager, state, imageManager, client, - dockerClient, _, _, execCmdMgr, _ := setup(t) - defer ctrl.Finish() - - mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl) - mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl) - mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) - mockPauseLoader := mock_loader.NewMockLoader(ctrl) - - mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes() - mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() - mockEC2Metadata.EXPECT().PrimaryENIMAC().Return("mac", nil) - mockEC2Metadata.EXPECT().VPCID(gomock.Eq("mac")).Return("vpc-id", nil) - mockEC2Metadata.EXPECT().SubnetID(gomock.Eq("mac")).Return("subnet-id", nil) - mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl) - mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() - mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes() - mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes() - mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes() - - mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl) - mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager} - mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() - mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() +func TestRegisterContainerInstanceExceptionsErrorHandling(t *testing.T) { + testCases := []struct { + name string + regError error + expectedExit int + }{ + { + name: "ThrottlingException", + regError: awserr.New("ThrottlingException", "", nil), + expectedExit: exitcodes.ExitError, + }, + { + name: "ClientException", + regError: awserr.New("ClientException", "", nil), + expectedExit: exitcodes.ExitTerminal, + }, + { + name: "InvalidParameterException", + regError: awserr.New("InvalidParameterException", "", nil), + expectedExit: exitcodes.ExitTerminal, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctrl, credentialsManager, state, imageManager, client, + dockerClient, _, _, execCmdMgr, _ := setup(t) + defer ctrl.Finish() - dockerClient.EXPECT().SupportedVersions().Return(apiVersions).AnyTimes() + mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl) + mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl) + mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) + mockPauseLoader := mock_loader.NewMockLoader(ctrl) + + mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes() + mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + mockEC2Metadata.EXPECT().PrimaryENIMAC().Return("mac", nil) + mockEC2Metadata.EXPECT().VPCID(gomock.Eq("mac")).Return("vpc-id", nil) + mockEC2Metadata.EXPECT().SubnetID(gomock.Eq("mac")).Return("subnet-id", nil) + mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl) + mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes() + mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes() + mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes() + + mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl) + mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager} + mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + + dockerClient.EXPECT().SupportedVersions().Return(apiVersions).AnyTimes() + + gomock.InOrder( + client.EXPECT().GetHostResources().Return(testHostResource, nil), + mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil), + mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil), + dockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).AnyTimes().Return([]string{}, nil), + client.EXPECT().RegisterContainerInstance(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).Return("", "", tc.), + ) + mockEC2Metadata.EXPECT().OutpostARN().Return("", nil) - gomock.InOrder( - client.EXPECT().GetHostResources().Return(testHostResource, nil), - mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil), - mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil), - dockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(), gomock.Any(), - gomock.Any()).AnyTimes().Return([]string{}, nil), - client.EXPECT().RegisterContainerInstance(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), - gomock.Any()).Return("", "", awserr.New("InvalidParameterException", "", nil)), - ) - mockEC2Metadata.EXPECT().OutpostARN().Return("", nil) + cfg := getTestConfig() + ctx, cancel := context.WithCancel(context.TODO()) + // Cancel the context to cancel async routines + defer cancel() + agent := &ecsAgent{ + ctx: ctx, + ec2MetadataClient: mockEC2Metadata, + cfg: &cfg, + pauseLoader: mockPauseLoader, + credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider), + dockerClient: dockerClient, + mobyPlugins: mockMobyPlugins, + terminationHandler: func(taskEngineState dockerstate.TaskEngineState, dataClient data.Client, taskEngine engine.TaskEngine, cancel context.CancelFunc) { + }, + serviceconnectManager: mockServiceConnectManager, + daemonManagers: mockDaemonManagers, + } - cfg := getTestConfig() - ctx, cancel := context.WithCancel(context.TODO()) - // Cancel the context to cancel async routines - defer cancel() - agent := &ecsAgent{ - ctx: ctx, - ec2MetadataClient: mockEC2Metadata, - cfg: &cfg, - pauseLoader: mockPauseLoader, - credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider), - dockerClient: dockerClient, - mobyPlugins: mockMobyPlugins, - terminationHandler: func(taskEngineState dockerstate.TaskEngineState, dataClient data.Client, taskEngine engine.TaskEngine, cancel context.CancelFunc) { - }, - serviceconnectManager: mockServiceConnectManager, - daemonManagers: mockDaemonManagers, + exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), + credentialsManager, state, imageManager, client, execCmdMgr) + assert.Equal(t, tc.expectedExit, exitCode) + }) } - - exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), - credentialsManager, state, imageManager, client, execCmdMgr) - assert.Equal(t, exitcodes.ExitTerminal, exitCode) } + func TestMergeTags(t *testing.T) { ec2Key := "ec2Key" ec2Value := "ec2Value" diff --git a/agent/app/errors.go b/agent/app/errors.go index 06b02349b84..66545904bc8 100644 --- a/agent/app/errors.go +++ b/agent/app/errors.go @@ -13,15 +13,16 @@ package app -// transientError represents a transient error when executing the ECS Agent -type transientError struct { +type terminalError struct { error } -// isTransient returns true if the error is transient -func isTransient(err error) bool { - _, ok := err.(transientError) - return ok +// isTerminal returns true if the error is already wrapped as an unrecoverable condition +// which will allow agent to exit terminally. +func isTerminal(err error) bool { + // Check if the error is already wrapped as a terminalError + _, terminal := err.(terminalError) + return terminal } // clusterMismatchError represents a mismatch in cluster name between the