Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus add nodes gauge for SQS mode #1083

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ func main() {
}
log.Debug().Msgf("AWS Credentials retrieved from provider: %s", creds.ProviderName)

ec2Client := ec2.New(sess)

if nthConfig.EnablePrometheus {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add nil check for metrics here in case we have any error during observability.InitMetrics?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree

Copy link
Author

@phuhung273 phuhung273 Jan 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to, but the compiler doesn't like it. Can you guys have a try?

go metrics.InitNodeMetrics(nthConfig, node, ec2Client)
}

completeLifecycleActionDelay := time.Duration(nthConfig.CompleteLifecycleActionDelaySeconds) * time.Second
sqsMonitor := sqsevent.SQSMonitor{
CheckIfManaged: nthConfig.CheckTagBeforeDraining,
Expand All @@ -224,7 +230,7 @@ func main() {
CancelChan: cancelChan,
SQS: sqsevent.GetSqsClient(sess),
ASG: autoscaling.New(sess),
EC2: ec2.New(sess),
EC2: ec2Client,
BeforeCompleteLifecycleAction: func() { <-time.After(completeLifecycleActionDelay) },
}
monitoringFns[sqsEvents] = sqsMonitor
Expand Down
98 changes: 98 additions & 0 deletions pkg/ec2helper/ec2helper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package ec2helper

import (
"fmt"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/ec2/ec2iface"
)

type IEC2Helper interface {
GetInstanceIdsMapByTagKey(tag string) (map[string]bool, error)
}

type EC2Helper struct {
ec2ServiceClient ec2iface.EC2API
}

func New(ec2 ec2iface.EC2API) EC2Helper {
return EC2Helper{
ec2ServiceClient: ec2,
}
}

func (h EC2Helper) GetInstanceIdsByTagKey(tag string) ([]string, error) {
ids := []string{}
var nextToken string

for {
result, err := h.ec2ServiceClient.DescribeInstances(&ec2.DescribeInstancesInput{
Filters: []*ec2.Filter{
phuhung273 marked this conversation as resolved.
Show resolved Hide resolved
{
Name: aws.String("tag-key"),
Values: []*string{aws.String(tag)},
},
},
NextToken: &nextToken,
})

if err != nil {
return nil, err
}

if result == nil || result.Reservations == nil {
return nil, fmt.Errorf("describe instances success but return empty response for tag key: %s", tag)
}

for _, reservation := range result.Reservations {
if reservation.Instances == nil {
continue
}
for _, instance := range reservation.Instances {
if instance == nil || instance.InstanceId == nil {
continue
}
ids = append(ids, *instance.InstanceId)
}
}

if result.NextToken == nil {
break
}
nextToken = *result.NextToken
}

return ids, nil
}

func (h EC2Helper) GetInstanceIdsMapByTagKey(tag string) (map[string]bool, error) {
phuhung273 marked this conversation as resolved.
Show resolved Hide resolved
idMap := map[string]bool{}
ids, err := h.GetInstanceIdsByTagKey(tag)
if err != nil {
return nil, err
}

if ids == nil {
phuhung273 marked this conversation as resolved.
Show resolved Hide resolved
return nil, fmt.Errorf("get instance ids success but return empty response for tag key: %s", tag)
}

for _, id := range ids {
idMap[id] = true
}

return idMap, nil
}
154 changes: 154 additions & 0 deletions pkg/ec2helper/ec2helper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package ec2helper_test
phuhung273 marked this conversation as resolved.
Show resolved Hide resolved

import (
"testing"

"github.com/aws/aws-node-termination-handler/pkg/ec2helper"
h "github.com/aws/aws-node-termination-handler/pkg/test"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/service/ec2"
)

const (
instanceId1 = "i-1"
instanceId2 = "i-2"
)

func TestGetInstanceIdsByTagKey(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: getDescribeInstancesResp(),
}
ec2Helper := ec2helper.New(ec2Mock)
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Ok(t, err)

h.Equals(t, 2, len(instanceIds))
h.Equals(t, instanceId1, instanceIds[0])
h.Equals(t, instanceId2, instanceIds[1])
}

func TestGetInstanceIdsByTagKeyAPIError(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: getDescribeInstancesResp(),
DescribeInstancesErr: awserr.New("ThrottlingException", "Rate exceeded", nil),
}
ec2Helper := ec2helper.New(ec2Mock)
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Nok(t, err)
}

func TestGetInstanceIdsByTagKeyNilResponse(t *testing.T) {
ec2Mock := h.MockedEC2{}
ec2Helper := ec2helper.New(ec2Mock)
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Nok(t, err)
}

func TestGetInstanceIdsByTagKeyNilReservations(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: ec2.DescribeInstancesOutput{
Reservations: nil,
},
}
ec2Helper := ec2helper.New(ec2Mock)
_, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Nok(t, err)
}

func TestGetInstanceIdsByTagKeyEmptyReservation(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: ec2.DescribeInstancesOutput{
Reservations: []*ec2.Reservation{},
},
}
ec2Helper := ec2helper.New(ec2Mock)
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Ok(t, err)
h.Equals(t, 0, len(instanceIds))
}

func TestGetInstanceIdsByTagKeyEmptyInstances(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: ec2.DescribeInstancesOutput{
Reservations: []*ec2.Reservation{
{
Instances: []*ec2.Instance{},
},
},
},
}
ec2Helper := ec2helper.New(ec2Mock)
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Ok(t, err)
h.Equals(t, 0, len(instanceIds))
}

func TestGetInstanceIdsByTagKeyNilInstancesId(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: ec2.DescribeInstancesOutput{
Reservations: []*ec2.Reservation{
{
Instances: []*ec2.Instance{
{
InstanceId: nil,
},
{
InstanceId: aws.String(instanceId1),
},
},
},
},
},
}
ec2Helper := ec2helper.New(ec2Mock)
instanceIds, err := ec2Helper.GetInstanceIdsByTagKey("myNTHManagedTag")
h.Ok(t, err)
h.Equals(t, 1, len(instanceIds))
}

func TestGetInstanceIdsMapByTagKey(t *testing.T) {
ec2Mock := h.MockedEC2{
DescribeInstancesResp: getDescribeInstancesResp(),
}
ec2Helper := ec2helper.New(ec2Mock)
instanceIdsMap, err := ec2Helper.GetInstanceIdsMapByTagKey("myNTHManagedTag")
h.Ok(t, err)

_, exist := instanceIdsMap[instanceId1]
h.Equals(t, true, exist)
_, exist = instanceIdsMap[instanceId2]
h.Equals(t, true, exist)
_, exist = instanceIdsMap["non-existent instance id"]
h.Equals(t, false, exist)
}

func getDescribeInstancesResp() ec2.DescribeInstancesOutput {
return ec2.DescribeInstancesOutput{
Reservations: []*ec2.Reservation{
{
Instances: []*ec2.Instance{
{
InstanceId: aws.String(instanceId1),
},
{
InstanceId: aws.String(instanceId2),
},
},
},
},
}
}
39 changes: 39 additions & 0 deletions pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"context"
"encoding/json"
"fmt"
"regexp"
"strconv"
"strings"
"time"
Expand Down Expand Up @@ -74,6 +75,7 @@ const (
var (
maxRetryDeadline time.Duration = 5 * time.Second
conflictRetryInterval time.Duration = 750 * time.Millisecond
instanceIDRegex = regexp.MustCompile(`^i-.*`)
)

// Node represents a kubernetes node with functions to manipulate its state via the kubernetes api server
Expand Down Expand Up @@ -635,6 +637,43 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) {
return &matchingNodes.Items[0], nil
}

// fetchKubernetesNode will send an http request to the k8s api server and return list of AWS EC2 instance id
func (n Node) FetchKubernetesNodeInstanceIds() ([]string, error) {
ids := []string{}

if n.nthConfig.DryRun {
log.Info().Msgf("Would have retrieved nodes, but dry-run flag was set")
return ids, nil
}
matchingNodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
if err != nil {
log.Warn().Msgf("Unable to list Nodes")
return nil, err
}

if matchingNodes == nil || matchingNodes.Items == nil {
return nil, fmt.Errorf("list nodes success but return empty response")
}

for _, node := range matchingNodes.Items {
// sample providerID: aws:///us-west-2a/i-0abcd1234efgh5678
parts := strings.Split(node.Spec.ProviderID, "/")
if len(parts) != 5 {
log.Warn().Msgf("Invalid providerID format found for node %s: %s (expected format: aws:///region/instance-id)", node.Name, node.Spec.ProviderID)
continue
}

instanceId := parts[len(parts)-1]
if instanceIDRegex.MatchString(instanceId) {
ids = append(ids, parts[len(parts)-1])
phuhung273 marked this conversation as resolved.
Show resolved Hide resolved
} else {
log.Warn().Msgf("Invalid instance id format found for node %s: %s (expected format: ^i-.*)", node.Name, instanceId)
}
}

return ids, nil
}

func (n Node) fetchAllPods(nodeName string) (*corev1.PodList, error) {
if n.nthConfig.DryRun {
log.Info().Msgf("Would have retrieved running pod list on node %s, but dry-run flag was set", nodeName)
Expand Down
Loading
Loading