Skip to content

Commit

Permalink
chore: Add more granular condition reason and messages (#7625)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Jan 24, 2025
1 parent 447b057 commit e0e7a58
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 31 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ metadata:
{{- with .Values.additionalAnnotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
controller-gen.kubebuilder.io/version: v0.16.5
controller-gen.kubebuilder.io/version: v0.17.1
name: ec2nodeclasses.karpenter.k8s.aws
spec:
group: karpenter.k8s.aws
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ require (
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738
sigs.k8s.io/controller-runtime v0.20.0
sigs.k8s.io/karpenter v1.1.2-0.20250117235835-ff44f7325bf0
sigs.k8s.io/karpenter v1.1.2-0.20250122224037-af00eb443545
sigs.k8s.io/yaml v1.4.0
)

Expand Down Expand Up @@ -114,9 +114,9 @@ require (
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/cloud-provider v0.32.0 // indirect
k8s.io/cloud-provider v0.32.1 // indirect
k8s.io/component-base v0.32.1 // indirect
k8s.io/csi-translation-lib v0.32.0 // indirect
k8s.io/csi-translation-lib v0.32.1 // indirect
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,12 @@ k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs=
k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE=
k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU=
k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg=
k8s.io/cloud-provider v0.32.0 h1:QXYJGmwME2q2rprymbmw2GroMChQYc/MWN6l/I4Kgp8=
k8s.io/cloud-provider v0.32.0/go.mod h1:cz3gVodkhgwi2ugj/JUPglIruLSdDaThxawuDyCHfr8=
k8s.io/cloud-provider v0.32.1 h1:74rRhnfca3o4CsjjnIp/C3ARVuSmyNsxgWPtH0yc9Z0=
k8s.io/cloud-provider v0.32.1/go.mod h1:GECSanFT+EeZ/ToX3xlasjETzMUI+VFu92zHUDUsGHw=
k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk=
k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w=
k8s.io/csi-translation-lib v0.32.0 h1:RAn9RGgYXHJQtDSb6qQ7zvq6QObOejzmsXDARI+f4OQ=
k8s.io/csi-translation-lib v0.32.0/go.mod h1:TjCJzkTNstdOESAXNnEImrYOMIEzP14aqM7H+vkehqw=
k8s.io/csi-translation-lib v0.32.1 h1:qqlB+eKiIdUM+GGZfJN/4FMNeuIPIELLxfWfv/LWUYk=
k8s.io/csi-translation-lib v0.32.1/go.mod h1:dc7zXqpUW4FykfAe6TqU32tYewsGhrjI63ZwJWQng3k=
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y=
Expand All @@ -345,8 +345,8 @@ sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrs
sigs.k8s.io/controller-runtime v0.20.0/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8=
sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo=
sigs.k8s.io/karpenter v1.1.2-0.20250117235835-ff44f7325bf0 h1:AAOsDTOzQIScWKWqwVEWsYCOkvtfqU9W+neUDnqYqCg=
sigs.k8s.io/karpenter v1.1.2-0.20250117235835-ff44f7325bf0/go.mod h1:OIjZ34eS462NJtQ2AW8nVBQX4/YKu1B41QJ17BaWBf4=
sigs.k8s.io/karpenter v1.1.2-0.20250122224037-af00eb443545 h1:fdGNMR0MLlQbUppwvSrL4wIulRlpwuBSmEgleA1OHkk=
sigs.k8s.io/karpenter v1.1.2-0.20250122224037-af00eb443545/go.mod h1:646txj32arNTy+K4gySCqWSljYrEdemAdYoBMQmkS7o=
sigs.k8s.io/structured-merge-diff/v4 v4.4.2 h1:MdmvkGuXi/8io6ixD5wud3vOLwc1rj0aNqRlpuvjmwA=
sigs.k8s.io/structured-merge-diff/v4 v4.4.2/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4=
sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E=
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.5
controller-gen.kubebuilder.io/version: v0.17.1
name: ec2nodeclasses.karpenter.k8s.aws
spec:
group: karpenter.k8s.aws
Expand Down
14 changes: 3 additions & 11 deletions pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,12 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
}

nodeClassReady := nodeClass.StatusConditions().Get(status.ConditionReady)
if nodeClassReady.IsFalse() {
if !nodeClassReady.IsTrue() {
return nil, cloudprovider.NewNodeClassNotReadyError(stderrors.New(nodeClassReady.Message))
}
if nodeClassReady.IsUnknown() {
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving NodeClass readiness, NodeClass is in Ready=Unknown, %s", nodeClassReady.Message), "NodeClass is in Ready=Unknown")
}
instanceTypes, err := c.resolveInstanceTypes(ctx, nodeClaim, nodeClass)
if err != nil {
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "Error resolving instance types")
return nil, cloudprovider.NewCreateError(fmt.Errorf("resolving instance types, %w", err), "InstanceTypeResolutionFailed", "Error resolving instance types")
}
if len(instanceTypes) == 0 {
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("all requested instance types were unavailable during launch"))
Expand All @@ -109,12 +106,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
}
instance, err := c.instanceProvider.Create(ctx, nodeClass, nodeClaim, tags, instanceTypes)
if err != nil {
conditionMessage := "Error creating instance"
var createError *cloudprovider.CreateError
if stderrors.As(err, &createError) {
conditionMessage = createError.ConditionMessage
}
return nil, cloudprovider.NewCreateError(fmt.Errorf("creating instance, %w", err), conditionMessage)
return nil, fmt.Errorf("creating instance, %w", err)
}
instanceType, _ := lo.Find(instanceTypes, func(i *cloudprovider.InstanceType) bool {
return i.Name == string(instance.Type)
Expand Down
4 changes: 2 additions & 2 deletions pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,12 @@ var _ = Describe("CloudProvider", func() {
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed())
Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed())
})
It("should not proceed with instance creation if NodeClass is unknown", func() {
It("should return NodeClassNotReady error on creation if NodeClass is unknown", func() {
nodeClass.StatusConditions().SetUnknown(opstatus.ConditionReady)
ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim)
_, err := cloudProvider.Create(ctx, nodeClaim)
Expect(err).To(HaveOccurred())
Expect(corecloudprovider.IsNodeClassNotReadyError(err)).To(BeFalse())
Expect(corecloudprovider.IsNodeClassNotReadyError(err)).To(BeTrue())
})
It("should return NodeClassNotReady error on creation if NodeClass is not ready", func() {
nodeClass.StatusConditions().SetFalse(opstatus.ConditionReady, "NodeClassNotReady", "NodeClass not ready")
Expand Down
52 changes: 52 additions & 0 deletions pkg/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package errors

import (
"errors"
"strings"

ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/aws/smithy-go"
Expand Down Expand Up @@ -106,3 +107,54 @@ func IsLaunchTemplateNotFound(err error) bool {
}
return false
}

// ToReasonMessage converts an error message from AWS into a well-known condition reason
// and well-known condition message that can be used for Launch failure classification
// nolint:gocyclo
func ToReasonMessage(err error) (string, string) {
if strings.Contains(err.Error(), "AuthFailure.ServiceLinkedRoleCreationNotPermitted") {
return "SpotSLRCreationFailed", "User does not hae sufficient permission to create the Spot ServiceLinkedRole to launch spot instances"
}
if strings.Contains(err.Error(), "UnauthorizedOperation") || strings.Contains(err.Error(), "AccessDenied") || strings.Contains(err.Error(), "AuthFailure") {
if strings.Contains(err.Error(), "with an explicit deny in a permissions boundary") {
return "Unauthorized", "User is not authorized to perform this operation due to a permission boundary"
}
if strings.Contains(err.Error(), "with an explicit deny in a service control policy") {
return "Unauthorized", "User is not authorized to perform this operation due to a service control policy"
}
return "Unauthorized", "User is not authorized to perform this operation because no identity-based policy allows it"
}
if strings.Contains(err.Error(), "iamInstanceProfile.name is invalid") {
return "InstanceProfileNameInvalid", "Instance profile name used from EC2NodeClass status does not exist"
}
if strings.Contains(err.Error(), "InvalidLaunchTemplateId.NotFound") {
return "LaunchTemplateNotFound", "Launch template used for instance launch wasn't found"
}
if strings.Contains(err.Error(), "InvalidAMIID.Malformed") {
return "InvalidAMIID", "AMI used for instance launch is invalid"
}
if strings.Contains(err.Error(), "RequestLimitExceeded") {
return "RequestLimitExceeded", "Request limit exceeded"
}
if strings.Contains(err.Error(), "InternalError") {
return "InternalError", "An internal error has occurred"
}
// ICE Errors come last in this list because we should return a generic ICE error if all of the errors that are returned from
// fleet are ICE errors
if strings.Contains(err.Error(), "MaxFleetCountExceeded") {
return "FleetQuotaExceeded", "A fleet launch was requested but this would exceed your fleet request quota"
}
if strings.Contains(err.Error(), "PendingVerification") {
return "AccountPendingVerification", "An instance launch was requested but the request for launching resources in this region is still being verified"
}
if strings.Contains(err.Error(), "MaxSpotInstanceCountExceeded") {
return "SpotQuotaExceeded", "A spot instance launch was requested but this would exceed your spot instance quota"
}
if strings.Contains(err.Error(), "VcpuLimitExceeded") {
return "VCPULimitExceeded", "An instance was requested that would exceed your VCPU quota"
}
if strings.Contains(err.Error(), "InsufficientFreeAddressesInSubnet") {
return "InsufficientFreeAddressesInSubnet", "There are not enough free IP addresses to launch an instance in this subnet"
}
return "LaunchFailed", "Instance launch failed"
}
16 changes: 9 additions & 7 deletions pkg/providers/instance/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass
}
instanceTypes, err := cloudprovider.InstanceTypes(instanceTypes).Truncate(schedulingRequirements, maxInstanceTypes)
if err != nil {
return nil, cloudprovider.NewCreateError(fmt.Errorf("truncating instance types, %w", err), "Error truncating instance types based on the passed-in requirements")
return nil, cloudprovider.NewCreateError(fmt.Errorf("truncating instance types, %w", err), "InstanceTypeResolutionFailed", "Error truncating instance types based on the passed-in requirements")
}
fleetInstance, err := p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags)
if awserrors.IsLaunchTemplateNotFound(err) {
Expand Down Expand Up @@ -211,13 +211,14 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N
capacityType := p.getCapacityType(nodeClaim, instanceTypes)
zonalSubnets, err := p.subnetProvider.ZonalSubnetsForLaunch(ctx, nodeClass, instanceTypes, capacityType)
if err != nil {
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting subnets, %w", err), "Error getting subnets")
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting subnets, %w", err), "SubnetResolutionFailed", "Error getting subnets")
}

// Get Launch Template Configs, which may differ due to GPU or Architecture requirements
launchTemplateConfigs, err := p.getLaunchTemplateConfigs(ctx, nodeClass, nodeClaim, instanceTypes, zonalSubnets, capacityType, tags)
if err != nil {
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting launch template configs, %w", err), "Error getting launch template configs")
reason, message := awserrors.ToReasonMessage(err)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("getting launch template configs, %w", err), reason, fmt.Sprintf("Error getting launch template configs: %s", message))
}
if err := p.checkODFallback(nodeClaim, instanceTypes, launchTemplateConfigs); err != nil {
log.FromContext(ctx).Error(err, "failed while checking on-demand fallback")
Expand Down Expand Up @@ -246,18 +247,18 @@ func (p *DefaultProvider) launchInstance(ctx context.Context, nodeClass *v1.EC2N
createFleetOutput, err := p.ec2Batcher.CreateFleet(ctx, createFleetInput)
p.subnetProvider.UpdateInflightIPs(createFleetInput, createFleetOutput, instanceTypes, lo.Values(zonalSubnets), capacityType)
if err != nil {
conditionMessage := "Error creating fleet"
if awserrors.IsLaunchTemplateNotFound(err) {
for _, lt := range launchTemplateConfigs {
p.launchTemplateProvider.InvalidateCache(ctx, aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateName), aws.ToString(lt.LaunchTemplateSpecification.LaunchTemplateId))
}
return ec2types.CreateFleetInstance{}, fmt.Errorf("creating fleet %w", err)
}
reason, message := awserrors.ToReasonMessage(err)
var reqErr *awshttp.ResponseError
if errors.As(err, &reqErr) {
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w (%v)", err, reqErr.ServiceRequestID()), conditionMessage)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w (%v)", err, reqErr.ServiceRequestID()), reason, fmt.Sprintf("Error creating fleet request: %s", message))
}
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet %w", err), conditionMessage)
return ec2types.CreateFleetInstance{}, cloudprovider.NewCreateError(fmt.Errorf("creating fleet request, %w", err), reason, fmt.Sprintf("Error creating fleet request: %s", message))
}
p.updateUnavailableOfferingsCache(ctx, createFleetOutput.Errors, capacityType)
if len(createFleetOutput.Instances) == 0 || len(createFleetOutput.Instances[0].InstanceIds) == 0 {
Expand Down Expand Up @@ -502,5 +503,6 @@ func combineFleetErrors(fleetErrs []ec2types.CreateFleetError) (errs error) {
if iceErrorCount == len(fleetErrs) {
return cloudprovider.NewInsufficientCapacityError(fmt.Errorf("with fleet error(s), %w", errs))
}
return cloudprovider.NewCreateError(errs, "Error creating fleet")
reason, message := awserrors.ToReasonMessage(errs)
return cloudprovider.NewCreateError(errs, reason, message)
}

0 comments on commit e0e7a58

Please sign in to comment.