diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 223fb692e1..0581349809 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.17.1 name: nodeclaims.karpenter.sh spec: group: karpenter.sh diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 5fcb05a312..386d1abd9c 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.17.1 name: nodepools.karpenter.sh spec: group: karpenter.sh diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index f93470c8b4..b8847ea2d8 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -239,7 +239,8 @@ func (o *Operator) Start(ctx context.Context, cp cloudprovider.CloudProvider) { wg.Add(1) go func() { defer wg.Done() - webhooks.ValidateConversionEnabled(ctx, o.GetClient()) + // Create a direct kubernetes API client so that we don't hit the read cache on subsequent reads during this check + webhooks.ValidateConversionEnabled(ctx, lo.Must(client.New(o.GetConfig(), client.Options{Scheme: o.GetScheme()}))) }() } wg.Wait() diff --git a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml index b8684d22e0..a80a21f279 100644 --- a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml +++ b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.17.1 name: testnodeclasses.karpenter.test.sh spec: group: karpenter.test.sh diff --git a/pkg/webhooks/webhooks.go b/pkg/webhooks/webhooks.go index d7cd10992e..6aa4da1e50 100644 --- a/pkg/webhooks/webhooks.go +++ b/pkg/webhooks/webhooks.go @@ -17,7 +17,7 @@ package webhooks import ( "context" "crypto/tls" - "errors" + stderrors "errors" "fmt" "io" "net/http" @@ -27,6 +27,8 @@ import ( "github.com/samber/lo" "go.uber.org/zap" "golang.org/x/sync/errgroup" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/rest" "knative.dev/pkg/configmap" @@ -43,6 +45,7 @@ import ( "knative.dev/pkg/webhook/resourcesemantics/validation" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/apis/v1beta1" @@ -184,7 +187,7 @@ func Start(ctx context.Context, cfg *rest.Config, ctors ...knativeinjection.Name <-egCtx.Done() // Don't forward ErrServerClosed as that indicates we're already shutting down. - if err := eg.Wait(); err != nil && !errors.Is(err, http.ErrServerClosed) { + if err := eg.Wait(); err != nil && !stderrors.Is(err, http.ErrServerClosed) { knativelogging.FromContext(ctx).Errorw("Error while running server", zap.Error(err)) } } @@ -218,21 +221,30 @@ func HealthProbe(ctx context.Context) healthz.Checker { } } -func ValidateConversionEnabled(ctx context.Context, kubeclient client.Client) { - // allow context to exist longer than cache sync timeout which has a default of 120 seconds - listCtx, cancel := context.WithTimeout(ctx, 130*time.Second) - defer cancel() - var err error - v1np := &v1.NodePoolList{} +func ValidateConversionEnabled(ctx context.Context, directKubeClient client.Client) { + failureCount := 0 for { - err = kubeclient.List(listCtx, v1np, &client.ListOptions{Limit: 1}) - if err == nil { - return + // We only need to try to get a single NodePool from the API server + err := directKubeClient.List(ctx, &v1.NodePoolList{}, &client.ListOptions{Limit: 1}) + // We only want to bail out with a failure if we know the reason that we are not able to List on this API + // is due to an internal error on the conversion webhook + if errors.IsInternalError(err) && strings.Contains(err.Error(), "conversion webhook for karpenter.sh/v1beta1, Kind=NodePool failed") { + failureCount++ + // If we see more than 2 consecutive failures, then we should panic at that point + if failureCount > 2 { + panic(fmt.Sprintf("validating conversion webhook reachable, %s", err.Error())) + } + } else { + failureCount = 0 + } + // If there's an error that occurs that we don't know about, and it's not about the API not existing, then log the error + if err != nil && !meta.IsNoMatchError(err) { + log.FromContext(ctx).Error(err, "failed validating conversion webhook reachable") } select { - case <-listCtx.Done(): - panic("Conversion webhook enabled but unable to complete call: " + err.Error()) - case <-time.After(10 * time.Second): + case <-ctx.Done(): // process is completing, so just exit + return + case <-time.After(time.Minute): // poll the API every minute } } }