Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Modify conversion webhook check to check for v1 version existence (cherry-pick v0.33.x) #1920

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: nodeclaims.karpenter.sh
spec:
group: karpenter.sh
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: nodepools.karpenter.sh
spec:
group: karpenter.sh
Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,8 @@ func (o *Operator) Start(ctx context.Context, cp cloudprovider.CloudProvider) {
wg.Add(1)
go func() {
defer wg.Done()
webhooks.ValidateConversionEnabled(ctx, o.GetClient())
// Create a direct kubernetes API client so that we don't hit the read cache on subsequent reads during this check
webhooks.ValidateConversionEnabled(ctx, lo.Must(client.New(o.GetConfig(), client.Options{Scheme: o.GetScheme()})))
}()
}
wg.Wait()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: testnodeclasses.karpenter.test.sh
spec:
group: karpenter.test.sh
Expand Down
40 changes: 26 additions & 14 deletions pkg/webhooks/webhooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ package webhooks
import (
"context"
"crypto/tls"
"errors"
stderrors "errors"
"fmt"
"io"
"net/http"
Expand All @@ -27,6 +27,8 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/rest"
"knative.dev/pkg/configmap"
Expand All @@ -43,6 +45,7 @@ import (
"knative.dev/pkg/webhook/resourcesemantics/validation"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
Expand Down Expand Up @@ -184,7 +187,7 @@ func Start(ctx context.Context, cfg *rest.Config, ctors ...knativeinjection.Name
<-egCtx.Done()

// Don't forward ErrServerClosed as that indicates we're already shutting down.
if err := eg.Wait(); err != nil && !errors.Is(err, http.ErrServerClosed) {
if err := eg.Wait(); err != nil && !stderrors.Is(err, http.ErrServerClosed) {
knativelogging.FromContext(ctx).Errorw("Error while running server", zap.Error(err))
}
}
Expand Down Expand Up @@ -218,21 +221,30 @@ func HealthProbe(ctx context.Context) healthz.Checker {
}
}

func ValidateConversionEnabled(ctx context.Context, kubeclient client.Client) {
// allow context to exist longer than cache sync timeout which has a default of 120 seconds
listCtx, cancel := context.WithTimeout(ctx, 130*time.Second)
defer cancel()
var err error
v1np := &v1.NodePoolList{}
func ValidateConversionEnabled(ctx context.Context, directKubeClient client.Client) {
failureCount := 0
for {
err = kubeclient.List(listCtx, v1np, &client.ListOptions{Limit: 1})
if err == nil {
return
// We only need to try to get a single NodePool from the API server
err := directKubeClient.List(ctx, &v1.NodePoolList{}, &client.ListOptions{Limit: 1})
// We only want to bail out with a failure if we know the reason that we are not able to List on this API
// is due to an internal error on the conversion webhook
if errors.IsInternalError(err) && strings.Contains(err.Error(), "conversion webhook for karpenter.sh/v1beta1, Kind=NodePool failed") {
failureCount++
// If we see more than 2 consecutive failures, then we should panic at that point
if failureCount > 2 {
panic(fmt.Sprintf("validating conversion webhook reachable, %s", err.Error()))
}
} else {
failureCount = 0
}
// If there's an error that occurs that we don't know about, and it's not about the API not existing, then log the error
if err != nil && !meta.IsNoMatchError(err) {
log.FromContext(ctx).Error(err, "failed validating conversion webhook reachable")
}
select {
case <-listCtx.Done():
panic("Conversion webhook enabled but unable to complete call: " + err.Error())
case <-time.After(10 * time.Second):
case <-ctx.Done(): // process is completing, so just exit
return
case <-time.After(time.Minute): // poll the API every minute
}
}
}
Loading