Skip to content

Commit

Permalink
fix: Modify conversion webhook check to check for v1 version existenc…
Browse files Browse the repository at this point in the history
…e (cherry-pick v0.34.x) (#1919)
  • Loading branch information
jonathan-innis authored Jan 15, 2025
1 parent 96e8080 commit cb0a421
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 18 deletions.
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodeclaims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: nodeclaims.karpenter.sh
spec:
group: karpenter.sh
Expand Down
2 changes: 1 addition & 1 deletion pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: nodepools.karpenter.sh
spec:
group: karpenter.sh
Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ func (o *Operator) Start(ctx context.Context, cp cloudprovider.CloudProvider) {
wg.Add(1)
go func() {
defer wg.Done()
webhooks.ValidateConversionEnabled(ctx, o.GetClient())
// Create a direct kubernetes API client so that we don't hit the read cache on subsequent reads during this check
webhooks.ValidateConversionEnabled(ctx, lo.Must(client.New(o.GetConfig(), client.Options{Scheme: o.GetScheme()})))
}()
}
wg.Wait()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.16.3
controller-gen.kubebuilder.io/version: v0.17.1
name: testnodeclasses.karpenter.test.sh
spec:
group: karpenter.test.sh
Expand Down
40 changes: 26 additions & 14 deletions pkg/webhooks/webhooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package webhooks
import (
"context"
"crypto/tls"
"errors"
stderrors "errors"
"fmt"
"io"
"net/http"
Expand All @@ -29,6 +29,8 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/rest"
"knative.dev/pkg/configmap"
Expand All @@ -45,6 +47,7 @@ import (
"knative.dev/pkg/webhook/resourcesemantics/validation"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
Expand Down Expand Up @@ -186,7 +189,7 @@ func Start(ctx context.Context, cfg *rest.Config, ctors ...knativeinjection.Name
<-egCtx.Done()

// Don't forward ErrServerClosed as that indicates we're already shutting down.
if err := eg.Wait(); err != nil && !errors.Is(err, http.ErrServerClosed) {
if err := eg.Wait(); err != nil && !stderrors.Is(err, http.ErrServerClosed) {
knativelogging.FromContext(ctx).Errorw("Error while running server", zap.Error(err))
}
}
Expand Down Expand Up @@ -220,21 +223,30 @@ func HealthProbe(ctx context.Context) healthz.Checker {
}
}

func ValidateConversionEnabled(ctx context.Context, kubeclient client.Client) {
// allow context to exist longer than cache sync timeout which has a default of 120 seconds
listCtx, cancel := context.WithTimeout(ctx, 130*time.Second)
defer cancel()
var err error
v1np := &v1.NodePoolList{}
func ValidateConversionEnabled(ctx context.Context, directKubeClient client.Client) {
failureCount := 0
for {
err = kubeclient.List(listCtx, v1np, &client.ListOptions{Limit: 1})
if err == nil {
return
// We only need to try to get a single NodePool from the API server
err := directKubeClient.List(ctx, &v1.NodePoolList{}, &client.ListOptions{Limit: 1})
// We only want to bail out with a failure if we know the reason that we are not able to List on this API
// is due to an internal error on the conversion webhook
if errors.IsInternalError(err) && strings.Contains(err.Error(), "conversion webhook for karpenter.sh/v1beta1, Kind=NodePool failed") {
failureCount++
// If we see more than 2 consecutive failures, then we should panic at that point
if failureCount > 2 {
panic(fmt.Sprintf("validating conversion webhook reachable, %s", err.Error()))
}
} else {
failureCount = 0
}
// If there's an error that occurs that we don't know about, and it's not about the API not existing, then log the error
if err != nil && !meta.IsNoMatchError(err) {
log.FromContext(ctx).Error(err, "failed validating conversion webhook reachable")
}
select {
case <-listCtx.Done():
panic("Conversion webhook enabled but unable to complete call: " + err.Error())
case <-time.After(10 * time.Second):
case <-ctx.Done(): // process is completing, so just exit
return
case <-time.After(time.Minute): // poll the API every minute
}
}
}

0 comments on commit cb0a421

Please sign in to comment.