diff --git a/pkg/apis/provisioning/v1alpha5/taints.go b/pkg/apis/provisioning/v1alpha5/taints.go index 56d73b6a902c..05c84d4a0b8d 100644 --- a/pkg/apis/provisioning/v1alpha5/taints.go +++ b/pkg/apis/provisioning/v1alpha5/taints.go @@ -52,7 +52,7 @@ func (ts Taints) WithPod(pod *v1.Pod) Taints { return ts } -// Has returns true if taints has a taint for the given key +// Has returns true if taints has a taint for the given key and effect func (ts Taints) Has(taint v1.Taint) bool { for _, t := range ts { if t.Key == taint.Key && t.Effect == taint.Effect { @@ -62,6 +62,16 @@ func (ts Taints) Has(taint v1.Taint) bool { return false } +// HasKey returns true if taints has a taint for the given key +func (ts Taints) HasKey(taintKey string) bool { + for _, t := range ts { + if t.Key == taintKey { + return true + } + } + return false +} + // Tolerates returns true if the pod tolerates all taints func (ts Taints) Tolerates(pod *v1.Pod) (errs error) { for i := range ts { diff --git a/pkg/controllers/node/controller.go b/pkg/controllers/node/controller.go index 345773fec150..2879e87174c6 100644 --- a/pkg/controllers/node/controller.go +++ b/pkg/controllers/node/controller.go @@ -41,22 +41,21 @@ const controllerName = "node" // NewController constructs a controller instance func NewController(kubeClient client.Client) *Controller { return &Controller{ - kubeClient: kubeClient, - liveness: &Liveness{kubeClient: kubeClient}, - emptiness: &Emptiness{kubeClient: kubeClient}, - expiration: &Expiration{kubeClient: kubeClient}, + kubeClient: kubeClient, + initialization: &Initialization{kubeClient: kubeClient}, + emptiness: &Emptiness{kubeClient: kubeClient}, + expiration: &Expiration{kubeClient: kubeClient}, } } // Controller manages a set of properties on karpenter provisioned nodes, such as // taints, labels, finalizers. type Controller struct { - kubeClient client.Client - readiness *Readiness - liveness *Liveness - emptiness *Emptiness - expiration *Expiration - finalizer *Finalizer + kubeClient client.Client + initialization *Initialization + emptiness *Emptiness + expiration *Expiration + finalizer *Finalizer } // Reconcile executes a reallocation control loop for the resource @@ -93,8 +92,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco for _, reconciler := range []interface { Reconcile(context.Context, *v1alpha5.Provisioner, *v1.Node) (reconcile.Result, error) }{ - c.readiness, - c.liveness, + c.initialization, c.expiration, c.emptiness, c.finalizer, diff --git a/pkg/controllers/node/initialization.go b/pkg/controllers/node/initialization.go new file mode 100644 index 000000000000..a15682bf269e --- /dev/null +++ b/pkg/controllers/node/initialization.go @@ -0,0 +1,65 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package node + +import ( + "context" + "fmt" + "time" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/utils/injectabletime" + "github.com/aws/karpenter/pkg/utils/node" + v1 "k8s.io/api/core/v1" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +const InitializationTimeout = 15 * time.Minute + +// Initialization is a subreconciler that +// 1. Removes the NotReady taint when the node is ready. This taint is originally applied on node creation. +// 2. Terminates nodes that don't transition to ready within InitializationTimeout +type Initialization struct { + kubeClient client.Client +} + +// Reconcile reconciles the node +func (r *Initialization) Reconcile(ctx context.Context, _ *v1alpha5.Provisioner, n *v1.Node) (reconcile.Result, error) { + if !v1alpha5.Taints(n.Spec.Taints).HasKey(v1alpha5.NotReadyTaintKey) { + // At this point, the startup of the node is complete and no more evaluation is necessary. + return reconcile.Result{}, nil + } + + if !node.IsReady(n) { + if age := injectabletime.Now().Sub(n.GetCreationTimestamp().Time); age < InitializationTimeout { + return reconcile.Result{RequeueAfter: InitializationTimeout - age}, nil + } + logging.FromContext(ctx).Infof("Triggering termination for node that failed to become ready") + if err := r.kubeClient.Delete(ctx, n); err != nil { + return reconcile.Result{}, fmt.Errorf("deleting node, %w", err) + } + return reconcile.Result{}, nil + } + taints := []v1.Taint{} + for _, taint := range n.Spec.Taints { + if taint.Key != v1alpha5.NotReadyTaintKey { + taints = append(taints, taint) + } + } + n.Spec.Taints = taints + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/node/liveness.go b/pkg/controllers/node/liveness.go deleted file mode 100644 index 0b47e9f1972f..000000000000 --- a/pkg/controllers/node/liveness.go +++ /dev/null @@ -1,57 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package node - -import ( - "context" - "fmt" - "time" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/utils/injectabletime" - "github.com/aws/karpenter/pkg/utils/node" - v1 "k8s.io/api/core/v1" - "knative.dev/pkg/logging" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -const LivenessTimeout = 15 * time.Minute - -// Liveness is a subreconciler that deletes nodes determined to be unrecoverable -type Liveness struct { - kubeClient client.Client -} - -// Reconcile reconciles the node -func (r *Liveness) Reconcile(ctx context.Context, _ *v1alpha5.Provisioner, n *v1.Node) (reconcile.Result, error) { - if timeSinceCreation := injectabletime.Now().Sub(n.GetCreationTimestamp().Time); timeSinceCreation < LivenessTimeout { - return reconcile.Result{RequeueAfter: LivenessTimeout - timeSinceCreation}, nil - } - condition := node.GetCondition(n.Status.Conditions, v1.NodeReady) - // If the reason is "", then the condition has never been set. We expect - // either the kubelet to set this reason, or the kcm's - // node-lifecycle-controller to set the status to NodeStatusNeverUpdated if - // the kubelet cannot connect. Once the value is NodeStatusNeverUpdated and - // the node is beyond the liveness timeout, we will delete the node. - if condition.Reason != "" && condition.Reason != "NodeStatusNeverUpdated" { - return reconcile.Result{}, nil - } - logging.FromContext(ctx).Infof("Triggering termination for node that failed to join") - if err := r.kubeClient.Delete(ctx, n); err != nil { - return reconcile.Result{}, fmt.Errorf("deleting node, %w", err) - } - return reconcile.Result{}, nil -} diff --git a/pkg/controllers/node/readiness.go b/pkg/controllers/node/readiness.go deleted file mode 100644 index 50908284d173..000000000000 --- a/pkg/controllers/node/readiness.go +++ /dev/null @@ -1,42 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package node - -import ( - "context" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/utils/node" - v1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -// Readiness is a subreconciler that removes the NotReady taint when the node is ready -type Readiness struct{} - -// Reconcile reconciles the node -func (r *Readiness) Reconcile(_ context.Context, _ *v1alpha5.Provisioner, n *v1.Node) (reconcile.Result, error) { - if !node.IsReady(n) { - return reconcile.Result{}, nil - } - taints := []v1.Taint{} - for _, taint := range n.Spec.Taints { - if taint.Key != v1alpha5.NotReadyTaintKey { - taints = append(taints, taint) - } - } - n.Spec.Taints = taints - return reconcile.Result{}, nil -} diff --git a/pkg/controllers/node/suite_test.go b/pkg/controllers/node/suite_test.go index 02ed0e4788c4..e30aa88ca491 100644 --- a/pkg/controllers/node/suite_test.go +++ b/pkg/controllers/node/suite_test.go @@ -120,7 +120,7 @@ var _ = Describe("Controller", func() { }) }) - Context("Readiness", func() { + Context("Initialization", func() { It("should not remove the readiness taint if not ready", func() { n := test.Node(test.NodeOptions{ ReadyStatus: v1.ConditionUnknown, @@ -181,9 +181,7 @@ var _ = Describe("Controller", func() { n = ExpectNodeExists(ctx, env.Client, n.Name) Expect(n.Spec.Taints).To(Equal(n.Spec.Taints)) }) - }) - Context("Liveness", func() { - It("should delete nodes if NodeStatusNeverUpdated after 5 minutes", func() { + It("should delete nodes if node not ready even after Initialization timeout ", func() { n := test.Node(test.NodeOptions{ ObjectMeta: metav1.ObjectMeta{ Finalizers: []string{v1alpha5.TerminationFinalizer}, @@ -191,42 +189,21 @@ var _ = Describe("Controller", func() { }, ReadyStatus: v1.ConditionUnknown, ReadyReason: "NodeStatusNeverUpdated", - }) - ExpectCreated(ctx, env.Client, provisioner) - ExpectCreatedWithStatus(ctx, env.Client, n) - - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provisioner)) - - // Expect node not be deleted - n = ExpectNodeExists(ctx, env.Client, n.Name) - Expect(n.DeletionTimestamp.IsZero()).To(BeTrue()) - - // Simulate time passing and a n failing to join - injectabletime.Now = func() time.Time { return time.Now().Add(node.LivenessTimeout) } - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(n)) - - n = ExpectNodeExists(ctx, env.Client, n.Name) - Expect(n.DeletionTimestamp.IsZero()).To(BeFalse()) - }) - It("should delete nodes if we never hear anything after 5 minutes", func() { - n := test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Finalizers: []string{v1alpha5.TerminationFinalizer}, - Labels: map[string]string{v1alpha5.ProvisionerNameLabelKey: provisioner.Name}, + Taints: []v1.Taint{ + {Key: v1alpha5.NotReadyTaintKey, Effect: v1.TaintEffectNoSchedule}, }, - ReadyStatus: v1.ConditionUnknown, - ReadyReason: "", }) ExpectCreated(ctx, env.Client, provisioner) ExpectCreatedWithStatus(ctx, env.Client, n) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provisioner)) + // Expect node not be deleted n = ExpectNodeExists(ctx, env.Client, n.Name) Expect(n.DeletionTimestamp.IsZero()).To(BeTrue()) // Simulate time passing and a n failing to join - injectabletime.Now = func() time.Time { return time.Now().Add(node.LivenessTimeout) } + injectabletime.Now = func() time.Time { return time.Now().Add(node.InitializationTimeout) } ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(n)) n = ExpectNodeExists(ctx, env.Client, n.Name)