diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index b3137ffa..1b3126a5 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -160,6 +160,7 @@ func watchForCancellationEvents(cancelChan <-chan interruptionevent.Interruption log.Log().Msgf("Uncordoning the node failed: %v", err) } node.RemoveNTHLabels() + node.RemoveNTHTaints() } else { log.Log().Msg("Another interruption event is active, not uncordoning the node") } diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index 688284d5..cb58023e 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -68,6 +68,7 @@ Parameter | Description | Default `enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true` `metadataTries` | The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3. | `3` `cordonOnly` | If true, nodes will be cordoned but not drained when an interruption event occurs. | `false` +`taintNode` | If true, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance` and `aws-node-termination-handler/spot-itn` | `false` `jsonLogging` | If true, use JSON-formatted logs instead of human readable logs. | `false` `affinity` | node/pod affinities | None `podAnnotations` | annotations to add to each pod | `{}` diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.yaml index 25163392..4d3a450d 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.yaml @@ -109,6 +109,8 @@ spec: value: {{ .Values.metadataTries | quote }} - name: CORDON_ONLY value: {{ .Values.cordonOnly | quote }} + - name: TAINT_NODE + value: {{ .Values.taintNode | quote }} - name: JSON_LOGGING value: {{ .Values.jsonLogging | quote }} - name: WEBHOOK_PROXY diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 796e370e..acf91766 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -33,6 +33,8 @@ enableSpotInterruptionDraining: "" ## enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event enableScheduledEventDraining: "" +taintNode: false + ## dryRun tells node-termination-handler to only log calls to kubernetes control plane dryRun: false diff --git a/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml b/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml index 81fbb432..379e1793 100644 --- a/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml +++ b/config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml @@ -36,4 +36,8 @@ spec: value: {{ .Values.ec2MetadataTestProxy.enableSpotITN | quote }} - name: ENABLE_IMDS_V2 value: {{ .Values.ec2MetadataTestProxy.enableIMDSV2 | quote }} + {{- if .Values.ec2MetadataTestProxy.tolerations }} + tolerations: +{{ toYaml .Values.ec2MetadataTestProxy.tolerations | indent 8 }} + {{- end }} {{- end -}} diff --git a/config/helm/ec2-metadata-test-proxy/values.yaml b/config/helm/ec2-metadata-test-proxy/values.yaml index 89739ecf..6451d615 100644 --- a/config/helm/ec2-metadata-test-proxy/values.yaml +++ b/config/helm/ec2-metadata-test-proxy/values.yaml @@ -22,8 +22,8 @@ ec2MetadataTestProxy: image: repository: ec2-metadata-test-proxy tag: customtest + tolerations: [] regularPodTest: create: true label: regular-pod-test port: 1339 - \ No newline at end of file diff --git a/pkg/config/config.go b/pkg/config/config.go index ecad64fd..3c8c50d5 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -52,6 +52,7 @@ const ( metadataTriesConfigKey = "METADATA_TRIES" metadataTriesDefault = 3 cordonOnly = "CORDON_ONLY" + taintNode = "TAINT_NODE" jsonLoggingConfigKey = "JSON_LOGGING" jsonLoggingDefault = false ) @@ -75,6 +76,7 @@ type Config struct { EnableSpotInterruptionDraining bool MetadataTries int CordonOnly bool + TaintNode bool JsonLogging bool } @@ -107,6 +109,7 @@ func ParseCliArgs() (config Config, err error) { flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received") flag.IntVar(&config.MetadataTries, "metadata-tries", getIntEnv(metadataTriesConfigKey, metadataTriesDefault), "The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3.") flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.") + flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.") flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.") flag.Parse() @@ -142,6 +145,7 @@ func ParseCliArgs() (config Config, err error) { "\tenable-spot-interruption-draining: %t,\n"+ "\tmetadata-tries: %d,\n"+ "\tcordon-only: %t,\n"+ + "\ttaint-node: %t,\n"+ "\tjson-logging: %t,\n"+ "\twebhook-proxy: %s,\n", config.DryRun, @@ -157,6 +161,7 @@ func ParseCliArgs() (config Config, err error) { config.EnableSpotInterruptionDraining, config.MetadataTries, config.CordonOnly, + config.TaintNode, config.JsonLogging, config.WebhookProxy, ) diff --git a/pkg/interruptionevent/scheduled-event.go b/pkg/interruptionevent/scheduled-event.go index 69e97e9a..cb44f3e7 100644 --- a/pkg/interruptionevent/scheduled-event.go +++ b/pkg/interruptionevent/scheduled-event.go @@ -85,20 +85,26 @@ func checkForScheduledEvents(imds *ec2metadata.Service) ([]InterruptionEvent, er return events, nil } -func uncordonAfterRebootPreDrain(interruptionEvent InterruptionEvent, node node.Node) error { - err := node.MarkWithEventID(interruptionEvent.EventID) +func uncordonAfterRebootPreDrain(interruptionEvent InterruptionEvent, n node.Node) error { + err := n.MarkWithEventID(interruptionEvent.EventID) if err != nil { return fmt.Errorf("Unable to mark node with event ID: %w", err) } + + err = n.TaintScheduledMaintenance(interruptionEvent.EventID) + if err != nil { + return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ScheduledMaintenanceTaint, interruptionEvent.EventID, err) + } + // if the node is already marked as unschedulable, then don't do anything - unschedulable, err := node.IsUnschedulable() + unschedulable, err := n.IsUnschedulable() if err == nil && unschedulable { log.Log().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.") return nil } else if err != nil { return fmt.Errorf("Encountered an error while checking if the node is unschedulable. Not setting an uncordon label: %w", err) } - err = node.MarkForUncordonAfterReboot() + err = n.MarkForUncordonAfterReboot() if err != nil { return fmt.Errorf("Unable to mark the node for uncordon: %w", err) } diff --git a/pkg/interruptionevent/scheduled-event_internal_test.go b/pkg/interruptionevent/scheduled-event_internal_test.go index 3e9bbe00..1eaeeb94 100644 --- a/pkg/interruptionevent/scheduled-event_internal_test.go +++ b/pkg/interruptionevent/scheduled-event_internal_test.go @@ -66,15 +66,26 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node { } func TestUncordonAfterRebootPreDrainSuccess(t *testing.T) { - drainEvent := InterruptionEvent{} + drainEvent := InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } nthConfig := config.Config{ - DryRun: true, + DryRun: true, + NodeName: nodeName, } - tNode, _ := node.New(nthConfig) - err := uncordonAfterRebootPreDrain(drainEvent, *tNode) + client := fake.NewSimpleClientset() + _, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client)) + h.Ok(t, err) + + err = uncordonAfterRebootPreDrain(drainEvent, *tNode) + h.Ok(t, err) } + func TestUncordonAfterRebootPreDrainMarkWithEventIDFailure(t *testing.T) { resetFlagsForTest() diff --git a/pkg/interruptionevent/spot-itn-event.go b/pkg/interruptionevent/spot-itn-event.go index ca5466be..a1a8da01 100644 --- a/pkg/interruptionevent/spot-itn-event.go +++ b/pkg/interruptionevent/spot-itn-event.go @@ -19,6 +19,7 @@ import ( "time" "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/node" "github.com/rs/zerolog/log" ) @@ -59,10 +60,22 @@ func checkForSpotInterruptionNotice(imds *ec2metadata.Service) (*InterruptionEve hash := sha256.New() hash.Write([]byte(fmt.Sprintf("%v", instanceAction))) + var preDrainFunc preDrainTask = setInterruptionTaint + return &InterruptionEvent{ - EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), - Kind: SpotITNKind, - StartTime: interruptionTime, - Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time), + EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)), + Kind: SpotITNKind, + StartTime: interruptionTime, + Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time), + PreDrainTask: preDrainFunc, }, nil } + +func setInterruptionTaint(interruptionEvent InterruptionEvent, n node.Node) error { + err := n.TaintSpotItn(interruptionEvent.EventID) + if err != nil { + return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ScheduledMaintenanceTaint, interruptionEvent.EventID, err) + } + + return nil +} diff --git a/pkg/interruptionevent/spot-itn-event_internal_test.go b/pkg/interruptionevent/spot-itn-event_internal_test.go new file mode 100644 index 00000000..ab2b5bb9 --- /dev/null +++ b/pkg/interruptionevent/spot-itn-event_internal_test.go @@ -0,0 +1,95 @@ +// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package interruptionevent + +import ( + "os" + "testing" + "time" + + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/node" + h "github.com/aws/aws-node-termination-handler/pkg/test" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/kubectl/pkg/drain" +) + +var spotNodeName = "NAME" + +func getSpotDrainHelper(client *fake.Clientset) *drain.Helper { + return &drain.Helper{ + Client: client, + Force: true, + GracePeriodSeconds: -1, + IgnoreAllDaemonSets: true, + DeleteLocalData: true, + Timeout: time.Duration(120) * time.Second, + Out: os.Stdout, + ErrOut: os.Stderr, + } +} + +func TestSetInterruptionTaint(t *testing.T) { + drainEvent := InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: true, + NodeName: spotNodeName, + } + + client := fake.NewSimpleClientset() + _, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client)) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} + +func TestInterruptionTaintAlreadyPresent(t *testing.T) { + drainEvent := InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: false, + NodeName: spotNodeName, + } + + client := fake.NewSimpleClientset() + newNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}, + Spec: v1.NodeSpec{Taints: []v1.Taint{{ + Key: node.SpotInterruptionTaint, + Value: drainEvent.EventID[:63], + Effect: v1.TaintEffectNoSchedule, + }, + }}, + } + + _, err := client.CoreV1().Nodes().Create(newNode) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client)) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} diff --git a/pkg/node/node.go b/pkg/node/node.go index a4138d4e..18fcea21 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -25,6 +25,7 @@ import ( "github.com/aws/aws-node-termination-handler/pkg/config" "github.com/rs/zerolog/log" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -43,6 +44,20 @@ const ( EventIDLabelKey = "aws-node-termination-handler/event-id" ) +const ( + // SpotInterruptionTaint is a taint used to make spot instance unschedulable + SpotInterruptionTaint = "aws-node-termination-handler/spot-itn" + // ScheduledMaintenanceTaint is a taint used to make spot instance unschedulable + ScheduledMaintenanceTaint = "aws-node-termination-handler/scheduled-maintenance" + + maxTaintValueLength = 63 +) + +var ( + maxRetryDeadline time.Duration = 5 * time.Second + conflictRetryInterval time.Duration = 750 * time.Millisecond +) + var uptimeFile = "/proc/uptime" // Node represents a kubernetes node with functions to manipulate its state via the kubernetes api server @@ -255,6 +270,65 @@ func (n Node) removeLabel(key string) error { return nil } +// TaintSpotItn adds the spot termination notice taint onto a node +func (n Node) TaintSpotItn(eventID string) error { + if !n.nthConfig.TaintNode { + return nil + } + + k8sNode, err := n.fetchKubernetesNode() + if err != nil { + return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err) + } + + if len(eventID) > 63 { + eventID = eventID[:maxTaintValueLength] + } + + return addTaint(k8sNode, n, SpotInterruptionTaint, eventID, corev1.TaintEffectNoSchedule) +} + +// TaintScheduledMaintenance adds the scheduled maintenance taint onto a node +func (n Node) TaintScheduledMaintenance(eventID string) error { + if !n.nthConfig.TaintNode { + return nil + } + + k8sNode, err := n.fetchKubernetesNode() + if err != nil { + return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err) + } + + if len(eventID) > 63 { + eventID = eventID[:maxTaintValueLength] + } + + return addTaint(k8sNode, n, ScheduledMaintenanceTaint, eventID, corev1.TaintEffectNoSchedule) +} + +// RemoveNTHTaints removes NTH-specific taints from a node +func (n Node) RemoveNTHTaints() error { + if !n.nthConfig.TaintNode { + return nil + } + + k8sNode, err := n.fetchKubernetesNode() + if err != nil { + return fmt.Errorf("Unable to fetch kubernetes node from API: %w", err) + } + + taints := []string{SpotInterruptionTaint, ScheduledMaintenanceTaint} + + for _, taint := range taints { + _, err = removeTaint(k8sNode, n.drainHelper.Client, taint) + if err != nil { + return fmt.Errorf("Unable to clean taint %s from node %s", taint, n.nthConfig.NodeName) + } + } + + return nil +} + // IsLabeledWithAction will return true if the current node is labeled with NTH action labels func (n Node) IsLabeledWithAction() (bool, error) { k8sNode, err := n.fetchKubernetesNode() @@ -300,6 +374,12 @@ func (n Node) UncordonIfRebooted() error { if err != nil { return err } + + err = n.RemoveNTHTaints() + if err != nil { + return err + } + log.Log().Msgf("Successfully completed action %s.", UncordonAfterRebootLabelVal) default: log.Log().Msg("There are no label actions to handle.") @@ -309,7 +389,10 @@ func (n Node) UncordonIfRebooted() error { // fetchKubernetesNode will send an http request to the k8s api server and return the corev1 model node func (n Node) fetchKubernetesNode() (*corev1.Node, error) { - node := &corev1.Node{} + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: n.nthConfig.NodeName}, + Spec: corev1.NodeSpec{}, + } if n.nthConfig.DryRun { return node, nil } @@ -363,3 +446,112 @@ func jsonPatchEscape(value string) string { value = strings.Replace(value, "~", "~0", -1) return strings.Replace(value, "/", "~1", -1) } + +func addTaint(node *corev1.Node, nth Node, taintKey string, taintValue string, effect corev1.TaintEffect) error { + if nth.nthConfig.DryRun { + log.Log().Msgf("Would have added taint (%s=%s:%s) to node %s, but dry-run flag was set", taintKey, taintValue, effect, nth.nthConfig.NodeName) + return nil + } + + retryDeadline := time.Now().Add(maxRetryDeadline) + freshNode := node.DeepCopy() + client := nth.drainHelper.Client + var err error + refresh := false + for { + if refresh { + // Get the newest version of the node. + freshNode, err = client.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{}) + if err != nil || freshNode == nil { + log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) + return fmt.Errorf("failed to get node %v: %v", node.Name, err) + } + } + + if !addTaintToSpec(freshNode, taintKey, taintValue, effect) { + if !refresh { + // Make sure we have the latest version before skipping update. + refresh = true + continue + } + return nil + } + _, err = client.CoreV1().Nodes().Update(freshNode) + if err != nil && errors.IsConflict(err) && time.Now().Before(retryDeadline) { + refresh = true + time.Sleep(conflictRetryInterval) + continue + } + + if err != nil { + log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) + return err + } + log.Log().Msgf("Successfully added %v on node %v", taintKey, node.Name) + return nil + } +} + +func addTaintToSpec(node *corev1.Node, taintKey string, taintValue string, effect corev1.TaintEffect) bool { + for _, taint := range node.Spec.Taints { + if taint.Key == taintKey { + log.Log().Msgf("%v already present on node %v, taint: %v", taintKey, node.Name, taint) + return false + } + } + node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{ + Key: taintKey, + Value: taintValue, + Effect: effect, + }) + return true +} + +func removeTaint(node *corev1.Node, client kubernetes.Interface, taintKey string) (bool, error) { + retryDeadline := time.Now().Add(maxRetryDeadline) + freshNode := node.DeepCopy() + var err error + refresh := false + for { + if refresh { + // Get the newest version of the node. + freshNode, err = client.CoreV1().Nodes().Get(node.Name, metav1.GetOptions{}) + if err != nil || freshNode == nil { + log.Log().Msgf("Error while adding %v taint on node %v: %v", taintKey, node.Name, err) + return false, fmt.Errorf("failed to get node %v: %v", node.Name, err) + } + } + newTaints := make([]corev1.Taint, 0) + for _, taint := range freshNode.Spec.Taints { + if taint.Key == taintKey { + log.Log().Msgf("Releasing taint %+v on node %v", taint, node.Name) + } else { + newTaints = append(newTaints, taint) + } + } + if len(newTaints) == len(freshNode.Spec.Taints) { + if !refresh { + // Make sure we have the latest version before skipping update. + refresh = true + continue + } + return false, nil + } + + freshNode.Spec.Taints = newTaints + _, err = client.CoreV1().Nodes().Update(freshNode) + + if err != nil && errors.IsConflict(err) && time.Now().Before(retryDeadline) { + refresh = true + time.Sleep(conflictRetryInterval) + continue + } + + if err != nil { + log.Log().Msgf("Error while releasing %v taint on node %v: %v", taintKey, node.Name, err) + return false, err + } + log.Log().Msgf("Successfully released %v on node %v", taintKey, node.Name) + return true, nil + } +} diff --git a/test/e2e/maintenance-event-cancellation-test b/test/e2e/maintenance-event-cancellation-test index f2a3885f..89151b59 100755 --- a/test/e2e/maintenance-event-cancellation-test +++ b/test/e2e/maintenance-event-cancellation-test @@ -22,7 +22,8 @@ helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" + --set enableScheduledEventDraining="true" \ + --set taintNode="true" helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ --wait \ @@ -32,13 +33,16 @@ helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-meta --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \ --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ --set ec2MetadataTestProxy.enableSpotITN="false" \ - --set ec2MetadataTestProxy.port="$IMDS_PORT" + --set ec2MetadataTestProxy.port="$IMDS_PORT" \ + --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' \ + --set 'ec2MetadataTestProxy.tolerations[0].operator=Exists' TAINT_CHECK_CYCLES=15 TAINT_CHECK_SLEEP=15 DEPLOYED=0 CORDONED=0 +TAINTED=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then @@ -57,6 +61,12 @@ fi for i in `seq 1 $TAINT_CHECK_CYCLES`; do if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" + + if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then + echo "✅ Verified the worked node was tainted!" + TAINTED=1 + fi + if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" CORDONED=1 @@ -71,6 +81,11 @@ if [[ $CORDONED -eq 0 ]]; then exit 3 fi +if [[ $TAINTED -eq 0 ]]; then + echo "❌ Failed tainting node for scheduled maintenance event" + exit 3 +fi + helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ --wait \ --force \ @@ -80,7 +95,9 @@ helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-meta --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \ --set ec2MetadataTestProxy.enableSpotITN="false" \ --set ec2MetadataTestProxy.scheduledEventStatus="canceled" \ - --set ec2MetadataTestProxy.port="$IMDS_PORT" + --set ec2MetadataTestProxy.port="$IMDS_PORT" \ + --set 'ec2MetadataTestProxy.tolerations[0].effect=NoSchedule' \ + --set 'ec2MetadataTestProxy.tolerations[0].operator=Exists' for i in `seq 1 $TAINT_CHECK_CYCLES`; do if kubectl get nodes $CLUSTER_NAME-worker --no-headers | grep -v SchedulingDisabled; then diff --git a/test/e2e/maintenance-event-reboot-test b/test/e2e/maintenance-event-reboot-test index d94e4870..0133bdab 100755 --- a/test/e2e/maintenance-event-reboot-test +++ b/test/e2e/maintenance-event-reboot-test @@ -18,7 +18,8 @@ helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" + --set enableScheduledEventDraining="true" \ + --set taintNode="true" helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ --wait \ @@ -35,6 +36,7 @@ TAINT_CHECK_SLEEP=15 DEPLOYED=0 CORDONED=0 +TAINTED=0 for i in `seq 1 10`; do if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then @@ -52,7 +54,13 @@ fi for i in `seq 1 $TAINT_CHECK_CYCLES`; do if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then - echo "✅ Verified the worker node was cordoned!" + echo "✅ Verified the worker node was cordoned for maintenance event reboot!" + + if kubectl get nodes $CLUSTER_NAME-worker -o json | grep "aws-node-termination-handler/scheduled-maintenance"; then + echo "✅ Verified the worked node was tainted!" + TAINTED=1 + fi + if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" CORDONED=1 @@ -67,6 +75,11 @@ if [[ $CORDONED -eq 0 ]]; then exit 3 fi +if [[ $TAINTED -eq 0 ]]; then + echo "❌ Failed tainting node for scheduled maintenance event" + exit 3 +fi + ## Copy uptime file to Kind k8s nodes for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do docker exec $node sh -c "rm -rf /uptime" @@ -88,12 +101,22 @@ helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ --set procUptimeFile="/uptime" \ --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" + --set enableScheduledEventDraining="true" \ + --set taintNode="true" for i in `seq 1 $TAINT_CHECK_CYCLES`; do NODE_LINE=$(kubectl get nodes $CLUSTER_NAME-worker | grep -v 'STATUS') if [[ -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then echo "✅ Verified the worker node was uncordoned!" + + if ! kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then + echo "✅ Verified the worked node was untainted!" + TAINTED=0 + else + echo "❌ Failed clearing the worked node taint after a reboot!" + exit 3 + fi + if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then echo "✅ Verified the regular-pod-test pod was rescheduled" echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅" diff --git a/test/e2e/maintenance-event-test b/test/e2e/maintenance-event-test index 88376ed0..8f405c47 100755 --- a/test/e2e/maintenance-event-test +++ b/test/e2e/maintenance-event-test @@ -22,7 +22,8 @@ helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ --set enableSpotInterruptionDraining="true" \ - --set enableScheduledEventDraining="true" + --set enableScheduledEventDraining="true" \ + --set taintNode="true" helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ @@ -53,9 +54,19 @@ if [[ $DEPLOYED -eq 0 ]]; then exit 2 fi +TAINTED=0 + for i in `seq 1 $TAINT_CHECK_CYCLES`; do if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" + + if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then + echo "✅ Verified the worked node was tainted!" + else + echo "❌ Failed tainting node for maintenance event" + exit 3 + fi + if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" echo "✅ Scheduled Maintenance Event Handling Test Passed $CLUSTER_NAME! ✅" diff --git a/test/e2e/spot-interruption-test b/test/e2e/spot-interruption-test index b85a6ef0..db530161 100755 --- a/test/e2e/spot-interruption-test +++ b/test/e2e/spot-interruption-test @@ -22,7 +22,8 @@ helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \ --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \ --set enableScheduledEventDraining="false" \ - --set enableSpotInterruptionDraining="true" + --set enableSpotInterruptionDraining="true" \ + --set taintNode="true" helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \ --wait \ @@ -55,6 +56,14 @@ fi for i in `seq 1 $TAINT_CHECK_CYCLES`; do if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then echo "✅ Verified the worker node was cordoned!" + + if kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/spot-itn"; then + echo "✅ Verified the worked node was tainted!" + else + echo "❌ Failed tainting node for spot termination event" + exit 3 + fi + if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then echo "✅ Verified the regular-pod-test pod was evicted!" echo "✅ Spot Interruption Test Passed $CLUSTER_NAME! ✅" diff --git a/test/e2e/webhook-http-proxy-test b/test/e2e/webhook-http-proxy-test index 1972b9f6..c05d446d 100755 --- a/test/e2e/webhook-http-proxy-test +++ b/test/e2e/webhook-http-proxy-test @@ -16,7 +16,6 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" SQUID_DOCKERHUB_IMG="sameersbn/squid:3.5.27-2@sha256:e98299069f0c6e3d9b9188903518e2f44ac36b1fa5007e879af518e1c0a234af" SQUID_DOCKER_IMG="squid:customtest" - ### LOCAL ONLY TESTS FOR 200 RESPONSE FROM LOCAL CLUSTER, MASTER WILL TEST WITH TRAVIS SECRET URL if [[ -z $(env | grep "WEBHOOK_URL=") ]]; then WEBHOOK_URL="http://127.0.0.1:$IMDS_PORT" diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index a855b8f4..ca00d718 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -52,7 +52,9 @@ function reset_cluster { fi for node in $(kubectl get nodes | tail -n+2 | cut -d' ' -f1); do kubectl uncordon $node - done + kubectl taint node $node aws-node-termination-handler/scheduled-maintenance- || true + kubectl taint node $node aws-node-termination-handler/spot-itn- || true + done remove_labels || : sleep 2 }