diff --git a/README.md b/README.md index 37c1191f..cb3720e1 100644 --- a/README.md +++ b/README.md @@ -83,8 +83,8 @@ When using the EC2 Console or EC2 API to terminate the instance, a state-change | Spot Instance Termination Notifications (ITN) | ✅ | ✅ | | Scheduled Events | ✅ | ✅ | | Instance Rebalance Recommendation | ✅ | ✅ | +| ASG Termination Lifecycle Hooks | ✅ | ✅ | | AZ Rebalance Recommendation | ❌ | ✅ | -| ASG Termination Lifecycle Hooks | ❌ | ✅ | | Instance State Change Events | ❌ | ✅ | ### Kubernetes Compatibility diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 6145e0e5..2d703f86 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -16,6 +16,7 @@ package main import ( "context" "fmt" + "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle" "os" "os/signal" "strings" @@ -53,6 +54,7 @@ import ( const ( scheduledMaintenance = "Scheduled Maintenance" spotITN = "Spot ITN" + asgLifecycle = "ASG Lifecycle" rebalanceRecommendation = "Rebalance Recommendation" sqsEvents = "SQS Event" timeFormat = "2006/01/02 15:04:05" @@ -188,6 +190,10 @@ func main() { imdsSpotMonitor := spotitn.NewSpotInterruptionMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) monitoringFns[spotITN] = imdsSpotMonitor } + if nthConfig.EnableASGLifecycleDraining { + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) + monitoringFns[asgLifecycle] = asgLifecycleMonitor + } if nthConfig.EnableScheduledEventDraining { imdsScheduledEventMonitor := scheduledevent.NewScheduledEventMonitor(imds, interruptionChan, cancelChan, nthConfig.NodeName) monitoringFns[scheduledMaintenance] = imdsScheduledEventMonitor diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index 95e4b50f..be6385de 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -143,6 +143,8 @@ spec: {{- end }} - name: ENABLE_SPOT_INTERRUPTION_DRAINING value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_ASG_LIFECYCLE_DRAINING + value: {{ .Values.enableASGLifecycleDraining | quote }} - name: ENABLE_SCHEDULED_EVENT_DRAINING value: {{ .Values.enableScheduledEventDraining | quote }} - name: ENABLE_REBALANCE_MONITORING diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml index 8a9db7bf..95af69d1 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -143,6 +143,8 @@ spec: {{- end }} - name: ENABLE_SPOT_INTERRUPTION_DRAINING value: {{ .Values.enableSpotInterruptionDraining | quote }} + - name: ENABLE_ASG_LIFECYCLE_DRAINING + value: {{ .Values.enableASGLifecycleDraining | quote }} - name: ENABLE_SCHEDULED_EVENT_DRAINING value: {{ .Values.enableScheduledEventDraining | quote }} - name: ENABLE_REBALANCE_MONITORING diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 2d6c84f2..f6c7bf42 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -270,6 +270,9 @@ metadataTries: 3 # enableSpotInterruptionDraining If false, do not drain nodes when the spot interruption termination notice is received. Only used in IMDS mode. enableSpotInterruptionDraining: true +# enableASGLifecycleDraining If false, do not drain nodes when ASG target lifecycle state Terminated is received. Only used in IMDS mode. +enableASGLifecycleDraining: true + # enableScheduledEventDraining If false, do not drain nodes before the maintenance window starts for an EC2 instance scheduled event. Only used in IMDS mode. enableScheduledEventDraining: true diff --git a/pkg/config/config.go b/pkg/config/config.go index ba843668..e784990b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -53,6 +53,8 @@ const ( enableScheduledEventDrainingDefault = true enableSpotInterruptionDrainingConfigKey = "ENABLE_SPOT_INTERRUPTION_DRAINING" enableSpotInterruptionDrainingDefault = true + enableASGLifecycleDrainingConfigKey = "ENABLE_ASG_LIFECYCLE_DRAINING" + enableASGLifecycleDrainingDefault = true enableSQSTerminationDrainingConfigKey = "ENABLE_SQS_TERMINATION_DRAINING" enableSQSTerminationDrainingDefault = false enableRebalanceMonitoringConfigKey = "ENABLE_REBALANCE_MONITORING" @@ -132,6 +134,7 @@ type Config struct { WebhookProxy string EnableScheduledEventDraining bool EnableSpotInterruptionDraining bool + EnableASGLifecycleDraining bool EnableSQSTerminationDraining bool EnableRebalanceMonitoring bool EnableRebalanceDraining bool @@ -195,6 +198,7 @@ func ParseCliArgs() (config Config, err error) { flag.StringVar(&config.WebhookTemplateFile, "webhook-template-file", getEnv(webhookTemplateFileConfigKey, ""), "If specified, replaces the default webhook message template with content from template file.") flag.BoolVar(&config.EnableScheduledEventDraining, "enable-scheduled-event-draining", getBoolEnv(enableScheduledEventDrainingConfigKey, enableScheduledEventDrainingDefault), "If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event") flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received") + flag.BoolVar(&config.EnableASGLifecycleDraining, "enable-asg-lifecycle-draining", getBoolEnv(enableASGLifecycleDrainingConfigKey, enableASGLifecycleDrainingDefault), "If true, drain nodes when the ASG target lifecyle state is Terminated is received") flag.BoolVar(&config.EnableSQSTerminationDraining, "enable-sqs-termination-draining", getBoolEnv(enableSQSTerminationDrainingConfigKey, enableSQSTerminationDrainingDefault), "If true, drain nodes when an SQS termination event is received") flag.BoolVar(&config.EnableRebalanceMonitoring, "enable-rebalance-monitoring", getBoolEnv(enableRebalanceMonitoringConfigKey, enableRebalanceMonitoringDefault), "If true, cordon nodes when the rebalance recommendation notice is received. If you'd like to drain the node in addition to cordoning, then also set \"enableRebalanceDraining\".") flag.BoolVar(&config.EnableRebalanceDraining, "enable-rebalance-draining", getBoolEnv(enableRebalanceDrainingConfigKey, enableRebalanceDrainingDefault), "If true, drain nodes when the rebalance recommendation notice is received") diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 79097952..c411b9fd 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -36,6 +36,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) { t.Setenv("DRY_RUN", "true") t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "true") t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "false") + t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "false") t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false") t.Setenv("ENABLE_REBALANCE_MONITORING", "true") t.Setenv("ENABLE_REBALANCE_DRAINING", "true") @@ -62,6 +63,7 @@ func TestParseCliArgsEnvSuccess(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, false, nthConfig.EnableSQSTerminationDraining) h.Equals(t, true, nthConfig.EnableRebalanceMonitoring) h.Equals(t, true, nthConfig.EnableRebalanceDraining) @@ -98,6 +100,7 @@ func TestParseCliArgsSuccess(t *testing.T) { "--dry-run=true", "--enable-scheduled-event-draining=true", "--enable-spot-interruption-draining=false", + "--enable-asg-lifecycle-draining=false", "--enable-sqs-termination-draining=false", "--enable-rebalance-monitoring=true", "--enable-rebalance-draining=true", @@ -124,6 +127,7 @@ func TestParseCliArgsSuccess(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, false, nthConfig.EnableSQSTerminationDraining) h.Equals(t, true, nthConfig.EnableRebalanceMonitoring) h.Equals(t, true, nthConfig.EnableRebalanceDraining) @@ -155,6 +159,7 @@ func TestParseCliArgsOverrides(t *testing.T) { t.Setenv("DRY_RUN", "false") t.Setenv("ENABLE_SCHEDULED_EVENT_DRAINING", "false") t.Setenv("ENABLE_SPOT_INTERRUPTION_DRAINING", "true") + t.Setenv("ENABLE_ASG_LIFECYCLE_DRAINING", "true") t.Setenv("ENABLE_SQS_TERMINATION_DRAINING", "false") t.Setenv("ENABLE_REBALANCE_MONITORING", "true") t.Setenv("ENABLE_REBALANCE_DRAINING", "true") @@ -178,6 +183,7 @@ func TestParseCliArgsOverrides(t *testing.T) { "--dry-run=true", "--enable-scheduled-event-draining=true", "--enable-spot-interruption-draining=false", + "--enable-asg-lifecycle-draining=false", "--enable-sqs-termination-draining=true", "--enable-rebalance-monitoring=false", "--enable-rebalance-draining=false", @@ -205,6 +211,7 @@ func TestParseCliArgsOverrides(t *testing.T) { h.Equals(t, true, nthConfig.DryRun) h.Equals(t, true, nthConfig.EnableScheduledEventDraining) h.Equals(t, false, nthConfig.EnableSpotInterruptionDraining) + h.Equals(t, false, nthConfig.EnableASGLifecycleDraining) h.Equals(t, true, nthConfig.EnableSQSTerminationDraining) h.Equals(t, false, nthConfig.EnableRebalanceMonitoring) h.Equals(t, false, nthConfig.EnableRebalanceDraining) diff --git a/pkg/ec2metadata/ec2metadata.go b/pkg/ec2metadata/ec2metadata.go index f7f5ade8..f64d0cde 100644 --- a/pkg/ec2metadata/ec2metadata.go +++ b/pkg/ec2metadata/ec2metadata.go @@ -30,6 +30,8 @@ import ( const ( // SpotInstanceActionPath is the context path to spot/instance-action within IMDS SpotInstanceActionPath = "/latest/meta-data/spot/instance-action" + // ASGTargetLifecycleStatePath path to autoscaling target lifecycle state within IMDS + ASGTargetLifecycleStatePath = "/latest/meta-data/autoscaling/target-lifecycle-state" // ScheduledEventPath is the context path to events/maintenance/scheduled within IMDS ScheduledEventPath = "/latest/meta-data/events/maintenance/scheduled" // RebalanceRecommendationPath is the context path to events/recommendations/rebalance within IMDS @@ -193,6 +195,28 @@ func (e *Service) GetRebalanceRecommendationEvent() (rebalanceRec *RebalanceReco return rebalanceRec, nil } +// GetASGTargetLifecycleState retrieves ASG target lifecycle state from imds. State can be empty string +// if the lifecycle hook is not present on the ASG +func (e *Service) GetASGTargetLifecycleState() (state string, err error) { + resp, err := e.Request(ASGTargetLifecycleStatePath) + // 404s should not happen, but there can be a case if the instance is brand new and the field is not populated yet + if resp != nil && resp.StatusCode == 404 { + return "", nil + } else if resp != nil && (resp.StatusCode < 200 || resp.StatusCode >= 300) { + return "", fmt.Errorf("Metadata request received http status code: %d", resp.StatusCode) + } + if err != nil { + return "", fmt.Errorf("Unable to parse metadata response: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("Unable to parse http response. Status code: %d. %w", resp.StatusCode, err) + } + return string(body), nil +} + // GetMetadataInfo generic function for retrieving ec2 metadata func (e *Service) GetMetadataInfo(path string) (info string, err error) { metadataInfo := "" diff --git a/pkg/ec2metadata/ec2metadata_test.go b/pkg/ec2metadata/ec2metadata_test.go index 6fe58301..4a698c22 100644 --- a/pkg/ec2metadata/ec2metadata_test.go +++ b/pkg/ec2metadata/ec2metadata_test.go @@ -504,6 +504,91 @@ func TestGetRebalanceRecommendationEventRequestFailure(t *testing.T) { h.Assert(t, err != nil, "error expected because no server should be running") } +func TestGetASGTargetLifecycleStateSuccess(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + _, err := rw.Write([]byte("InService")) + h.Ok(t, err) + })) + defer server.Close() + + expectedState := "InService" + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + state, err := imds.GetASGTargetLifecycleState() + h.Ok(t, err) + h.Equals(t, expectedState, state) +} + +func TestGetASGTargetLifecycleState404Success(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + rw.WriteHeader(404) + })) + defer server.Close() + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + state, err := imds.GetASGTargetLifecycleState() + h.Ok(t, err) + h.Assert(t, state == "", "ASG target lifecycle state should be empty") +} + +func TestGetASGTargetLifecycleState500Failure(t *testing.T) { + requestPath := "/latest/meta-data/autoscaling/target-lifecycle-state" + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + rw.Header().Add("X-aws-ec2-metadata-token-ttl-seconds", "100") + if req.URL.String() == "/latest/api/token" { + rw.WriteHeader(200) + _, err := rw.Write([]byte(`token`)) + h.Ok(t, err) + return + } + h.Equals(t, req.Header.Get("X-aws-ec2-metadata-token"), "token") + h.Equals(t, req.URL.String(), requestPath) + rw.WriteHeader(500) + })) + defer server.Close() + + // Use URL from our local test server + imds := ec2metadata.New(server.URL, 1) + + _, err := imds.GetASGTargetLifecycleState() + h.Assert(t, err != nil, "error expected on non-200 or non-404 status code") +} + +func TestGetASGTargetLifecycleStateRequestFailure(t *testing.T) { + // Use URL from our local test server + imds := ec2metadata.New("/some-path-that-will-error", 1) + + _, err := imds.GetASGTargetLifecycleState() + h.Assert(t, err != nil, "error expected because no server should be running") +} + func TestGetMetadataServiceRequest404(t *testing.T) { var requestPath string = "/latest/meta-data/instance-type" diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go new file mode 100644 index 00000000..a623ae96 --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor.go @@ -0,0 +1,103 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle + +import ( + "crypto/sha256" + "fmt" + "time" + + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" +) + +// ASGLifecycleMonitorKind is a const to define this monitor kind +const ASGLifecycleMonitorKind = "ASG_LIFECYCLE_MONITOR" + +// ASGLifecycleMonitor is a struct definition which facilitates monitoring of ASG target lifecycle state from IMDS +type ASGLifecycleMonitor struct { + IMDS *ec2metadata.Service + InterruptionChan chan<- monitor.InterruptionEvent + CancelChan chan<- monitor.InterruptionEvent + NodeName string +} + +// NewASGLifecycleMonitor creates an instance of a ASG lifecycle IMDS monitor +func NewASGLifecycleMonitor(imds *ec2metadata.Service, interruptionChan chan<- monitor.InterruptionEvent, cancelChan chan<- monitor.InterruptionEvent, nodeName string) ASGLifecycleMonitor { + return ASGLifecycleMonitor{ + IMDS: imds, + InterruptionChan: interruptionChan, + CancelChan: cancelChan, + NodeName: nodeName, + } +} + +// Monitor continuously monitors metadata for ASG target lifecycle state and sends interruption events to the passed in channel +func (m ASGLifecycleMonitor) Monitor() error { + interruptionEvent, err := m.checkForASGTargetLifecycleStateNotice() + if err != nil { + return err + } + if interruptionEvent != nil && interruptionEvent.Kind == monitor.ASGLifecycleKind { + m.InterruptionChan <- *interruptionEvent + } + return nil +} + +// Kind denotes the kind of monitor +func (m ASGLifecycleMonitor) Kind() string { + return ASGLifecycleMonitorKind +} + +// checkForASGTargetLifecycleStateNotice Checks EC2 instance metadata for a asg lifecycle termination notice +func (m ASGLifecycleMonitor) checkForASGTargetLifecycleStateNotice() (*monitor.InterruptionEvent, error) { + state, err := m.IMDS.GetASGTargetLifecycleState() + if err != nil { + return nil, fmt.Errorf("There was a problem checking for ASG target lifecycle state: %w", err) + } + if state != "Terminated" { + // if the state is not "Terminated", we can skip. State can also be empty (no hook configured). + return nil, nil + } + + nodeName := m.NodeName + // there is no time in the response, we just set time to the latest check + interruptionTime := time.Now() + + // There's no EventID returned, so we'll create it using a hash to prevent duplicates. + hash := sha256.New() + if _, err = hash.Write([]byte(fmt.Sprintf("%s:%s", state, interruptionTime))); err != nil { + return nil, fmt.Errorf("There was a problem creating an event ID from the event: %w", err) + } + + return &monitor.InterruptionEvent{ + EventID: fmt.Sprintf("target-lifecycle-state-terminated-%x", hash.Sum(nil)), + Kind: monitor.ASGLifecycleKind, + Monitor: ASGLifecycleMonitorKind, + StartTime: interruptionTime, + NodeName: nodeName, + Description: "AST target lifecycle state received. Instance will be terminated\n", + PreDrainTask: setInterruptionTaint, + }, nil +} + +func setInterruptionTaint(interruptionEvent monitor.InterruptionEvent, n node.Node) error { + err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID) + if err != nil { + return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID, err) + } + + return nil +} diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go new file mode 100644 index 00000000..539b3b1a --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_internal_test.go @@ -0,0 +1,99 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle + +import ( + "context" + "testing" + "time" + + "github.com/rs/zerolog/log" + + "github.com/aws/aws-node-termination-handler/pkg/config" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + "github.com/aws/aws-node-termination-handler/pkg/node" + h "github.com/aws/aws-node-termination-handler/pkg/test" + "github.com/aws/aws-node-termination-handler/pkg/uptime" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + "k8s.io/kubectl/pkg/drain" +) + +const nodeName = "NAME" + +func getDrainHelper(client *fake.Clientset) *drain.Helper { + return &drain.Helper{ + Client: client, + Force: true, + GracePeriodSeconds: -1, + IgnoreAllDaemonSets: true, + DeleteEmptyDirData: true, + Timeout: time.Duration(120) * time.Second, + Out: log.Logger, + ErrOut: log.Logger, + } +} + +func TestSetInterruptionTaint(t *testing.T) { + drainEvent := monitor.InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: true, + NodeName: nodeName, + } + + client := fake.NewSimpleClientset() + _, err := client.CoreV1().Nodes().Create(context.Background(), &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}, metav1.CreateOptions{}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} + +func TestInterruptionTaintAlreadyPresent(t *testing.T) { + drainEvent := monitor.InterruptionEvent{ + EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters", + } + nthConfig := config.Config{ + DryRun: false, + NodeName: nodeName, + } + + client := fake.NewSimpleClientset() + newNode := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: nodeName}, + Spec: v1.NodeSpec{Taints: []v1.Taint{{ + Key: node.ASGLifecycleTerminationTaint, + Value: drainEvent.EventID[:63], + Effect: v1.TaintEffectNoSchedule, + }, + }}, + } + + _, err := client.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{}) + h.Ok(t, err) + + tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client), uptime.Uptime) + h.Ok(t, err) + + err = setInterruptionTaint(drainEvent, *tNode) + + h.Ok(t, err) +} diff --git a/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go new file mode 100644 index 00000000..cf6ab6ac --- /dev/null +++ b/pkg/monitor/asglifecycle/asg-lifecycle-monitor_test.go @@ -0,0 +1,125 @@ +// Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"). You may +// not use this file except in compliance with the License. A copy of the +// License is located at +// +// http://aws.amazon.com/apache2.0/ +// +// or in the "license" file accompanying this file. This file is distributed +// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +// express or implied. See the License for the specific language governing +// permissions and limitations under the License. + +package asglifecycle_test + +import ( + "github.com/aws/aws-node-termination-handler/pkg/monitor/asglifecycle" + "net/http" + "net/http/httptest" + "testing" + + "github.com/aws/aws-node-termination-handler/pkg/ec2metadata" + "github.com/aws/aws-node-termination-handler/pkg/monitor" + h "github.com/aws/aws-node-termination-handler/pkg/test" +) + +const ( + expFormattedTime = "2020-10-26 15:15:15 +0000 UTC" + imdsV2TokenPath = "/latest/api/token" + nodeName = "test-node" +) + +var asgTargetLifecycleStateResponse = []byte("InService") + +func TestMonitor_Success(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + _, err := rw.Write(asgTargetLifecycleStateResponse) + h.Ok(t, err) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + go func() { + result := <-drainChan + h.Equals(t, monitor.ASGLifecycleKind, result.Kind) + h.Equals(t, asglifecycle.ASGLifecycleMonitorKind, result.Monitor) + h.Equals(t, expFormattedTime, result.StartTime.String()) + }() + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_MetadataParseFailure(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_404Response(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + http.Error(rw, "error", http.StatusNotFound) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Ok(t, err) +} + +func TestMonitor_500Response(t *testing.T) { + requestPath := ec2metadata.ASGTargetLifecycleStatePath + + server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) { + if imdsV2TokenPath == req.URL.String() { + rw.WriteHeader(403) + return + } + h.Equals(t, req.URL.String(), requestPath) + http.Error(rw, "error", http.StatusInternalServerError) + })) + defer server.Close() + + drainChan := make(chan monitor.InterruptionEvent) + cancelChan := make(chan monitor.InterruptionEvent) + imds := ec2metadata.New(server.URL, 1) + + asgLifecycleMonitor := asglifecycle.NewASGLifecycleMonitor(imds, drainChan, cancelChan, nodeName) + err := asgLifecycleMonitor.Monitor() + h.Assert(t, err != nil, "Failed to return error when 500 response") +} diff --git a/test/e2e/asg-lifecycle-imds-test b/test/e2e/asg-lifecycle-imds-test new file mode 100755 index 00000000..ad3a9de4 --- /dev/null +++ b/test/e2e/asg-lifecycle-imds-test @@ -0,0 +1,138 @@ +#!/bin/bash +set -euo pipefail + +# The purpose of this end-to-end test is to ensure that nodes with an NTH pod, will cordon and drain when they receive an ASG Termination event +# through Instance Metadata Service (IMDS) +# This test assumes that AEMM (aws ec2 metadata mock service) provides an endpoint for getting target-lifecycle-state +# For more details on expected behavior +# Reference: https://docs.aws.amazon.com/autoscaling/ec2/userguide/retrieving-target-lifecycle-state-through-imds.html + +# Available env vars: +# $CLUSTER_NAME +# $NODE_TERMINATION_HANDLER_DOCKER_REPO +# $NODE_TERMINATION_HANDLER_DOCKER_TAG +# $WEBHOOK_DOCKER_REPO +# $WEBHOOK_DOCKER_TAG +# $AEMM_URL +# $IMDS_PORT + +function fail_and_exit { + echo "❌ ASG Lifecycle IMDS Test failed $CLUSTER_NAME ❌" + exit "${1:-1}" +} + +echo "Starting ASG Lifecycle IMDS Test for Node Termination Handler" + +SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" + +common_helm_args=() + +anth_helm_args=( + upgrade + --install + --namespace kube-system + "$CLUSTER_NAME-nth" + "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/" + --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}" + --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" + --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" + --set taintNode="true" + --set enableASGLifecycleDraining="true" + --set enableSpotInterruptionDraining="false" + --set enableScheduledEventDraining="false" + --set daemonsetTolerations="" + --wait + --force +) +[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] && + anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + anth_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${anth_helm_args[@]}" +set +x + +emtp_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-emtp" + "$SCRIPTPATH/../../config/helm/webhook-test-proxy/" + --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO" + --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG" + --wait +) +[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] && + emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY") +[[ ${#common_helm_args[@]} -gt 0 ]] && + emtp_helm_args+=("${common_helm_args[@]}") + +set -x +helm "${emtp_helm_args[@]}" +set +x + +aemm_helm_args=( + upgrade + --install + --namespace default + "$CLUSTER_NAME-aemm" + "$AEMM_DL_URL" + --set servicePort="$IMDS_PORT" + --set 'tolerations[0].effect=NoSchedule' + --set 'tolerations[0].operator=Exists' + --set arguments='{asglifecycle}' + --wait +) +[[ ${#common_helm_args[@]} -gt 0 ]] && + aemm_helm_args+=("${common_helm_args[@]}") + +set -x +retry 5 helm "${aemm_helm_args[@]}" +set +x + +TAINT_CHECK_CYCLES=15 +TAINT_CHECK_SLEEP=15 + +DEPLOYED=0 +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then + echo "✅ Verified regular-pod-test pod was scheduled and started!" + DEPLOYED=1 + break + fi + echo "Setup Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $DEPLOYED -eq 0 ]]; then + echo "❌ regular-pod-test pod deployment failed" + fail_and_exit 2 +fi + + +# Check that worker node was cordoned and drained +cordoned=0 +test_node="${TEST_NODE:-$CLUSTER_NAME-worker}" +for i in $(seq 1 $TAINT_CHECK_CYCLES); do + if [[ $cordoned -eq 0 ]] && kubectl get nodes "${test_node}" | grep SchedulingDisabled > /dev/null; then + echo "✅ Verified the worker node was cordoned!" + cordoned=1 + fi + + if [[ $cordoned -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then + echo "✅ Verified the regular-pod-test pod was evicted!" + echo "✅ ASG Lifecycle IMDS Test Passed $CLUSTER_NAME! ✅" + exit 0 + fi + echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" + sleep $TAINT_CHECK_SLEEP +done + +if [[ $cordoned -eq 0 ]]; then + echo "❌ Worker node was not cordoned" +else + echo "❌ regular-pod-test was not evicted" +fi + +fail_and_exit 1 \ No newline at end of file diff --git a/test/eks-cluster-test/run-test b/test/eks-cluster-test/run-test index 949f68d1..a1e3e316 100755 --- a/test/eks-cluster-test/run-test +++ b/test/eks-cluster-test/run-test @@ -6,7 +6,7 @@ SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" PRESERVE=true export TEST_WINDOWS="false" -export AEMM_VERSION="1.8.1" +export AEMM_VERSION="1.12.0" export AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz" export CLUSTER_CONFIG_FILE=$SCRIPTPATH/cluster-spec.yaml diff --git a/test/k8s-local-cluster-test/run-test b/test/k8s-local-cluster-test/run-test index fd80d9fd..7c820cfe 100755 --- a/test/k8s-local-cluster-test/run-test +++ b/test/k8s-local-cluster-test/run-test @@ -18,7 +18,7 @@ WEBHOOK_DOCKER_IMG="" OVERRIDE_PATH=0 K8S_VERSION="1.30" AEMM_URL="amazon-ec2-metadata-mock-service.default.svc.cluster.local" -AEMM_VERSION="1.8.1" +AEMM_VERSION="1.12.0" AEMM_DL_URL="https://github.com/aws/amazon-ec2-metadata-mock/releases/download/v$AEMM_VERSION/amazon-ec2-metadata-mock-$AEMM_VERSION.tgz" WEBHOOK_URL=${WEBHOOK_URL:="http://webhook-test-proxy.default.svc.cluster.local"}