Skip to content

Commit

Permalink
Taint nodes on spot and scheduled events (#162)
Browse files Browse the repository at this point in the history
* Taint nodes on spot and scheduled events

Fixes #160.

Signed-off-by: Ilya Shaisultanov <[email protected]>
  • Loading branch information
diversario authored May 18, 2020
1 parent b647783 commit 0d528d6
Show file tree
Hide file tree
Showing 18 changed files with 417 additions and 24 deletions.
1 change: 1 addition & 0 deletions cmd/node-termination-handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ func watchForCancellationEvents(cancelChan <-chan interruptionevent.Interruption
log.Log().Msgf("Uncordoning the node failed: %v", err)
}
node.RemoveNTHLabels()
node.RemoveNTHTaints()
} else {
log.Log().Msg("Another interruption event is active, not uncordoning the node")
}
Expand Down
1 change: 1 addition & 0 deletions config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Parameter | Description | Default
`enableSpotInterruptionDraining` | If true, drain nodes when the spot interruption termination notice is received | `true`
`metadataTries` | The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3. | `3`
`cordonOnly` | If true, nodes will be cordoned but not drained when an interruption event occurs. | `false`
`taintNode` | If true, nodes will be tainted when an interruption event occurs. Currently used taint keys are `aws-node-termination-handler/scheduled-maintenance` and `aws-node-termination-handler/spot-itn` | `false`
`jsonLogging` | If true, use JSON-formatted logs instead of human readable logs. | `false`
`affinity` | node/pod affinities | None
`podAnnotations` | annotations to add to each pod | `{}`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ spec:
value: {{ .Values.metadataTries | quote }}
- name: CORDON_ONLY
value: {{ .Values.cordonOnly | quote }}
- name: TAINT_NODE
value: {{ .Values.taintNode | quote }}
- name: JSON_LOGGING
value: {{ .Values.jsonLogging | quote }}
- name: WEBHOOK_PROXY
Expand Down
2 changes: 2 additions & 0 deletions config/helm/aws-node-termination-handler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ enableSpotInterruptionDraining: ""
## enableScheduledEventDraining [EXPERIMENTAL] If true, drain nodes before the maintenance window starts for an EC2 instance scheduled event
enableScheduledEventDraining: ""

taintNode: false

## dryRun tells node-termination-handler to only log calls to kubernetes control plane
dryRun: false

Expand Down
4 changes: 4 additions & 0 deletions config/helm/ec2-metadata-test-proxy/templates/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,8 @@ spec:
value: {{ .Values.ec2MetadataTestProxy.enableSpotITN | quote }}
- name: ENABLE_IMDS_V2
value: {{ .Values.ec2MetadataTestProxy.enableIMDSV2 | quote }}
{{- if .Values.ec2MetadataTestProxy.tolerations }}
tolerations:
{{ toYaml .Values.ec2MetadataTestProxy.tolerations | indent 8 }}
{{- end }}
{{- end -}}
2 changes: 1 addition & 1 deletion config/helm/ec2-metadata-test-proxy/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ ec2MetadataTestProxy:
image:
repository: ec2-metadata-test-proxy
tag: customtest
tolerations: []
regularPodTest:
create: true
label: regular-pod-test
port: 1339

5 changes: 5 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
metadataTriesConfigKey = "METADATA_TRIES"
metadataTriesDefault = 3
cordonOnly = "CORDON_ONLY"
taintNode = "TAINT_NODE"
jsonLoggingConfigKey = "JSON_LOGGING"
jsonLoggingDefault = false
)
Expand All @@ -75,6 +76,7 @@ type Config struct {
EnableSpotInterruptionDraining bool
MetadataTries int
CordonOnly bool
TaintNode bool
JsonLogging bool
}

Expand Down Expand Up @@ -107,6 +109,7 @@ func ParseCliArgs() (config Config, err error) {
flag.BoolVar(&config.EnableSpotInterruptionDraining, "enable-spot-interruption-draining", getBoolEnv(enableSpotInterruptionDrainingConfigKey, enableSpotInterruptionDrainingDefault), "If true, drain nodes when the spot interruption termination notice is received")
flag.IntVar(&config.MetadataTries, "metadata-tries", getIntEnv(metadataTriesConfigKey, metadataTriesDefault), "The number of times to try requesting metadata. If you would like 2 retries, set metadata-tries to 3.")
flag.BoolVar(&config.CordonOnly, "cordon-only", getBoolEnv(cordonOnly, false), "If true, nodes will be cordoned but not drained when an interruption event occurs.")
flag.BoolVar(&config.TaintNode, "taint-node", getBoolEnv(taintNode, false), "If true, nodes will be tainted when an interruption event occurs.")
flag.BoolVar(&config.JsonLogging, "json-logging", getBoolEnv(jsonLoggingConfigKey, jsonLoggingDefault), "If true, use JSON-formatted logs instead of human readable logs.")

flag.Parse()
Expand Down Expand Up @@ -142,6 +145,7 @@ func ParseCliArgs() (config Config, err error) {
"\tenable-spot-interruption-draining: %t,\n"+
"\tmetadata-tries: %d,\n"+
"\tcordon-only: %t,\n"+
"\ttaint-node: %t,\n"+
"\tjson-logging: %t,\n"+
"\twebhook-proxy: %s,\n",
config.DryRun,
Expand All @@ -157,6 +161,7 @@ func ParseCliArgs() (config Config, err error) {
config.EnableSpotInterruptionDraining,
config.MetadataTries,
config.CordonOnly,
config.TaintNode,
config.JsonLogging,
config.WebhookProxy,
)
Expand Down
14 changes: 10 additions & 4 deletions pkg/interruptionevent/scheduled-event.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,20 +85,26 @@ func checkForScheduledEvents(imds *ec2metadata.Service) ([]InterruptionEvent, er
return events, nil
}

func uncordonAfterRebootPreDrain(interruptionEvent InterruptionEvent, node node.Node) error {
err := node.MarkWithEventID(interruptionEvent.EventID)
func uncordonAfterRebootPreDrain(interruptionEvent InterruptionEvent, n node.Node) error {
err := n.MarkWithEventID(interruptionEvent.EventID)
if err != nil {
return fmt.Errorf("Unable to mark node with event ID: %w", err)
}

err = n.TaintScheduledMaintenance(interruptionEvent.EventID)
if err != nil {
return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ScheduledMaintenanceTaint, interruptionEvent.EventID, err)
}

// if the node is already marked as unschedulable, then don't do anything
unschedulable, err := node.IsUnschedulable()
unschedulable, err := n.IsUnschedulable()
if err == nil && unschedulable {
log.Log().Msg("Node is already marked unschedulable, not taking any action to add uncordon label.")
return nil
} else if err != nil {
return fmt.Errorf("Encountered an error while checking if the node is unschedulable. Not setting an uncordon label: %w", err)
}
err = node.MarkForUncordonAfterReboot()
err = n.MarkForUncordonAfterReboot()
if err != nil {
return fmt.Errorf("Unable to mark the node for uncordon: %w", err)
}
Expand Down
19 changes: 15 additions & 4 deletions pkg/interruptionevent/scheduled-event_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,26 @@ func getNode(t *testing.T, drainHelper *drain.Helper) *node.Node {
}

func TestUncordonAfterRebootPreDrainSuccess(t *testing.T) {
drainEvent := InterruptionEvent{}
drainEvent := InterruptionEvent{
EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters",
}
nthConfig := config.Config{
DryRun: true,
DryRun: true,
NodeName: nodeName,
}
tNode, _ := node.New(nthConfig)

err := uncordonAfterRebootPreDrain(drainEvent, *tNode)
client := fake.NewSimpleClientset()
_, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}})
h.Ok(t, err)

tNode, err := node.NewWithValues(nthConfig, getDrainHelper(client))
h.Ok(t, err)

err = uncordonAfterRebootPreDrain(drainEvent, *tNode)

h.Ok(t, err)
}

func TestUncordonAfterRebootPreDrainMarkWithEventIDFailure(t *testing.T) {
resetFlagsForTest()

Expand Down
21 changes: 17 additions & 4 deletions pkg/interruptionevent/spot-itn-event.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"time"

"github.com/aws/aws-node-termination-handler/pkg/ec2metadata"
"github.com/aws/aws-node-termination-handler/pkg/node"
"github.com/rs/zerolog/log"
)

Expand Down Expand Up @@ -59,10 +60,22 @@ func checkForSpotInterruptionNotice(imds *ec2metadata.Service) (*InterruptionEve
hash := sha256.New()
hash.Write([]byte(fmt.Sprintf("%v", instanceAction)))

var preDrainFunc preDrainTask = setInterruptionTaint

return &InterruptionEvent{
EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)),
Kind: SpotITNKind,
StartTime: interruptionTime,
Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time),
EventID: fmt.Sprintf("spot-itn-%x", hash.Sum(nil)),
Kind: SpotITNKind,
StartTime: interruptionTime,
Description: fmt.Sprintf("Spot ITN received. Instance will be interrupted at %s \n", instanceAction.Time),
PreDrainTask: preDrainFunc,
}, nil
}

func setInterruptionTaint(interruptionEvent InterruptionEvent, n node.Node) error {
err := n.TaintSpotItn(interruptionEvent.EventID)
if err != nil {
return fmt.Errorf("Unable to taint node with taint %s:%s: %w", node.ScheduledMaintenanceTaint, interruptionEvent.EventID, err)
}

return nil
}
95 changes: 95 additions & 0 deletions pkg/interruptionevent/spot-itn-event_internal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Copyright 2016-2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package interruptionevent

import (
"os"
"testing"
"time"

"github.com/aws/aws-node-termination-handler/pkg/config"
"github.com/aws/aws-node-termination-handler/pkg/node"
h "github.com/aws/aws-node-termination-handler/pkg/test"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
"k8s.io/kubectl/pkg/drain"
)

var spotNodeName = "NAME"

func getSpotDrainHelper(client *fake.Clientset) *drain.Helper {
return &drain.Helper{
Client: client,
Force: true,
GracePeriodSeconds: -1,
IgnoreAllDaemonSets: true,
DeleteLocalData: true,
Timeout: time.Duration(120) * time.Second,
Out: os.Stdout,
ErrOut: os.Stderr,
}
}

func TestSetInterruptionTaint(t *testing.T) {
drainEvent := InterruptionEvent{
EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters",
}
nthConfig := config.Config{
DryRun: true,
NodeName: spotNodeName,
}

client := fake.NewSimpleClientset()
_, err := client.CoreV1().Nodes().Create(&v1.Node{ObjectMeta: metav1.ObjectMeta{Name: spotNodeName}})
h.Ok(t, err)

tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client))
h.Ok(t, err)

err = setInterruptionTaint(drainEvent, *tNode)

h.Ok(t, err)
}

func TestInterruptionTaintAlreadyPresent(t *testing.T) {
drainEvent := InterruptionEvent{
EventID: "some-id-that-is-very-long-for-some-reason-and-is-definitely-over-63-characters",
}
nthConfig := config.Config{
DryRun: false,
NodeName: spotNodeName,
}

client := fake.NewSimpleClientset()
newNode := &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: spotNodeName},
Spec: v1.NodeSpec{Taints: []v1.Taint{{
Key: node.SpotInterruptionTaint,
Value: drainEvent.EventID[:63],
Effect: v1.TaintEffectNoSchedule,
},
}},
}

_, err := client.CoreV1().Nodes().Create(newNode)
h.Ok(t, err)

tNode, err := node.NewWithValues(nthConfig, getSpotDrainHelper(client))
h.Ok(t, err)

err = setInterruptionTaint(drainEvent, *tNode)

h.Ok(t, err)
}
Loading

0 comments on commit 0d528d6

Please sign in to comment.