From 6c789ce12afe4f58033311fd512224976d75e081 Mon Sep 17 00:00:00 2001 From: Matt Merkes Date: Mon, 21 Dec 2020 07:34:00 -0800 Subject: [PATCH] Adds test for amazon-eks-ami issue 454 to test a soft lockup bug --- eks/amazon-eks-ami-issue-454/README.md | 17 + .../soft-lockup-454.go | 334 ++++++++++++++++++ eks/eks.go | 8 + eksconfig/add-on-ami-soft-lockup-issue-454.go | 65 ++++ eksconfig/config.go | 9 + eksconfig/env.go | 16 + 6 files changed, 449 insertions(+) create mode 100644 eks/amazon-eks-ami-issue-454/README.md create mode 100644 eks/amazon-eks-ami-issue-454/soft-lockup-454.go create mode 100644 eksconfig/add-on-ami-soft-lockup-issue-454.go diff --git a/eks/amazon-eks-ami-issue-454/README.md b/eks/amazon-eks-ami-issue-454/README.md new file mode 100644 index 000000000..c1cfff531 --- /dev/null +++ b/eks/amazon-eks-ami-issue-454/README.md @@ -0,0 +1,17 @@ +This is intended to test a soft lockup issue described [here](https://github.com/awslabs/amazon-eks-ami/issues/454). +This based off of [this repo](https://github.com/mmerkes/eks-k8s-repro-assistant/tree/master/scenarios/decompression-loop). + +### Running + +Here's an example command that will run this test. + +``` +AWS_K8S_TESTER_EKS_ON_FAILURE_DELETE=true \ +AWS_K8S_TESTER_EKS_ADD_ON_NODE_GROUPS_ENABLE=true \ +AWS_K8S_TESTER_EKS_ADD_ON_NODE_GROUPS_ROLE_CREATE=true \ +AWS_K8S_TESTER_EKS_ADD_ON_NODE_GROUPS_ASGS='{"soft-lockup":{"name":"soft-lockup","remote-access-user-name":"ec2-user","ami-type":"AL2_x86_64","image-id-ssm-parameter":"/aws/service/eks/optimized-ami/1.16/amazon-linux-2/recommended/image_id","instance-types":["m5.2xlarge"],"volume-size":40,"asg-min-size":1,"asg-max-size":1,"asg-desired-capacity":1,"kubelet-extra-args":"--node-labels amazon-ami-issue=454"}}' \ +AWS_K8S_TESTER_EKS_ADD_ON_AMI_SOFT_LOCKUP_ISSUE_454_ENABLE=true \ +AWS_K8S_TESTER_EKS_ADD_ON_AMI_SOFT_LOCKUP_ISSUE_454_DEPLOYMENT_NODE_SELECTOR='{"amazon-ami-issue":"454"}' \ +AWS_K8S_TESTER_EKS_PARAMETERS_REQUEST_HEADER_KEY="x-eks-opts" \ +./bin/aws-k8s-tester-latest-darwin-amd64 eks create cluster --enable-prompt=true --path ./stack/test.yaml +``` diff --git a/eks/amazon-eks-ami-issue-454/soft-lockup-454.go b/eks/amazon-eks-ami-issue-454/soft-lockup-454.go new file mode 100644 index 000000000..509be5e69 --- /dev/null +++ b/eks/amazon-eks-ami-issue-454/soft-lockup-454.go @@ -0,0 +1,334 @@ +/* +This is intended to test a soft lockup issue described here: +https://github.com/awslabs/amazon-eks-ami/issues/454 +This is based off of the following repro: +https://github.com/mmerkes/eks-k8s-repro-assistant/tree/master/scenarios/decompression-loop +*/ +package amazoneksamiissue454 + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "reflect" + "strings" + "time" + + eks_tester "github.com/aws/aws-k8s-tester/eks/tester" + "github.com/aws/aws-k8s-tester/eksconfig" + k8s_client "github.com/aws/aws-k8s-tester/pkg/k8s-client" + "github.com/aws/aws-k8s-tester/pkg/timeutil" + "github.com/aws/aws-sdk-go/aws" + "go.uber.org/zap" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + apierrs "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/exec" +) + +// Config defines test configuration +type Config struct { + Logger *zap.Logger + LogWriter io.Writer + Stopc chan struct{} + EKSConfig *eksconfig.Config + K8SClient k8s_client.EKS +} + +var pkgName = reflect.TypeOf(tester{}).PkgPath() + +func (ts *tester) Name() string { return pkgName } + +// New creates a new Job tester. +func New(cfg Config) eks_tester.Tester { + cfg.Logger.Info("creating tester", zap.String("tester", pkgName)) + return &tester{cfg: cfg} +} + +type tester struct { + cfg Config +} + +const ( + deploymentName = "soft-lockup-454" + decompressionLoopCommand = "yum install git -y; git clone https://github.com/aws/aws-sdk-go.git; tar cvf sdk.tar.gz aws-sdk-go; rm -rf aws-sdk-go && while true; do tar xvf sdk.tar.gz; sleep 5; done" + nodeCheckWaitSeconds = 120 + nodeCheckIntervalSeconds = 5 +) + +func (ts *tester) Create() error { + if !ts.cfg.EKSConfig.IsEnabledAddOnAmiSoftLockupIssue454() { + ts.cfg.Logger.Info("skipping tester.Create", zap.String("tester", pkgName)) + return nil + } + if ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Created { + ts.cfg.Logger.Info("skipping tester.Create", zap.String("tester", pkgName)) + return nil + } + + ts.cfg.Logger.Info("starting tester.Create", zap.String("tester", pkgName)) + createStart := time.Now() + + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Created = true + defer func() { + createEnd := time.Now() + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.TimeFrameCreate = timeutil.NewTimeFrame(createStart, createEnd) + ts.cfg.EKSConfig.Sync() + }() + + if err := k8s_client.CreateNamespace( + ts.cfg.Logger, + ts.cfg.K8SClient.KubernetesClientSet(), + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace, + ); err != nil { + return err + } + if err := ts.createDeployment(); err != nil { + return err + } + if err := ts.waitDeployment(); err != nil { + return err + } + + if err := ts.validateNodesStayHealthy(); err != nil { + return err + } + + return ts.cfg.EKSConfig.Sync() +} + +func (ts *tester) Delete() error { + if !ts.cfg.EKSConfig.IsEnabledAddOnAmiSoftLockupIssue454() { + ts.cfg.Logger.Info("skipping tester.Delete", zap.String("tester", pkgName)) + return nil + } + if !ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Created { + ts.cfg.Logger.Info("skipping tester.Delete", zap.String("tester", pkgName)) + return nil + } + + ts.cfg.Logger.Info("starting tester.Delete", zap.String("tester", pkgName)) + deleteStart := time.Now() + defer func() { + deleteEnd := time.Now() + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.TimeFrameDelete = timeutil.NewTimeFrame(deleteStart, deleteEnd) + ts.cfg.EKSConfig.Sync() + }() + + var errs []string + + if err := ts.deleteDeployment(); err != nil { + errs = append(errs, fmt.Sprintf("failed to delete soft-lockup-issue-454 Deployment (%v)", err)) + } + ts.cfg.Logger.Info("wait for a minute after deleting Deployment") + time.Sleep(time.Minute) + + if err := k8s_client.DeleteNamespaceAndWait( + ts.cfg.Logger, + ts.cfg.K8SClient.KubernetesClientSet(), + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace, + k8s_client.DefaultNamespaceDeletionInterval, + k8s_client.DefaultNamespaceDeletionTimeout, + k8s_client.WithForceDelete(true), + ); err != nil { + errs = append(errs, fmt.Sprintf("failed to delete soft-lockup-issue-454 namespace (%v)", err)) + } + + if len(errs) > 0 { + return errors.New(strings.Join(errs, ", ")) + } + + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Created = false + return ts.cfg.EKSConfig.Sync() +} + +func (ts *tester) createDeployment() error { + var nodeSelector map[string]string + if len(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentNodeSelector) > 0 { + nodeSelector = ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentNodeSelector + } else { + nodeSelector = nil + } + ts.cfg.Logger.Info("creating soft-lockup-454 Deployment", zap.Any("node-selector", nodeSelector)) + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + _, err := ts.cfg.K8SClient.KubernetesClientSet(). + AppsV1(). + Deployments(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace). + Create( + ctx, + &appsv1.Deployment{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "apps/v1", + Kind: "Deployment", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: deploymentName, + Namespace: ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace, + Labels: map[string]string{ + "app.kubernetes.io/name": deploymentName, + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: aws.Int32(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentReplicas), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/name": deploymentName, + }, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app.kubernetes.io/name": deploymentName, + }, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyAlways, + Containers: []v1.Container{ + { + Name: deploymentName, + Image: "centos:7", + Command: []string{"bash"}, + Args: []string{"-c", decompressionLoopCommand}, + }, + }, + NodeSelector: nodeSelector, + }, + }, + }, + }, + metav1.CreateOptions{}, + ) + cancel() + if err != nil { + return fmt.Errorf("failed to create soft-luck-454 Deployment (%v)", err) + } + + ts.cfg.Logger.Info("created soft-luck-454 Deployment") + return ts.cfg.EKSConfig.Sync() +} + +func (ts *tester) deleteDeployment() error { + ts.cfg.Logger.Info("deleting soft-luck-454 Deployment") + foreground := metav1.DeletePropagationForeground + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + err := ts.cfg.K8SClient.KubernetesClientSet(). + AppsV1(). + Deployments(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace). + Delete( + ctx, + deploymentName, + metav1.DeleteOptions{ + GracePeriodSeconds: aws.Int64(0), + PropagationPolicy: &foreground, + }, + ) + cancel() + if err != nil && !apierrs.IsNotFound(err) && !strings.Contains(err.Error(), "not found") { + ts.cfg.Logger.Warn("failed to delete", zap.Error(err)) + return fmt.Errorf("failed to delete soft-luck-454 Deployment (%v)", err) + } + + ts.cfg.Logger.Info("deleted soft-luck-454 Deployment") + return ts.cfg.EKSConfig.Sync() +} + +func (ts *tester) waitDeployment() (err error) { + timeout := 7*time.Minute + time.Duration(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentReplicas)*time.Minute + ctx, cancel := context.WithTimeout(context.Background(), timeout) + _, err = k8s_client.WaitForDeploymentCompletes( + ctx, + ts.cfg.Logger, + ts.cfg.LogWriter, + ts.cfg.Stopc, + ts.cfg.K8SClient, + time.Minute, + 20*time.Second, + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace, + deploymentName, + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentReplicas, + k8s_client.WithQueryFunc(func() { + descArgs := []string{ + ts.cfg.EKSConfig.KubectlPath, + "--kubeconfig=" + ts.cfg.EKSConfig.KubeConfigPath, + "--namespace=" + ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.Namespace, + "describe", + "deployment", + deploymentName, + } + descCmd := strings.Join(descArgs, " ") + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + output, err := exec.New().CommandContext(ctx, descArgs[0], descArgs[1:]...).CombinedOutput() + cancel() + if err != nil { + ts.cfg.Logger.Warn("'kubectl describe deployment' failed", zap.Error(err)) + } + out := string(output) + fmt.Fprintf(ts.cfg.LogWriter, "\n\n\"%s\" output:\n%s\n\n", descCmd, out) + }), + ) + cancel() + return err +} + +func (ts *tester) validateNodesStayHealthy() (err error) { + nodeSelector := ts.getNodeSelector() + + nodes, err := ts.cfg.K8SClient.ListNodesWithSelector(1000, 5*time.Second, nodeSelector) + if err != nil { + ts.cfg.Logger.Warn("get nodes failed", zap.Error(err)) + return err + } + + start := time.Now() + + for { + for _, node := range nodes { + nodeName := node.GetName() + ts.cfg.Logger.Info("checking node-info conditions", zap.String("node-name", nodeName)) + for _, cond := range node.Status.Conditions { + if cond.Type != v1.NodeReady { + continue + } + + ts.cfg.Logger.Info("node info", + zap.String("node-name", nodeName), + zap.String("type", fmt.Sprintf("%s", cond.Type)), + zap.String("status", fmt.Sprintf("%s", cond.Status)), + ) + + if cond.Status != v1.ConditionTrue { + return fmt.Errorf("node %s went unhealthy", nodeName) + } + } + } + + if time.Since(start) >= nodeCheckWaitSeconds { + ts.cfg.Logger.Info("All nodes stayed healthy") + + return nil + } + + time.Sleep(nodeCheckIntervalSeconds * time.Second) + } +} + +func (ts *tester) getNodeSelector() string { + if len(ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentNodeSelector) == 0 { + return "" + } + + nodeSelector := ts.cfg.EKSConfig.AddOnAmiSoftLockupIssue454.DeploymentNodeSelector + b := new(bytes.Buffer) + i := 0 + for key, value := range nodeSelector { + if i != 0 { + fmt.Fprintf(b, ",") + } + fmt.Fprintf(b, "%s=%s", key, value) + i++ + } + return b.String() +} diff --git a/eks/eks.go b/eks/eks.go index 412675a84..4775866ef 100644 --- a/eks/eks.go +++ b/eks/eks.go @@ -23,6 +23,7 @@ import ( "github.com/aws/aws-k8s-tester/ec2config" alb_2048 "github.com/aws/aws-k8s-tester/eks/alb-2048" + ami_soft_lockup_issue_454 "github.com/aws/aws-k8s-tester/eks/amazon-eks-ami-issue-454" app_mesh "github.com/aws/aws-k8s-tester/eks/app-mesh" "github.com/aws/aws-k8s-tester/eks/cluster" "github.com/aws/aws-k8s-tester/eks/cluster-loader/clusterloader2" @@ -813,6 +814,13 @@ func (ts *Tester) createTesters() (err error) { K8SClient: ts.k8sClient, EKSAPI: ts.eksAPI, }), + ami_soft_lockup_issue_454.New(ami_soft_lockup_issue_454.Config{ + Logger: ts.lg, + LogWriter: ts.logWriter, + Stopc: ts.stopCreationCh, + EKSConfig: ts.cfg, + K8SClient: ts.k8sClient, + }), } if serr := ts.cfg.Sync(); serr != nil { fmt.Fprintf(ts.logWriter, ts.color("[light_magenta]cfg.Sync failed [default]%v\n"), serr) diff --git a/eksconfig/add-on-ami-soft-lockup-issue-454.go b/eksconfig/add-on-ami-soft-lockup-issue-454.go new file mode 100644 index 000000000..16abcb4f3 --- /dev/null +++ b/eksconfig/add-on-ami-soft-lockup-issue-454.go @@ -0,0 +1,65 @@ +package eksconfig + +import ( + "errors" + + "github.com/aws/aws-k8s-tester/pkg/timeutil" +) + +// AddOnAmiSoftLockupIssue454 defines parameters for EKS cluster +// add-on NLB hello-world service. +type AddOnAmiSoftLockupIssue454 struct { + // Enable is 'true' to create this add-on. + Enable bool `json:"enable"` + // Created is true when the resource has been created. + // Used for delete operations. + Created bool `json:"created" read-only:"true"` + TimeFrameCreate timeutil.TimeFrame `json:"time-frame-create" read-only:"true"` + TimeFrameDelete timeutil.TimeFrame `json:"time-frame-delete" read-only:"true"` + + // Namespace is the namespace to create objects in. + Namespace string `json:"namespace"` + + // DeploymentReplicas is the number of replicas to deploy using "Deployment" object. + DeploymentReplicas int32 `json:"deployment-replicas"` + // DeploymentNodeSelector is configured to overwrite existing node selector + // for deployment. If left empty, tester sets default selector. + DeploymentNodeSelector map[string]string `json:"deployment-node-selector"` +} + +// EnvironmentVariablePrefixAddOnAmiSoftLockupIssue454 is the environment variable prefix used for "eksconfig". +const EnvironmentVariablePrefixAddOnAmiSoftLockupIssue454 = AWS_K8S_TESTER_EKS_PREFIX + "ADD_ON_AMI_SOFT_LOCKUP_ISSUE_454_" + +// IsEnabledAddOnAmiSoftLockupIssue454 returns true if "AddOnAmiSoftLockupIssue454" is enabled. +// Otherwise, nil the field for "omitempty". +func (cfg *Config) IsEnabledAddOnAmiSoftLockupIssue454() bool { + if cfg.AddOnAmiSoftLockupIssue454 == nil { + return false + } + if cfg.AddOnAmiSoftLockupIssue454.Enable { + return true + } + cfg.AddOnAmiSoftLockupIssue454 = nil + return false +} + +func getDefaultAddOnAmiSoftLockupIssue454() *AddOnAmiSoftLockupIssue454 { + return &AddOnAmiSoftLockupIssue454{ + Enable: false, + DeploymentReplicas: 8, + DeploymentNodeSelector: make(map[string]string), + } +} + +func (cfg *Config) validateAddOnAmiSoftLockupIssue454() error { + if !cfg.IsEnabledAddOnAmiSoftLockupIssue454() { + return nil + } + if !cfg.IsEnabledAddOnNodeGroups() && !cfg.IsEnabledAddOnManagedNodeGroups() { + return errors.New("AddOnAmiSoftLockupIssue454.Enable true but no node group is enabled") + } + if cfg.AddOnAmiSoftLockupIssue454.Namespace == "" { + cfg.AddOnAmiSoftLockupIssue454.Namespace = cfg.Name + "-ami-soft-lockup-issue-454" + } + return nil +} diff --git a/eksconfig/config.go b/eksconfig/config.go index 4814ebac7..e2925c1cf 100644 --- a/eksconfig/config.go +++ b/eksconfig/config.go @@ -343,6 +343,10 @@ type Config struct { // for EKS cluster version upgrade add-on. AddOnClusterVersionUpgrade *AddOnClusterVersionUpgrade `json:"add-on-cluster-version-upgrade,omitempty"` + // AddOnAmiSoftLockupIssue454 defines parameters + // for testing the AMI soft lockup issue. + AddOnAmiSoftLockupIssue454 *AddOnAmiSoftLockupIssue454 `json:"add-on-ami-soft-lockup-issue-454,omitempty"` + // Spec contains addons and other configuration // Note: New addons should be implemented inside spec Spec Spec `json:"spec,omitempty"` @@ -867,6 +871,7 @@ func NewDefault() *Config { AddOnStresserLocal: getDefaultAddOnStresserLocal(), AddOnStresserRemote: getDefaultAddOnStresserRemote(), AddOnClusterVersionUpgrade: getDefaultAddOnClusterVersionUpgrade(), + AddOnAmiSoftLockupIssue454: getDefaultAddOnAmiSoftLockupIssue454(), // read-only Status: &Status{ @@ -1074,6 +1079,10 @@ func (cfg *Config) ValidateAndSetDefaults() error { return fmt.Errorf("validateAddOnClusterVersionUpgrade failed [%v]", err) } + if err := cfg.validateAddOnAmiSoftLockupIssue454(); err != nil { + return fmt.Errorf("validateAddOnClusterVersionUpgrade failed [%v]", err) + } + return nil } diff --git a/eksconfig/env.go b/eksconfig/env.go index 77f515801..798407844 100644 --- a/eksconfig/env.go +++ b/eksconfig/env.go @@ -554,6 +554,22 @@ func (cfg *Config) UpdateFromEnvs() (err error) { return fmt.Errorf("expected *AddOnClusterVersionUpgrade, got %T", vv) } + if cfg.AddOnAmiSoftLockupIssue454 == nil { + cfg.AddOnAmiSoftLockupIssue454 = &AddOnAmiSoftLockupIssue454{} + } + vv, err = parseEnvs(EnvironmentVariablePrefixAddOnAmiSoftLockupIssue454, cfg.AddOnAmiSoftLockupIssue454) + if err != nil { + return err + } + if av, ok := vv.(*AddOnAmiSoftLockupIssue454); ok { + cfg.AddOnAmiSoftLockupIssue454 = av + if !cfg.AddOnAmiSoftLockupIssue454.Enable { + return fmt.Errorf("WTF %T", cfg) + } + } else { + return fmt.Errorf("expected *AddOnAmiSoftLockupIssue454, got %T", vv) + } + return nil }