Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add security group pods scale test in ginkgo #457

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions scripts/test/create-cluster-karpenter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/usr/bin/env bash

# Create EKS cluster with Karpenter using eksctl
set -eo pipefail

SCRIPTS_DIR=$(cd "$(dirname "$0")" || exit 1; pwd)
source "$SCRIPTS_DIR/lib/common.sh"
check_is_installed eksctl
check_is_installed helm
check_is_installed aws


export KARPENTER_NAMESPACE="kube-system"
export KARPENTER_VERSION="1.0.1"
export K8S_VERSION="1.30"

export AWS_PARTITION="aws" # if you are not using standard partitions, you may need to configure to aws-cn / aws-us-gov
export CLUSTER_NAME="${USER}-sgp-scaletest"
export AWS_DEFAULT_REGION="us-west-2"
export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)"
export TEMPOUT="$(mktemp)"

# Deploy CFN stack to enable Karpenter to create and manage nodes
echo "Deploying Karpenter CFN stack"
curl -fsSL https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > "${TEMPOUT}" \
&& aws cloudformation deploy \
--stack-name "Karpenter-${CLUSTER_NAME}" \
--template-file "${TEMPOUT}" \
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=${CLUSTER_NAME}"

# Create EKS cluster
echo "Creating EKS cluster"
eksctl create cluster -f - <<EOF
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${AWS_DEFAULT_REGION}
version: "${K8S_VERSION}"
tags:
karpenter.sh/discovery: ${CLUSTER_NAME}

iam:
withOIDC: true
podIdentityAssociations:
- namespace: "${KARPENTER_NAMESPACE}"
serviceAccountName: karpenter
roleName: ${CLUSTER_NAME}-karpenter
permissionPolicyARNs:
- arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}

iamIdentityMappings:
- arn: "arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/KarpenterNodeRole-${CLUSTER_NAME}"
username: system:node:{{EC2PrivateDNSName}}
groups:
- system:bootstrappers
- system:nodes

managedNodeGroups:
- instanceType: c5.xlarge
amiFamily: AmazonLinux2
name: ${CLUSTER_NAME}-ng
desiredCapacity: 2
minSize: 1
maxSize: 10

addons:
- name: eks-pod-identity-agent
EOF

export CLUSTER_ENDPOINT="$(aws eks describe-cluster --name "${CLUSTER_NAME}" --query "cluster.endpoint" --output text)"
export KARPENTER_IAM_ROLE_ARN="arn:${AWS_PARTITION}:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter"

# Log out of ECR Public registry and perform unauthenticated image pull
docker logout public.ecr.aws
helm registry logout public.ecr.aws
# Install Karpenter
echo "Installing Karpenter"
helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version "${KARPENTER_VERSION}" --namespace "${KARPENTER_NAMESPACE}" --create-namespace \
--set "settings.clusterName=${CLUSTER_NAME}" \
--set "settings.interruptionQueue=${CLUSTER_NAME}" \
--set controller.resources.requests.cpu=1 \
--set controller.resources.requests.memory=1Gi \
--set controller.resources.limits.cpu=1 \
--set controller.resources.limits.memory=1Gi \
--wait

# Create NodePool and EC2NodeClass.
# NodePool sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes
# EC2NodeClass is used to configure AWS-specific settings like AMI type, AMI ID, EC2 security groups etc

cat <<EOF | envsubst | kubectl apply -f -
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: default
spec:
template:
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["c"]
- key: karpenter.k8s.aws/instance-generation
operator: Gt
values: ["2"]
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: default
expireAfter: 720h
limits:
cpu: 1000
disruption:
consolidationPolicy: WhenEmptyOrUnderutilized
consolidateAfter: 1m
---
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: default
spec:
amiFamily: AL2
role: "KarpenterNodeRole-${CLUSTER_NAME}"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "${CLUSTER_NAME}"
amiSelectorTerms:
- alias: al2@latest
EOF

echo "Enabling security group for pods on cluster"
kubectl set env daemonset aws-node -n kube-system ENABLE_POD_ENI=true
23 changes: 23 additions & 0 deletions scripts/test/delete-cluster-karpenter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env bash

# Delete EKS cluster & related resources created via script create-cluster-karpenter.sh
set -eo pipefail

SCRIPTS_DIR=$(cd "$(dirname "$0")" || exit 1; pwd)
source "$SCRIPTS_DIR/lib/common.sh"
check_is_installed helm
check_is_installed eksctl
check_is_installed jq
check_is_installed aws

export KARPENTER_NAMESPACE="kube-system"
export CLUSTER_NAME="${USER}-sgp-scaletest" # Update cluster name if it is different
echo "Uninstalling Karpenter"
helm uninstall karpenter --namespace "${KARPENTER_NAMESPACE}"
echo "Deleting Karpenter CFN stack"
aws cloudformation delete-stack --stack-name "Karpenter-${CLUSTER_NAME}"
aws ec2 describe-launch-templates --filters "Name=tag:karpenter.k8s.aws/cluster,Values=${CLUSTER_NAME}" |
jq -r ".LaunchTemplates[].LaunchTemplateName" |
xargs -I{} aws ec2 delete-launch-template --launch-template-name {}
echo "Deleting EKS cluster"
eksctl delete cluster --name "${CLUSTER_NAME}"
43 changes: 39 additions & 4 deletions test/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,42 @@ The Integration test suite provides the following focuses.

This is intended for the purposes of local development, testing and CI Setup. For more details refer the steps are provided in `scripts/test/README.md`

### Future Work
- Once we have more test suites, we can provide a script instead of invoking each suite manually.
- Add Windows tests to the list once the support is enabled.
- Move the script based tests in `integration-test` to Ginkgo Based integration/e2e test.
### Running Scale Tests

#### Test Pod startup latency
For each release, verify that pod startup latency is comparable to the previous release. This helps to detect regression issues which impact controller performance in the new release.

To run the test manually:

##### 1. Create EKS cluster and install Karpenter.

Karpenter provides node lifecycle management for Kubernetes clusters. It automates provisioning and deprovisioning of nodes based on the scheduling needs of pods, allowing efficient scaling and cost optimization.

The script will provision all required resources for the test:
1. Deploy CFN stack to set up EKS cluster infrastructure
2. Create EKS cluster using eksctl
3. Install Karpenter on the cluster via helm
4. Deploy default NodePool and EC2NodeClass. NodePool sets constraints on the nodes that can be created by Karpenter and the pods that can run on those nodes. EC2NodeClass is used to configure AWS-specific settings such as AMI type, AMI ID, EC2 security groups.
Refer to the Karpenter documentation for further details.
```
./scripts/test/create-cluster-karpenter.sh
```
The scripts are located in the `scripts/test` directory.

##### 2. Run the scale tests.

The scale tests are located in `test/integration/scale` directory. The test will create a deployment with 1000 pods and measures the pod startup latency. It asserts that all 1000 pods be ready within 5 minutes. The test is run three times on repeat and must pass each time.
```
KUBE_CONFIG_PATH=<path-to-kube-config> # Update the kube-config path
ginkgo -v --timeout 30m -- --cluster-kubeconfig=$KUBE_CONFIG_PATH --cluster-name=$CLUSTER_NAME --aws-region=$AWS_REGION --aws-vpc-id=$VPC_ID
```

##### 3. Delete EKS cluster and other resources.

The below script uninstalls Karpenter on the clusters, deletes the CFN stack, and finally deletes the EKS cluster.
```
./scripts/test/delete-cluster-karpenter.sh
```

References:
1. Karpenter Getting Started Guide: https://karpenter.sh/docs/getting-started/getting-started-with-karpenter/
17 changes: 17 additions & 0 deletions test/framework/resource/aws/ec2/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,20 @@ func (d *Manager) DeleteNetworkInterface(nwInterfaceID string) error {
})
return err
}
func (d *Manager) ReCreateSG(securityGroupName string, ctx context.Context) (string, error) {
groupID, err := d.GetSecurityGroupID(securityGroupName)
// If the security group already exists, no error will be returned
// We need to delete the security Group in this case so ingres/egress
// rules from last run don't interfere with the current test
if err == nil {
if err = d.DeleteSecurityGroup(ctx, groupID); err != nil {
return "", err
}
}
// If error is not nil, then the Security Group doesn't exists, we need
// to create new rule
if groupID, err = d.CreateSecurityGroup(securityGroupName); err != nil {
return "", err
}
return groupID, nil
}
1 change: 1 addition & 0 deletions test/framework/utils/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ package utils

const (
ResourceNamePrefix = "vpc-resource-controller-integration-"
TestNameSpace = "test-ns"
)
25 changes: 4 additions & 21 deletions test/integration/perpodsg/perpodsg_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ var _ = BeforeSuite(func() {
ctx = context.Background()
verify = verifier.NewPodVerification(frameWork, ctx)

securityGroupID1 = reCreateSGIfAlreadyExists(utils.ResourceNamePrefix + "sg-1")
securityGroupID2 = reCreateSGIfAlreadyExists(utils.ResourceNamePrefix + "sg-2")
securityGroupID1, err = frameWork.EC2Manager.ReCreateSG(utils.ResourceNamePrefix+"sg-1", ctx)
Expect(err).ToNot(HaveOccurred())
securityGroupID2, err = frameWork.EC2Manager.ReCreateSG(utils.ResourceNamePrefix+"sg-2", ctx)
Expect(err).ToNot(HaveOccurred())

nodeList = node.GetNodeAndWaitTillCapacityPresent(frameWork.NodeManager, "linux",
config.ResourceNamePodENI)
Expand All @@ -60,22 +62,3 @@ var _ = AfterSuite(func() {
Expect(frameWork.EC2Manager.DeleteSecurityGroup(ctx, securityGroupID1)).To(Succeed())
Expect(frameWork.EC2Manager.DeleteSecurityGroup(ctx, securityGroupID2)).To(Succeed())
})

func reCreateSGIfAlreadyExists(securityGroupName string) string {
groupID, err := frameWork.EC2Manager.GetSecurityGroupID(securityGroupName)
// If the security group already exists, no error will be returned
// We need to delete the security Group in this case so ingres/egress
// rules from last run don't interfere with the current test
if err == nil {
By("deleting the older security group" + groupID)
err = frameWork.EC2Manager.DeleteSecurityGroup(ctx, groupID)
Expect(err).ToNot(HaveOccurred())
}
// If error is not nil, then the Security Group doesn't exists, we need
// to create new rule
By("creating a new security group with name " + securityGroupName)
groupID, err = frameWork.EC2Manager.CreateSecurityGroup(securityGroupName)
Expect(err).ToNot(HaveOccurred())

return groupID
}
89 changes: 89 additions & 0 deletions test/integration/scale/pod_scale_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may
// not use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
// express or implied. See the License for the specific language governing
// permissions and limitations under the License.

package scale_test

import (
"time"

"github.com/aws/amazon-vpc-resource-controller-k8s/apis/vpcresources/v1beta1"
"github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/manifest"
deploymentWrapper "github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/resource/k8s/deployment"
sgpWrapper "github.com/aws/amazon-vpc-resource-controller-k8s/test/framework/resource/k8s/sgp"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
v1 "k8s.io/api/apps/v1"
)

var _ = Describe("Security group per pod scale test", func() {
var (
sgpLabelKey string
sgpLabelValue string
securityGroups []string
securityGroupPolicy *v1beta1.SecurityGroupPolicy
err error
)

BeforeEach(func() {
sgpLabelKey = "role"
sgpLabelValue = "db"
securityGroups = []string{securityGroupID}
})

JustBeforeEach(func() {
// create SGP
securityGroupPolicy, err = manifest.NewSGPBuilder().
Namespace(namespace).
PodMatchLabel(sgpLabelKey, sgpLabelValue).
SecurityGroup(securityGroups).Build()
Expect(err).NotTo(HaveOccurred())
})

JustAfterEach(func() {
By("deleting security group policy")
err = frameWork.SGPManager.DeleteAndWaitTillSecurityGroupIsDeleted(ctx, securityGroupPolicy)
Expect(err).NotTo(HaveOccurred())
})

Describe("creating deployment", func() {
var deployment *v1.Deployment

JustBeforeEach(func() {
deployment = manifest.NewDefaultDeploymentBuilder().
Namespace(namespace).
Replicas(1000).
PodLabel(sgpLabelKey, sgpLabelValue).Build()
})

JustAfterEach(func() {
By("deleting the deployment")
err = frameWork.DeploymentManager.DeleteAndWaitUntilDeploymentDeleted(ctx, deployment)
Expect(err).ToNot(HaveOccurred())
time.Sleep(time.Minute) // allow time for pods to terminate
})

Context("when deployment is created", func() {
It("should have all the pods running", MustPassRepeatedly(3), func() {
start := time.Now()
sgpWrapper.CreateSecurityGroupPolicy(frameWork.K8sClient, ctx, securityGroupPolicy)
deploymentWrapper.
CreateAndWaitForDeploymentToStart(frameWork.DeploymentManager, ctx, deployment)
duration := time.Since(start)
verify.VerifyNetworkingOfAllPodUsingENI(namespace, sgpLabelKey, sgpLabelValue,
securityGroups)
Expect(duration.Minutes()).To(BeNumerically("<", 5.0))
})
})
})

})
Loading
Loading