Skip to content

Commit

Permalink
test: add REBOOT_CONTROL_PLANE_NODES E2E config (Azure#3745)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackfrancis authored and penggu committed Oct 28, 2020
1 parent 4bed1df commit b54db61
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 43 deletions.
61 changes: 31 additions & 30 deletions test/e2e/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,37 @@ import (

// Config holds global test configuration
type Config struct {
SkipTest bool `envconfig:"SKIP_TEST" default:"false"`
SkipLogsCollection bool `envconfig:"SKIP_LOGS_COLLECTION" default:"true"`
Orchestrator string `envconfig:"ORCHESTRATOR" default:"kubernetes"`
Name string `envconfig:"NAME" default:""` // Name allows you to set the name of a cluster already created
Location string `envconfig:"LOCATION" default:""` // Location where you want to create the cluster
Regions []string `envconfig:"REGIONS" default:""` // A list of regions to instruct the runner to randomly choose when provisioning IaaS
ClusterDefinition string `envconfig:"CLUSTER_DEFINITION" required:"true" default:"examples/kubernetes.json"` // ClusterDefinition is the path on disk to the json template these are normally located in examples/
CleanUpOnExit bool `envconfig:"CLEANUP_ON_EXIT" default:"false"` // if true the tests will clean up rgs when tests finish
CleanUpIfFail bool `envconfig:"CLEANUP_IF_FAIL" default:"false"`
RetainSSH bool `envconfig:"RETAIN_SSH" default:"true"`
StabilityIterations int `envconfig:"STABILITY_ITERATIONS" default:"3"`
ClusterInitPodName string `envconfig:"CLUSTER_INIT_POD_NAME" default:""`
ClusterInitJobName string `envconfig:"CLUSTER_INIT_JOB_NAME" default:""`
Timeout time.Duration `envconfig:"TIMEOUT" default:"20m"`
LBTimeout time.Duration `envconfig:"LB_TIMEOUT" default:"20m"`
CurrentWorkingDir string
ResourceGroup string `envconfig:"RESOURCE_GROUP" default:""`
SoakClusterName string `envconfig:"SOAK_CLUSTER_NAME" default:""`
ForceDeploy bool `envconfig:"FORCE_DEPLOY" default:"false"`
UseDeployCommand bool `envconfig:"USE_DEPLOY_COMMAND" default:"false"`
GinkgoFocus string `envconfig:"GINKGO_FOCUS" default:""`
GinkgoSkip string `envconfig:"GINKGO_SKIP" default:""`
GinkgoFailFast bool `envconfig:"GINKGO_FAIL_FAST" default:"false"`
DebugAfterSuite bool `envconfig:"DEBUG_AFTERSUITE" default:"false"`
BlockSSHPort bool `envconfig:"BLOCK_SSH" default:"false"`
AddNodePoolInput string `envconfig:"ADD_NODE_POOL_INPUT" default:""`
TestPVC bool `envconfig:"TEST_PVC" default:"false"`
SubscriptionID string `envconfig:"SUBSCRIPTION_ID"`
ClientID string `envconfig:"CLIENT_ID"`
ClientSecret string `envconfig:"CLIENT_SECRET"`
SkipTest bool `envconfig:"SKIP_TEST" default:"false"`
SkipLogsCollection bool `envconfig:"SKIP_LOGS_COLLECTION" default:"true"`
Orchestrator string `envconfig:"ORCHESTRATOR" default:"kubernetes"`
Name string `envconfig:"NAME" default:""` // Name allows you to set the name of a cluster already created
Location string `envconfig:"LOCATION" default:""` // Location where you want to create the cluster
Regions []string `envconfig:"REGIONS" default:""` // A list of regions to instruct the runner to randomly choose when provisioning IaaS
ClusterDefinition string `envconfig:"CLUSTER_DEFINITION" required:"true" default:"examples/kubernetes.json"` // ClusterDefinition is the path on disk to the json template these are normally located in examples/
CleanUpOnExit bool `envconfig:"CLEANUP_ON_EXIT" default:"false"` // if true the tests will clean up rgs when tests finish
CleanUpIfFail bool `envconfig:"CLEANUP_IF_FAIL" default:"false"`
RetainSSH bool `envconfig:"RETAIN_SSH" default:"true"`
StabilityIterations int `envconfig:"STABILITY_ITERATIONS" default:"3"`
ClusterInitPodName string `envconfig:"CLUSTER_INIT_POD_NAME" default:""`
ClusterInitJobName string `envconfig:"CLUSTER_INIT_JOB_NAME" default:""`
Timeout time.Duration `envconfig:"TIMEOUT" default:"20m"`
LBTimeout time.Duration `envconfig:"LB_TIMEOUT" default:"20m"`
CurrentWorkingDir string
ResourceGroup string `envconfig:"RESOURCE_GROUP" default:""`
SoakClusterName string `envconfig:"SOAK_CLUSTER_NAME" default:""`
ForceDeploy bool `envconfig:"FORCE_DEPLOY" default:"false"`
UseDeployCommand bool `envconfig:"USE_DEPLOY_COMMAND" default:"false"`
GinkgoFocus string `envconfig:"GINKGO_FOCUS" default:""`
GinkgoSkip string `envconfig:"GINKGO_SKIP" default:""`
GinkgoFailFast bool `envconfig:"GINKGO_FAIL_FAST" default:"false"`
DebugAfterSuite bool `envconfig:"DEBUG_AFTERSUITE" default:"false"`
BlockSSHPort bool `envconfig:"BLOCK_SSH" default:"false"`
RebootControlPlaneNodes bool `envconfig:"REBOOT_CONTROL_PLANE_NODES" default:"false"`
AddNodePoolInput string `envconfig:"ADD_NODE_POOL_INPUT" default:""`
TestPVC bool `envconfig:"TEST_PVC" default:"false"`
SubscriptionID string `envconfig:"SUBSCRIPTION_ID"`
ClientID string `envconfig:"CLIENT_ID"`
ClientSecret string `envconfig:"CLIENT_SECRET"`
*ArcOnboardingConfig
}

Expand Down
41 changes: 29 additions & 12 deletions test/e2e/kubernetes/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ var _ = BeforeSuite(func() {
kubeConfig, getKubeConfigError = GetConfigWithRetry(3*time.Second, cfg.Timeout)
Expect(getKubeConfigError).NotTo(HaveOccurred())

if cfg.RebootControlPlaneNodes {
cfg.BlockSSHPort = true
cfg.StabilityIterations = 0
}

if !cfg.BlockSSHPort {
var err error
masterName := masterNodes[0].Metadata.Name
Expand Down Expand Up @@ -682,7 +687,7 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu

It("should have node labels and annotations added by E2E test runner", func() {
if !eng.ExpandedDefinition.Properties.HasNonRegularPriorityScaleset() &&
cfg.AddNodePoolInput == "" {
cfg.AddNodePoolInput == "" && !cfg.RebootControlPlaneNodes {
totalNodeCount := eng.NodeCount()
nodes := totalNodeCount - len(masterNodes)
nodeList, err := node.GetByLabel("foo")
Expand Down Expand Up @@ -903,6 +908,16 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu
Expect(running).To(Equal(true))
})

It("should be able to run a node reboot daemonset", func() {
if cfg.RebootControlPlaneNodes {
_, err := daemonset.CreateDaemonsetFromFileWithRetry(filepath.Join(WorkloadDir, "reboot-control-plane-node.yaml"), "reboot-test", "default", 5*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
pods, err := pod.GetAllRunningByLabelWithRetry("app", "reboot-test", "default", 5*time.Second, cfg.Timeout)
Expect(err).NotTo(HaveOccurred())
Expect(pods).NotTo(BeEmpty())
}
})

It("should be able to launch a long running HTTP listener and svc endpoint", func() {
By("Creating a php-apache deployment")
phpApacheDeploy, err := deployment.CreateLinuxDeployIfNotExist("deis/hpa-example", longRunningApacheDeploymentName, "default", "", "", 3*time.Second, cfg.Timeout)
Expand Down Expand Up @@ -2353,18 +2368,20 @@ var _ = Describe("Azure Container Cluster using the Kubernetes Orchestrator", fu

Describe("after the cluster has been up for a while", func() {
It("dns-liveness pod should not have any restarts", func() {
pod, err := pod.Get("dns-liveness", "default", podLookupRetries)
Expect(err).NotTo(HaveOccurred())
running, err := pod.WaitOnReady(sleepBetweenRetriesWhenWaitingForPodReady, 3*time.Minute)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
restarts := pod.Status.ContainerStatuses[0].RestartCount
if cfg.SoakClusterName == "" {
err = pod.Delete(util.DefaultDeleteRetries)
if !cfg.RebootControlPlaneNodes {
pod, err := pod.Get("dns-liveness", "default", podLookupRetries)
Expect(err).NotTo(HaveOccurred())
Expect(restarts).To(Equal(0))
} else {
log.Printf("%d DNS livenessProbe restarts since this cluster was created...\n", restarts)
running, err := pod.WaitOnReady(sleepBetweenRetriesWhenWaitingForPodReady, 3*time.Minute)
Expect(err).NotTo(HaveOccurred())
Expect(running).To(Equal(true))
restarts := pod.Status.ContainerStatuses[0].RestartCount
if cfg.SoakClusterName == "" {
err = pod.Delete(util.DefaultDeleteRetries)
Expect(err).NotTo(HaveOccurred())
Expect(restarts).To(Equal(0))
} else {
log.Printf("%d DNS livenessProbe restarts since this cluster was created...\n", restarts)
}
}
})

Expand Down
111 changes: 111 additions & 0 deletions test/e2e/kubernetes/workloads/reboot-control-plane-node.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# How this works:
# On every master, a pod will run that will try to get an annotation lock
# once it gets the lock for its node, it releases it, sleeps a bit and
# then loops until it can get it again.
# After it gets it the second time, it will then trigger a node hard reboot
# (Since force or double force)
# Then, when the node (master) comes back up, it will again run this pod
# and it will again see it has the annotation and will release it, letting
# another node race to get it.
# Works with ubuntu 16.04 and 18.04 on kubernetes 1.15+ (may work earlier)
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: reboot-test
name: reboot-test
namespace: default
spec:
selector:
matchLabels:
app: reboot-test
template:
metadata:
labels:
app: reboot-test
spec:
# This restricts this to masters
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.azure.com/role
operator: In
values:
- master
containers:
- name: reboot-test
image: ubuntu:18.04
imagePullPolicy: IfNotPresent
command:
- bash
- -c
- >-
while [[ $(kubectl --request-timeout 30s annotate namespace ${LOCK_NS} ${LOCK_NAME}=${NODE_ID} 2>&1) != *\(${NODE_ID}\)* ]];
do sleep ${CHECK_FREQUENCY}; done;
kubectl --request-timeout 30s annotate namespace ${LOCK_NS} ${LOCK_NAME}-;
sleep ${CHECK_DELAY};
while [[ $(kubectl --request-timeout 30s annotate namespace ${LOCK_NS} ${LOCK_NAME}=${NODE_ID} 2>&1) != *\(${NODE_ID}\)* ]];
do sleep ${CHECK_FREQUENCY}; done;
echo >/etc/cron.d/reboot-test "* * * * * root /bin/bash -c '/bin/rm -f /etc/cron.d/reboot-test; /sbin/reboot ${FORCE}' >/var/log/RebootTest.log 2>&1";
echo "Waiting for the reboot to happen...";
sleep 1000
env:
- name: KUBECONFIG
value: /.kubeconfig
- name: NODE_ID
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
- name: LOCK_NS
value: default
- name: LOCK_NAME
value: RebootTest
# CHECK_DELAY should be larger than CHECK_FREQUENCY to randomize who reboots
- name: CHECK_DELAY
value: 60s
- name: CHECK_FREQUENCY
value: 30s
- name: FORCE
value: --force
# For harder reboot, without process kill, use double-force...
# value: --force --force
volumeMounts:
- name: node-crond
mountPath: /etc/cron.d
- name: kubectl
mountPath: /usr/local/bin/kubectl
readOnly: true
- name: kubeconfig
mountPath: /.kubeconfig
readOnly: true
- name: certs
mountPath: /etc/kubernetes/certs
readOnly: true
dnsPolicy: ClusterFirst
nodeSelector:
beta.kubernetes.io/os: linux
restartPolicy: Always
terminationGracePeriodSeconds: 0
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- hostPath:
path: /etc/cron.d
name: node-crond
- hostPath:
path: /etc/kubernetes/certs
name: certs
- hostPath:
path: /var/lib/kubelet/kubeconfig
name: kubeconfig
- hostPath:
path: /usr/local/bin/kubectl
name: kubectl
updateStrategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
2 changes: 1 addition & 1 deletion test/e2e/runner/cli_provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ func (cli *CLIProvisioner) waitForNodes() error {
if !cli.IsPrivate() {
log.Println("Waiting on nodes to go into ready state...")
var expectedReadyNodes int
if !cli.Engine.ExpandedDefinition.Properties.HasNonRegularPriorityScaleset() {
if !cli.Engine.ExpandedDefinition.Properties.HasNonRegularPriorityScaleset() && !cli.Config.RebootControlPlaneNodes {
expectedReadyNodes = cli.Engine.NodeCount()
log.Printf("Checking for %d Ready nodes\n", expectedReadyNodes)
} else {
Expand Down

0 comments on commit b54db61

Please sign in to comment.