From 8595420e9f0b64cce2e95fdf9a1f2ee92954ce2c Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 20 Nov 2020 15:16:56 -0800 Subject: [PATCH] WIP debug migration --- hack/.gitignore | 1 + hack/run-e2e-test | 305 +++++++++++++++++++++++++-------------------- hack/utils/helm.sh | 22 ++-- hack/utils/kops.sh | 15 +++ 4 files changed, 198 insertions(+), 145 deletions(-) create mode 100644 hack/.gitignore create mode 100644 hack/utils/kops.sh diff --git a/hack/.gitignore b/hack/.gitignore new file mode 100644 index 0000000000..3cc5e8f053 --- /dev/null +++ b/hack/.gitignore @@ -0,0 +1 @@ +ebs-e2e-test/ diff --git a/hack/run-e2e-test b/hack/run-e2e-test index c82169fc47..a082eb6bbe 100755 --- a/hack/run-e2e-test +++ b/hack/run-e2e-test @@ -16,151 +16,192 @@ set -euo pipefail -OS_ARCH=$(go env GOOS)-amd64 -TEST_ID=$RANDOM -CLUSTER_NAME=test-cluster-$TEST_ID -TEST_DIR=/tmp/ebs-e2e-test -BASE_DIR=$(dirname $0) -REGION=${AWS_REGION-us-west-2} -ZONES=${AWS_AVAILABILITY_ZONES-us-west-2a,us-west-2b,us-west-2c} -FOCUS=${GINKGO_FOCUS-"[ebs-csi-e2e]"} -NODES=${GINKGO_NODES:-4} -K8S_VERSION=${K8S_VERSION-1.18.10} -INSTANCE_TYPE=${INSTANCE_TYPE-c4.large} - -source $(dirname "${BASH_SOURCE}")/utils/helm.sh - -echo "Testing in region: $REGION and zones: $ZONES" +TEST_ID=${TEST_ID:-$RANDOM} +CLUSTER_NAME=test-cluster-${TEST_ID} -KOPS_DOWNLOAD_URL=https://github.com/kubernetes/kops/releases/download/v1.18.2/kops-$OS_ARCH -KOPS_PATH=$TEST_DIR/kops -KOPS_STATE_FILE=s3://k8s-kops-csi-e2e +BASE=$(realpath "${BASH_SOURCE[0]}") +BASE_DIR=$(dirname "${BASE}") +TEST_DIR=${BASE_DIR}/ebs-e2e-test +BIN_DIR=${TEST_DIR}/bin +SSH_KEY_PATH=${TEST_DIR}/id_rsa -# Download kops if not yet -if [[ ! -e $KOPS_PATH ]]; then - mkdir -p $TEST_DIR - echo "Downloading KOPS from $KOPS_DOWNLOAD_URL to $KOPS_PATH" - curl -L -X GET $KOPS_DOWNLOAD_URL -o $KOPS_PATH +REGION=${AWS_REGION-us-west-2} +ZONES=${AWS_AVAILABILITY_ZONES:-us-west-2a,us-west-2b,us-west-2c} +INSTANCE_TYPE=${INSTANCE_TYPE:-c4.large} + +K8S_VERSION=${K8S_VERSION:-1.18.10} +KOPS_VERSION=${KOPS_VERSION:-1.18.2} +KOPS_STATE_FILE=${KOPS_STATE_FILE:-s3://k8s-kops-csi-e2e} + +KUBECONFIG=${KUBECONFIG:-"${HOME}/.kube/config"} +ARTIFACTS=${ARTIFACTS:-"${TEST_DIR}/artifacts"} +GINKGO_FOCUS=${GINKGO_FOCUS:-"\[ebs-csi-migration\]"} +GINKGO_SKIP=${GINKGO_SKIP:-"\[Disruptive\]"} +GINKGO_NODES=${GINKGO_NODES:-4} +CHECK_MIGRATION=${CHECK_MIGRATION:-"true"} + +CLEAN=${CLEAN:-"true"} + +loudecho() { + echo "###" + echo "## ${1}" + echo "#" +} + +loudecho "Testing in region ${REGION} and zones ${ZONES}" +mkdir -p "${BIN_DIR}" + +loudecho "Installing kops ${KOPS_VERSION} to ${BIN_DIR}" +source "${BASE_DIR}"/utils/kops.sh +kops::install "${BIN_DIR}" "${KOPS_VERSION}" +KOPS_BIN=${BIN_DIR}/kops + +loudecho "Installing helm to ${BIN_DIR}" +source "${BASE_DIR}"/utils/helm.sh +helm::install "${BIN_DIR}" +HELM_BIN=${BIN_DIR}/helm + +loudecho "Installing ginkgo to ${BIN_DIR}" +GINKGO_BIN=${BIN_DIR}/ginkgo +if [[ ! -e ${GINKGO_BIN} ]]; then + export GOPATH=${TEST_DIR} + export GOBIN=${BIN_DIR} + export GO111MODULE=on + pushd /tmp + go get github.com/onsi/ginkgo/ginkgo@v1.12.0 + popd fi -chmod +x $KOPS_PATH - -helm::install - -echo "Build and push test driver image" -eval $(aws ecr get-login --region $REGION --no-include-email) AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) -IMAGE_TAG=$TEST_ID -IMAGE_NAME=$AWS_ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/aws-ebs-csi-driver -docker build -t $IMAGE_NAME:$IMAGE_TAG . -docker push $IMAGE_NAME:$IMAGE_TAG - +IMAGE_NAME=${AWS_ACCOUNT_ID}.dkr.ecr.${REGION}.amazonaws.com/aws-ebs-csi-driver +IMAGE_TAG=${TEST_ID} set +e -echo "Creating cluster $CLUSTER_NAME" -CLUSTER_YAML_PATH=$TEST_DIR/$CLUSTER_NAME.yaml -SSH_KEY_PATH=$TEST_DIR/id_rsa -ssh-keygen -P csi-e2e -f $SSH_KEY_PATH +if docker images | grep "${IMAGE_NAME}" | grep "${IMAGE_TAG}"; then + set -e + loudecho "Assuming ${IMAGE_NAME}:${IMAGE_TAG} has been built and pushed" +else + set -e + loudecho "Building and pushing test driver image to ${IMAGE_NAME}:${IMAGE_TAG}" + eval "$(aws ecr get-login --region "${REGION}" --no-include-email)" + docker build -t "${IMAGE_NAME}":"${IMAGE_TAG}" . + docker push "${IMAGE_NAME}":"${IMAGE_TAG}" +fi -$KOPS_PATH create cluster --state $KOPS_STATE_FILE \ - --zones $ZONES \ +loudecho "Generating SSH key $SSH_KEY_PATH" +if [[ ! -e ${SSH_KEY_PATH} ]]; then + ssh-keygen -P csi-e2e -f "${SSH_KEY_PATH}" +fi + +set +e +if ${KOPS_BIN} get cluster --state "${KOPS_STATE_FILE}" "${CLUSTER_NAME}".k8s.local; then + set -e + loudecho "Updating cluster $CLUSTER_NAME" +else + set -e + loudecho "Creating cluster $CLUSTER_NAME" + ${KOPS_BIN} create cluster --state "${KOPS_STATE_FILE}" \ + --zones "${ZONES}" \ --node-count=3 \ - --node-size=$INSTANCE_TYPE \ - --kubernetes-version=$K8S_VERSION \ - --ssh-public-key=$SSH_KEY_PATH.pub \ - $CLUSTER_NAME.k8s.local -$KOPS_PATH get cluster --state $KOPS_STATE_FILE $CLUSTER_NAME.k8s.local -o yaml > $CLUSTER_YAML_PATH -cat $BASE_DIR/feature-gates.yaml >> $CLUSTER_YAML_PATH -cat $BASE_DIR/additional-policies.yaml >> $CLUSTER_YAML_PATH -$KOPS_PATH replace --state $KOPS_STATE_FILE -f $CLUSTER_YAML_PATH -$KOPS_PATH update cluster --state $KOPS_STATE_FILE $CLUSTER_NAME.k8s.local --yes - -# Wait for cluster creation -while [[ 1 ]]; do - $KOPS_PATH validate cluster --state $KOPS_STATE_FILE - ret=$? - if [[ $ret -eq 0 ]]; then - break - else - echo "Waiting cluster to be created" - sleep 30 - fi -done; - -echo "Deploying driver" -helm::init - -helm install --name aws-ebs-csi-driver \ - --set enableVolumeScheduling=true \ - --set enableVolumeResizing=true \ - --set enableVolumeSnapshot=true \ - --set image.repository=$IMAGE_NAME \ - --set image.tag=$IMAGE_TAG \ - ./aws-ebs-csi-driver - -# Run the test -if [[ "$GINKGO_FOCUS" == "\[ebs-csi-migration\]" ]]; then - # TODO known test failures to skip temporarily - # - should not allow expansion of pvcs without AllowVolumeExpansion property - # - Test passes but cleanup fails, need https://github.com/kubernetes/kubernetes/pull/81107 - # - (block volmode) Verify if offline PVC expansion works / should resize volume when PVC is edited while pod is using it - # - NodeExpand for BlockVolumes not well-defined, need more investigation and possibly https://github.com/container-storage-interface/spec/issues/380 - # - should provision storage with mount options - # - Known bug, need https://github.com/kubernetes/kubernetes/pull/80191 but not yet in a patch release - pushd ./tests/e2e-migration - go get github.com/onsi/ginkgo/ginkgo - SKIP="\[Disruptive\]\ -|should.provision.storage.with.mount.options\ -|should.not.mount./.map.unused.volumes.in.a.pod" - $GOBIN/ginkgo -p -nodes=$NODES -v --focus="$FOCUS" --skip="$SKIP" --noColor ./... -- -kubeconfig=$HOME/.kube/config -report-dir=$ARTIFACTS -gce-zone=${ZONES%,*} - TEST_PASS=$? - popd - - # There should have been no calls to the in-tree driver kubernetes.io/aws-ebs but many calls to ebs.csi.aws.com - # Find the controller-manager log and read its metrics to verify - NODE=$(kubectl get node -l kubernetes.io/role=master -o json | jq -r ".items[].metadata.name") - kubectl port-forward kube-controller-manager-$NODE 10252:10252 -n kube-system& - - # Ensure port forwarding is succeeded - while true - do - HEALTHZ=$(curl -s 127.0.0.1:10252/healthz) - if [[ $HEALTHZ == "ok" ]]; then - echo "Port forwarding is succeeded" - break - else - echo "Port forwarding is not yet ready" - fi - sleep 1 - done - - curl 127.0.0.1:10252/metrics -s | grep -a 'volume_operation_total_seconds_bucket{operation_name="provision",plugin_name="ebs.csi.aws.com"' - CSI_CALLED=$? - curl 127.0.0.1:10252/metrics -s | grep -a 'volume_operation_total_seconds_bucket{operation_name="provision",plugin_name="kubernetes.io/aws-ebs"' - INTREE_CALLED=$? - - echo "TEST_PASS: $TEST_PASS CSI_CALLED: $CSI_CALLED INTREE_CALLED: $INTREE_CALLED" - - # TEST_PASS if tests passed, CSI was called, and In-tree was not called - if [ "$TEST_PASS" == 0 ] && [ "$CSI_CALLED" == 0 ] && [ "$INTREE_CALLED" == 1 ]; then - TEST_PASS=0 + --node-size="${INSTANCE_TYPE}" \ + --kubernetes-version="${K8S_VERSION}" \ + --ssh-public-key="${SSH_KEY_PATH}".pub \ + "${CLUSTER_NAME}".k8s.local +fi + +CLUSTER_YAML_PATH=${TEST_DIR}/${CLUSTER_NAME}.yaml +${KOPS_BIN} get cluster --state "${KOPS_STATE_FILE}" "${CLUSTER_NAME}".k8s.local -o yaml > "${CLUSTER_YAML_PATH}" +cat "${BASE_DIR}"/feature-gates.yaml >> "${CLUSTER_YAML_PATH}" +cat "${BASE_DIR}"/additional-policies.yaml >> "${CLUSTER_YAML_PATH}" +${KOPS_BIN} replace --state "${KOPS_STATE_FILE}" -f "${CLUSTER_YAML_PATH}" +${KOPS_BIN} update cluster --state "${KOPS_STATE_FILE}" "${CLUSTER_NAME}".k8s.local --yes + +loudecho "Validating cluster $CLUSTER_NAME" +${KOPS_BIN} validate cluster --state "${KOPS_STATE_FILE}" --wait 10m +VALID=$? +if [[ $VALID -ne 0 ]]; then + exit 1 +fi + +loudecho "Deploying driver" +${HELM_BIN} upgrade --install aws-ebs-csi-driver \ + --namespace kube-system \ + --set enableVolumeScheduling=true \ + --set enableVolumeResizing=true \ + --set enableVolumeSnapshot=true \ + --set image.repository="${IMAGE_NAME}" \ + --set image.tag="${IMAGE_TAG}" \ + ./aws-ebs-csi-driver + +loudecho "Testing focus ${GINKGO_FOCUS}" +set -x +${GINKGO_BIN} -p -nodes="${GINKGO_NODES}" -v --focus="${GINKGO_FOCUS}" --skip="${GINKGO_SKIP}" ./tests/e2e-migration/... -- -kubeconfig="${KUBECONFIG}" -report-dir="${ARTIFACTS}" -gce-zone="${ZONES%,*}" +TEST_PASSED=$? +set +x + +if [[ "${CHECK_MIGRATION}" == true ]]; then + loudecho "Checking migration" + # There should have been no calls to the in-tree driver kubernetes.io/aws-ebs but many calls to ebs.csi.aws.com + # Find the controller-manager log and read its metrics to verify + NODE=$(kubectl get node -l kubernetes.io/role=master -o json | jq -r ".items[].metadata.name") + kubectl port-forward kube-controller-manager-"${NODE}" 10252:10252 -n kube-system & + + # Ensure port forwarding succeeded + while true; do + set +e + HEALTHZ=$(curl -s 127.0.0.1:10252/healthz) + set -e + if [[ ${HEALTHZ} == "ok" ]]; then + loudecho "Port forwarding succeeded" + break else - TEST_PASS=1 + loudecho "Port forwarding is not yet ready" fi -else - go get github.com/onsi/ginkgo/ginkgo - export KUBECONFIG=$HOME/.kube/config - $GOBIN/ginkgo -p -nodes=$NODES -v --focus="$FOCUS" tests/e2e -- -report-dir=$ARTIFACTS - TEST_PASS=$? + sleep 1 + done + + set +e + curl 127.0.0.1:10252/metrics -s | grep -a 'volume_operation_total_seconds_bucket{operation_name="provision",plugin_name="ebs.csi.aws.com"' + CSI_CALLED=${PIPESTATUS[1]} + set -e + + set +e + curl 127.0.0.1:10252/metrics -s | grep -a 'volume_operation_total_seconds_bucket{operation_name="provision",plugin_name="kubernetes.io/aws-ebs"' + INTREE_CALLED=${PIPESTATUS[1]} + set -e + + for PROC in $(jobs -p); do + kill "${PROC}" + done + + loudecho "CSI_CALLED: ${CSI_CALLED}" + loudecho "INTREE_CALLED: ${INTREE_CALLED}" + + # TEST_PASSED if tests passed, CSI was called, and In-tree was not called + if [ "${TEST_PASSED}" == 0 ] && [ "${CSI_CALLED}" == 0 ] && [ "${INTREE_CALLED}" == 1 ]; then + TEST_PASSED=0 + else + TEST_PASSED=1 + fi fi -echo "Removing driver" -helm del --purge aws-ebs-csi-driver +if [[ "${CLEAN}" == true ]]; then + loudecho "Cleaning" + + loudecho "Removing driver" + ${HELM_BIN} del aws-ebs-csi-driver -echo "Deleting cluster $CLUSTER_NAME" -$KOPS_PATH delete cluster --name $CLUSTER_NAME.k8s.local --state $KOPS_STATE_FILE --yes + loudecho "Deleting cluster ${CLUSTER_NAME}" + ${KOPS_BIN} delete cluster --name "${CLUSTER_NAME}".k8s.local --state "${KOPS_STATE_FILE}" --yes -rm -rf $TEST_DIR + rm -rf "${TEST_DIR}" +else + loudecho "Not cleaning" +fi -if [[ $TEST_PASS -ne 0 ]]; then - exit 1 +loudecho "TEST_PASSED: ${TEST_PASSED}" +if [[ $TEST_PASSED -ne 0 ]]; then + loudecho "FAIL!" + exit 1 +else + loudecho "SUCCESS!" fi diff --git a/hack/utils/helm.sh b/hack/utils/helm.sh index 125dd113bb..95ab5c388e 100755 --- a/hack/utils/helm.sh +++ b/hack/utils/helm.sh @@ -2,18 +2,14 @@ set -uo pipefail -OS_ARCH=$(go env GOOS)-amd64 - helm::install() { - declare -r helm_name=helm-v2.16.0-$OS_ARCH.tar.gz - wget https://get.helm.sh/$helm_name - tar xvzf $helm_name - mv $OS_ARCH/helm /usr/local/bin/helm -} - -helm::init() { - declare -r rbac_file_path=$(dirname "${BASH_SOURCE}")/tiller-rbac.yaml - kubectl apply -f $rbac_file_path - helm init --service-account tiller --history-max 200 --wait - kubectl get po -n kube-system + INSTALL_PATH=${1} + if [[ ! -e ${INSTALL_PATH}/helm ]]; then + curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 + chmod 700 get_helm.sh + export USE_SUDO=false + export HELM_INSTALL_DIR=${INSTALL_PATH} + ./get_helm.sh + rm get_helm.sh + fi } diff --git a/hack/utils/kops.sh b/hack/utils/kops.sh new file mode 100644 index 0000000000..0cddae6c4c --- /dev/null +++ b/hack/utils/kops.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -uo pipefail + +OS_ARCH=$(go env GOOS)-amd64 + +kops::install() { + INSTALL_PATH=${1} + KOPS_VERSION=${2} + if [[ ! -e ${INSTALL_PATH}/kops ]]; then + KOPS_DOWNLOAD_URL=https://github.com/kubernetes/kops/releases/download/v${KOPS_VERSION}/kops-${OS_ARCH} + curl -L -X GET "${KOPS_DOWNLOAD_URL}" -o "${INSTALL_PATH}"/kops + chmod +x "${INSTALL_PATH}"/kops + fi +}