Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not promote when not ready on skip analysis #695

Merged
merged 9 commits into from
Sep 29, 2020
Merged
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,9 @@ jobs:
- run: test/container-build.sh
- run: test/e2e-kind.sh v1.18.2
- run: test/e2e-istio.sh
- run: test/e2e-istio-dependencies.sh
- run: test/e2e-istio-tests.sh
- run: test/e2e-istio-tests-skip-analysis.sh

e2e-gloo-testing:
machine: true
Expand Down
13 changes: 11 additions & 2 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ func (c *Controller) advanceCanary(name string, namespace string) {
}

// check if analysis should be skipped
if skip := c.shouldSkipAnalysis(cd, canaryController, meshRouter); skip {
if skip := c.shouldSkipAnalysis(cd, canaryController, meshRouter, err, retriable); skip {
return
}

Expand Down Expand Up @@ -616,11 +616,20 @@ func (c *Controller) runAnalysis(canary *flaggerv1.Canary) bool {
return true
}

func (c *Controller) shouldSkipAnalysis(canary *flaggerv1.Canary, canaryController canary.Controller, meshRouter router.Interface) bool {
func (c *Controller) shouldSkipAnalysis(canary *flaggerv1.Canary, canaryController canary.Controller, meshRouter router.Interface, err error, retriable bool) bool {
if !canary.SkipAnalysis() {
return false
}

// regardless if analysis is being skipped, rollback if canary failed to progress
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

without this block, flagger immediately tries to promote the new version which at this point has 0 healthy pods.

if !retriable || canary.Status.FailedChecks >= canary.GetAnalysisThreshold() {
c.recordEventWarningf(canary, "Rolling back %s.%s progress deadline exceeded %v", canary.Name, canary.Namespace, err)
c.alert(canary, fmt.Sprintf("Progress deadline exceeded %v", err), false, flaggerv1.SeverityError)
c.rollback(canary, canaryController, meshRouter)

return true
}

// route all traffic to primary
primaryWeight := 100
canaryWeight := 0
Expand Down
19 changes: 19 additions & 0 deletions test/e2e-istio-dependencies.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

# This script setups the scenarios for istio tests by creating a Kubernetes namespace, installing the load tester and a test workload (podinfo)
# Prerequisites: Kubernetes Kind and Istio

set -o errexit

REPO_ROOT=$(git rev-parse --show-toplevel)

echo '>>> Creating test namespace'
kubectl create namespace test
kubectl label namespace test istio-injection=enabled

echo '>>> Installing the load tester'
kubectl apply -k ${REPO_ROOT}/kustomize/tester
kubectl -n test rollout status deployment/flagger-loadtester

echo '>>> Deploy podinfo'
kubectl apply -f ${REPO_ROOT}/test/e2e-workload.yaml
138 changes: 138 additions & 0 deletions test/e2e-istio-tests-skip-analysis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env bash

# This script runs e2e tests for when the canary analysis is skipped
# Prerequisites: Kubernetes Kind and Istio

set -o errexit

echo '>>> Initialising canary'
cat <<EOF | kubectl apply -f -
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: podinfo
namespace: test
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: podinfo
progressDeadlineSeconds: 60
service:
port: 9898
portDiscovery: true
skipAnalysis: true
analysis:
interval: 15s
threshold: 15
maxWeight: 30
stepWeight: 10
webhooks:
- name: load-test
url: http://flagger-loadtester.test/
timeout: 5s
metadata:
type: cmd
cmd: "hey -z 10m -q 10 -c 2 http://podinfo.test:9898/"
logCmdOutput: "true"
EOF

echo '>>> Waiting for primary to be ready'
retries=50
count=0
ok=false
until ${ok}; do
kubectl -n test get canary/podinfo | grep 'Initialized' && ok=true || ok=false
sleep 5
count=$(($count + 1))
if [[ ${count} -eq ${retries} ]]; then
kubectl -n istio-system logs deployment/flagger
echo "No more retries left"
exit 1
fi
done

echo '✔ Canary initialization test passed'

echo '>>> Triggering canary deployment'
kubectl -n test set image deployment/podinfo podinfod=stefanprodan/podinfo:3.1.1

echo '>>> Waiting for canary promotion'
retries=50
count=0
ok=false
until ${ok}; do
kubectl -n test describe deployment/podinfo-primary | grep '3.1.1' && ok=true || ok=false
sleep 10
kubectl -n istio-system logs deployment/flagger --tail 1
count=$(($count + 1))
if [[ ${count} -eq ${retries} ]]; then
kubectl -n test describe deployment/podinfo
kubectl -n test describe deployment/podinfo-primary
kubectl -n istio-system logs deployment/flagger
echo "No more retries left"
exit 1
fi
done

echo '>>> Waiting for canary finalization'
retries=50
count=0
ok=false
until ${ok}; do
kubectl -n test get canary/podinfo | grep 'Succeeded' && ok=true || ok=false
sleep 5
count=$(($count + 1))
if [[ ${count} -eq ${retries} ]]; then
kubectl -n istio-system logs deployment/flagger
echo "No more retries left"
exit 1
fi
done

echo '✔ Canary promotion test passed'

if [[ "$1" = "canary" ]]; then
exit 0
fi

echo '>>> Triggering canary deployment with a bad release (non existent docker image)'
kubectl -n test set image deployment/podinfo podinfod=stefanprodan/potato:1.0.0

echo '>>> Waiting for canary to fail'
retries=50
count=0
ok=false
until ${ok}; do
kubectl get canary/podinfo -n test -o=jsonpath='{.status.phase}' | grep 'Failed' && ok=true || ok=false
sleep 10
kubectl -n istio-system logs deployment/flagger --tail 1
count=$(($count + 1))
if [[ ${count} -eq ${retries} ]]; then
kubectl -n test describe deployment/podinfo
kubectl -n test describe deployment/podinfo-primary
kubectl -n istio-system logs deployment/flagger
echo "No more retries left"
exit 1
fi
done

echo '>>> Confirm primary pod is still running and with correct version'
retries=50
count=0
ok=false
until ${okImage} && ${okRunning}; do
kubectl get deployment podinfo-primary -n test -o jsonpath='{.spec.replicas}' | grep 1 && okRunning=true || okRunning=false
kubectl -n test describe deployment/podinfo-primary | grep '3.1.3' && okImage=true || okImage=false
sleep 5
count=$(($count + 1))
if [[ ${count} -eq ${retries} ]]; then
kubectl -n istio-system logs deployment/flagger
echo "No more retries left"
exit 1
fi
done

kubectl -n istio-system logs deployment/flagger

echo '✔ All tests passed'
13 changes: 0 additions & 13 deletions test/e2e-istio-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,6 @@

set -o errexit
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add back set -o errexit to all scripts

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course.

ps: I don't know if this new structure of tests makes sense. I'm happy to change this if you want.


REPO_ROOT=$(git rev-parse --show-toplevel)

echo '>>> Creating test namespace'
kubectl create namespace test
kubectl label namespace test istio-injection=enabled

echo '>>> Installing the load tester'
kubectl apply -k ${REPO_ROOT}/kustomize/tester
kubectl -n test rollout status deployment/flagger-loadtester

echo '>>> Deploy podinfo'
kubectl apply -f ${REPO_ROOT}/test/e2e-workload.yaml

echo '>>> Create latency metric template'
cat <<EOF | kubectl apply -f -
apiVersion: flagger.app/v1beta1
Expand Down