Skip to content

Commit

Permalink
workflows: fix in-cluster job kubectl wait
Browse files Browse the repository at this point in the history
`kubectl wait --for=condition=complete --timeout=X` behaviour is a bit
counterintuitive: it waits until either the job succeeds or timeout is
hit. When the job fails, it does not stop waiting: it will continue
waiting until timeout is hit.

For watching for failures, `--for=condition=failed` should be used.
However, this will likewise wait until either the job fails or timeout
is hit, and will not stop waiting if the job succeeds.

`kubectl wait` unfortunately does not allow waiting for multiple
conditions. To work around this, we set up two concurrent background
waits for both conditions, and actively wait for the first one to end.

This will ensure we do not wait for the whole allocated timeout
everytime there is an error during the in-cluster script execution.

Signed-off-by: Nicolas Busseneau <[email protected]>
  • Loading branch information
nbusseneau authored and michi-covalent committed Jul 22, 2021
1 parent 1bf9ca7 commit 68282c3
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 14 deletions.
30 changes: 27 additions & 3 deletions .github/workflows/aks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,20 @@ jobs:
--set azure.client_secret=${{ steps.az.outputs.client-secret }}
- name: Wait for install job
env:
timeout: 5m
run: |
kubectl -n kube-system wait job/cilium-cli-install --for=condition=complete --timeout=5m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-install --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-install --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Delete the first node pool
run: |
Expand All @@ -183,9 +195,21 @@ jobs:
helm install .github/cilium-cli-test-job-chart \
--generate-name
- name: Wait for job
- name: Wait for test job
env:
timeout: 15m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=15m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand Down
31 changes: 28 additions & 3 deletions .github/workflows/eks-tunnel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,21 @@ jobs:
--set tag=${{ steps.vars.outputs.sha }} \
--set cluster_name=${{ env.clusterName }}
- name: Wait for test job
- name: Wait for job
env:
timeout: 20m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=20m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand All @@ -141,6 +153,8 @@ jobs:

- name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
if: ${{ success() }}
env:
timeout: 2m
run: |
kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
helm install .github/cilium-cli-test-job-chart \
Expand All @@ -149,7 +163,18 @@ jobs:
--set cluster_name=${{ env.clusterName }} \
--set job_name=cilium-cli-uninstall \
--set test_script_cm=cilium-cli-test-script-uninstall
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=2m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
echo "=== Retrieve in-cluster jobs logs ==="
kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
Expand Down
31 changes: 28 additions & 3 deletions .github/workflows/eks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,21 @@ jobs:
--set tag=${{ steps.vars.outputs.sha }} \
--set cluster_name=${{ env.clusterName }}
- name: Wait for test job
- name: Wait for job
env:
timeout: 20m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=20m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand All @@ -141,6 +153,8 @@ jobs:

- name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
if: ${{ success() }}
env:
timeout: 2m
run: |
kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
helm install .github/cilium-cli-test-job-chart \
Expand All @@ -149,7 +163,18 @@ jobs:
--set cluster_name=${{ env.clusterName }} \
--set job_name=cilium-cli-uninstall \
--set test_script_cm=cilium-cli-test-script-uninstall
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=2m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
echo "=== Retrieve in-cluster jobs logs ==="
kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
Expand Down
28 changes: 26 additions & 2 deletions .github/workflows/externalworkloads.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,20 @@ jobs:
--set cluster_cidr=${{ steps.cluster.outputs.cluster_cidr }}
- name: Wait for install job
env:
timeout: 10m
run: |
kubectl -n kube-system wait job/cilium-cli-install --for=condition=complete --timeout=10m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-install --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-install --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Copy VM install script from cilium-cli-install pod
run: |
Expand Down Expand Up @@ -164,8 +176,20 @@ jobs:
--set test_script_cm=cilium-cli-test-script
- name: Wait for test job
env:
timeout: 10m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=10m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand Down
16 changes: 14 additions & 2 deletions .github/workflows/gke.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,21 @@ jobs:
--set cluster_name=${{ env.clusterName }} \
--set cluster_cidr=${{ steps.cluster.outputs.cluster_cidr }}
- name: Wait for job
- name: Wait for test job
env:
timeout: 15m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=15m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand Down
14 changes: 13 additions & 1 deletion .github/workflows/multicluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,20 @@ jobs:
--set cluster_name_2=${{ env.clusterName2 }} \
- name: Wait for test job
env:
timeout: 20m
run: |
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=20m
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
exit $?
- name: Post-test information gathering
if: ${{ !success() }}
Expand Down

0 comments on commit 68282c3

Please sign in to comment.