Skip to content

Commit

Permalink
eks: Run cilium-cli inside a container
Browse files Browse the repository at this point in the history
Update {eks,eks-tunnel}.yaml to run cilium-cli inside a container
instead of using cilium-cli-test-job-chart.

Ref: #2623
Ref: #2627
Ref: cilium/design-cfps#9

Signed-off-by: Michi Mutsuzaki <[email protected]>
  • Loading branch information
michi-covalent committed Jun 25, 2024
1 parent 9658f02 commit 9db68d5
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 225 deletions.
42 changes: 0 additions & 42 deletions .github/in-cluster-test-scripts/eks-tunnel.sh

This file was deleted.

10 changes: 0 additions & 10 deletions .github/in-cluster-test-scripts/eks-uninstall.sh

This file was deleted.

34 changes: 0 additions & 34 deletions .github/in-cluster-test-scripts/eks.sh

This file was deleted.

115 changes: 48 additions & 67 deletions .github/workflows/eks-tunnel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,6 @@ jobs:
sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin
rm eksctl_$(uname -s)_amd64.tar.gz
- name: Install helm
uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5
with:
# Due to the below issue, v3.8.2 is pinned currently to avoid
# exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1"
# https://github.com/helm/helm/issues/10975
version: v3.8.2

- name: Set up AWS CLI credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
Expand Down Expand Up @@ -126,14 +118,11 @@ jobs:
eksctl create cluster -f ./eks-config.yaml
- name: Create kubeconfig and load it in configmap
run: |
.github/get-kubeconfig.sh
kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig
- name: Load cilium cli script in configmap
run: |
kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-tunnel.sh
- name: Install Cilium CLI
uses: ./
with:
skip-build: 'true'
image-tag: ${{ steps.vars.outputs.sha }}

- name: Create cilium-cli test job
run: |
Expand All @@ -143,37 +132,50 @@ jobs:
--set cilium_version=${{ env.cilium_version }} \
--set cluster_name=${{ env.clusterName }}
- name: Wait for job
env:
timeout: 30m
- name: Install Cilium and run tests
timeout-minutes: 30
run: |
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
EXIT_CODE=$?
# Retrieve job logs
kubectl logs --timestamps -n kube-system job/cilium-cli
exit ${EXIT_CODE}
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
# Install Cilium
cilium install \
--version "${{ env.cilium_version }}" \
--set cluster.name="${{ env.clusterName }}" \
--wait=false \
--set bpf.monitorAggregation=none \
--datapath-mode=tunnel \
--set loadBalancer.l7.backend=envoy \
--set tls.secretsBackend=k8s \
--set ipam.mode=cluster-pool
# Enable Relay
cilium hubble enable
# Wait for cilium and hubble relay to be ready
# NB: necessary to work against occassional flakes due to https://github.com/cilium/cilium-cli/issues/918
cilium status --wait
# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
# Port forward Relay
cilium hubble port-forward&
sleep 10s
[[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]]
# Run connectivity test
cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com. \
--test '!dns-only,!to-fqdns,!client-egress-l7,!health'
# workaround for nslookup issues in tunnel mode causing tests to fail reliably
# TODO: remove once:
# - https://github.com/cilium/cilium/issues/16975 is fixed
# - fix has been deployed to a stable branch
# - cilium-cli default cilium version has been updated to pick up the fix
# Run performance test
cilium connectivity perf --duration 1s
- name: Post-test information gathering
if: ${{ !success() }}
run: |
echo "=== Install latest stable CLI ==="
curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum}
sha256sum --check cilium-linux-amd64.tar.gz.sha256sum
sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin
rm cilium-linux-amd64.tar.gz{,.sha256sum}
cilium version
echo "=== Retrieve cluster state ==="
kubectl get pods --all-namespaces -o wide
cilium status
Expand All @@ -182,33 +184,12 @@ jobs:

- name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
if: ${{ success() }}
env:
timeout: 5m
timeout-minutes: 5
run: |
kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
helm install .github/cilium-cli-test-job-chart \
--generate-name \
--set tag=${{ steps.vars.outputs.sha }} \
--set cluster_name=${{ env.clusterName }} \
--set job_name=cilium-cli-uninstall \
--set test_script_cm=cilium-cli-test-script-uninstall
cilium uninstall --wait
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
EXIT_CODE=$?
# Retrieve job logs
kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
exit ${EXIT_CODE}
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
# Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
[[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]]
- name: Clean up EKS
if: ${{ always() }}
Expand Down
110 changes: 38 additions & 72 deletions .github/workflows/eks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,6 @@ jobs:
sudo tar xzvfC eksctl_$(uname -s)_amd64.tar.gz /usr/bin
rm eksctl_$(uname -s)_amd64.tar.gz
- name: Install helm
uses: azure/setup-helm@5119fcb9089d432beecbf79bb2c7915207344b78 # v3.5
with:
# Due to the below issue, v3.8.2 is pinned currently to avoid
# exec plugin: invalid apiVersion "client.authentication.k8s.io/v1alpha1"
# https://github.com/helm/helm/issues/10975
version: v3.8.2

- name: Set up AWS CLI credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
Expand All @@ -80,6 +72,7 @@ jobs:
- name: Set up job variables
id: vars
run: |
env
if [ ${{ github.event.issue.pull_request || github.event.pull_request }} ]; then
PR_API_JSON=$(curl \
-H "Accept: application/vnd.github.v3+json" \
Expand Down Expand Up @@ -126,54 +119,48 @@ jobs:
eksctl create cluster -f ./eks-config.yaml
- name: Create kubeconfig and load it in configmap
run: |
.github/get-kubeconfig.sh
kubectl create configmap cilium-cli-kubeconfig -n kube-system --from-file kubeconfig
- name: Install Cilium CLI
uses: ./
with:
skip-build: 'true'
image-tag: ${{ steps.vars.outputs.sha }}

- name: Load cilium cli script in configmap
- name: Install Cilium and run tests
timeout-minutes: 30
run: |
kubectl create configmap cilium-cli-test-script -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks.sh
# Install Cilium
cilium install \
--version "${{ env.cilium_version }}" \
--set cluster.name="${{ env.clusterName }}" \
--wait=false \
--set loadBalancer.l7.backend=envoy \
--set tls.secretsBackend=k8s \
--set bpf.monitorAggregation=none
- name: Create cilium-cli test job
run: |
helm install .github/cilium-cli-test-job-chart \
--generate-name \
--set tag=${{ steps.vars.outputs.sha }} \
--set cilium_version=${{ env.cilium_version }} \
--set cluster_name=${{ env.clusterName }}
- name: Wait for job
env:
timeout: 30m
run: |
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Enable Relay
cilium hubble enable
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Wait for cilium and hubble relay to be ready
# NB: necessary to work against occasional flakes due to https://github.com/cilium/cilium-cli/issues/918
cilium status --wait
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
EXIT_CODE=$?
# Make sure the 'aws-node' DaemonSet exists but has no scheduled pods
[[ $(kubectl -n kube-system get ds/aws-node -o jsonpath='{.status.currentNumberScheduled}') == 0 ]]
# Retrieve job logs
kubectl logs --timestamps -n kube-system job/cilium-cli
exit ${EXIT_CODE}
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
# Port forward Relay
cilium hubble port-forward&
sleep 10s
[[ $(pgrep -f "kubectl.*port-forward.*hubble-relay" | wc -l) == 1 ]]
# Run connectivity test
cilium connectivity test --debug --all-flows --collect-sysdump-on-failure --external-target amazon.com.
# Run performance test
cilium connectivity perf --duration 1s
- name: Post-test information gathering
if: ${{ !success() }}
run: |
echo "=== Install latest stable CLI ==="
curl -sSL --remote-name-all https://github.com/cilium/cilium-cli/releases/latest/download/cilium-linux-amd64.tar.gz{,.sha256sum}
sha256sum --check cilium-linux-amd64.tar.gz.sha256sum
sudo tar xzvfC cilium-linux-amd64.tar.gz /usr/bin
rm cilium-linux-amd64.tar.gz{,.sha256sum}
cilium version
echo "=== Retrieve cluster state ==="
kubectl get pods --all-namespaces -o wide
cilium status
Expand All @@ -182,33 +169,12 @@ jobs:

- name: Uninstall and make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
if: ${{ success() }}
env:
timeout: 5m
timeout-minutes: 5
run: |
kubectl create configmap cilium-cli-test-script-uninstall -n kube-system --from-file=in-cluster-test-script.sh=.github/in-cluster-test-scripts/eks-uninstall.sh
helm install .github/cilium-cli-test-job-chart \
--generate-name \
--set tag=${{ steps.vars.outputs.sha }} \
--set cluster_name=${{ env.clusterName }} \
--set job_name=cilium-cli-uninstall \
--set test_script_cm=cilium-cli-test-script-uninstall
# Background wait for job to complete or timeout
kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=complete --timeout=${{ env.timeout }} &
complete_pid=$!
# Background wait for job to fail
(kubectl -n kube-system wait job/cilium-cli-uninstall --for=condition=failed --timeout=${{ env.timeout }} && exit 1) &
failed_pid=$!
# Active wait for whichever background process ends first
wait -n $complete_pid $failed_pid
EXIT_CODE=$?
# Retrieve job logs
kubectl logs --timestamps -n kube-system job/cilium-cli-uninstall
exit ${EXIT_CODE}
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently
cilium uninstall --wait
# Make sure the 'aws-node' DaemonSet blocking nodeSelector was removed
[[ ! $(kubectl -n kube-system get ds/aws-node -o jsonpath="{.spec.template.spec.nodeSelector['io\.cilium/aws-node-enabled']}") ]]
- name: Clean up EKS
if: ${{ always() }}
Expand Down

0 comments on commit 9db68d5

Please sign in to comment.