Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add helm mode test coverage for AKS BYOCNI #1578

Merged
merged 2 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/in-cluster-test-scripts/aks-byocni-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ cilium install \
--wait=false \
--helm-set loadBalancer.l7.backend=envoy \
--helm-set tls.secretsBackend=k8s \
--config monitor-aggregation=none
--helm-set bpf.monitorAggregation=none
23 changes: 14 additions & 9 deletions .github/workflows/aks-byocni.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ concurrency:
cancel-in-progress: true

env:
name: ${{ github.repository_owner }}-${{ github.event.repository.name }}-${{ github.run_id }}
location: westeurope
cost_reduction: --node-vm-size Standard_B2s --node-osdisk-size 30
cilium_version: v1.13.2
Expand All @@ -47,6 +46,9 @@ jobs:
if: ${{ github.repository == 'cilium/cilium-cli' }}
runs-on: ubuntu-22.04
timeout-minutes: 45
strategy:
matrix:
mode: ["classic", "helm"]
steps:
- name: Checkout
uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
Expand Down Expand Up @@ -89,19 +91,20 @@ jobs:

echo "sha=${SHA}" >> $GITHUB_OUTPUT
echo "owner=${OWNER}" >> $GITHUB_OUTPUT
echo "name=${{ github.event.repository.name }}-${{ github.run_id }}-${{ matrix.mode }}" >> $GITHUB_OUTPUT

- name: Create AKS cluster
run: |
# Create group
az group create \
--name ${{ env.name }} \
--name ${{ steps.vars.outputs.name }} \
--location ${{ env.location }} \
--tags usage=${{ github.repository_owner }}-${{ github.event.repository.name }} owner=${{ steps.vars.outputs.owner }}

# Create AKS cluster
az aks create \
--resource-group ${{ env.name }} \
--name ${{ env.name }} \
--resource-group ${{ steps.vars.outputs.name }} \
--name ${{ steps.vars.outputs.name }} \
--location ${{ env.location }} \
--network-plugin none \
--node-count 2 \
Expand All @@ -111,8 +114,8 @@ jobs:
- name: Get cluster credentials
run: |
az aks get-credentials \
--resource-group ${{ env.name }} \
--name ${{ env.name }}
--resource-group ${{ steps.vars.outputs.name }} \
--name ${{ steps.vars.outputs.name }}

- name: Create kubeconfig and load it in configmap
run: |
Expand All @@ -130,7 +133,8 @@ jobs:
--set job_name=cilium-cli-install \
--set test_script_cm=cilium-cli-test-script-install \
--set tag=${{ steps.vars.outputs.sha }} \
--set cilium_version=${{ env.cilium_version }}
--set cilium_version=${{ env.cilium_version }} \
--set cilium_cli_mode=${{ matrix.mode }}

- name: Wait for install job
env:
Expand Down Expand Up @@ -163,7 +167,8 @@ jobs:
--generate-name \
--set job_name=cilium-cli \
--set test_script_cm=cilium-cli-test-script \
--set tag=${{ steps.vars.outputs.sha }}
--set tag=${{ steps.vars.outputs.sha }} \
--set cilium_cli_mode=${{ matrix.mode }}

- name: Wait for test job
env:
Expand Down Expand Up @@ -205,7 +210,7 @@ jobs:
- name: Clean up AKS
if: ${{ always() }}
run: |
az group delete --name ${{ env.name }} --yes --no-wait
az group delete --name ${{ steps.vars.outputs.name }} --yes --no-wait
shell: bash {0} # Disable default fail-fast behaviour so that all commands run independently

- name: Upload artifacts
Expand Down
31 changes: 26 additions & 5 deletions connectivity/check/action.go
Original file line number Diff line number Diff line change
Expand Up @@ -720,19 +720,40 @@ func (a *Action) GetIngressRequirements(p FlowParameters) []filters.FlowSetRequi
return []filters.FlowSetRequirement{ingress}
}

// waitForRelay polls the server status from Relay until either it's connected to all the Hubble
// instances (success) or the context is cancelled (failure).
func (a *Action) isRelayServerStatusOk(ctx context.Context, client observer.ObserverClient) bool {
// Sometimes ServerStatus request just gets stuck until the context gets cancelled. Call
// ServerStatus with a shorter timeout than ctx so that we get to try it multiple times.
timeoutContext, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
res, err := client.ServerStatus(timeoutContext, &observer.ServerStatusRequest{})
if err == nil {
a.Debugf("hubble relay server status %+v", res)
if (res.NumUnavailableNodes == nil || res.NumUnavailableNodes.Value == 0) && res.NumFlows > 0 {
// This means all the nodes are available, and Hubble Relay started receiving flows.
// Ideally we should check res.NumConnectedNodes matches the expected number of Cilium
// nodes instead of relying on res.NumUnavailableNodes, but I don't know if that information
// is available to us.
return true
}
} else {
a.Debugf("hubble relay server status failed: %v", err)
}
return false
}

// waitForRelay polls the server status from Relay until it indicates that there are no unavailable
// nodes and num_flows is greater than zero (success), or the context is cancelled (failure).
func (a *Action) waitForRelay(ctx context.Context, client observer.ObserverClient) error {
for {
res, err := client.ServerStatus(ctx, &observer.ServerStatusRequest{})
if err == nil && (res.NumUnavailableNodes == nil || res.NumUnavailableNodes.Value == 0) {
// This means all the nodes are available.
if a.isRelayServerStatusOk(ctx, client) {
a.Debug("hubble relay is ready")
return nil
}
select {
case <-ctx.Done():
return fmt.Errorf("hubble server status failure: %w", ctx.Err())
case <-time.After(time.Second):
a.Debug("retrying hubble relay server status request")
}
}
}
Expand Down