Skip to content

Commit

Permalink
connectivity: Add upgrade tests
Browse files Browse the repository at this point in the history
The new test case checks whether there are no interruptions in
long-lived E/W LB flows.

The upgrade test consists of two steps:

* "cli connectivity test --include-upgrade-test --upgrade-test-setup":
  deploys migrate-svc (to be renamed) pods, and stores restart counters.
* Do Cilium upgrade
* "cli connectivity test --include-upgrade-test --test post-upgrade":
  checks restart counters of migrate-svc pods, and compares against the
  previously stored counters (counters mismatch means interruptions in
  flow)

Signed-off-by: Martynas Pumputis <[email protected]>
  • Loading branch information
brb committed Jun 15, 2023
1 parent ef99b9d commit 4c228f7
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 1 deletion.
5 changes: 4 additions & 1 deletion connectivity/check/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ type Parameters struct {
JunitFile string
JunitProperties map[string]string

FlushCT bool
IncludeUpgradeTest bool
UpgradeTestSetup bool
UpgradeTestResultPath string
FlushCT bool

K8sVersion string
HelmChartDirectory string
Expand Down
75 changes: 75 additions & 0 deletions connectivity/check/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ const (
hostNetNSDeploymentNameNonCilium = "host-netns-non-cilium" // runs on non-Cilium test nodes
kindHostNetNS = "host-netns"

migrateSvcClientDeploymentName = "migrate-svc-client"
migrateSvcServerDeploymentName = "migrate-svc-server"
migrateSvcServiceName = "migrate-svc"
KindMigrateSvc = "migrate-svc"

EchoServerHostPort = 40000

IngressServiceName = "ingress-service"
Expand Down Expand Up @@ -584,6 +589,67 @@ func (ct *ConnectivityTest) deploy(ctx context.Context) error {
return nil
}

// Deploy migrate-svc-client, migrate-svc-server and migrate-svc
if ct.params.UpgradeTestSetup {
_, err = ct.clients.src.GetDeployment(ctx, ct.params.TestNamespace, migrateSvcServerDeploymentName, metav1.GetOptions{})
if err != nil {
ct.Logf("✨ [%s] Deploying %s deployment...", ct.clients.src.ClusterName(), migrateSvcServerDeploymentName)
migrateSvcServerDeployment := newDeployment(deploymentParameters{
Name: migrateSvcServerDeploymentName,
Kind: KindMigrateSvc,
Image: "docker.io/cilium/migrate-svc-test:v0.0.2",
Replicas: 3,
Labels: map[string]string{"app": "migrate-svc-server"},
Command: []string{"/server", "8000"},
Port: 8000,
})
_, err = ct.clients.src.CreateServiceAccount(ctx, ct.params.TestNamespace, k8s.NewServiceAccount(migrateSvcServerDeploymentName), metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("unable to create service account %s: %w", migrateSvcServerDeploymentName, err)
}
_, err = ct.clients.src.CreateDeployment(ctx, ct.params.TestNamespace, migrateSvcServerDeployment, metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("unable to create deployment %s: %w", migrateSvcServerDeployment, err)
}
}

_, err = ct.clients.src.GetService(ctx, ct.params.TestNamespace, migrateSvcServiceName, metav1.GetOptions{})
if err != nil {
ct.Logf("✨ [%s] Deploying %s service...", ct.clients.src.ClusterName(), migrateSvcServiceName)
svc := newService(migrateSvcServiceName, map[string]string{"app": "migrate-svc-server"}, nil, "http", 8000)
_, err = ct.clients.src.CreateService(ctx, ct.params.TestNamespace, svc, metav1.CreateOptions{})
if err != nil {
return err
}
}

_, err = ct.clients.src.GetDeployment(ctx, ct.params.TestNamespace, migrateSvcClientDeploymentName, metav1.GetOptions{})
if err != nil {
ct.Logf("✨ [%s] Deploying %s deployment...", ct.clients.src.ClusterName(), migrateSvcClientDeploymentName)
migrateSvcClientDeployment := newDeployment(deploymentParameters{
Name: migrateSvcClientDeploymentName,
Kind: KindMigrateSvc,
Image: "docker.io/cilium/migrate-svc-test:v0.0.2",
Replicas: 5,
Labels: map[string]string{"app": "migrate-svc-client"},
Port: 8000,
Command: []string{
"/client",
fmt.Sprintf("migrate-svc.%s.svc.cluster.local.:8000", ct.params.TestNamespace),
},
})

_, err = ct.clients.src.CreateServiceAccount(ctx, ct.params.TestNamespace, k8s.NewServiceAccount(migrateSvcClientDeploymentName), metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("unable to create service account %s: %w", migrateSvcClientDeploymentName, err)
}
_, err = ct.clients.src.CreateDeployment(ctx, ct.params.TestNamespace, migrateSvcClientDeployment, metav1.CreateOptions{})
if err != nil {
return fmt.Errorf("unable to create deployment %s: %w", migrateSvcClientDeployment, err)
}
}
}

if ct.params.MultiCluster != "" {
if ct.params.ForceDeploy {
if err := ct.deleteDeployments(ctx, ct.clients.dst); err != nil {
Expand Down Expand Up @@ -950,6 +1016,11 @@ func (ct *ConnectivityTest) deploymentList() (srcList []string, dstList []string
}
}

if ct.params.IncludeUpgradeTest {
srcList = append(srcList, migrateSvcClientDeploymentName)
dstList = append(dstList, migrateSvcServerDeploymentName)
}

if (ct.params.MultiCluster != "" || !ct.params.SingleNode) && !ct.params.Perf {
dstList = append(dstList, echoOtherNodeDeploymentName)
}
Expand All @@ -967,10 +1038,14 @@ func (ct *ConnectivityTest) deleteDeployments(ctx context.Context, client *k8s.C
_ = client.DeleteDeployment(ctx, ct.params.TestNamespace, echoOtherNodeDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteDeployment(ctx, ct.params.TestNamespace, clientDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteDeployment(ctx, ct.params.TestNamespace, client2DeploymentName, metav1.DeleteOptions{})
_ = client.DeleteDeployment(ctx, ct.params.TestNamespace, migrateSvcClientDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteDeployment(ctx, ct.params.TestNamespace, migrateSvcServerDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, echoSameNodeDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, echoOtherNodeDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, clientDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, client2DeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, migrateSvcClientDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteServiceAccount(ctx, ct.params.TestNamespace, migrateSvcServerDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteService(ctx, ct.params.TestNamespace, echoSameNodeDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteService(ctx, ct.params.TestNamespace, echoOtherNodeDeploymentName, metav1.DeleteOptions{})
_ = client.DeleteConfigMap(ctx, ct.params.TestNamespace, corednsConfigMapName, metav1.DeleteOptions{})
Expand Down
14 changes: 14 additions & 0 deletions connectivity/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,20 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch
return ct.Run(ctx)
}

// Upgrade Test
if ct.Params().IncludeUpgradeTest {
ct.NewTest("post-upgrade").WithScenarios(
tests.NoInterruptedConnections(),
)

if ct.Params().UpgradeTestSetup {
// Exit early, as --upgrade-setup is only needed to deploy pods which
// will be used by another invocation of "cli connectivity test" (with
// include --include-upgrade-tests"
return ct.Run(ctx)
}
}

// Run all tests without any policies in place.
noPoliciesScenarios := []check.Scenario{
tests.PodToPod(),
Expand Down
103 changes: 103 additions & 0 deletions connectivity/tests/upgrade.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright Authors of Cilium

package tests

import (
"context"
gojson "encoding/json"
"os"
"strconv"

"github.com/cilium/cilium-cli/connectivity/check"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// NoInterruptedConnections checks whether there are no interruptions in
// long-lived E/W LB connections. The test case is used to validate Cilium
// upgrades.
//
// The test case consists of three steps:
//
// 1. Deploying pods and a service which establish the long-lived connections
// (done by "--upgrade-test-setup"). The client pods ("migrate-svc-client")
// establish connections via ClusterIP ("migrate-svc") to server pods
// ("migrate-svc-server"). As there former pods come first before the latter,
// the former pods can crash which increases the pod restart counter. The step
// is responsible for storing the restart counter too.
// 2. Do Cilium upgrade.
// 3. Run the test ("--include-upgrade-test"). The test checks the restart
// counters, and compares them against the previously stored ones. A mismatch
// indicates that a connection was interrupted.
func NoInterruptedConnections() check.Scenario {
return &noInterruptedConnections{}
}

type noInterruptedConnections struct{}

func (n *noInterruptedConnections) Name() string {
return "no-interrupted-connections"
}

func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) {
ct := t.Context()

client := ct.K8sClient()
pods, err := client.ListPods(ctx, ct.Params().TestNamespace, metav1.ListOptions{LabelSelector: "kind=" + check.KindMigrateSvc})
if err != nil {
t.Fatalf("Unable to list migrate-svc pods: %s", err)
}
if len(pods.Items) == 0 {
t.Fatal("No migrate-svc-{client,server} pods found")
}

restartCount := make(map[string]string)
for _, pod := range pods.Items {
restartCount[pod.GetObjectMeta().GetName()] = strconv.Itoa(int(pod.Status.ContainerStatuses[0].RestartCount))
}

// Only store restart counters which will be used later when running the same
// test case, but w/o --upgrade-test-setup.
if ct.Params().UpgradeTestSetup {
file, err := os.Create(ct.Params().UpgradeTestResultPath)
if err != nil {
t.Fatalf("Failed to create %q file for writing upgrade test temp results: %s",
ct.Params().UpgradeTestResultPath, err)
}
defer file.Close()

counts := make(map[string]string)
for pod, count := range restartCount {
counts[pod] = count
}
j, err := gojson.Marshal(counts)
if err != nil {
t.Fatalf("Failed to marshal JSON: %s", err)
}

if _, err := file.Write(j); err != nil {
t.Fatalf("Failed to write upgrade test temp result into file: %s", err)
}

return
}

b, err := os.ReadFile(ct.Params().UpgradeTestResultPath)
if err != nil {
t.Fatalf("Failed to read upgrade test result files: %s", err)
}
prevRestartCount := make(map[string]string)
if err := gojson.Unmarshal(b, &prevRestartCount); err != nil {
t.Fatalf("Failed to unmarshal JSON test result file: %s", err)
}

for pod, count := range restartCount {
if prevCount, found := prevRestartCount[pod]; !found {
t.Fatalf("Could not found Pod %s restart count", pod)
} else if prevCount != count {
t.Fatalf("Pod %s flow was interrupted (restart count does not match %s != %s)",
pod, prevCount, count)
}
}
}
3 changes: 3 additions & 0 deletions internal/cli/cmd/connectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,9 @@ func newCmdConnectivityTest(hooks Hooks) *cobra.Command {

initSysdumpFlags(cmd, &params.SysdumpOptions, "sysdump-", hooks)

cmd.Flags().BoolVar(&params.IncludeUpgradeTest, "include-upgrade-test", false, "Include upgrade test")
cmd.Flags().BoolVar(&params.UpgradeTestSetup, "upgrade-test-setup", false, "Set up upgrade test dependencies")
cmd.Flags().StringVar(&params.UpgradeTestResultPath, "upgrade-test-result-path", "/tmp/cilium-upgrade-test-restart-counts", "Upgrade test temporary result file (used internally)")
cmd.Flags().BoolVar(&params.FlushCT, "flush-ct", false, "Flush conntrack of Cilium on each node")

hooks.AddConnectivityTestFlags(cmd.Flags())
Expand Down

0 comments on commit 4c228f7

Please sign in to comment.