Skip to content

Commit

Permalink
connectivity: add interruption test for IPSec key rotation
Browse files Browse the repository at this point in the history
This commit adds new connectivity test flag
"--interruption-test-for-ipsec-key-rotation", which dumps xfrm error
counts for each node on setup stage, and compares the current counts
with previously recorded ones on check stage.

In detail, the test consists of 3 steps:

1. Run "cilium-cli connectivity test --interruption-test-setup
   --interruption-test-for-ipsec-key-rotation" to deploy connectivity
   sensitive pods, then record the current restart counts and xfrm error
   counts.
2. Perform IPsec key rotation. This part can be done in the ci-e2e.
3. Run "cilium-cli connectivity test
   --interruption-test-for-ipsec-key-rotation" to check restart counts
   and xfrm error counts again, test fails if any diff is found.

Signed-off-by: Zhichuan Liang <[email protected]>
  • Loading branch information
jschwinger233 committed Jul 5, 2023
1 parent c08820e commit ade1392
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 22 deletions.
9 changes: 5 additions & 4 deletions connectivity/check/check.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ type Parameters struct {
JunitFile string
JunitProperties map[string]string

IncludeInterruptionTest bool
InterruptionTestSetup bool
InterruptionTestResultPath string
FlushCT bool
InterruptionTestForUpgrade bool
InterruptionTestForIPsecKeyRotation bool
InterruptionTestSetup bool
InterruptionTestResultPath string
FlushCT bool

K8sVersion string
HelmChartDirectory string
Expand Down
2 changes: 1 addition & 1 deletion connectivity/check/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ func (ct *ConnectivityTest) deploymentList() (srcList []string, dstList []string
}
}

if ct.params.IncludeInterruptionTest {
if ct.params.InterruptionTestForUpgrade {
srcList = append(srcList, testConnDisruptClientDeploymentName)
dstList = append(dstList, testConnDisruptServerDeploymentName)
}
Expand Down
5 changes: 3 additions & 2 deletions connectivity/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch
}

// Interruption Test
if ct.Params().IncludeInterruptionTest {
if ct.Params().InterruptionTestForUpgrade || ct.Params().InterruptionTestForIPsecKeyRotation {
ct.NewTest("post-interruption").WithScenarios(
tests.NoInterruptedConnections(),
)
Expand All @@ -215,7 +215,8 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch
// Exit early, as --interruption-test-setup is only
// needed to deploy pods which will be used by another
// invocation of "cli connectivity test" (with include
// --include-interruption-test"
// --interruption-test-for-upgrade" or
// --interruption-test-for-ipsec-key-rotation).
return ct.Run(ctx)
}

Expand Down
79 changes: 66 additions & 13 deletions connectivity/tests/interruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,25 @@
package tests

import (
"bufio"
"context"
gojson "encoding/json"
"os"
"sort"
"strconv"
"strings"

"github.com/cilium/cilium-cli/connectivity/check"
"github.com/cilium/cilium-cli/defaults"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

type interruptionCounts struct {
Restarts map[string]string `json:"restarts"`
XfrmErrors map[string]string `json:"xfrm_errors", omitempty`
}

// NoInterruptedConnections checks whether there are no interruptions in
// long-lived E/W LB connections. The test case is used to validate Cilium
// upgrades, ipsec key rotation, etc.
Expand All @@ -27,7 +36,8 @@ import (
// the former pods can crash which increases the pod restart counter. The step
// is responsible for storing the restart counter too.
// 2. Do Cilium upgrade, or ipsec key rotation, etc.
// 3. Run the test ("--include-interruption-test"). The test checks the restart
// 3. Run the test ("--interruption-test-for-upgrade" or
// "--interruption-test-for-ipsec-key-rotation"). The test checks the restart
// counters, and compares them against the previously stored ones. A mismatch
// indicates that a connection was interrupted.
func NoInterruptedConnections() check.Scenario {
Expand All @@ -52,9 +62,53 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) {
t.Fatal("No test-conn-disrupt-{client,server} pods found")
}

restartCount := make(map[string]string)
counts := interruptionCounts{
Restarts: map[string]string{},
}
for _, pod := range pods.Items {
restartCount[pod.GetObjectMeta().GetName()] = strconv.Itoa(int(pod.Status.ContainerStatuses[0].RestartCount))
counts.Restarts[pod.GetObjectMeta().GetName()] = strconv.Itoa(int(pod.Status.ContainerStatuses[0].RestartCount))
}

if ct.Params().InterruptionTestForIPsecKeyRotation {
counts.XfrmErrors = map[string]string{}

nodes, err := client.ListNodes(ctx, metav1.ListOptions{})
if err != nil {
t.Fatalf("Unable to list nodes: %s", err)
}
if len(nodes.Items) == 0 {
t.Fatal("No nodes found")
}

for _, node := range nodes.Items {
ciliumPods, err := client.ListPods(ctx, "kube-system", metav1.ListOptions{LabelSelector: defaults.AgentPodSelector, FieldSelector: "spec.nodeName=" + node.GetName()})
if err != nil {
t.Fatalf("Unable to list cilium pods: %s", err)
}
if len(ciliumPods.Items) == 0 {
t.Fatalf("No cilium pods found")
}

encryptStatus, err := client.ExecInPod(ctx, "kube-system", ciliumPods.Items[0].GetName(), "", []string{"cilium", "encrypt", "status"})
if err != nil {
t.Fatalf("Unable to get encrypt status: %s", err)
}

xfrmErrors := []string{}
hasXfrmError := false
scanner := bufio.NewScanner(strings.NewReader(encryptStatus.String()))
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "Errors:") {
hasXfrmError = true
}
if hasXfrmError {
xfrmErrors = append(xfrmErrors, strings.TrimSpace(line))
}
}
sort.Strings(xfrmErrors)
counts.XfrmErrors[node.GetName()] = strings.Join(xfrmErrors, "\n")
}
}

// Only store restart counters which will be used later when running the same
Expand All @@ -67,10 +121,6 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) {
}
defer file.Close()

counts := make(map[string]string)
for pod, count := range restartCount {
counts[pod] = count
}
j, err := gojson.Marshal(counts)
if err != nil {
t.Fatalf("Failed to marshal JSON: %s", err)
Expand All @@ -87,17 +137,20 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) {
if err != nil {
t.Fatalf("Failed to read interruption test result files: %s", err)
}
prevRestartCount := make(map[string]string)
if err := gojson.Unmarshal(b, &prevRestartCount); err != nil {
prevCounts := interruptionCounts{}
if err := gojson.Unmarshal(b, &prevCounts); err != nil {
t.Fatalf("Failed to unmarshal JSON test result file: %s", err)
}

for pod, count := range restartCount {
if prevCount, found := prevRestartCount[pod]; !found {
for pod, restarts := range counts.Restarts {
if prevRestarts, found := prevCounts.Restarts[pod]; !found {
t.Fatalf("Could not found Pod %s restart count", pod)
} else if prevCount != count {
} else if prevRestarts != restarts {
t.Fatalf("Pod %s flow was interrupted (restart count does not match %s != %s)",
pod, prevCount, count)
pod, prevRestarts, restarts)
}
}

if ct.Params().InterruptionTestForIPsecKeyRotation {
}
}
5 changes: 3 additions & 2 deletions internal/cli/cmd/connectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,10 @@ func newCmdConnectivityTest(hooks Hooks) *cobra.Command {

initSysdumpFlags(cmd, &params.SysdumpOptions, "sysdump-", hooks)

cmd.Flags().BoolVar(&params.IncludeInterruptionTest, "include-interruption-test", false, "Include interruption test (upgrade test, ipsec key rotation test, etc.)")
cmd.Flags().BoolVar(&params.InterruptionTestForUpgrade, "interruption-test-for-upgrade", false, "Interruption test for upgrade")
cmd.Flags().BoolVar(&params.InterruptionTestForIPsecKeyRotation, "interruption-test-for-ipsec-key-rotation", false, "Interruption test for IPsec key rotation")
cmd.Flags().BoolVar(&params.InterruptionTestSetup, "interruption-test-setup", false, "Set up interruption test dependencies")
cmd.Flags().StringVar(&params.InterruptionTestResultPath, "interruption-test-result-path", "/tmp/cilium-interruption-test-restart-counts", "Interruption test temporary result file (used internally)")
cmd.Flags().StringVar(&params.InterruptionTestResultPath, "interruption-test-result-path", "/tmp/cilium-interruption-test-counts", "Interruption test temporary result file (used internally)")
cmd.Flags().BoolVar(&params.FlushCT, "flush-ct", false, "Flush conntrack of Cilium on each node")

hooks.AddConnectivityTestFlags(cmd.Flags())
Expand Down

0 comments on commit ade1392

Please sign in to comment.