diff --git a/connectivity/check/check.go b/connectivity/check/check.go index bfd2d62c58..4c9c1d06a5 100644 --- a/connectivity/check/check.go +++ b/connectivity/check/check.go @@ -64,10 +64,11 @@ type Parameters struct { JunitFile string JunitProperties map[string]string - IncludeInterruptionTest bool - InterruptionTestSetup bool - InterruptionTestResultPath string - FlushCT bool + InterruptionTestForUpgrade bool + InterruptionTestForIPsecKeyRotation bool + InterruptionTestSetup bool + InterruptionTestResultPath string + FlushCT bool K8sVersion string HelmChartDirectory string diff --git a/connectivity/check/deployment.go b/connectivity/check/deployment.go index 6320e19fb5..2594ab9968 100644 --- a/connectivity/check/deployment.go +++ b/connectivity/check/deployment.go @@ -1031,7 +1031,7 @@ func (ct *ConnectivityTest) deploymentList() (srcList []string, dstList []string } } - if ct.params.IncludeInterruptionTest { + if ct.params.InterruptionTestForUpgrade { srcList = append(srcList, testConnDisruptClientDeploymentName) dstList = append(dstList, testConnDisruptServerDeploymentName) } diff --git a/connectivity/suite.go b/connectivity/suite.go index 377c9475f6..3a09a44ac8 100644 --- a/connectivity/suite.go +++ b/connectivity/suite.go @@ -206,7 +206,7 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch } // Interruption Test - if ct.Params().IncludeInterruptionTest { + if ct.Params().InterruptionTestForUpgrade || ct.Params().InterruptionTestForIPsecKeyRotation { ct.NewTest("post-interruption").WithScenarios( tests.NoInterruptedConnections(), ) @@ -215,7 +215,8 @@ func Run(ctx context.Context, ct *check.ConnectivityTest, addExtraTests func(*ch // Exit early, as --interruption-test-setup is only // needed to deploy pods which will be used by another // invocation of "cli connectivity test" (with include - // --include-interruption-test" + // --interruption-test-for-upgrade" or + // --interruption-test-for-ipsec-key-rotation). return ct.Run(ctx) } diff --git a/connectivity/tests/interruption.go b/connectivity/tests/interruption.go index aba8da6684..82aac8e5a7 100644 --- a/connectivity/tests/interruption.go +++ b/connectivity/tests/interruption.go @@ -4,16 +4,25 @@ package tests import ( + "bufio" "context" gojson "encoding/json" "os" + "sort" "strconv" + "strings" "github.com/cilium/cilium-cli/connectivity/check" + "github.com/cilium/cilium-cli/defaults" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +type interruptionCounts struct { + Restarts map[string]string `json:"restarts"` + XfrmErrors map[string]string `json:"xfrm_errors", omitempty` +} + // NoInterruptedConnections checks whether there are no interruptions in // long-lived E/W LB connections. The test case is used to validate Cilium // upgrades, ipsec key rotation, etc. @@ -27,7 +36,8 @@ import ( // the former pods can crash which increases the pod restart counter. The step // is responsible for storing the restart counter too. // 2. Do Cilium upgrade, or ipsec key rotation, etc. -// 3. Run the test ("--include-interruption-test"). The test checks the restart +// 3. Run the test ("--interruption-test-for-upgrade" or +// "--interruption-test-for-ipsec-key-rotation"). The test checks the restart // counters, and compares them against the previously stored ones. A mismatch // indicates that a connection was interrupted. func NoInterruptedConnections() check.Scenario { @@ -52,9 +62,53 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) { t.Fatal("No test-conn-disrupt-{client,server} pods found") } - restartCount := make(map[string]string) + counts := interruptionCounts{ + Restarts: map[string]string{}, + } for _, pod := range pods.Items { - restartCount[pod.GetObjectMeta().GetName()] = strconv.Itoa(int(pod.Status.ContainerStatuses[0].RestartCount)) + counts.Restarts[pod.GetObjectMeta().GetName()] = strconv.Itoa(int(pod.Status.ContainerStatuses[0].RestartCount)) + } + + if ct.Params().InterruptionTestForIPsecKeyRotation { + counts.XfrmErrors = map[string]string{} + + nodes, err := client.ListNodes(ctx, metav1.ListOptions{}) + if err != nil { + t.Fatalf("Unable to list nodes: %s", err) + } + if len(nodes.Items) == 0 { + t.Fatal("No nodes found") + } + + for _, node := range nodes.Items { + ciliumPods, err := client.ListPods(ctx, "kube-system", metav1.ListOptions{LabelSelector: defaults.AgentPodSelector, FieldSelector: "spec.nodeName=" + node.GetName()}) + if err != nil { + t.Fatalf("Unable to list cilium pods: %s", err) + } + if len(ciliumPods.Items) == 0 { + t.Fatalf("No cilium pods found") + } + + encryptStatus, err := client.ExecInPod(ctx, "kube-system", ciliumPods.Items[0].GetName(), "", []string{"cilium", "encrypt", "status"}) + if err != nil { + t.Fatalf("Unable to get encrypt status: %s", err) + } + + xfrmErrors := []string{} + hasXfrmError := false + scanner := bufio.NewScanner(strings.NewReader(encryptStatus.String())) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "Errors:") { + hasXfrmError = true + } + if hasXfrmError { + xfrmErrors = append(xfrmErrors, strings.TrimSpace(line)) + } + } + sort.Strings(xfrmErrors) + counts.XfrmErrors[node.GetName()] = strings.Join(xfrmErrors, "\n") + } } // Only store restart counters which will be used later when running the same @@ -67,10 +121,6 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) { } defer file.Close() - counts := make(map[string]string) - for pod, count := range restartCount { - counts[pod] = count - } j, err := gojson.Marshal(counts) if err != nil { t.Fatalf("Failed to marshal JSON: %s", err) @@ -87,17 +137,20 @@ func (n *noInterruptedConnections) Run(ctx context.Context, t *check.Test) { if err != nil { t.Fatalf("Failed to read interruption test result files: %s", err) } - prevRestartCount := make(map[string]string) - if err := gojson.Unmarshal(b, &prevRestartCount); err != nil { + prevCounts := interruptionCounts{} + if err := gojson.Unmarshal(b, &prevCounts); err != nil { t.Fatalf("Failed to unmarshal JSON test result file: %s", err) } - for pod, count := range restartCount { - if prevCount, found := prevRestartCount[pod]; !found { + for pod, restarts := range counts.Restarts { + if prevRestarts, found := prevCounts.Restarts[pod]; !found { t.Fatalf("Could not found Pod %s restart count", pod) - } else if prevCount != count { + } else if prevRestarts != restarts { t.Fatalf("Pod %s flow was interrupted (restart count does not match %s != %s)", - pod, prevCount, count) + pod, prevRestarts, restarts) } } + + if ct.Params().InterruptionTestForIPsecKeyRotation { + } } diff --git a/internal/cli/cmd/connectivity.go b/internal/cli/cmd/connectivity.go index 2533f2266f..fa2898998e 100644 --- a/internal/cli/cmd/connectivity.go +++ b/internal/cli/cmd/connectivity.go @@ -172,9 +172,10 @@ func newCmdConnectivityTest(hooks Hooks) *cobra.Command { initSysdumpFlags(cmd, ¶ms.SysdumpOptions, "sysdump-", hooks) - cmd.Flags().BoolVar(¶ms.IncludeInterruptionTest, "include-interruption-test", false, "Include interruption test (upgrade test, ipsec key rotation test, etc.)") + cmd.Flags().BoolVar(¶ms.InterruptionTestForUpgrade, "interruption-test-for-upgrade", false, "Interruption test for upgrade") + cmd.Flags().BoolVar(¶ms.InterruptionTestForIPsecKeyRotation, "interruption-test-for-ipsec-key-rotation", false, "Interruption test for IPsec key rotation") cmd.Flags().BoolVar(¶ms.InterruptionTestSetup, "interruption-test-setup", false, "Set up interruption test dependencies") - cmd.Flags().StringVar(¶ms.InterruptionTestResultPath, "interruption-test-result-path", "/tmp/cilium-interruption-test-restart-counts", "Interruption test temporary result file (used internally)") + cmd.Flags().StringVar(¶ms.InterruptionTestResultPath, "interruption-test-result-path", "/tmp/cilium-interruption-test-counts", "Interruption test temporary result file (used internally)") cmd.Flags().BoolVar(¶ms.FlushCT, "flush-ct", false, "Flush conntrack of Cilium on each node") hooks.AddConnectivityTestFlags(cmd.Flags())