From 6a4bdd4b8f9255e0b5c82e97ad120b39b1b87e9a Mon Sep 17 00:00:00 2001
From: Paul Chaignon <paul@cilium.io>
Date: Thu, 27 Apr 2023 00:01:47 +0200
Subject: [PATCH] connectivity: Retry on inconclusive results

When running the connectivity tests in AKS, we sometimes get interrupted
commands that don't have any output [1]. Unfortunately, those commands
then exit without any error and are therefore considered successful. We
think this is caused by connectivity blips between Kubernetes
components.

This commit adds a check for those inconclusive results. If we see a
seemingly successful command with no output, we retry it until we get
something conclusive. This works because all our test commands (curl,
ping, nslookup) dump something to stdout.

1 - https://github.com/cilium/cilium/issues/22162
Signed-off-by: Paul Chaignon <paul@cilium.io>
---
 connectivity/check/action.go | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/connectivity/check/action.go b/connectivity/check/action.go
index 27bd7b1eec..a7135dbc47 100644
--- a/connectivity/check/action.go
+++ b/connectivity/check/action.go
@@ -4,6 +4,7 @@
 package check
 
 import (
+	"bytes"
 	"context"
 	"encoding/base64"
 	"errors"
@@ -28,6 +29,10 @@ import (
 	"github.com/cilium/cilium-cli/defaults"
 )
 
+const (
+	testCommandRetries = 3
+)
+
 // Action represents an individual action (e.g. a curl call) in a Scenario
 // between a source and a destination peer.
 type Action struct {
@@ -236,13 +241,33 @@ func (a *Action) ExecInPod(ctx context.Context, cmd []string) {
 	pod := a.src
 
 	a.Debug("Executing command", cmd)
-
-	output, err := pod.K8sClient.ExecInPod(ctx,
-		pod.Pod.Namespace, pod.Pod.Name, pod.Pod.Labels["name"], cmd)
-
 	cmdName := cmd[0]
 	cmdStr := strings.Join(cmd, " ")
-	a.cmdOutput = output.String()
+
+	var output bytes.Buffer
+	var err error
+	// We retry the command in case of inconclusive results. The result is
+	// deemed inconclusive when the command succeeded, but we don't have any
+	// output. We've seen this happen when there are connectivity blips on the
+	// k8s side.
+	// This check currently only works because all our test commands expect an
+	// output.
+	for i := 1; i <= testCommandRetries; i++ {
+		output, err = pod.K8sClient.ExecInPod(ctx,
+			pod.Pod.Namespace, pod.Pod.Name, pod.Pod.Labels["name"], cmd)
+		a.cmdOutput = output.String()
+		// Check for inconclusive results.
+		if err == nil && strings.TrimSpace(a.cmdOutput) == "" {
+			a.Debugf("retrying command %s due to inconclusive results", cmdStr)
+			continue
+		}
+		break
+	}
+	// Check for inconclusive results.
+	if err == nil && strings.TrimSpace(a.cmdOutput) == "" {
+		a.Failf("inconclusive results: command %q was successful but without output", cmdStr)
+	}
+
 	showOutput := false
 	expectedExitCode := a.expectedExitCode()
 	if err != nil {