FoundationDB · johscheuer · May 3, 2023 · Apr 10, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go b/e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go
@@ -381,4 +381,38 @@ var _ = Describe("Operator HA Upgrades", Label("e2e"), func() {
 		EntryDescription("Upgrade from %[1]s to %[2]s with network link that drops some packets"),
 		fixtures.GenerateUpgradeTableEntries(testOptions),
 	)
+
+	DescribeTable(
+		"Test ha cluster generation number during upgrade",
+		func(beforeVersion string, targetVersion string) {
+			clusterSetup(beforeVersion, false)
+
+			initialGeneration := 0
+			for _, singleCluster := range fdbCluster.GetAllClusters() {
+				status := singleCluster.GetStatus()
+				if status.Cluster.Generation > initialGeneration {
+					initialGeneration = status.Cluster.Generation
+				}
+			}
+
+			// Start the upgrade, but do not wait for reconciliation to complete.
+			Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
+
+			Eventually(func() bool {
+				for _, singleCluster := range fdbCluster.GetAllClusters() {
+					if singleCluster.GetCluster().Status.RunningVersion != targetVersion {
+						return false
+					}
+					// Verify that the cluster generation number doesn't increase by more
+					// than 9 (the number of recoveries we think should happen during an ha
+					// cluster upgrade).
+					status := singleCluster.GetStatus()
+					Expect(status.Cluster.Generation).To(BeNumerically("<=", initialGeneration+9))
+				}
+				return true
+			}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())
+		},
+		EntryDescription("Upgrade, with cluster generation test, from %s to %s"),
+		fixtures.GenerateUpgradeTableEntries(testOptions),
+	)
 })
diff --git a/e2e/test_operator_upgrades/operator_upgrades_test.go b/e2e/test_operator_upgrades/operator_upgrades_test.go
@@ -30,6 +30,7 @@ Since FoundationDB is version incompatible for major and minor versions and the
 import (
 	"fmt"
 	"log"
+	"strings"
 	"time"
 
 	fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
@@ -352,7 +353,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 			log.Println(
 				"Selected coordinator:",
 				selectedCoordinator.Name,
-				" to be restarted during the staging phase",
+				"(podIP:",
+				selectedCoordinator.Status.PodIP,
+				") to be restarted during the staging phase",
 			)
 
 			// Disable the feature that the operator restarts processes. This allows us to restart the coordinator
@@ -376,6 +379,17 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 			)
 			Expect(err).NotTo(HaveOccurred())
 
+			// Wait for the server process to restart.
+			time.Sleep(140 * time.Second)
+
+			// Check if the restarted process is showing up in IncompatibleConnections list in status output.
+			status := fdbCluster.GetStatus()
+			log.Println("IncompatibleProcesses:", status.Cluster.IncompatibleConnections)
+			Expect(len(status.Cluster.IncompatibleConnections)).To(Equal(1))
+			// Extract the IP of the incompatible process.
+			incompatibleProcess := strings.Split(status.Cluster.IncompatibleConnections[0], ":")[0]
+			Expect(incompatibleProcess == selectedCoordinator.Status.PodIP).Should(BeTrue())
+
 			// Allow the operator to restart processes and the upgrade should continue and finish.
 			fdbCluster.SetKillProcesses(true)
 		},
@@ -386,6 +400,72 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 		fixtures.GenerateUpgradeTableEntries(testOptions),
 	)
 
+	DescribeTable(
+		"upgrading a cluster where a storage and multiple stateless processes get restarted during the staging phase",
+		func(beforeVersion string, targetVersion string) {
+			// We set the before version here to overwrite the before version from the specific flag
+			// the specific flag will be removed in the future.
+			isAtLeast := factory.OperatorIsAtLeast(
+				"v1.14.0",
+			)
+
+			if !isAtLeast {
+				Skip("operator doesn't support feature for test case")
+			}
+
+			if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) {
+				Skip("this test case only affects version incompatible upgrades")
+			}
+
+			clusterSetup(beforeVersion, true)
+
+			// Select half of the stateless processes.
+			statelessPods := fdbCluster.GetStatelessPods()
+			Expect(statelessPods.Items).NotTo(BeEmpty())
+			selectedPods := fixtures.RandomPickPod(
+				statelessPods.Items,
+				len(statelessPods.Items)/2,
+			)
+
+			// Select a random storage process, and append it to "selectedPods".
+			storagePods := fdbCluster.GetStoragePods()
+			Expect(storagePods.Items).NotTo(BeEmpty())
+			selectedPods = append(selectedPods, fixtures.RandomPickOnePod(statelessPods.Items))
+
+			// Disable the feature that the operator restarts processes. This allows us to restart the processes
+			// selected above once all new binaries are present.
+			fdbCluster.SetKillProcesses(false)
+
+			// Start the upgrade.
+			Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
+
+			// Wait until all process groups are in the staging phase and the new binaries are available.
+			Eventually(func() bool {
+				return fdbCluster.AllProcessGroupsHaveCondition(fdbv1beta2.IncorrectCommandLine)
+			}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())
+
+			// Restart the processes selected above to pickup the new configuration and run with the
+			// newer version.
+			for _, selectedPod := range selectedPods {
+				_, _, err := fdbCluster.ExecuteCmdOnPod(
+					selectedPod,
+					fdbv1beta2.MainContainerName,
+					"pkill fdbserver",
+					false,
+				)
+				Expect(err).NotTo(HaveOccurred())
+			}
+
+			// Allow the operator to restart processes and the upgrade should continue and finish.
+			fdbCluster.SetKillProcesses(true)
+		},
+
+		EntryDescription(
+			"Upgrade from %[1]s to %[2]s with a storage and multiple stateless processes restarted during the staging phase",
+		),
+		fixtures.GenerateUpgradeTableEntries(testOptions),
+	)
+
 	DescribeTable(
 		"upgrading a cluster with a crash looping sidecar process",
 		func(beforeVersion string, targetVersion string) {
@@ -471,7 +551,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 			log.Println(
 				"Selected coordinator:",
 				selectedCoordinator.Name,
-				" to be skipped during the restart",
+				"(podIP:",
+				selectedCoordinator.Status.PodIP,
+				") to be skipped during the restart",
 			)
 			fdbCluster.SetIgnoreDuringRestart(
 				[]fdbv1beta2.ProcessGroupID{
@@ -481,6 +563,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 
 			// The cluster should still be able to upgrade.
 			Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred())
+
+			status := fdbCluster.GetStatus()
+			Expect(len(status.Cluster.IncompatibleConnections)).To(Equal(0))
 		},
 		EntryDescription("Upgrade from %[1]s to %[2]s with one coordinator not being restarted"),
 		fixtures.GenerateUpgradeTableEntries(testOptions),
@@ -874,4 +959,52 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
 		EntryDescription("Upgrade from %[1]s to %[2]s and one process is missing the new binary"),
 		fixtures.GenerateUpgradeTableEntries(testOptions),
 	)
+
+	DescribeTable(
+		"upgrading a cluster when no storage processes are restarted",
+		func(beforeVersion string, targetVersion string) {
+			// We set the before version here to overwrite the before version from the specific flag
+			// the specific flag will be removed in the future.
+			isAtLeast := factory.OperatorIsAtLeast(
+				"v1.14.0",
+			)
+
+			if !isAtLeast {
+				Skip("operator doesn't support feature for test case")
+			}
+
+			clusterSetup(beforeVersion, true)
+
+			// Select storage processes and use the buggify option to skip those
+			// processes during the restart command.
+			storagePods := fdbCluster.GetStoragePods()
+			Expect(storagePods.Items).NotTo(BeEmpty())
+
+			ignoreDuringRestart := make(
+				[]fdbv1beta2.ProcessGroupID,
+				0,
+				len(storagePods.Items),
+			)
+
+			for _, pod := range storagePods.Items {
+				ignoreDuringRestart = append(
+					ignoreDuringRestart,
+					fdbv1beta2.ProcessGroupID(pod.Labels[fdbCluster.GetCachedCluster().GetProcessGroupIDLabel()]),
+				)
+			}
+
+			log.Println(
+				"Selected Pods:",
+				ignoreDuringRestart,
+				"to be skipped during the restart",
+			)
+			fdbCluster.SetIgnoreDuringRestart(ignoreDuringRestart)
+
+			// The cluster should still be able to upgrade.
+			Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred())
+		},
+		EntryDescription("Upgrade from %[1]s to %[2]s when no storage processes are restarted"),
+		fixtures.GenerateUpgradeTableEntries(testOptions),
+	)
+
 })