Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test various upgrade scenarios #1580

Merged
merged 8 commits into from
May 3, 2023
34 changes: 34 additions & 0 deletions e2e/test_operator_ha_upgrades/operator_ha_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,4 +381,38 @@ var _ = Describe("Operator HA Upgrades", Label("e2e"), func() {
EntryDescription("Upgrade from %[1]s to %[2]s with network link that drops some packets"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)

DescribeTable(
"Test ha cluster generation number during upgrade",
sbodagala marked this conversation as resolved.
Show resolved Hide resolved
func(beforeVersion string, targetVersion string) {
clusterSetup(beforeVersion, false)

initialGeneration := 0
for _, singleCluster := range fdbCluster.GetAllClusters() {
status := singleCluster.GetStatus()
if status.Cluster.Generation > initialGeneration {
initialGeneration = status.Cluster.Generation
}
}
sbodagala marked this conversation as resolved.
Show resolved Hide resolved

// Start the upgrade, but do not wait for reconciliation to complete.
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())
sbodagala marked this conversation as resolved.
Show resolved Hide resolved

Eventually(func() bool {
for _, singleCluster := range fdbCluster.GetAllClusters() {
sbodagala marked this conversation as resolved.
Show resolved Hide resolved
if singleCluster.GetCluster().Status.RunningVersion != targetVersion {
return false
}
// Verify that the cluster generation number doesn't increase by more
// than 9 (the number of recoveries we think should happen during an ha
// cluster upgrade).
status := singleCluster.GetStatus()
Expect(status.Cluster.Generation).To(BeNumerically("<=", initialGeneration+9))
}
return true
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())
},
EntryDescription("Upgrade, with cluster generation test, from %s to %s"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)
})
137 changes: 135 additions & 2 deletions e2e/test_operator_upgrades/operator_upgrades_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Since FoundationDB is version incompatible for major and minor versions and the
import (
"fmt"
"log"
"strings"
"time"

fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/api/v1beta2"
Expand Down Expand Up @@ -352,7 +353,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
log.Println(
"Selected coordinator:",
selectedCoordinator.Name,
" to be restarted during the staging phase",
"(podIP:",
selectedCoordinator.Status.PodIP,
") to be restarted during the staging phase",
)

// Disable the feature that the operator restarts processes. This allows us to restart the coordinator
Expand All @@ -376,6 +379,17 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
)
Expect(err).NotTo(HaveOccurred())

// Wait for the server process to restart.
time.Sleep(140 * time.Second)

// Check if the restarted process is showing up in IncompatibleConnections list in status output.
status := fdbCluster.GetStatus()
log.Println("IncompatibleProcesses:", status.Cluster.IncompatibleConnections)
Expect(len(status.Cluster.IncompatibleConnections)).To(Equal(1))
// Extract the IP of the incompatible process.
incompatibleProcess := strings.Split(status.Cluster.IncompatibleConnections[0], ":")[0]
Expect(incompatibleProcess == selectedCoordinator.Status.PodIP).Should(BeTrue())
sbodagala marked this conversation as resolved.
Show resolved Hide resolved

// Allow the operator to restart processes and the upgrade should continue and finish.
fdbCluster.SetKillProcesses(true)
},
Expand All @@ -386,6 +400,72 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
fixtures.GenerateUpgradeTableEntries(testOptions),
)

DescribeTable(
"upgrading a cluster where a storage and multiple stateless processes get restarted during the staging phase",
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
func(beforeVersion string, targetVersion string) {
// We set the before version here to overwrite the before version from the specific flag
sbodagala marked this conversation as resolved.
Show resolved Hide resolved
// the specific flag will be removed in the future.
isAtLeast := factory.OperatorIsAtLeast(
"v1.14.0",
)

if !isAtLeast {
Skip("operator doesn't support feature for test case")
}

if fixtures.VersionsAreProtocolCompatible(beforeVersion, targetVersion) {
Skip("this test case only affects version incompatible upgrades")
}

clusterSetup(beforeVersion, true)

// Select half of the stateless processes.
statelessPods := fdbCluster.GetStatelessPods()
Expect(statelessPods.Items).NotTo(BeEmpty())
selectedPods := fixtures.RandomPickPod(
statelessPods.Items,
len(statelessPods.Items)/2,
)

// Select a random storage process, and append it to "selectedPods".
storagePods := fdbCluster.GetStoragePods()
Expect(storagePods.Items).NotTo(BeEmpty())
selectedPods = append(selectedPods, fixtures.RandomPickOnePod(statelessPods.Items))

// Disable the feature that the operator restarts processes. This allows us to restart the processes
// selected above once all new binaries are present.
fdbCluster.SetKillProcesses(false)

// Start the upgrade.
Expect(fdbCluster.UpgradeCluster(targetVersion, false)).NotTo(HaveOccurred())

// Wait until all process groups are in the staging phase and the new binaries are available.
Eventually(func() bool {
return fdbCluster.AllProcessGroupsHaveCondition(fdbv1beta2.IncorrectCommandLine)
}).WithTimeout(10 * time.Minute).WithPolling(2 * time.Second).Should(BeTrue())

// Restart the processes selected above to pickup the new configuration and run with the
// newer version.
for _, selectedPod := range selectedPods {
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
_, _, err := fdbCluster.ExecuteCmdOnPod(
selectedPod,
fdbv1beta2.MainContainerName,
"pkill fdbserver",
false,
)
Expect(err).NotTo(HaveOccurred())
}

// Allow the operator to restart processes and the upgrade should continue and finish.
fdbCluster.SetKillProcesses(true)
},

EntryDescription(
"Upgrade from %[1]s to %[2]s with a storage and multiple stateless processes restarted during the staging phase",
),
fixtures.GenerateUpgradeTableEntries(testOptions),
)

DescribeTable(
"upgrading a cluster with a crash looping sidecar process",
func(beforeVersion string, targetVersion string) {
Expand Down Expand Up @@ -471,7 +551,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
log.Println(
"Selected coordinator:",
selectedCoordinator.Name,
" to be skipped during the restart",
"(podIP:",
selectedCoordinator.Status.PodIP,
") to be skipped during the restart",
)
fdbCluster.SetIgnoreDuringRestart(
[]fdbv1beta2.ProcessGroupID{
Expand All @@ -481,6 +563,9 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {

// The cluster should still be able to upgrade.
Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred())

status := fdbCluster.GetStatus()
Expect(len(status.Cluster.IncompatibleConnections)).To(Equal(0))
johscheuer marked this conversation as resolved.
Show resolved Hide resolved
},
EntryDescription("Upgrade from %[1]s to %[2]s with one coordinator not being restarted"),
fixtures.GenerateUpgradeTableEntries(testOptions),
Expand Down Expand Up @@ -874,4 +959,52 @@ var _ = Describe("Operator Upgrades", Label("e2e"), func() {
EntryDescription("Upgrade from %[1]s to %[2]s and one process is missing the new binary"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)

DescribeTable(
"upgrading a cluster when no storage processes are restarted",
func(beforeVersion string, targetVersion string) {
// We set the before version here to overwrite the before version from the specific flag
sbodagala marked this conversation as resolved.
Show resolved Hide resolved
// the specific flag will be removed in the future.
isAtLeast := factory.OperatorIsAtLeast(
"v1.14.0",
)

if !isAtLeast {
Skip("operator doesn't support feature for test case")
}

clusterSetup(beforeVersion, true)

// Select storage processes and use the buggify option to skip those
// processes during the restart command.
storagePods := fdbCluster.GetStoragePods()
Expect(storagePods.Items).NotTo(BeEmpty())

ignoreDuringRestart := make(
[]fdbv1beta2.ProcessGroupID,
0,
len(storagePods.Items),
)

for _, pod := range storagePods.Items {
ignoreDuringRestart = append(
ignoreDuringRestart,
fdbv1beta2.ProcessGroupID(pod.Labels[fdbCluster.GetCachedCluster().GetProcessGroupIDLabel()]),
)
}

log.Println(
"Selected Pods:",
ignoreDuringRestart,
"to be skipped during the restart",
)
fdbCluster.SetIgnoreDuringRestart(ignoreDuringRestart)

// The cluster should still be able to upgrade.
Expect(fdbCluster.UpgradeCluster(targetVersion, true)).NotTo(HaveOccurred())
sbodagala marked this conversation as resolved.
Show resolved Hide resolved
},
EntryDescription("Upgrade from %[1]s to %[2]s when no storage processes are restarted"),
fixtures.GenerateUpgradeTableEntries(testOptions),
)

})