From 9a92392f63faac6d7b984758bb7180340bc35164 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Wed, 14 Feb 2024 10:53:29 +0100 Subject: [PATCH 1/2] Don't skip the exclusion for process groups without an address when using localities --- .../replace_failed_process_groups_test.go | 35 +++++++++++++++++-- .../replace_failed_process_groups.go | 5 ++- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/controllers/replace_failed_process_groups_test.go b/controllers/replace_failed_process_groups_test.go index 6f3167eb4..5d0ca9345 100644 --- a/controllers/replace_failed_process_groups_test.go +++ b/controllers/replace_failed_process_groups_test.go @@ -99,8 +99,7 @@ var _ = Describe("replace_failed_process_groups", func() { cluster.Spec.AutomationOptions.Replacements.FailureDetectionTimeSeconds = pointer.Int(5) cluster.Spec.AutomationOptions.Replacements.TaintReplacementTimeSeconds = pointer.Int(1) // Update cluster config so that generic reconciliation will work - err := k8sClient.Update(ctx.TODO(), cluster) - Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient.Update(ctx.TODO(), cluster)).NotTo(HaveOccurred()) adminClient, err = mock.NewMockAdminClientUncast(cluster, k8sClient) @@ -918,6 +917,38 @@ var _ = Describe("replace_failed_process_groups", func() { Expect(getRemovedProcessGroupIDs(cluster)).To(Equal([]fdbv1beta2.ProcessGroupID{})) }) }) + + When("the cluster uses localities for exclusions", func() { + BeforeEach(func() { + processGroup := fdbv1beta2.FindProcessGroupByID(cluster.Status.ProcessGroups, "storage-2") + processGroup.Addresses = nil + + cluster.Spec.Version = fdbv1beta2.Versions.SupportsLocalityBasedExclusions71.String() + cluster.Status.RunningVersion = fdbv1beta2.Versions.SupportsLocalityBasedExclusions71.String() + cluster.Spec.AutomationOptions.UseLocalitiesForExclusion = pointer.Bool(true) + Expect(k8sClient.Update(ctx.TODO(), cluster)).NotTo(HaveOccurred()) + }) + + It("should requeue", func() { + Expect(result).NotTo(BeNil()) + Expect(result.message).To(Equal("Removals have been updated in the cluster status")) + }) + + It("should mark the process group for removal", func() { + Expect(getRemovedProcessGroupIDs(cluster)).To(Equal([]fdbv1beta2.ProcessGroupID{"storage-2"})) + }) + + It("should not skip the exclusion", func() { + for _, pg := range cluster.Status.ProcessGroups { + if pg.ProcessGroupID != "storage-2" { + continue + } + + Expect(pg.ExclusionSkipped).To(BeFalse()) + } + }) + + }) }) Context("with maintenance mode enabled", func() { diff --git a/internal/replacements/replace_failed_process_groups.go b/internal/replacements/replace_failed_process_groups.go index 4d173ba50..4c2e41243 100644 --- a/internal/replacements/replace_failed_process_groups.go +++ b/internal/replacements/replace_failed_process_groups.go @@ -54,6 +54,7 @@ func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationD maxReplacements := getMaxReplacements(cluster, cluster.GetMaxConcurrentAutomaticReplacements()) hasReplacement := false crashLoopContainerProcessGroups := cluster.GetCrashLoopContainerProcessGroups() + localitiesUsedForExclusion := cluster.UseLocalitiesForExclusion() for _, processGroupStatus := range cluster.Status.ProcessGroups { // If a process group is already marked for removal we can skip it here. @@ -92,7 +93,9 @@ func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationD } skipExclusion := false - if len(processGroupStatus.Addresses) == 0 { + // Only if localities are not used for exclusions we want to skip the exclusion. Skipping the exclusion could + // lead to a race condition, see: https://github.com/FoundationDB/fdb-kubernetes-operator/issues/1890 + if len(processGroupStatus.Addresses) == 0 && !localitiesUsedForExclusion { if !hasDesiredFaultTolerance { log.Info( "Skip process group with missing address", From 1efcc61610ffe3926ff556efabb4a1d335bd9fe8 Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Wed, 14 Feb 2024 13:30:21 +0100 Subject: [PATCH 2/2] Update comment --- internal/replacements/replace_failed_process_groups.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/replacements/replace_failed_process_groups.go b/internal/replacements/replace_failed_process_groups.go index 4c2e41243..653dbed29 100644 --- a/internal/replacements/replace_failed_process_groups.go +++ b/internal/replacements/replace_failed_process_groups.go @@ -93,8 +93,10 @@ func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationD } skipExclusion := false - // Only if localities are not used for exclusions we want to skip the exclusion. Skipping the exclusion could - // lead to a race condition, see: https://github.com/FoundationDB/fdb-kubernetes-operator/issues/1890 + // Only if localities are not used for exclusions we should be skipping the exclusion. + // Skipping the exclusion could lead to a race condition, which can be prevented if + // we are able to exclude by locality. + // see: https://github.com/FoundationDB/fdb-kubernetes-operator/issues/1890 if len(processGroupStatus.Addresses) == 0 && !localitiesUsedForExclusion { if !hasDesiredFaultTolerance { log.Info(