From 65b1ce37e5b6d00a108e4fe4866614ea7150008a Mon Sep 17 00:00:00 2001 From: RainbowMango Date: Fri, 29 Nov 2024 18:45:54 +0800 Subject: [PATCH] Disable cluster failover by default which should be explicitly enabled by administrators after a fully evaluation. Signed-off-by: RainbowMango --- .../deploy/karmada-controller-manager.yaml | 2 +- .../app/controllermanager.go | 3 - .../cluster/cluster_controller_test.go | 473 ++++++++++++++++++ pkg/features/features.go | 23 +- 4 files changed, 493 insertions(+), 8 deletions(-) create mode 100644 pkg/controllers/cluster/cluster_controller_test.go diff --git a/artifacts/deploy/karmada-controller-manager.yaml b/artifacts/deploy/karmada-controller-manager.yaml index 5771b1d4ad3c..f07ee695c162 100644 --- a/artifacts/deploy/karmada-controller-manager.yaml +++ b/artifacts/deploy/karmada-controller-manager.yaml @@ -31,7 +31,7 @@ spec: - --secure-port=10357 - --failover-eviction-timeout=30s - --controllers=*,hpaScaleTargetMarker,deploymentReplicasSyncer - - --feature-gates=PropagationPolicyPreemption=true,MultiClusterService=true + - --feature-gates=Failover=true,PropagationPolicyPreemption=true,MultiClusterService=true - --v=4 livenessProbe: httpGet: diff --git a/cmd/controller-manager/app/controllermanager.go b/cmd/controller-manager/app/controllermanager.go index fb7ceab1ece8..c1a55a6193bd 100644 --- a/cmd/controller-manager/app/controllermanager.go +++ b/cmd/controller-manager/app/controllermanager.go @@ -590,9 +590,6 @@ func startGracefulEvictionController(ctx controllerscontext.Context) (enabled bo } func startApplicationFailoverController(ctx controllerscontext.Context) (enabled bool, err error) { - if !features.FeatureGate.Enabled(features.Failover) { - return false, nil - } rbApplicationFailoverController := applicationfailover.RBApplicationFailoverController{ Client: ctx.Mgr.GetClient(), EventRecorder: ctx.Mgr.GetEventRecorderFor(applicationfailover.RBApplicationFailoverControllerName), diff --git a/pkg/controllers/cluster/cluster_controller_test.go b/pkg/controllers/cluster/cluster_controller_test.go new file mode 100644 index 000000000000..e1a915dc9e77 --- /dev/null +++ b/pkg/controllers/cluster/cluster_controller_test.go @@ -0,0 +1,473 @@ +/* +Copyright 2024 The Karmada Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cluster + +import ( + "context" + "fmt" + "reflect" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" + workv1alpha1 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha1" + workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" + "github.com/karmada-io/karmada/pkg/features" + "github.com/karmada-io/karmada/pkg/util" + "github.com/karmada-io/karmada/pkg/util/gclient" + "github.com/karmada-io/karmada/pkg/util/names" +) + +func newClusterController() *Controller { + rbIndexerFunc := func(obj client.Object) []string { + rb, ok := obj.(*workv1alpha2.ResourceBinding) + if !ok { + return nil + } + return util.GetBindingClusterNames(&rb.Spec) + } + + crbIndexerFunc := func(obj client.Object) []string { + crb, ok := obj.(*workv1alpha2.ClusterResourceBinding) + if !ok { + return nil + } + return util.GetBindingClusterNames(&crb.Spec) + } + client := fake.NewClientBuilder().WithScheme(gclient.NewSchema()). + WithIndex(&workv1alpha2.ResourceBinding{}, rbClusterKeyIndex, rbIndexerFunc). + WithIndex(&workv1alpha2.ClusterResourceBinding{}, crbClusterKeyIndex, crbIndexerFunc). + WithStatusSubresource(&clusterv1alpha1.Cluster{}).Build() + return &Controller{ + Client: client, + EventRecorder: record.NewFakeRecorder(1024), + clusterHealthMap: newClusterHealthMap(), + EnableTaintManager: true, + ClusterMonitorGracePeriod: 40 * time.Second, + } +} + +func TestController_Reconcile(t *testing.T) { + req := controllerruntime.Request{NamespacedName: types.NamespacedName{Name: "test-cluster"}} + tests := []struct { + name string + cluster *clusterv1alpha1.Cluster + ns *corev1.Namespace + work *workv1alpha1.Work + del bool + wCluster *clusterv1alpha1.Cluster + want controllerruntime.Result + wantErr bool + }{ + { + name: "cluster without status", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + Taints: []corev1.Taint{{ + Key: clusterv1alpha1.TaintClusterNotReady, + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{}, + }, + }, + want: controllerruntime.Result{}, + wantErr: false, + }, + { + name: "cluster with ready condition", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + Taints: []corev1.Taint{{ + Key: clusterv1alpha1.TaintClusterNotReady, + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{Taints: []corev1.Taint{}}, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + want: controllerruntime.Result{}, + wantErr: false, + }, + { + name: "cluster with unknown condition", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + Taints: []corev1.Taint{{ + Key: clusterv1alpha1.TaintClusterNotReady, + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionUnknown, + }, + }, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{Taints: []corev1.Taint{ + { + Key: clusterv1alpha1.TaintClusterUnreachable, + Effect: corev1.TaintEffectNoSchedule, + }, + }}, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionUnknown, + }, + }, + }, + }, + want: controllerruntime.Result{}, + wantErr: false, + }, + { + name: "cluster with false condition", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + Taints: []corev1.Taint{{ + Key: clusterv1alpha1.TaintClusterUnreachable, + Effect: corev1.TaintEffectNoSchedule, + }}, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionFalse, + }, + }, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{Taints: []corev1.Taint{ + { + Key: clusterv1alpha1.TaintClusterNotReady, + Effect: corev1.TaintEffectNoSchedule, + }, + }}, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionFalse, + }, + }, + }, + }, + want: controllerruntime.Result{}, + wantErr: false, + }, + { + name: "cluster not found", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster-noexist", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + }, + want: controllerruntime.Result{}, + wantErr: false, + }, + { + name: "remove cluster failed", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionFalse, + }, + }, + }, + }, + ns: &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: names.GenerateExecutionSpaceName("test-cluster"), + }, + }, + work: &workv1alpha1.Work{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-work", + Namespace: names.GenerateExecutionSpaceName("test-cluster"), + Finalizers: []string{util.ExecutionControllerFinalizer}, + }, + }, + del: true, + want: controllerruntime.Result{}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := newClusterController() + if tt.cluster != nil { + if err := c.Create(context.Background(), tt.cluster, &client.CreateOptions{}); err != nil { + t.Fatalf("failed to create cluster %v", err) + } + } + + if tt.ns != nil { + if err := c.Create(context.Background(), tt.ns, &client.CreateOptions{}); err != nil { + t.Fatalf("failed to create ns %v", err) + } + } + + if tt.work != nil { + if err := c.Create(context.Background(), tt.work, &client.CreateOptions{}); err != nil { + t.Fatalf("failed to create work %v", err) + } + } + + if tt.del { + if err := c.Delete(context.Background(), tt.cluster, &client.DeleteOptions{}); err != nil { + t.Fatalf("failed to delete cluster %v", err) + } + } + + got, err := c.Reconcile(context.Background(), req) + if (err != nil) != tt.wantErr { + t.Errorf("Controller.Reconcile() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("Controller.Reconcile() = %v, want %v", got, tt.want) + return + } + + if tt.wCluster != nil { + cluster := &clusterv1alpha1.Cluster{} + if err := c.Get(context.Background(), types.NamespacedName{Name: tt.cluster.Name}, cluster, &client.GetOptions{}); err != nil { + t.Errorf("failed to get cluster %v", err) + return + } + + cleanUpCluster(cluster) + if !reflect.DeepEqual(cluster, tt.wCluster) { + t.Errorf("Cluster resource reconcile get %v, want %v", *cluster, *tt.wCluster) + } + } + }) + } +} + +func TestController_monitorClusterHealth(t *testing.T) { + tests := []struct { + name string + cluster *clusterv1alpha1.Cluster + wCluster *clusterv1alpha1.Cluster + wantErr bool + }{ + { + name: "cluster without status", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + Taints: []corev1.Taint{ + { + Key: clusterv1alpha1.TaintClusterUnreachable, + Effect: corev1.TaintEffectNoExecute, + }, + }, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{{ + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionUnknown, + Reason: "ClusterStatusNeverUpdated", + Message: "Cluster status controller never posted cluster status.", + }}, + }, + }, + wantErr: false, + }, + { + name: "cluster with ready condition", + cluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wCluster: &clusterv1alpha1.Cluster{ + ObjectMeta: controllerruntime.ObjectMeta{ + Name: "test-cluster", + Finalizers: []string{util.ClusterControllerFinalizer}, + }, + Spec: clusterv1alpha1.ClusterSpec{ + SyncMode: clusterv1alpha1.Pull, + Taints: []corev1.Taint{}, + }, + Status: clusterv1alpha1.ClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1alpha1.ClusterConditionReady, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := features.FeatureGate.Set(fmt.Sprintf("%s=%t", features.Failover, true)) + if err != nil { + t.Fatalf("Failed to enable failover feature gate: %v", err) + } + c := newClusterController() + if tt.cluster != nil { + if err = c.Create(context.Background(), tt.cluster, &client.CreateOptions{}); err != nil { + t.Fatalf("failed to create cluster: %v", err) + } + } + + if err = c.monitorClusterHealth(context.Background()); (err != nil) != tt.wantErr { + t.Errorf("Controller.monitorClusterHealth() error = %v, wantErr %v", err, tt.wantErr) + return + } + + cluster := &clusterv1alpha1.Cluster{} + if err = c.Get(context.Background(), types.NamespacedName{Name: "test-cluster"}, cluster, &client.GetOptions{}); err != nil { + t.Errorf("failed to get cluster: %v", err) + return + } + + cleanUpCluster(cluster) + if !reflect.DeepEqual(cluster, tt.wCluster) { + t.Errorf("Cluster resource get %+v, want %+v", *cluster, *tt.wCluster) + return + } + }) + } +} + +// cleanUpCluster removes unnecessary fields from Cluster resource for testing purposes. +func cleanUpCluster(c *clusterv1alpha1.Cluster) { + c.ObjectMeta.ResourceVersion = "" + + taints := []corev1.Taint{} + for _, taint := range c.Spec.Taints { + taint.TimeAdded = nil + taints = append(taints, taint) + } + c.Spec.Taints = taints + + cond := []metav1.Condition{} + for _, condition := range c.Status.Conditions { + condition.LastTransitionTime = metav1.Time{} + cond = append(cond, condition) + } + c.Status.Conditions = cond +} diff --git a/pkg/features/features.go b/pkg/features/features.go index 9c0938741797..9393741900cb 100644 --- a/pkg/features/features.go +++ b/pkg/features/features.go @@ -22,11 +22,22 @@ import ( ) const ( - // Failover indicates if scheduler should reschedule on cluster failure. + // Failover controls whether the scheduler should reschedule + // workloads on cluster failure. + // When enabled, Karmada will automatically migrate workloads + // from a failed cluster to other available clusters. + // + // Note: This feature does not control application failover, + // which is managed separately via the PropagationPolicy or + // ClusterPropagationPolicy. Failover featuregate.Feature = "Failover" - // GracefulEviction indicates if enable grace eviction. - // Takes effect only when the Failover feature is enabled. + // GracefulEviction controls whether to perform graceful evictions + // during both cluster failover and application failover. + // When used for cluster failover, it takes effect only when the + // Failover feature is enabled. + // Graceful eviction ensures that workloads are migrated in a + // controlled manner, minimizing disruption to applications. GracefulEviction featuregate.Feature = "GracefulEviction" // PropagateDeps indicates if relevant resources should be propagated automatically @@ -51,7 +62,11 @@ var ( // DefaultFeatureGates is the default feature gates of Karmada. DefaultFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{ - Failover: {Default: true, PreRelease: featuregate.Beta}, + // Failover(cluster failover) is disabled by default because it involves migrating + // all resources in the cluster, which can have significant impacts, it should be + // explicitly enabled by administrators after fully evaluation to avoid unexpected + // incidents. + Failover: {Default: false, PreRelease: featuregate.Beta}, GracefulEviction: {Default: true, PreRelease: featuregate.Beta}, PropagateDeps: {Default: true, PreRelease: featuregate.Beta}, CustomizedClusterResourceModeling: {Default: true, PreRelease: featuregate.Beta},