diff --git a/helm-chart/ray-cluster/values.yaml b/helm-chart/ray-cluster/values.yaml index f8eab8dc707..d65e220e087 100644 --- a/helm-chart/ray-cluster/values.yaml +++ b/helm-chart/ray-cluster/values.yaml @@ -46,6 +46,8 @@ head: # cpu: "500m" # memory: "512Mi" labels: {} + # Note: From KubeRay v0.6.0, users need to create the ServiceAccount by themselves if they specify the `serviceAccountName` + # in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details. serviceAccountName: "" rayStartParams: dashboard-host: '0.0.0.0' diff --git a/ray-operator/config/samples/ray-service.autoscaler.yaml b/ray-operator/config/samples/ray-service.autoscaler.yaml index 0b9bc2987c0..fc0c41e577f 100644 --- a/ray-operator/config/samples/ray-service.autoscaler.yaml +++ b/ray-operator/config/samples/ray-service.autoscaler.yaml @@ -63,6 +63,9 @@ spec: #pod template template: spec: + # Note: From KubeRay v0.6.0, users need to create the ServiceAccount by themselves if they specify the `serviceAccountName` + # in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details. + # serviceAccountName: my-sa containers: - name: ray-head image: rayproject/ray:2.4.0 diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go index 53ddb6d551b..896c5763d6a 100644 --- a/ray-operator/controllers/ray/raycluster_controller.go +++ b/ray-operator/controllers/ray/raycluster_controller.go @@ -1026,6 +1026,18 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(instance *rayio return err } + // If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. + // However, if KubeRay creates a ServiceAccount for users, the autoscaler may encounter permission issues during + // zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/issues/1123 + // for more details. + if instance.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName == namespacedName.Name { + r.Log.Error(err, fmt.Sprintf( + "If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+ + "However, ServiceAccount %s is not found. Please create one. "+ + "See the PR description of https://github.com/ray-project/kuberay/pull/1128 for more details.", namespacedName.Name), "ServiceAccount", namespacedName) + return err + } + // Create service account for autoscaler if there's no existing one in the cluster. serviceAccount, err := common.BuildServiceAccount(instance) if err != nil { @@ -1034,6 +1046,7 @@ func (r *RayClusterReconciler) reconcileAutoscalerServiceAccount(instance *rayio // making sure the name is valid serviceAccount.Name = utils.CheckName(serviceAccount.Name) + // Set controller reference if err := controllerutil.SetControllerReference(instance, serviceAccount, r.Scheme); err != nil { return err diff --git a/ray-operator/controllers/ray/raycluster_controller_fake_test.go b/ray-operator/controllers/ray/raycluster_controller_fake_test.go index f108e28ca22..cb757f4a677 100644 --- a/ray-operator/controllers/ray/raycluster_controller_fake_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_fake_test.go @@ -21,6 +21,7 @@ import ( "testing" "github.com/ray-project/kuberay/ray-operator/controllers/ray/common" + "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" . "github.com/onsi/ginkgo" rayiov1alpha1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" @@ -264,7 +265,6 @@ func setupTest(t *testing.T) { }, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - ServiceAccountName: headGroupServiceAccount, Containers: []corev1.Container{ { Name: "ray-head", @@ -878,7 +878,7 @@ func TestReconcile_AutoscalerServiceAccount(t *testing.T) { fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(testPods...).Build() saNamespacedName := types.NamespacedName{ - Name: headGroupServiceAccount, + Name: utils.GetHeadGroupServiceAccountName(testRayCluster), Namespace: namespaceStr, } sa := corev1.ServiceAccount{} @@ -901,6 +901,57 @@ func TestReconcile_AutoscalerServiceAccount(t *testing.T) { assert.Nil(t, err, "Fail to get head group ServiceAccount after reconciliation") } +func TestReconcile_Autoscaler_ServiceAccountName(t *testing.T) { + setupTest(t) + defer tearDown(t) + + // Specify a ServiceAccountName for the head Pod + myServiceAccount := corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-sa", + Namespace: namespaceStr, + }, + } + cluster := testRayCluster.DeepCopy() + cluster.Spec.HeadGroupSpec.Template.Spec.ServiceAccountName = myServiceAccount.Name + + // Case 1: There is no ServiceAccount "my-sa" in the Kubernetes cluster + runtimeObjects := []runtime.Object{} + fakeClient := clientFake.NewClientBuilder().WithRuntimeObjects(runtimeObjects...).Build() + + // Initialize the reconciler + testRayClusterReconciler := &RayClusterReconciler{ + Client: fakeClient, + Recorder: &record.FakeRecorder{}, + Scheme: scheme.Scheme, + Log: ctrl.Log.WithName("controllers").WithName("RayCluster"), + } + + // If users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. + // However, if KubeRay creates a ServiceAccount for users, the autoscaler may encounter permission issues during + // zero-downtime rolling updates when RayService is performed. See https://github.com/ray-project/kuberay/pull/1128 + // for more details. + err := testRayClusterReconciler.reconcileAutoscalerServiceAccount(cluster) + assert.NotNil(t, err, + "When users specify ServiceAccountName for the head Pod, they need to create a ServiceAccount themselves. "+ + "If the ServiceAccount does not exist, the reconciler should return an error. However, err is nil.") + + // Case 2: There is a ServiceAccount "my-sa" in the Kubernetes cluster + runtimeObjects = []runtime.Object{&myServiceAccount} + fakeClient = clientFake.NewClientBuilder().WithRuntimeObjects(runtimeObjects...).Build() + + // Initialize the reconciler + testRayClusterReconciler = &RayClusterReconciler{ + Client: fakeClient, + Recorder: &record.FakeRecorder{}, + Scheme: scheme.Scheme, + Log: ctrl.Log.WithName("controllers").WithName("RayCluster"), + } + + err = testRayClusterReconciler.reconcileAutoscalerServiceAccount(cluster) + assert.Nil(t, err) +} + func TestReconcile_AutoscalerRoleBinding(t *testing.T) { setupTest(t) defer tearDown(t) diff --git a/ray-operator/controllers/ray/raycluster_controller_test.go b/ray-operator/controllers/ray/raycluster_controller_test.go index dd17bfc4e99..63b8d4126b9 100644 --- a/ray-operator/controllers/ray/raycluster_controller_test.go +++ b/ray-operator/controllers/ray/raycluster_controller_test.go @@ -62,7 +62,6 @@ var _ = Context("Inside the default namespace", func() { }, Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ - ServiceAccountName: "head-service-account", Containers: []corev1.Container{ { Name: "ray-head", @@ -168,14 +167,6 @@ var _ = Context("Inside the default namespace", func() { Expect(pod.Status.Phase).Should(Or(Equal(corev1.PodPending), Equal(corev1.PodRunning))) }) - It("should create the head group's specified K8s ServiceAccount if it doesn't exist", func() { - saName := utils.GetHeadGroupServiceAccountName(myRayCluster) - sa := &corev1.ServiceAccount{} - Eventually( - getResourceFunc(ctx, client.ObjectKey{Name: saName, Namespace: "default"}, sa), - time.Second*15, time.Millisecond*500).Should(BeNil(), "My head group ServiceAccount = %v", saName) - }) - It("should create the autoscaler K8s RoleBinding if it doesn't exist", func() { rbName := myRayCluster.Name rb := &rbacv1.RoleBinding{} diff --git a/ray-operator/controllers/ray/rayservice_controller_test.go b/ray-operator/controllers/ray/rayservice_controller_test.go index 07b1bb4e850..2463601e9f4 100644 --- a/ray-operator/controllers/ray/rayservice_controller_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_test.go @@ -441,7 +441,7 @@ var _ = Context("Inside the default namespace", func() { // Hence, all the ServeStatuses[i].Status should be updated to UNHEALTHY. // // Note: LastUpdateTime/HealthLastUpdateTime will be overwritten via metav1.Now() in rayservice_controller.go. - // Hence, we cannot use `newTime`` to check whether the status is updated or not. + // Hence, we cannot use `newTime` to check whether the status is updated or not. Eventually( checkAllServeStatusesUnhealthy(ctx, myRayService), time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status)