From 71eaa8661c2a29fea4f534b16e920f4c80fa3ca4 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Fri, 5 May 2023 07:53:56 +0000 Subject: [PATCH 1/8] update --- .../controllers/ray/rayservice_controller.go | 67 ++++++++++++++++++- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 8b845a92759..f63d45d629b 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -107,6 +107,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if rayServiceInstance, err = r.getRayServiceInstance(ctx, request); err != nil { return ctrl.Result{}, client.IgnoreNotFound(err) } + originalRayServiceInstance := rayServiceInstance.DeepCopy() r.cleanUpServeConfigCache(rayServiceInstance) // TODO (kevin85421): ObservedGeneration should be used to determine whether to update this CR or not. @@ -124,6 +125,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Check if we need to create pending RayCluster. if rayServiceInstance.Status.PendingServiceStatus.RayClusterName != "" && pendingRayClusterInstance == nil { // Update RayService Status since reconcileRayCluster may mark RayCluster restart. + r.Log.Info("r.Status().Update() Update RayService Status since reconcileRayCluster may mark RayCluster restart.") if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { logger.Error(errStatus, "Fail to update status of RayService after RayCluster changes", "rayServiceInstance", rayServiceInstance) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, nil @@ -212,14 +214,70 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } // Final status update for any CR modification. - if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { - logger.Error(errStatus, "Fail to update status of RayService", "rayServiceInstance", rayServiceInstance) - return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + if r.inconsistentRayServiceStatuses(originalRayServiceInstance.Status, rayServiceInstance.Status) { + r.Log.Info("r.Status().Update() Final status update for any CR modification.") + if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { + logger.Error(errStatus, "Fail to update status of RayService", "rayServiceInstance", rayServiceInstance) + return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err + } } return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, nil } +func (r *RayServiceReconciler) inconsistentRayServiceStatus(oldStatus rayv1alpha1.RayServiceStatus, newStatus rayv1alpha1.RayServiceStatus) bool { + if oldStatus.RayClusterName != newStatus.RayClusterName { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService RayClusterName changed from %s to %s", oldStatus.RayClusterName, newStatus.RayClusterName)) + return true + } + + if oldStatus.DashboardStatus.IsHealthy != newStatus.DashboardStatus.IsHealthy { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService DashboardStatus changed from %v to %v", oldStatus.DashboardStatus, newStatus.DashboardStatus)) + return true + } + + if oldStatus.ApplicationStatus.Status != newStatus.ApplicationStatus.Status || + oldStatus.ApplicationStatus.Message != newStatus.ApplicationStatus.Message { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ApplicationStatus changed from %v to %v", oldStatus.ApplicationStatus, newStatus.ApplicationStatus)) + return true + } + + if len(oldStatus.ServeStatuses) != len(newStatus.ServeStatuses) { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService number of ServeStatus changed from %v to %v", len(oldStatus.ServeStatuses), len(newStatus.ServeStatuses))) + return true + } + + for i := 0; i < len(oldStatus.ServeStatuses); i++ { + if oldStatus.ServeStatuses[i].Name != newStatus.ServeStatuses[i].Name || + oldStatus.ServeStatuses[i].Status != newStatus.ServeStatuses[i].Status || + oldStatus.ServeStatuses[i].Message != newStatus.ServeStatuses[i].Message { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ServeDeploymentStatus changed from %v to %v", oldStatus.ServeStatuses[i], newStatus.ServeStatuses[i])) + return true + } + } + + return false +} + +func (r *RayServiceReconciler) inconsistentRayServiceStatuses(oldStatus rayv1alpha1.RayServiceStatuses, newStatus rayv1alpha1.RayServiceStatuses) bool { + if oldStatus.ServiceStatus != newStatus.ServiceStatus { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ServiceStatus changed from %s to %s", oldStatus.ServiceStatus, newStatus.ServiceStatus)) + return true + } + + if r.inconsistentRayServiceStatus(oldStatus.ActiveServiceStatus, newStatus.ActiveServiceStatus) { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ActiveServiceStatus changed")) + return true + } + + if r.inconsistentRayServiceStatus(oldStatus.PendingServiceStatus, newStatus.PendingServiceStatus) { + r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService PendingServiceStatus changed")) + return true + } + + return false +} + // SetupWithManager sets up the controller with the Manager. func (r *RayServiceReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). @@ -249,6 +307,7 @@ func (r *RayServiceReconciler) getRayServiceInstance(ctx context.Context, reques func (r *RayServiceReconciler) updateState(ctx context.Context, rayServiceInstance *rayv1alpha1.RayService, status rayv1alpha1.ServiceStatus, err error) error { rayServiceInstance.Status.ServiceStatus = status + r.Log.Info("r.Status().Update() updateState", "error", err) if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { return fmtErrors.Errorf("combined error: %v %v", err, errStatus) } @@ -882,6 +941,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns r.Recorder.Event(rayServiceInstance, "Normal", "Running", "The Serve applicaton is now running and healthy.") } else if isHealthy && !isReady { rayServiceInstance.Status.ServiceStatus = rayv1alpha1.WaitForServeDeploymentReady + r.Log.Info("r.Status().Update() isHealthy && !isReady") if err := r.Status().Update(ctx, rayServiceInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, true, false, err } @@ -890,6 +950,7 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns // NOTE: When isHealthy is false, isReady is guaranteed to be false. r.markRestart(rayServiceInstance) rayServiceInstance.Status.ServiceStatus = rayv1alpha1.Restarting + r.Log.Info("r.Status().Update() !isHealthy") if err := r.Status().Update(ctx, rayServiceInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, false, err } From 191e934df1b602ae622eedf85fa491ff9751bead Mon Sep 17 00:00:00 2001 From: kaihsun Date: Fri, 5 May 2023 16:07:42 +0000 Subject: [PATCH 2/8] update --- ray-operator/controllers/ray/rayservice_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index f63d45d629b..8758acba221 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -266,12 +266,12 @@ func (r *RayServiceReconciler) inconsistentRayServiceStatuses(oldStatus rayv1alp } if r.inconsistentRayServiceStatus(oldStatus.ActiveServiceStatus, newStatus.ActiveServiceStatus) { - r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ActiveServiceStatus changed")) + r.Log.Info("inconsistentRayServiceStatus RayService ActiveServiceStatus changed") return true } if r.inconsistentRayServiceStatus(oldStatus.PendingServiceStatus, newStatus.PendingServiceStatus) { - r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService PendingServiceStatus changed")) + r.Log.Info("inconsistentRayServiceStatus RayService PendingServiceStatus changed") return true } From f5d4a36b2a6736d8d52b7eca2da9e99ff718a6bb Mon Sep 17 00:00:00 2001 From: kaihsun Date: Sat, 6 May 2023 00:32:30 +0000 Subject: [PATCH 3/8] add test --- .../ray/rayservice_controller_test.go | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/ray-operator/controllers/ray/rayservice_controller_test.go b/ray-operator/controllers/ray/rayservice_controller_test.go index 4448ddc34cc..a06c2509c5a 100644 --- a/ray-operator/controllers/ray/rayservice_controller_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_test.go @@ -382,6 +382,95 @@ var _ = Context("Inside the default namespace", func() { time.Second*15, time.Millisecond*500).Should(Equal(initialPendingClusterName), "New active RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) }) + It("Status should be updated if the differences are not only LastUpdateTime and HealthLastUpdateTime fields.", func() { + // Make sure (1) Dashboard client is healthy (2) All the three Ray Serve deployments in the active RayCluster are HEALTHY. + initialClusterName, _ := getRayClusterNameFunc(ctx, myRayService)() + Eventually( + checkServiceHealth(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) + + // ServiceUnhealthySecondThreshold is a global variable in rayservice_controller.go. + // If the time elapsed since the last update of the service HEALTHY status exceeds ServiceUnhealthySecondThreshold seconds, + // the RayService controller will consider the active RayCluster as unhealthy and prepare a new RayCluster. + orignalServeDeploymentUnhealthySecondThreshold := ServiceUnhealthySecondThreshold + ServiceUnhealthySecondThreshold = 500 + + // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. + oldTime := myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime.DeepCopy() + newTime := oldTime.Add(time.Duration(5) * time.Minute) // 300 seconds + fmt.Printf("!!!!!!oldTime = %v, newTime = %v\n", oldTime, newTime) + fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "UNHEALTHY")) + + // Confirm not switch to a new RayCluster because ServiceUnhealthySecondThreshold is 500 seconds. + Consistently( + getRayClusterNameFunc(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(Equal(initialClusterName), "Active RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) + + // Check if all the ServeStatuses[i].Status are UNHEALTHY. + checkAllServeStatusesUnhealthy := func(ctx context.Context, rayService *rayiov1alpha1.RayService) bool { + if err := k8sClient.Get(ctx, client.ObjectKey{Name: rayService.Name, Namespace: rayService.Namespace}, rayService); err != nil { + return false + } + for _, serveStatus := range rayService.Status.ActiveServiceStatus.ServeStatuses { + if serveStatus.Status != "UNHEALTHY" { + return false + } + } + return true + } + + // The status update not only includes the LastUpdateTime and HealthLastUpdateTime fields, but also the ServeStatuses[i].Status field. + // Hence, all the ServeStatuses[i].Status should be updated to UNHEALTHY. + // + // Note: LastUpdateTime/HealthLastUpdateTime will be overwritten via metav1.Now() in rayservice_controller.go. + // Hence, we cannot use `newTime`` to check whether the status is updated or not. + Eventually( + checkAllServeStatusesUnhealthy(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) + + fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "HEALTHY")) + + // Confirm not switch to a new RayCluster because ServiceUnhealthySecondThreshold is 500 seconds. + Consistently( + getRayClusterNameFunc(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(Equal(initialClusterName), "Active RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) + + // The status update not only includes the LastUpdateTime and HealthLastUpdateTime fields, but also the ServeStatuses[i].Status field. + // Hence, the status should be updated. + Eventually( + checkServiceHealth(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) + ServiceUnhealthySecondThreshold = orignalServeDeploymentUnhealthySecondThreshold + }) + + It("Status should not be updated if the only differences are the LastUpdateTime and HealthLastUpdateTime fields.", func() { + // Make sure (1) Dashboard client is healthy (2) All the three Ray Serve deployments in the active RayCluster are HEALTHY. + initialClusterName, _ := getRayClusterNameFunc(ctx, myRayService)() + Eventually( + checkServiceHealth(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) + + // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. + oldTime := myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime.DeepCopy() + newTime := oldTime.Add(time.Duration(5) * time.Minute) // 300 seconds + fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "HEALTHY")) + + // Confirm not switch to a new RayCluster + Consistently( + getRayClusterNameFunc(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(Equal(initialClusterName), "Active RayCluster name = %v", myRayService.Status.ActiveServiceStatus.RayClusterName) + + // The status is still the same as before. + Eventually( + checkServiceHealth(ctx, myRayService), + time.Second*3, time.Millisecond*500).Should(BeTrue(), "myRayService status = %v", myRayService.Status) + + // Status should not be updated if the only differences are the LastUpdateTime and HealthLastUpdateTime fields. + // Unlike the test "Status should be updated if the differences are not only LastUpdateTime and HealthLastUpdateTime fields.", + // the status update will not be triggered, so we can check whether the LastUpdateTime/HealthLastUpdateTime fields are updated or not by `oldTime`. + Expect(myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime).Should(Equal(oldTime), "myRayService status = %v", myRayService.Status) + }) + It("Update workerGroup.replicas in RayService and should not switch to new Ray Cluster", func() { // Certain field updates should not trigger new RayCluster preparation, such as updates // to `Replicas` and `WorkersToDelete` triggered by the autoscaler during scaling up/down. From c701c84604cff9857c0f70fa9f184984e49524f9 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Sat, 6 May 2023 00:33:37 +0000 Subject: [PATCH 4/8] fix --- ray-operator/controllers/ray/rayservice_controller_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller_test.go b/ray-operator/controllers/ray/rayservice_controller_test.go index a06c2509c5a..3acb1cd05f1 100644 --- a/ray-operator/controllers/ray/rayservice_controller_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_test.go @@ -398,7 +398,6 @@ var _ = Context("Inside the default namespace", func() { // Only update the LastUpdateTime and HealthLastUpdateTime fields in the active RayCluster. oldTime := myRayService.Status.ActiveServiceStatus.ServeStatuses[0].HealthLastUpdateTime.DeepCopy() newTime := oldTime.Add(time.Duration(5) * time.Minute) // 300 seconds - fmt.Printf("!!!!!!oldTime = %v, newTime = %v\n", oldTime, newTime) fakeRayDashboardClient.SetServeStatus(generateServeStatus(metav1.NewTime(newTime), "UNHEALTHY")) // Confirm not switch to a new RayCluster because ServiceUnhealthySecondThreshold is 500 seconds. From 1a0d67157d772cb18e881d89c6eccdcd41e2dc81 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Sat, 6 May 2023 06:23:06 +0000 Subject: [PATCH 5/8] add unit tests --- .../ray/rayservice_controller_unit_test.go | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/ray-operator/controllers/ray/rayservice_controller_unit_test.go b/ray-operator/controllers/ray/rayservice_controller_unit_test.go index 7493a62882d..1b05da48540 100644 --- a/ray-operator/controllers/ray/rayservice_controller_unit_test.go +++ b/ray-operator/controllers/ray/rayservice_controller_unit_test.go @@ -6,7 +6,9 @@ import ( "github.com/ray-project/kuberay/ray-operator/apis/ray/v1alpha1" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/pointer" + ctrl "sigs.k8s.io/controller-runtime" ) func TestGenerateRayClusterJsonHash(t *testing.T) { @@ -60,3 +62,115 @@ func TestCompareRayClusterJsonHash(t *testing.T) { assert.Nil(t, err) assert.True(t, equal) } + +func TestInconsistentRayServiceStatuses(t *testing.T) { + r := &RayServiceReconciler{ + Log: ctrl.Log.WithName("controllers").WithName("RayService"), + } + + timeNow := metav1.Now() + oldStatus := v1alpha1.RayServiceStatuses{ + ActiveServiceStatus: v1alpha1.RayServiceStatus{ + RayClusterName: "new-cluster", + DashboardStatus: v1alpha1.DashboardStatus{ + IsHealthy: true, + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ApplicationStatus: v1alpha1.AppStatus{ + Status: "running", + Message: "OK", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ServeStatuses: []v1alpha1.ServeDeploymentStatus{ + { + Name: "serve-1", + Status: "unhealthy", + Message: "error", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, + }, + PendingServiceStatus: v1alpha1.RayServiceStatus{ + RayClusterName: "old-cluster", + DashboardStatus: v1alpha1.DashboardStatus{ + IsHealthy: true, + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ApplicationStatus: v1alpha1.AppStatus{ + Status: "stopped", + Message: "stopped", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ServeStatuses: []v1alpha1.ServeDeploymentStatus{ + { + Name: "serve-1", + Status: "healthy", + Message: "Serve is healthy", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, + }, + ServiceStatus: v1alpha1.WaitForDashboard, + } + + // Test 1: Update ServiceStatus only. + newStatus := oldStatus.DeepCopy() + newStatus.ServiceStatus = v1alpha1.WaitForServeDeploymentReady + assert.True(t, r.inconsistentRayServiceStatuses(oldStatus, *newStatus)) + + // Test 2: Test RayServiceStatus + newStatus = oldStatus.DeepCopy() + newStatus.ActiveServiceStatus.DashboardStatus.LastUpdateTime = &metav1.Time{Time: timeNow.Add(1)} + assert.False(t, r.inconsistentRayServiceStatuses(oldStatus, *newStatus)) + + newStatus.ActiveServiceStatus.DashboardStatus.IsHealthy = !oldStatus.ActiveServiceStatus.DashboardStatus.IsHealthy + assert.True(t, r.inconsistentRayServiceStatuses(oldStatus, *newStatus)) +} + +func TestInconsistentRayServiceStatus(t *testing.T) { + timeNow := metav1.Now() + oldStatus := v1alpha1.RayServiceStatus{ + RayClusterName: "cluster-1", + DashboardStatus: v1alpha1.DashboardStatus{ + IsHealthy: true, + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ApplicationStatus: v1alpha1.AppStatus{ + Status: "running", + Message: "Application is running", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + ServeStatuses: []v1alpha1.ServeDeploymentStatus{ + { + Name: "serve-1", + Status: "healthy", + Message: "Serve is healthy", + LastUpdateTime: &timeNow, + HealthLastUpdateTime: &timeNow, + }, + }, + } + + r := &RayServiceReconciler{ + Log: ctrl.Log.WithName("controllers").WithName("RayService"), + } + + // Test 1: Only LastUpdateTime and HealthLastUpdateTime are updated. + newStatus := oldStatus.DeepCopy() + newStatus.DashboardStatus.LastUpdateTime = &metav1.Time{Time: timeNow.Add(1)} + assert.False(t, r.inconsistentRayServiceStatus(oldStatus, *newStatus)) + + // Test 2: Not only LastUpdateTime and HealthLastUpdateTime are updated. + newStatus = oldStatus.DeepCopy() + newStatus.DashboardStatus.LastUpdateTime = &metav1.Time{Time: timeNow.Add(1)} + newStatus.DashboardStatus.IsHealthy = !oldStatus.DashboardStatus.IsHealthy + assert.True(t, r.inconsistentRayServiceStatus(oldStatus, *newStatus)) +} From def53f559eec38411a471ab56cf1616a18b0b663 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Mon, 8 May 2023 05:59:36 +0000 Subject: [PATCH 6/8] remove log --- ray-operator/controllers/ray/rayservice_controller.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index 8758acba221..b2a69aeed48 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -125,7 +125,6 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Check if we need to create pending RayCluster. if rayServiceInstance.Status.PendingServiceStatus.RayClusterName != "" && pendingRayClusterInstance == nil { // Update RayService Status since reconcileRayCluster may mark RayCluster restart. - r.Log.Info("r.Status().Update() Update RayService Status since reconcileRayCluster may mark RayCluster restart.") if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { logger.Error(errStatus, "Fail to update status of RayService after RayCluster changes", "rayServiceInstance", rayServiceInstance) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, nil @@ -215,7 +214,6 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Final status update for any CR modification. if r.inconsistentRayServiceStatuses(originalRayServiceInstance.Status, rayServiceInstance.Status) { - r.Log.Info("r.Status().Update() Final status update for any CR modification.") if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { logger.Error(errStatus, "Fail to update status of RayService", "rayServiceInstance", rayServiceInstance) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err @@ -259,6 +257,7 @@ func (r *RayServiceReconciler) inconsistentRayServiceStatus(oldStatus rayv1alpha return false } +// Determine whether to update the status of the RayService instance. func (r *RayServiceReconciler) inconsistentRayServiceStatuses(oldStatus rayv1alpha1.RayServiceStatuses, newStatus rayv1alpha1.RayServiceStatuses) bool { if oldStatus.ServiceStatus != newStatus.ServiceStatus { r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService ServiceStatus changed from %s to %s", oldStatus.ServiceStatus, newStatus.ServiceStatus)) @@ -307,7 +306,6 @@ func (r *RayServiceReconciler) getRayServiceInstance(ctx context.Context, reques func (r *RayServiceReconciler) updateState(ctx context.Context, rayServiceInstance *rayv1alpha1.RayService, status rayv1alpha1.ServiceStatus, err error) error { rayServiceInstance.Status.ServiceStatus = status - r.Log.Info("r.Status().Update() updateState", "error", err) if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { return fmtErrors.Errorf("combined error: %v %v", err, errStatus) } @@ -941,7 +939,6 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns r.Recorder.Event(rayServiceInstance, "Normal", "Running", "The Serve applicaton is now running and healthy.") } else if isHealthy && !isReady { rayServiceInstance.Status.ServiceStatus = rayv1alpha1.WaitForServeDeploymentReady - r.Log.Info("r.Status().Update() isHealthy && !isReady") if err := r.Status().Update(ctx, rayServiceInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, true, false, err } @@ -950,7 +947,6 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns // NOTE: When isHealthy is false, isReady is guaranteed to be false. r.markRestart(rayServiceInstance) rayServiceInstance.Status.ServiceStatus = rayv1alpha1.Restarting - r.Log.Info("r.Status().Update() !isHealthy") if err := r.Status().Update(ctx, rayServiceInstance); err != nil { return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, false, false, err } From 48c3e828b7e4fa8ddf38b6f42866d10390e5f4b4 Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Mon, 8 May 2023 11:09:22 -0700 Subject: [PATCH 7/8] Update ray-operator/controllers/ray/rayservice_controller.go Co-authored-by: shrekris-anyscale <92341594+shrekris-anyscale@users.noreply.github.com> Signed-off-by: Kai-Hsun Chen --- ray-operator/controllers/ray/rayservice_controller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index b2a69aeed48..b86f4fdf28c 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -215,7 +215,7 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // Final status update for any CR modification. if r.inconsistentRayServiceStatuses(originalRayServiceInstance.Status, rayServiceInstance.Status) { if errStatus := r.Status().Update(ctx, rayServiceInstance); errStatus != nil { - logger.Error(errStatus, "Fail to update status of RayService", "rayServiceInstance", rayServiceInstance) + logger.Error(errStatus, "Failed to update RayService status", "rayServiceInstance", rayServiceInstance) return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err } } From 36f10509798798bd8c42cb3feb4a0c2c06709d96 Mon Sep 17 00:00:00 2001 From: kaihsun Date: Mon, 8 May 2023 22:46:19 +0000 Subject: [PATCH 8/8] add logs --- ray-operator/controllers/ray/rayservice_controller.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ray-operator/controllers/ray/rayservice_controller.go b/ray-operator/controllers/ray/rayservice_controller.go index b86f4fdf28c..5045f5ba76f 100644 --- a/ray-operator/controllers/ray/rayservice_controller.go +++ b/ray-operator/controllers/ray/rayservice_controller.go @@ -223,6 +223,10 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, nil } +// Checks whether the old and new RayServiceStatus are inconsistent by comparing different fields. +// If the only differences between the old and new status are the LastUpdateTime and HealthLastUpdateTime fields, +// the status update will not be triggered. +// The RayClusterStatus field is only for observability in RayService CR, and changes to it will not trigger the status update. func (r *RayServiceReconciler) inconsistentRayServiceStatus(oldStatus rayv1alpha1.RayServiceStatus, newStatus rayv1alpha1.RayServiceStatus) bool { if oldStatus.RayClusterName != newStatus.RayClusterName { r.Log.Info(fmt.Sprintf("inconsistentRayServiceStatus RayService RayClusterName changed from %s to %s", oldStatus.RayClusterName, newStatus.RayClusterName))