tune readiness probe timeouts

ray-project · Jul 26, 2022 · 577850d · 577850d
1 parent 8dcc8a7
commit 577850d
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 20 deletions.
diff --git a/ray-operator/controllers/ray/common/constant.go b/ray-operator/controllers/ray/common/constant.go
@@ -73,15 +73,15 @@ const (
 	LOCAL_HOST = "127.0.0.1"
 	// Ray HA default readiness probe values
 	DefaultReadinessProbeInitialDelaySeconds = 10
-	DefaultReadinessProbeTimeoutSeconds      = 0
-	DefaultReadinessProbePeriodSeconds       = 0
+	DefaultReadinessProbeTimeoutSeconds      = 1
+	DefaultReadinessProbePeriodSeconds       = 3
 	DefaultReadinessProbeSuccessThreshold    = 0
 	DefaultReadinessProbeFailureThreshold    = 20
 
 	// Ray HA default liveness probe values
 	DefaultLivenessProbeInitialDelaySeconds = 10
-	DefaultLivenessProbeTimeoutSeconds      = 0
-	DefaultLivenessProbePeriodSeconds       = 0
+	DefaultLivenessProbeTimeoutSeconds      = 1
+	DefaultLivenessProbePeriodSeconds       = 3
 	DefaultLivenessProbeSuccessThreshold    = 0
 	DefaultLivenessProbeFailureThreshold    = 40
 

diff --git a/ray-operator/controllers/ray/raycluster_controller.go b/ray-operator/controllers/ray/raycluster_controller.go
@@ -149,17 +149,11 @@ func (r *RayClusterReconciler) eventReconcile(request ctrl.Request, event *v1.Ev
 		return ctrl.Result{}, nil
 	}
 
-	needUpdate := true
-	if !utils.IsRunningAndReady(unhealthyPod) && unhealthyPod.Annotations != nil {
-		log.Info("mark pod unhealthy and need for a rebuild", "pod name", unhealthyPod.Name)
-		for k, v := range unhealthyPod.GetAnnotations() {
-			if k == common.RayNodeHealthStateAnnotationKey && v == common.PodUnhealthy {
-				needUpdate = false
-			}
-		}
-		if needUpdate {
+	if !utils.IsRunningAndReady(unhealthyPod) {
+		if v, ok := unhealthyPod.Annotations[common.RayNodeHealthStateAnnotationKey]; !ok || v != common.PodUnhealthy {
 			updatedPod := unhealthyPod.DeepCopy()
 			updatedPod.Annotations[common.RayNodeHealthStateAnnotationKey] = common.PodUnhealthy
+			log.Info("mark pod unhealthy and need for a rebuild", "pod", unhealthyPod)
 			if err := r.Update(context.TODO(), updatedPod); err != nil {
 				return ctrl.Result{}, err
 			}
@@ -444,10 +438,10 @@ func (r *RayClusterReconciler) reconcilePods(instance *rayiov1alpha1.RayCluster)
 				continue
 			}
 			if v, ok := workerPod.Annotations[common.RayNodeHealthStateAnnotationKey]; ok && v == common.PodUnhealthy {
+				log.Info(fmt.Sprintf("deleting unhealthy worker pod %s", workerPod.Name))
 				if err := r.Delete(context.TODO(), &workerPod); err != nil {
 					return err
 				}
-				log.Info(fmt.Sprintf("need to delete unhealthy worker pod %s", workerPod.Name))
 				// we are deleting one worker pod now, let's reconcile again later
 				return nil
 			}

diff --git a/tests/compatibility-test.py b/tests/compatibility-test.py
@@ -45,7 +45,7 @@ def shell_assert_failure(cmd):
 def create_cluster():
     shell_assert_success(
         'kind create cluster --config {}'.format(kindcluster_config_file))
-    time.sleep(30)
+    time.sleep(60)
     rtn = shell_run('kubectl wait --for=condition=ready pod -n kube-system --all --timeout=900s')
     if rtn != 0:
         shell_run('kubectl get pods -A')
@@ -246,7 +246,7 @@ def test_kill_head(self):
             shell_run('kubectl describe pod $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
             shell_run('kubectl logs $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
             shell_run('kubectl logs -n $(kubectl get pods -A | grep -e "-operator" | awk \'{print $1 "  " $2}\')')
-        assert  rtn == 0
+        assert rtn == 0
 
     def test_ray_serve(self):
         client = docker.from_env()

diff --git a/tests/config/ray-cluster.ray-ha.yaml.template b/tests/config/ray-cluster.ray-ha.yaml.template
@@ -108,10 +108,14 @@ spec:
                 name: client
             livenessProbe:
               initialDelaySeconds: 30
-              failureThreshold: 200
+              periodSeconds: 3
+              timeoutSeconds: 1
+              failureThreshold: 400
             readinessProbe:
               initialDelaySeconds: 30
-              failureThreshold: 120
+              periodSeconds: 3
+              timeoutSeconds: 1
+              failureThreshold: 300
   workerGroupSpecs:
     # the pod replicas in this group typed worker
     - replicas: 2
@@ -170,10 +174,14 @@ spec:
                   cpu: "200m"
               livenessProbe:
                 initialDelaySeconds: 30
-                failureThreshold: 200
+                periodSeconds: 3
+                timeoutSeconds: 1
+                failureThreshold: 400
               readinessProbe:
                 initialDelaySeconds: 30
-                failureThreshold: 120
+                periodSeconds: 3
+                timeoutSeconds: 1
+                failureThreshold: 300
           volumes:
             - name: log-volume
               emptyDir: {}