Skip to content

Commit

Permalink
tune readiness probe timeouts
Browse files Browse the repository at this point in the history
  • Loading branch information
wilsonwang371 committed Jul 26, 2022
1 parent 8dcc8a7 commit 577850d
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 20 deletions.
8 changes: 4 additions & 4 deletions ray-operator/controllers/ray/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ const (
LOCAL_HOST = "127.0.0.1"
// Ray HA default readiness probe values
DefaultReadinessProbeInitialDelaySeconds = 10
DefaultReadinessProbeTimeoutSeconds = 0
DefaultReadinessProbePeriodSeconds = 0
DefaultReadinessProbeTimeoutSeconds = 1
DefaultReadinessProbePeriodSeconds = 3
DefaultReadinessProbeSuccessThreshold = 0
DefaultReadinessProbeFailureThreshold = 20

// Ray HA default liveness probe values
DefaultLivenessProbeInitialDelaySeconds = 10
DefaultLivenessProbeTimeoutSeconds = 0
DefaultLivenessProbePeriodSeconds = 0
DefaultLivenessProbeTimeoutSeconds = 1
DefaultLivenessProbePeriodSeconds = 3
DefaultLivenessProbeSuccessThreshold = 0
DefaultLivenessProbeFailureThreshold = 40

Expand Down
14 changes: 4 additions & 10 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,17 +149,11 @@ func (r *RayClusterReconciler) eventReconcile(request ctrl.Request, event *v1.Ev
return ctrl.Result{}, nil
}

needUpdate := true
if !utils.IsRunningAndReady(unhealthyPod) && unhealthyPod.Annotations != nil {
log.Info("mark pod unhealthy and need for a rebuild", "pod name", unhealthyPod.Name)
for k, v := range unhealthyPod.GetAnnotations() {
if k == common.RayNodeHealthStateAnnotationKey && v == common.PodUnhealthy {
needUpdate = false
}
}
if needUpdate {
if !utils.IsRunningAndReady(unhealthyPod) {
if v, ok := unhealthyPod.Annotations[common.RayNodeHealthStateAnnotationKey]; !ok || v != common.PodUnhealthy {
updatedPod := unhealthyPod.DeepCopy()
updatedPod.Annotations[common.RayNodeHealthStateAnnotationKey] = common.PodUnhealthy
log.Info("mark pod unhealthy and need for a rebuild", "pod", unhealthyPod)
if err := r.Update(context.TODO(), updatedPod); err != nil {
return ctrl.Result{}, err
}
Expand Down Expand Up @@ -444,10 +438,10 @@ func (r *RayClusterReconciler) reconcilePods(instance *rayiov1alpha1.RayCluster)
continue
}
if v, ok := workerPod.Annotations[common.RayNodeHealthStateAnnotationKey]; ok && v == common.PodUnhealthy {
log.Info(fmt.Sprintf("deleting unhealthy worker pod %s", workerPod.Name))
if err := r.Delete(context.TODO(), &workerPod); err != nil {
return err
}
log.Info(fmt.Sprintf("need to delete unhealthy worker pod %s", workerPod.Name))
// we are deleting one worker pod now, let's reconcile again later
return nil
}
Expand Down
4 changes: 2 additions & 2 deletions tests/compatibility-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def shell_assert_failure(cmd):
def create_cluster():
shell_assert_success(
'kind create cluster --config {}'.format(kindcluster_config_file))
time.sleep(30)
time.sleep(60)
rtn = shell_run('kubectl wait --for=condition=ready pod -n kube-system --all --timeout=900s')
if rtn != 0:
shell_run('kubectl get pods -A')
Expand Down Expand Up @@ -246,7 +246,7 @@ def test_kill_head(self):
shell_run('kubectl describe pod $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
shell_run('kubectl logs $(kubectl get pods | grep -e "-head" | awk "{print \$1}")')
shell_run('kubectl logs -n $(kubectl get pods -A | grep -e "-operator" | awk \'{print $1 " " $2}\')')
assert rtn == 0
assert rtn == 0

def test_ray_serve(self):
client = docker.from_env()
Expand Down
16 changes: 12 additions & 4 deletions tests/config/ray-cluster.ray-ha.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,14 @@ spec:
name: client
livenessProbe:
initialDelaySeconds: 30
failureThreshold: 200
periodSeconds: 3
timeoutSeconds: 1
failureThreshold: 400
readinessProbe:
initialDelaySeconds: 30
failureThreshold: 120
periodSeconds: 3
timeoutSeconds: 1
failureThreshold: 300
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 2
Expand Down Expand Up @@ -170,10 +174,14 @@ spec:
cpu: "200m"
livenessProbe:
initialDelaySeconds: 30
failureThreshold: 200
periodSeconds: 3
timeoutSeconds: 1
failureThreshold: 400
readinessProbe:
initialDelaySeconds: 30
failureThreshold: 120
periodSeconds: 3
timeoutSeconds: 1
failureThreshold: 300
volumes:
- name: log-volume
emptyDir: {}

0 comments on commit 577850d

Please sign in to comment.