Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug][RayJob] Fix FailedToGetJobStatus by allowing transition to Running #1583

Merged
merged 6 commits into from
Oct 31, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions ray-operator/controllers/ray/rayjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,23 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
// Check the current status of ray jobs
jobInfo, err := rayDashboardClient.GetJobInfo(ctx, rayJobInstance.Status.JobId)
if err != nil {
r.Log.Error(err, "failed to get job info", "jobId", rayJobInstance.Status.JobId)
err = r.updateState(ctx, rayJobInstance, jobInfo, rayJobInstance.Status.JobStatus, rayv1.JobDeploymentStatusFailedToGetJobStatus, err)
// Dashboard service in head pod takes time to start, it's possible we get connection refused error.
// Requeue after few seconds to avoid continuous connection errors.
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}

// Update RayJob.Status (Kubernetes CR) from Ray Job Status from Dashboard service
if jobInfo != nil && jobInfo.JobStatus != rayJobInstance.Status.JobStatus {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the bug reproduction by killing the head pod, the previous and current JobStatus were both RUNNING so we wouldn't go into this branch.

r.Log.Info(fmt.Sprintf("Update status from %s to %s", rayJobInstance.Status.JobStatus, jobInfo.JobStatus), "rayjob", rayJobInstance.Status.JobId)
err = r.updateState(ctx, rayJobInstance, jobInfo, jobInfo.JobStatus, rayv1.JobDeploymentStatusRunning, nil)
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
if jobInfo != nil {
jobStatusChanged := (rayJobInstance.Status.JobStatus != jobInfo.JobStatus)
// If the status changed, or if we didn't have the status before and now we have it, update the status and deployment status.
if jobStatusChanged || rayJobInstance.Status.JobDeploymentStatus == rayv1.JobDeploymentStatusFailedToGetJobStatus {
r.Log.Info(fmt.Sprintf("Update jobStatus from %s to %s", rayJobInstance.Status.JobStatus, jobInfo.JobStatus), "rayjob", rayJobInstance.Status.JobId)
r.Log.Info(fmt.Sprintf("Update jobDeploymentStatus from %s to %s", rayJobInstance.Status.JobDeploymentStatus, rayv1.JobDeploymentStatusRunning), "rayjob", rayJobInstance.Status.JobId)
err = r.updateState(ctx, rayJobInstance, jobInfo, jobInfo.JobStatus, rayv1.JobDeploymentStatusRunning, nil)
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err
}
}

if rayJobInstance.Status.JobDeploymentStatus == rayv1.JobDeploymentStatusRunning {
Expand Down Expand Up @@ -305,8 +311,10 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
}
}

deployStatus := rayJobInstance.Status.JobDeploymentStatus
isDeployStatusRunningOrFailedToGetStatus := deployStatus == rayv1.JobDeploymentStatusRunning || deployStatus == rayv1.JobDeploymentStatusFailedToGetJobStatus
kevin85421 marked this conversation as resolved.
Show resolved Hide resolved
// Let's use rayJobInstance.Status.JobStatus to make sure we only delete cluster after the CR is updated.
if isJobSucceedOrFailed(rayJobInstance.Status.JobStatus) && rayJobInstance.Status.JobDeploymentStatus == rayv1.JobDeploymentStatusRunning {
if isJobSucceedOrFailed(rayJobInstance.Status.JobStatus) && isDeployStatusRunningOrFailedToGetStatus {
if rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 {
// the RayJob is submitted against the RayCluster created by THIS job, so we can tear that
// RayCluster down.
Expand Down