Skip to content

Commit

Permalink
Set a Running condition when the XGBoostJob is completed and doesn't …
Browse files Browse the repository at this point in the history
…have a Running condition

Signed-off-by: Yuki Iwai <[email protected]>
  • Loading branch information
tenzen-y committed Mar 28, 2023
1 parent b2ee1cb commit 659c736
Showing 1 changed file with 25 additions and 2 deletions.
27 changes: 25 additions & 2 deletions pkg/controller.v1/xgboost/xgboostjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,21 +397,28 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com
expected := *(spec.Replicas) - succeeded
running := status.Active
failed := status.Failed
xgboostJobRunningMessage := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name)

logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d",
xgboostJob.Name, rtype, expected, running, succeeded, failed)

if rtype == commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster) {
if running > 0 {
msg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name)
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg)
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage)
if err != nil {
logger.Infof("Append job condition error: %v", err)
return err
}
}
// when master is succeed, the job is finished.
if expected == 0 {
if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage)
if err != nil {
logger.Infof("Append job condition error: %v", err)
return err
}
}
msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name)
logrus.Info(msg)
r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, xgboostJobSucceededReason, msg)
Expand All @@ -429,6 +436,13 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com
}
}
if failed > 0 {
if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil {
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage)
if err != nil {
logger.Infof("Append job condition error: %v", err)
return err
}
}
if spec.RestartPolicy == commonv1.RestartPolicyExitCode {
msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype)
r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, xgboostJobRestartingReason, msg)
Expand Down Expand Up @@ -520,3 +534,12 @@ func (r *XGBoostJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool
return true
}
}

func findStatusCondition(conditions []commonv1.JobCondition, conditionType commonv1.JobConditionType) *commonv1.JobCondition {
for i := range conditions {
if conditions[i].Type == conditionType {
return &conditions[i]
}
}
return nil
}

0 comments on commit 659c736

Please sign in to comment.