diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index b78c91e9b1..664710283e 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -397,14 +397,14 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com expected := *(spec.Replicas) - succeeded running := status.Active failed := status.Failed + xgboostJobRunningMessage := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name) logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", xgboostJob.Name, rtype, expected, running, succeeded, failed) if rtype == commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster) { if running > 0 { - msg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage) if err != nil { logger.Infof("Append job condition error: %v", err) return err @@ -412,6 +412,13 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } // when master is succeed, the job is finished. if expected == 0 { + if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil { + err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage) + if err != nil { + logger.Infof("Append job condition error: %v", err) + return err + } + } msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name) logrus.Info(msg) r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, xgboostJobSucceededReason, msg) @@ -429,6 +436,13 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } } if failed > 0 { + if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil { + err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, xgboostJobRunningMessage) + if err != nil { + logger.Infof("Append job condition error: %v", err) + return err + } + } if spec.RestartPolicy == commonv1.RestartPolicyExitCode { msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype) r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, xgboostJobRestartingReason, msg) @@ -520,3 +534,12 @@ func (r *XGBoostJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool return true } } + +func findStatusCondition(conditions []commonv1.JobCondition, conditionType commonv1.JobConditionType) *commonv1.JobCondition { + for i := range conditions { + if conditions[i].Type == conditionType { + return &conditions[i] + } + } + return nil +}