From 9efffd4b7ad18fb35d8502d44c5d40f8bc7145f7 Mon Sep 17 00:00:00 2001 From: Wang Zhang Date: Fri, 29 Oct 2021 15:22:51 +0800 Subject: [PATCH] fix tfjob status when enableDynamicWorker set true (#1455) --- pkg/controller.v1/tensorflow/tfjob_controller.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index fa3fce0c6e..a83d3600a6 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -513,6 +513,11 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 // we know it because we update the status condition when reconciling the replicas trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, tensorflowv1.FrameworkName) } else { + if tfJob.Spec.EnableDynamicWorker && rtype == tensorflowv1.TFReplicaTypeWorker { + commonutil.LoggerForJob(tfJob).Infof("TFJob %s/%s continues regardless %d Worker replica(s) failed as enableDynamicWorker is set true.", + tfJob.Namespace, tfJob.Name, failed) + continue + } msg := fmt.Sprintf("TFJob %s/%s has failed because %d %s replica(s) failed.", tfJob.Namespace, tfJob.Name, failed, rtype) r.recorder.Event(tfJob, corev1.EventTypeNormal, tfJobFailedReason, msg)