diff --git a/dpdispatcher/submission.py b/dpdispatcher/submission.py index fc07b310..94449321 100644 --- a/dpdispatcher/submission.py +++ b/dpdispatcher/submission.py @@ -528,8 +528,8 @@ def handle_unexpected_job_state(self): self.fail_count += 1 dlog.info(f"job: {self.job_hash} {self.job_id} terminated;" f"fail_cout is {self.fail_count}; resubmitting job") - if self.fail_count > 3: - raise RuntimeError(f"job:{self.job_hash}failed 3 times.job_detail:{self}") + if ( self.fail_count ) > 0 and ( self.fail_count % 3 == 0 ) : + raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}") self.submit_job() dlog.info("job:{job_hash} re-submit after terminated; new job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) self.get_job_state() @@ -540,9 +540,8 @@ def handle_unexpected_job_state(self): if job_state == JobStatus.unsubmitted: dlog.info(f"job: {self.job_hash} unsubmitted; submit it") - if self.fail_count > 3: - raise RuntimeError("job:job {job} failed 3 times".format(job=self)) - # self.fail_count += 1 + # if self.fail_count > 3: + # raise RuntimeError("job:job {job} failed 3 times".format(job=self)) self.submit_job() dlog.info("job: {job_hash} submit; job_id is {job_id}".format(job_hash=self.job_hash, job_id=self.job_id)) # self.get_job_state()