Skip to content

Commit

Permalink
Merge pull request #267 from galaxyproject/261-job-restarts
Browse files Browse the repository at this point in the history
Job restart
  • Loading branch information
ksuderman authored Dec 15, 2023
2 parents b064013 + d0c526b commit 81bd807
Showing 1 changed file with 24 additions and 2 deletions.
26 changes: 24 additions & 2 deletions abm/lib/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# History related functions
#

# The number of times a failed job will be restarted.
RESTART_MAX = 3

def longest_name(histories: list):
longest = 0
Expand Down Expand Up @@ -397,10 +399,21 @@ def wait(context: Context, args: list):
wait_for(gi, history_id)


def kill_all_jobs(gi: GalaxyInstance, job_list:list):
cancel_states = ['new', 'running', 'paused']
for job in job_list:
if job['state'] in cancel_states:
print(f"Cancelling job {job['tool_id']}")
gi.jobs.cancel_job(job['id'])
else:
print(f"Job {job['id']} for tool {job['tool_id']} is in state {job['state']}")


def wait_for(gi: GalaxyInstance, history_id: str):
errored = []
waiting = True
job_states = JobStates()
restart_counts = dict()
while waiting:
restart = []
status_counts = dict()
Expand All @@ -421,9 +434,18 @@ def wait_for(gi: GalaxyInstance, history_id: str):
elif state == 'error':
terminal += 1
if id not in errored:
restart.append(id)
tool = job['tool_id']
if tool in restart_counts:
restart_counts[tool] += 1
else:
restart_counts[tool] = 1
if restart_counts[tool] < RESTART_MAX:
restart.append(id)
else:
kill_all_jobs(gi, job_list)
waiting = False
errored.append(id)
if len(restart) > 0:
if len(restart) > 0 and waiting:
for job in restart:
print(f"Restaring job {job}")
try:
Expand Down

0 comments on commit 81bd807

Please sign in to comment.