galaxyproject · ksuderman · Aug 14, 2023 · Aug 14, 2023 · Aug 14, 2023 · Aug 14, 2023
diff --git a/abm/lib/benchmark.py b/abm/lib/benchmark.py
@@ -7,6 +7,7 @@
 from lib import Keys, INVOCATIONS_DIR, METRICS_DIR
 from lib.common import connect, Context, _get_dataset_data, _make_dataset_element, print_json
 from bioblend.galaxy import GalaxyInstance, dataset_collections
+from lib.history import wait_for
 
 log = logging.getLogger('abm')
 
@@ -369,41 +370,63 @@ def wait_for_jobs(context, gi: GalaxyInstance, invocations: dict):
     conf = invocations['job_conf']
     inputs = invocations['inputs']
     output_dir = invocations['output_dir']
-    for step in invocations['steps']:
-        job_id = step['job_id']
-        if job_id is not None:
-            retries = 3
-            done = False
-            while not done and retries >= 0:
-                print(f"Waiting for job {job_id} on {context.GALAXY_SERVER}")
-                try:
-                    # TDOD Should retry if anything throws an exception.
-                    status = gi.jobs.wait_for_job(job_id, 86400, 10, False)
-                    data = gi.jobs.show_job(job_id, full_details=True)
-                    metrics = {
-                        'run': run,
-                        'cloud': cloud,
-                        'job_conf': conf,
-                        'workflow_id': wfid,
-                        'history_id': hid,
-                        'inputs': inputs,
-                        'metrics': data,
-                        'status': status,
-                        'server': context.GALAXY_SERVER,
-                        'ref_data_size': invocations['ref_data_size'],
-                        'input_data_size': invocations['input_data_size']
-                    }
-                    output_path = os.path.join(output_dir, f"{job_id}.json")
-                    with open(output_path, "w") as f:
-                        json.dump(metrics, f, indent=4)
-                        print(f"Wrote metrics to {output_path}")
-                    done = True
-                except ConnectionError as e:
-                    print(f"ERROR: connection dropped while waiting for {job_id}")
-                    retries -= 1
-                except Exception as e:
-                    print(f"ERROR: {e}")
-                    retries -= 1
+    wait_for(gi, hid)
+    jobs = gi.jobs.get_jobs(history_id=hid)
+    for job in jobs:
+        data = gi.jobs.show_job(job['id'], full_details=True)
+        metrics = {
+            'run': run,
+            'cloud': cloud,
+            'job_conf': conf,
+            'workflow_id': wfid,
+            'history_id': hid,
+            'inputs': inputs,
+            'metrics': data,
+            'status': job['state'],
+            'server': context.GALAXY_SERVER,
+            'ref_data_size': invocations['ref_data_size'],
+            'input_data_size': invocations['input_data_size']
+        }
+        output_path = os.path.join(output_dir, f"{job['id']}.json")
+        with open(output_path, "w") as f:
+            json.dump(metrics, f, indent=4)
+            print(f"Wrote metrics to {output_path}")
+
+    # for step in invocations['steps']:
+    #     job_id = step['job_id']
+    #     if job_id is not None:
+    #         retries = 3
+    #         done = False
+    #         while not done and retries >= 0:
+    #             print(f"Waiting for job {job_id} on {context.GALAXY_SERVER}")
+    #             try:
+    #                 # TDOD Should retry if anything throws an exception.
+    #                 status = gi.jobs.wait_for_job(job_id, 86400, 10, False)
+    #                 data = gi.jobs.show_job(job_id, full_details=True)
+    #                 metrics = {
+    #                     'run': run,
+    #                     'cloud': cloud,
+    #                     'job_conf': conf,
+    #                     'workflow_id': wfid,
+    #                     'history_id': hid,
+    #                     'inputs': inputs,
+    #                     'metrics': data,
+    #                     'status': status,
+    #                     'server': context.GALAXY_SERVER,
+    #                     'ref_data_size': invocations['ref_data_size'],
+    #                     'input_data_size': invocations['input_data_size']
+    #                 }
+    #                 output_path = os.path.join(output_dir, f"{job_id}.json")
+    #                 with open(output_path, "w") as f:
+    #                     json.dump(metrics, f, indent=4)
+    #                     print(f"Wrote metrics to {output_path}")
+    #                 done = True
+    #             except ConnectionError as e:
+    #                 print(f"ERROR: connection dropped while waiting for {job_id}")
+    #                 retries -= 1
+    #             except Exception as e:
+    #                 print(f"ERROR: {e}")
+    #                 retries -= 1
 
 
 def parse_workflow(workflow_path: str):

diff --git a/abm/lib/history.py b/abm/lib/history.py
@@ -1,8 +1,11 @@
 import json
 import os
 import sys
+import time
+
 import yaml
 
+from bioblend.galaxy.objects import GalaxyInstance
 from lib.common import connect, parse_profile, Context, summarize_metrics, find_history, print_json
 from pprint import pprint
 from pathlib import Path
@@ -38,7 +41,7 @@ def print_histories(histories: list):
         print(f"{history['id']:<{id_width}} {history['name']:<{name_width}} {pad(history['deleted'])} {pad(history['published'])} {', '.join(history['tags'])}")
 
 
-def list(context: Context, args: list):
+def _list(context: Context, args: list):
     gi = connect(context)
     print_histories(gi.histories.get_histories())
 
@@ -360,3 +363,80 @@ def summarize(context: Context, args: list):
         #         all_jobs.append(job)
     # summarize_metrics(gi, gi.jobs.get_jobs(history_id=args[0]))
     summarize_metrics(gi, all_jobs)
+
+def wait(context: Context, args: list):
+    state = ''
+    if len(args) == 0:
+        print("ERROR: No history ID provided")
+        return
+
+    gi = connect(context)
+    history_id = find_history(gi, args[0])
+    if history_id is None:
+        print("ERROR: No such history")
+        return
+    wait_for(gi, history_id)
+
+
+def wait_for(gi: GalaxyInstance, history_id: str):
+    errored = []
+    waiting = True
+    job_states = JobStates()
+    while waiting:
+        restart = []
+        status_counts = dict()
+        terminal = 0
+        job_list = gi.jobs.get_jobs(history_id=history_id)
+        for job in job_list:
+            job_states.update(job)
+            state = job['state']
+            id = job['id']
+            # Count how many jobs are in each state.
+            if state not in status_counts:
+                status_counts[state] = 1
+            else:
+                status_counts[state] += 1
+            # Count jobs in a terminal state and mark failed jobs for a restart
+            if state == 'ok':
+                terminal += 1
+            elif state == 'error':
+                terminal += 1
+                if id not in errored:
+                    restart.append(id)
+                    errored.append(id)
+        if len(restart) > 0:
+            for job in restart:
+                print(f"Restaring job {job}")
+                try:
+                    gi.jobs.rerun_job(job, remap=True)
+                except:
+                    try:
+                        gi.jobs.rerun_job(job, remap=False)
+                    except:
+                        print(f"Failed to restart job {job}")
+                        waiting = False
+        elif len(job_list) == terminal:
+            print("All jobs are in a terminal state")
+            waiting = False
+        if waiting:
+            time.sleep(30)
+            # elif state == 'paused':
+            #     paused += 1
+            # print(f"{job['id']}\t{job['state']}\t{job['update_time']}\t{job['tool_id']}")
+
+
+class JobStates:
+    def __init__(self):
+        self._jobs = dict()
+
+    def update(self, job):
+        id = job['id']
+        state = job['state']
+        tool = job['tool_id']
+        if '/' in tool:
+            tool = tool.split('/')[-2]
+        if id not in self._jobs:
+            print(f"Job {id} {tool} state {state}")
+        elif state != self._jobs[id]:
+            print(f"Job {id} {tool} {self._jobs[id]} -> {state}")
+        self._jobs[id] = state
diff --git a/abm/lib/menu.yml b/abm/lib/menu.yml
@@ -125,7 +125,7 @@
   help: manage histories
   menu:
     - name: ['list', 'ls']
-      handler: history.list
+      handler: history._list
       help: list histories on the server.
       params: "[-a|--all]"
     - name: ['import', 'imp', 'im']
@@ -182,6 +182,10 @@
       params: STR
       help: delete all histories that contain STR in the name. Use * to purge all histories.
       handler: history.purge
+    - name: [ wait ]
+      handler: history.wait
+      help: Wait for all jobs in the history to enter a terminal state (ok or error)
+      params: ID
 - name: [ jobs, job ]
   help: manage jobs on the server
   menu: