Merge pull request #304 from galaxyproject/dev

Preparing for v2.9.0
galaxyproject · Jul 13, 2024 · 2b1de88 · 2b1de88
2 parents 0c9f5eb + 24b7b5b
commit 2b1de88
Show file tree

Hide file tree

Showing 31 changed files with 908 additions and 499 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2021 Galaxy Project
+Copyright (c) 2024 Galaxy Project
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -1,4 +1,15 @@
 .PHONY: dist
+help:
+	@echo
+	@echo "GOALS"
+	@echo "    clean       - deletes the dist directory and egg-info"
+	@echo "    dist        - creates the distribution package (wheel)"
+	@echo "    format      - runs Black and isort"
+	@echo "    test-deploy - deploys to test.pypi.org"
+	@echo "    deploy      - deploys to pypi.org"
+	@echo "    release     - creates a GitHub release package"
+	@echo
+
 dist:
 	python3 setup.py sdist bdist_wheel
 

diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ The `kubectl` program is only required when bootstrapping a new Galaxy instance,
 
 ### Credentials
 
-You will need an [API key](https://training.galaxyproject.org/training-material/faqs/galaxy/preferences_admin_api_key.html) for every Galaxy instance you would like to intereact with. You will also need the *kubeconfig* file for each Kubernetes cluster.  The `abm` script loads the Galaxy server URLs, API keys, and the location of the *kubeconfig* files from a Yaml configuration file that it expects to find in `$HOME/.abm/profile.yml` or `.abm-profile.yml` in the current directory.  You can use the `profile-sample.yml` file as a starting point and it includes the URLs for all Galaxy instances we have used to date (December 22, 2021 as of this writing). 
+You will need an [API key](https://training.galaxyproject.org/training-material/faqs/galaxy/preferences_admin_api_key.html) for every Galaxy instance you would like to intereact with. You will also need the *kubeconfig* file for each Kubernetes cluster.  The `abm` script loads the Galaxy server URLs, API keys, and the location of the *kubeconfig* files from a Yaml configuration file that it expects to find in `$HOME/.abm/profile.yml` or `.abm-profile.yml` in the current directory.  You can use the `samples/profile.yml` file as a starting point and it includes the URLs for all Galaxy instances we have used to date (December 22, 2021 as of this writing). 
 
 :bulb: It is now possible (>=2.0.0) to create Galaxy users and their API keys directly with `abm`.
 

diff --git a/abm/__main__.py b/abm/__main__.py
@@ -3,7 +3,7 @@
 """
 The Automated Benchmarking Tool
 
-Copyright 2023 The Galaxy Project. All rights reserved.
+Copyright 2024 The Galaxy Project. All rights reserved.
 
 """
 
@@ -64,7 +64,7 @@ def command_list(commands: list):
 
 
 def copyright():
-    print(f"    Copyright 2023 The Galaxy Project. All Rights Reserved.\n")
+    print(f"    Copyright 2024 The Galaxy Project. All Rights Reserved.\n")
 
 
 def print_main_help(menu_data):

diff --git a/abm/lib/__init__.py b/abm/lib/__init__.py
@@ -4,14 +4,16 @@
 
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 
-# from common import parse_profile
-
+# Where the workflow invocation data returned by Galaxy will be saved.
 INVOCATIONS_DIR = "invocations"
+#  Where workflow runtime metrics will be saved.
 METRICS_DIR = "metrics"
 
+# Global instance of a YAML parser so we can reuse it if needed.
 parser = None
 
 
+# Keys used in various dictionaries.
 class Keys:
     NAME = 'name'
     RUNS = 'runs'
@@ -22,3 +24,20 @@ class Keys:
     COLLECTION = 'collection'
     HISTORY_BASE_NAME = 'output_history_base_name'
     HISTORY_NAME = 'history_name'
+
+
+# def get_master_api_key():
+#     '''
+#     Get the master API key from the environment or configuration file.
+#     '''
+#     if 'GALAXY_MASTER_API_KEY' in os.environ:
+#         return os.environ['GALAXY_MASTER_API_KEY']
+#     config_path = os.path.expanduser("~/.abm/config.yml")
+#     if not os.path.exists(config_path):
+#         raise RuntimeError(f"ERROR: Configuration file not found: {config_path}")
+#     with open(config_path, 'r') as f:
+#         config = yaml.safe_load(f)
+#     key = config.get('GALAXY_MASTER_API_KEY', None)
+#     if key == None:
+#         raise RuntimeError("ERROR: GALAXY_MASTER_API_KEY not found in config.yml")
+#     return key
diff --git a/abm/lib/benchmark.py b/abm/lib/benchmark.py
@@ -8,21 +8,18 @@
 from bioblend.galaxy import GalaxyInstance, dataset_collections
 from lib import INVOCATIONS_DIR, METRICS_DIR, Keys
 from lib.common import (Context, _get_dataset_data, _make_dataset_element,
-                        connect, print_json)
+                        connect, print_json, try_for)
 from lib.history import wait_for
 
 log = logging.getLogger('abm')
 
 
 def run_cli(context: Context, args: list):
     """
-    Runs a single workflow defined by *args[0]*
+    Command line handler to run a single benchmark.
 
-    :param args: a list that contains:
-    args[0] - the path to the benchmark configuration file
-    args[1] - the prefix to use when creating the new history in Galaxy
-    args[2] - the name of the experiment, if part of one. This is used to
-              generate output folder names.
+    :param context: a context object the defines how to connect to the Galaxy server.
+    :param args: parameters from the command line
 
     :return: True if the workflows completed sucessfully. False otherwise.
     """
@@ -43,11 +40,15 @@ def run_cli(context: Context, args: list):
 
 
 def run(context: Context, workflow_path, history_prefix: str, experiment: str):
-    # if len(args) > 1:
-    #     history_prefix = args[1]
-    #     if len(args) > 2:
-    #         experiment = args[2].replace(' ', '_').lower()
+    """
+    Does the actual work of running a benchmark.
 
+    :param context: a context object the defines how to connect to the Galaxy server.
+    :param workflow_path: path to the ABM workflow file. (benchmark really). NOTE this is NOT the Galaxy .ga file.
+    :param history_prefix: a prefix value used when generating new history names.
+    :param experiment: the name of the experiment (arbitrary string). Used to generate new history names.
+    :return: True if the workflow run completed successfully. False otherwise.
+    """
     if os.path.exists(INVOCATIONS_DIR):
         if not os.path.isdir(INVOCATIONS_DIR):
             print('ERROR: Can not save invocation status, directory name in use.')
@@ -76,7 +77,7 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
     workflows = parse_workflow(workflow_path)
     if not workflows:
         print(f"Unable to load any workflow definitions from {workflow_path}")
-        return
+        return False
 
     print(f"Found {len(workflows)} workflow definitions")
     for workflow in workflows:
@@ -144,11 +145,13 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
                         dsid = find_collection_id(gi, dsname)
                         dsdata = _get_dataset_data(gi, dsid)
                         if dsdata is None:
-                            raise Exception(
-                                f"ERROR: unable to resolve {dsname} to a dataset."
-                            )
-                        dsid = dsdata['id']
-                        dssize = dsdata['size']
+                            # raise Exception(
+                            #     f"ERROR: unable to resolve {dsname} to a dataset."
+                            # )
+                            dssize = 0
+                        else:
+                            dsid = dsdata['id']
+                            dssize = dsdata['size']
                         input_data_size.append(dssize)
                         print(f"Input collection ID: {dsname} [{dsid}] {dssize}")
                         inputs[input[0]] = {'id': dsid, 'src': 'hdca', 'size': dssize}
@@ -173,7 +176,7 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
                             histories = gi.histories.get_histories(name=spec['history'])
                             if len(histories) == 0:
                                 print(f"ERROR: History {spec['history']} not foune")
-                                return
+                                return False
                             hid = histories[0]['id']
                             pairs = 0
                             paired_list = spec['paired']
@@ -183,7 +186,13 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
                                 for key in item.keys():
                                     # print(f"Getting dataset for {key} = {item[key]}")
                                     value = _get_dataset_data(gi, item[key])
-                                    size += value['size']
+                                    if value is None:
+                                        print(
+                                            f"ERROR: Unable to find dataset {item[key]}"
+                                        )
+                                        return
+                                    if size in value:
+                                        size += value['size']
                                     elements.append(
                                         _make_dataset_element(key, value['id'])
                                     )
@@ -224,16 +233,20 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
                     else:
                         raise Exception(f'Invalid input value')
             print(f"Running workflow {wfid} in history {new_history_name}")
-            invocation = gi.workflows.invoke_workflow(
+            f = lambda: gi.workflows.invoke_workflow(
                 wfid, inputs=inputs, history_name=new_history_name
             )
+            invocation = try_for(f, 3)
             id = invocation['id']
             # invocations = gi.invocations.wait_for_invocation(id, 86400, 10, False)
+            f = lambda: gi.invocations.wait_for_invocation(id, 86400, 10, False)
             try:
-                invocations = gi.invocations.wait_for_invocation(id, 86400, 10, False)
-            except:
+                invocations = try_for(f, 2)
+            except Exception as e:
+                print(f"Exception waiting for invocations")
                 pprint(invocation)
                 sys.exc_info()
+                raise e
             print("Waiting for jobs")
             if history_prefix is not None:
                 parts = history_prefix.split()
@@ -265,6 +278,14 @@ def run(context: Context, workflow_path, history_prefix: str, experiment: str):
 
 
 def translate(context: Context, args: list):
+    """
+    Translates the human readable names of datasets and workflows in to the Galaxy
+    ID that is unique to each server.
+
+    :param context: the conext object used to connect to the Galaxy server
+    :param args: [0] the path to the benchmarking YAML file to translate
+    :return: Nothing. Prints the translated workflow file to stdout.
+    """
     if len(args) == 0:
         print('ERROR: no workflow configuration specified')
         return
@@ -307,6 +328,14 @@ def translate(context: Context, args: list):
 
 
 def validate(context: Context, args: list):
+    """
+    Checks to see if the workflow and all datasets defined in the benchmark can
+    be found on the server.
+
+    :param context: the context object used to connect to the Galaxy instance
+    :param args: [0] the benchmark YAML file to be validated.
+    :return:
+    """
     if len(args) == 0:
         print('ERROR: no workflow configuration specified')
         return
@@ -412,10 +441,10 @@ def validate(context: Context, args: list):
 
 
 def wait_for_jobs(context, gi: GalaxyInstance, invocations: dict):
-    """Blocks until all jobs defined in the *invocations* to complete.
+    """Blocks until all jobs defined in *invocations* are complete (in a terminal state).
 
     :param gi: The *GalaxyInstance** running the jobs
-    :param invocations:
+    :param invocations: a dictionary containing information about the jobs invoked
     :return:
     """
     wfid = invocations['workflow_id']
@@ -429,6 +458,7 @@ def wait_for_jobs(context, gi: GalaxyInstance, invocations: dict):
     jobs = gi.jobs.get_jobs(history_id=hid)
     for job in jobs:
         data = gi.jobs.show_job(job['id'], full_details=True)
+        data['job_metrics'] = gi.jobs.get_job_metrics(job['id'])
         metrics = {
             'run': run,
             'cloud': cloud,
@@ -485,6 +515,11 @@ def wait_for_jobs(context, gi: GalaxyInstance, invocations: dict):
 
 
 def parse_workflow(workflow_path: str):
+    """
+    Loads the benchmark YAML file.
+    :param workflow_path: the path to the file to be loaded.
+    :return: a dictionary containing the benchmark.
+    """
     if not os.path.exists(workflow_path):
         print(f'ERROR: could not find workflow file {workflow_path}')
         return None
@@ -503,6 +538,14 @@ def parse_workflow(workflow_path: str):
 
 
 def find_workflow_id(gi, name_or_id):
+    """
+    Resolves the human-readable name for a workflow into the unique ID on the
+    Galaxy instance.
+
+    :param gi: the connection object to the Galaxy instance
+    :param name_or_id: the name of the workflow
+    :return: The Galaxy workflow ID or None if the workflow could not be located
+    """
     try:
         wf = gi.workflows.show_workflow(name_or_id)
         return wf['id']
@@ -519,7 +562,14 @@ def find_workflow_id(gi, name_or_id):
 
 
 def find_dataset_id(gi, name_or_id):
-    # print(f"Finding dataset {name_or_id}")
+    """
+    Resolves the human-readable name if a dataset into the unique ID on the
+    Galaxy instance
+
+    :param gi: the connection object to the Galaxy instance
+    :param name_or_id: the name of the dataset.
+    :return: the Galaxy dataset ID or None if the dataset could not be located.
+    """
     try:
         ds = gi.datasets.show_dataset(name_or_id)
         return ds['id']
@@ -544,6 +594,14 @@ def find_dataset_id(gi, name_or_id):
 
 
 def find_collection_id(gi, name):
+    """
+    Resolves a human-readable collection name into the unique Galaxy ID.
+
+    :param gi: the connection object to the Galaxy instance
+    :param name: the name of the collection to resolve
+    :return: The unique Galaxy ID of the collection or None if the collection
+    can not be located.
+    """
     kwargs = {'limit': 10000, 'offset': 0}
     datasets = gi.datasets.get_datasets(**kwargs)
     if len(datasets) == 0:
@@ -565,7 +623,22 @@ def find_collection_id(gi, name):
 
 
 def test(context: Context, args: list):
-    id = 'c90fffcf98b31cd3'
+    """
+    Allows running testing code from the command line.
+
+    :param context: a connection object to a Galaxy instance
+    :param args: varies
+    :return: varies, typically None.
+    """
+    # id = 'c90fffcf98b31cd3'
+    # gi = connect(context)
+    # inputs = gi.workflows.get_workflow_inputs(id, 'PE fastq input')
+    # pprint(inputs)
+
     gi = connect(context)
-    inputs = gi.workflows.get_workflow_inputs(id, 'PE fastq input')
-    pprint(inputs)
+    print("Calling find_collection_id")
+    dsid = find_collection_id(gi, args[0])
+    print(f"Collection ID: {dsid}")
+    print("Calling _get_dataset_data")
+    dsdata = _get_dataset_data(gi, dsid)
+    pprint(dsdata)
diff --git a/abm/lib/cloudlaunch.py b/abm/lib/cloudlaunch.py
@@ -8,6 +8,8 @@
 from cloudlaunch_cli.main import create_api_client
 from common import Context
 
+# DEPRECATED - Cloudlaunch is no longer used to manage Galaxy clusters.
+
 BOLD = '\033[1m'
 CLEAR = '\033[0m'
 
@@ -40,7 +42,7 @@ def h1(text):
 '''
 
 
-def list(context: Context, args: list):
+def do_list(context: Context, args: list):
     archived = False
     filter = None
     status = lambda t: t.instance_status if t.instance_status else t.status