From 31a9bc2b2582c847d3b1971ab6582270694395e4 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 6 May 2024 13:53:57 -0400 Subject: [PATCH 01/40] Add test case for GCP S3 Interop Object Stores. Works with service account and user accounts keys. Settings -> Interoperability to create HMAC keys. --- test/unit/objectstore/test_objectstore.py | 38 +++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 44564acbc2be..1229a01e0a74 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1617,6 +1617,44 @@ def test_aws_via_cloudbridge_store_with_region(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) + +GOOGLE_S3_INTEROP_TEMPLATE_TEST_CONFIG_YAML = """ +type: generic_s3 +store_by: uuid +auth: + access_key: ${access_key} + secret_key: ${secret_key} + +bucket: + name: ${bucket} + +connection: + host: storage.googleapis.com + port: 443 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure +""" + +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop(tmp_path): + template_vars = { + "access_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY"], + "bucket": os.environ["GALAXY_TEST_GOOGLE_BUCKET"], + } + with TestConfig(GOOGLE_S3_INTEROP_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as ( + _, + object_store, + ): + verify_caching_object_store_functionality(tmp_path, object_store) + + class MockDataset: def __init__(self, id): self.id = id From 95913c626e56f6890436fcecc0c15694b843408a Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 6 May 2024 16:59:06 -0400 Subject: [PATCH 02/40] De-dupclication around object store permission restoration. --- lib/galaxy/objectstore/_util.py | 15 +++++++++++++++ lib/galaxy/objectstore/azure_blob.py | 18 ++++-------------- lib/galaxy/objectstore/cloud.py | 17 +++-------------- lib/galaxy/objectstore/irods.py | 17 ++++------------- lib/galaxy/objectstore/pithos.py | 16 +++------------- lib/galaxy/objectstore/s3.py | 17 +++-------------- 6 files changed, 32 insertions(+), 68 deletions(-) create mode 100644 lib/galaxy/objectstore/_util.py diff --git a/lib/galaxy/objectstore/_util.py b/lib/galaxy/objectstore/_util.py new file mode 100644 index 000000000000..e70f26e0661e --- /dev/null +++ b/lib/galaxy/objectstore/_util.py @@ -0,0 +1,15 @@ +import os + +from galaxy.util import umask_fix_perms + + +def fix_permissions(config, rel_path: str): + """Set permissions on rel_path""" + for basedir, _, files in os.walk(rel_path): + umask_fix_perms(basedir, config.umask, 0o777, config.gid) + for filename in files: + path = os.path.join(basedir, filename) + # Ignore symlinks + if os.path.islink(path): + continue + umask_fix_perms(path, config.umask, 0o666, config.gid) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 39e3c7490eb1..817f8f190eb6 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -27,7 +27,6 @@ ) from galaxy.util import ( directory_hash_id, - umask_fix_perms, unlink, ) from galaxy.util.path import safe_relpath @@ -38,6 +37,8 @@ InProcessCacheMonitor, parse_caching_config_dict_from_xml, ) +from ._util import fix_permissions + NO_BLOBSERVICE_ERROR_MESSAGE = ( "ObjectStore configured, but no azure.storage.blob dependency available." @@ -232,17 +233,6 @@ def _construct_path( return rel_path - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - def _get_cache_path(self, rel_path): return os.path.abspath(os.path.join(self.staging_path, rel_path)) @@ -278,7 +268,7 @@ def _pull_into_cache(self, rel_path): os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) # Now pull in the file file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) return file_ok def _download(self, rel_path): @@ -556,7 +546,7 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if source_file != cache_file and self.cache_updated_data: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) + fix_permissions(self.config, cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 0a7f80e37b31..e9b09b264965 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -18,7 +18,6 @@ from galaxy.util import ( directory_hash_id, safe_relpath, - umask_fix_perms, unlink, ) from . import ConcreteObjectStore @@ -28,6 +27,7 @@ InProcessCacheMonitor, ) from .s3 import parse_config_xml +from ._util import fix_permissions try: from cloudbridge.factory import ( @@ -260,17 +260,6 @@ def _get_bucket(self, bucket_name): log.exception(f"Could not get bucket '{bucket_name}'") raise Exception - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - def _construct_path( self, obj, @@ -367,7 +356,7 @@ def _pull_into_cache(self, rel_path): os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) # Now pull in the file file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): @@ -661,7 +650,7 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if source_file != cache_file and self.cache_updated_data: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) + fix_permissions(self.config, cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index bb0bdead62b2..8795ae027e34 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -33,6 +33,8 @@ ) from galaxy.util.path import safe_relpath from . import DiskObjectStore +from ._util import fix_permissions + IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it" # 1 MB @@ -314,17 +316,6 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - def _construct_path( self, obj, @@ -432,7 +423,7 @@ def _pull_into_cache(self, rel_path): os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) # Now pull in the file file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) log.debug("irods_pt _pull_into_cache: %s", ipt_timer) return file_ok @@ -779,7 +770,7 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if source_file != cache_file and self.cache_updated_data: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) + fix_permissions(self.config, cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 60a710f1542d..611a16820824 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -27,6 +27,7 @@ ) from galaxy.util.path import safe_relpath from . import ConcreteObjectStore +from ._util import fix_permissions NO_KAMAKI_ERROR_MESSAGE = ( "ObjectStore configured, but no kamaki.clients dependency available." @@ -215,17 +216,6 @@ def _in_cache(self, rel_path): cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) @@ -235,7 +225,7 @@ def _pull_into_cache(self, rel_path): # Now pull in the file cache_path = self._get_cache_path(rel_path_dir) self.pithos.download_object(rel_path, cache_path) - self._fix_permissions(cache_path) + fix_permissions(self.config, cache_path) return cache_path # No need to overwrite "shutdown" @@ -418,7 +408,7 @@ def _update_from_file(self, obj, **kwargs): try: if source_path != cache_path: shutil.copy2(source_path, cache_path) - self._fix_permissions(cache_path) + fix_permissions(self.config, cache_path) except OSError: log.exception('Trouble copying source file "%s" to cache "%s"', source_path, cache_path) else: diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 1caf355aec68..a807845a1b1f 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -27,7 +27,6 @@ from galaxy.util import ( directory_hash_id, string_as_bool, - umask_fix_perms, unlink, which, ) @@ -40,6 +39,7 @@ parse_caching_config_dict_from_xml, ) from .s3_multipart_upload import multipart_upload +from ._util import fix_permissions NO_BOTO_ERROR_MESSAGE = ( "S3/Swift object store configured, but no boto dependency available." @@ -288,17 +288,6 @@ def _get_bucket(self, bucket_name): # raise error raise S3ResponseError - def _fix_permissions(self, rel_path): - """Set permissions on rel_path""" - for basedir, _, files in os.walk(rel_path): - umask_fix_perms(basedir, self.config.umask, 0o777, self.config.gid) - for filename in files: - path = os.path.join(basedir, filename) - # Ignore symlinks - if os.path.islink(path): - continue - umask_fix_perms(path, self.config.umask, 0o666, self.config.gid) - def _construct_path( self, obj, @@ -417,7 +406,7 @@ def _pull_into_cache(self, rel_path): os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) # Now pull in the file file_ok = self._download(rel_path) - self._fix_permissions(self._get_cache_path(rel_path_dir)) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) return file_ok def _transfer_cb(self, complete, total): @@ -722,7 +711,7 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if source_file != cache_file and self.cache_updated_data: # FIXME? Should this be a `move`? shutil.copy2(source_file, cache_file) - self._fix_permissions(cache_file) + fix_permissions(self.config, cache_file) except OSError: log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) else: From fa53abd75eaf0ea398a970851c6389ed077595d8 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 6 May 2024 17:19:46 -0400 Subject: [PATCH 03/40] De-duplication around object store _construct_path. --- lib/galaxy/objectstore/azure_blob.py | 67 ++------------------------ lib/galaxy/objectstore/caching.py | 70 ++++++++++++++++++++++++++++ lib/galaxy/objectstore/cloud.py | 65 ++------------------------ lib/galaxy/objectstore/irods.py | 63 +------------------------ lib/galaxy/objectstore/pithos.py | 66 +------------------------- lib/galaxy/objectstore/rucio.py | 54 +-------------------- lib/galaxy/objectstore/s3.py | 57 ++-------------------- 7 files changed, 85 insertions(+), 357 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 817f8f190eb6..45e8b4a99057 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -31,14 +31,14 @@ ) from galaxy.util.path import safe_relpath from . import ConcreteObjectStore +from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, + UsesCache, ) -from ._util import fix_permissions - NO_BLOBSERVICE_ERROR_MESSAGE = ( "ObjectStore configured, but no azure.storage.blob dependency available." @@ -92,7 +92,7 @@ def parse_config_xml(config_xml): raise -class AzureBlobObjectStore(ConcreteObjectStore): +class AzureBlobObjectStore(ConcreteObjectStore, UsesCache): """ Object store that stores objects as blobs in an Azure Blob Container. A local cache exists that is used as an intermediate location for files between @@ -180,62 +180,6 @@ def _configure_connection(self): ) self.service = service - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - # rel_path = '%s/' % rel_path # assume for now we don't need this in Azure blob storage. - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - def _get_size_in_azure(self, rel_path): try: properties = self._blob_client(rel_path).get_blob_properties() @@ -256,11 +200,6 @@ def _in_azure(self, rel_path): def _blob_client(self, rel_path: str): return self.service.get_blob_client(self.container_name, rel_path) - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 502d55b5a75c..e0b407e91e32 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -7,6 +7,7 @@ import time from math import inf from typing import ( + Dict, List, Optional, Tuple, @@ -14,10 +15,13 @@ from typing_extensions import NamedTuple +from galaxy.exceptions import ObjectInvalid from galaxy.util import ( + directory_hash_id, nice_size, string_as_bool, ) +from galaxy.util.path import safe_relpath from galaxy.util.sleeper import Sleeper log = logging.getLogger(__name__) @@ -213,3 +217,69 @@ def shutdown(self): # Wait for the cache monitor thread to join before ending self.cache_monitor_thread.join(5) + + +# mixin for object stores using a cache directory +class UsesCache: + staging_path: str + extra_dirs: Dict[str, str] + + def _construct_path( + self, + obj, + base_dir=None, + dir_only=None, + extra_dir=None, + extra_dir_at_root=False, + alt_name=None, + obj_dir=False, + in_cache=False, + **kwargs, + ): + # extra_dir should never be constructed from provided data but just + # make sure there are no shenannigans afoot + if extra_dir and extra_dir != os.path.normpath(extra_dir): + log.warning("extra_dir is not normalized: %s", extra_dir) + raise ObjectInvalid("The requested object is invalid") + # ensure that any parent directory references in alt_name would not + # result in a path not contained in the directory path constructed here + if alt_name: + if not safe_relpath(alt_name): + log.warning("alt_name would locate path outside dir: %s", alt_name) + raise ObjectInvalid("The requested object is invalid") + # alt_name can contain parent directory references, but S3 will not + # follow them, so if they are valid we normalize them out + alt_name = os.path.normpath(alt_name) + + object_id = self._get_object_id(obj) + rel_path = os.path.join(*directory_hash_id(object_id)) + + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # for JOB_WORK directory + if obj_dir: + rel_path = os.path.join(rel_path, str(object_id)) + if base_dir: + base = self.extra_dirs.get(base_dir) + assert base + return os.path.join(base, rel_path) + + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat") + + if in_cache: + return self._get_cache_path(rel_path) + + return rel_path + + def _get_cache_path(self, rel_path: str) -> str: + return os.path.abspath(os.path.join(self.staging_path, rel_path)) + + def _in_cache(self, rel_path: str) -> bool: + """Check if the given dataset is in the local cache and return True if so.""" + cache_path = self._get_cache_path(rel_path) + return os.path.exists(cache_path) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index e9b09b264965..0cbde3616526 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -21,13 +21,14 @@ unlink, ) from . import ConcreteObjectStore +from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, InProcessCacheMonitor, + UsesCache, ) from .s3 import parse_config_xml -from ._util import fix_permissions try: from cloudbridge.factory import ( @@ -64,7 +65,7 @@ def _config_to_dict(self): } -class Cloud(ConcreteObjectStore, CloudConfigMixin): +class Cloud(ConcreteObjectStore, CloudConfigMixin, UsesCache): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between @@ -260,60 +261,6 @@ def _get_bucket(self, bucket_name): log.exception(f"Could not get bucket '{bucket_name}'") raise Exception - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - def _get_transfer_progress(self): return self.transfer_progress @@ -343,12 +290,6 @@ def _key_exists(self, rel_path): return False return exists - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - # log.debug("------ Checking cache for rel_path %s" % rel_path) - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 8795ae027e34..1404ead95d66 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -34,7 +34,7 @@ from galaxy.util.path import safe_relpath from . import DiskObjectStore from ._util import fix_permissions - +from .caching import UsesCache IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it" # 1 MB @@ -153,7 +153,7 @@ def _config_to_dict(self): } -class IRODSObjectStore(DiskObjectStore, CloudConfigMixin): +class IRODSObjectStore(DiskObjectStore, CloudConfigMixin, UsesCache): """ Object store that stores files as data objects in an iRODS Zone. A local cache exists that is used as an intermediate location for files between Galaxy and iRODS. @@ -316,60 +316,6 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - ipt_timer = ExecutionTimer() - # extra_dir should never be constructed from provided data but just - # make sure there are no shenanigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - log.debug("irods_pt _construct_path: %s", ipt_timer) - return os.path.join(base, rel_path) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - log.debug("irods_pt _construct_path: %s", ipt_timer) - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - # rel_path is file or folder? def _get_size_in_irods(self, rel_path): ipt_timer = ExecutionTimer() @@ -410,11 +356,6 @@ def _data_object_exists(self, rel_path): finally: log.debug("irods_pt _data_object_exists: %s", ipt_timer) - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - def _pull_into_cache(self, rel_path): ipt_timer = ExecutionTimer() # Ensure the cache directory structure exists (e.g., dataset_#_files/) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 611a16820824..1c4c6b8065ca 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -28,6 +28,7 @@ from galaxy.util.path import safe_relpath from . import ConcreteObjectStore from ._util import fix_permissions +from .caching import UsesCache NO_KAMAKI_ERROR_MESSAGE = ( "ObjectStore configured, but no kamaki.clients dependency available." @@ -89,7 +90,7 @@ def parse_config_xml(config_xml): return r -class PithosObjectStore(ConcreteObjectStore): +class PithosObjectStore(ConcreteObjectStore, UsesCache): """ Object store that stores objects as items in a Pithos+ container. Cache is ignored for the time being. @@ -153,69 +154,6 @@ def _init_pithos(self): if project and c.get("x-container-policy-project") != project: self.pithos.reassign_container(project) - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - """Construct path from object and parameters""" - # param extra_dir: should never be constructed from provided data but - # just make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning(f"extra_dir is not normalized: {extra_dir}") - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning(f"alt_name would locate path outside dir: {alt_name}") - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # Pithos+ folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - an = alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat" - rel_path = os.path.join(rel_path, an) - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if - so. - """ - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - def _pull_into_cache(self, rel_path): # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 1d9c3d48b8d7..349764185c42 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -39,6 +39,7 @@ enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, + UsesCache, ) log = logging.getLogger(__name__) @@ -273,7 +274,7 @@ def delete(self, key, auth_token): return True -class RucioObjectStore(ConcreteObjectStore): +class RucioObjectStore(ConcreteObjectStore, UsesCache): """ Object store implementation that uses ORNL remote data broker. @@ -312,57 +313,6 @@ def _initialize(self): if self.enable_cache_monitor: self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenanigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(str(base), rel_path) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - def _pull_into_cache(self, rel_path, auth_token): log.debug("rucio _pull_into_cache: %s", rel_path) # Ensure the cache directory structure exists (e.g., dataset_#_files/) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index a807845a1b1f..73e24799c6b5 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -32,14 +32,15 @@ ) from galaxy.util.path import safe_relpath from . import ConcreteObjectStore +from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, + UsesCache, ) from .s3_multipart_upload import multipart_upload -from ._util import fix_permissions NO_BOTO_ERROR_MESSAGE = ( "S3/Swift object store configured, but no boto dependency available." @@ -154,7 +155,7 @@ def _config_to_dict(self): } -class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin): +class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin, UsesCache): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between @@ -288,58 +289,6 @@ def _get_bucket(self, bucket_name): # raise error raise S3ResponseError - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(self._get_object_id(obj))) - if base_dir: - base = self.extra_dirs.get(base_dir) - return os.path.join(base, rel_path) - - # S3 folders are marked by having trailing '/' so add it now - rel_path = f"{rel_path}/" - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - if in_cache: - return self._get_cache_path(rel_path) - return rel_path - - def _get_cache_path(self, rel_path): - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - def _get_transfer_progress(self): return self.transfer_progress From 08e0d10322de688dc2f2d5aca72059d46e189861 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 09:46:13 -0400 Subject: [PATCH 04/40] Reduce duplication around _pull_into_cache. --- lib/galaxy/objectstore/azure_blob.py | 10 ---------- lib/galaxy/objectstore/caching.py | 20 ++++++++++++++++++++ lib/galaxy/objectstore/cloud.py | 10 ---------- lib/galaxy/objectstore/irods.py | 12 ------------ lib/galaxy/objectstore/pithos.py | 14 +++----------- lib/galaxy/objectstore/s3.py | 10 ---------- 6 files changed, 23 insertions(+), 53 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 45e8b4a99057..296ffaf8359e 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -200,16 +200,6 @@ def _in_azure(self, rel_path): def _blob_client(self, rel_path: str): return self.service.get_blob_client(self.container_name, rel_path) - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - return file_ok - def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index e0b407e91e32..6adb889957b9 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -7,6 +7,7 @@ import time from math import inf from typing import ( + Any, Dict, List, Optional, @@ -18,11 +19,13 @@ from galaxy.exceptions import ObjectInvalid from galaxy.util import ( directory_hash_id, + ExecutionTimer, nice_size, string_as_bool, ) from galaxy.util.path import safe_relpath from galaxy.util.sleeper import Sleeper +from ._util import fix_permissions log = logging.getLogger(__name__) @@ -223,6 +226,7 @@ def shutdown(self): class UsesCache: staging_path: str extra_dirs: Dict[str, str] + config: Any def _construct_path( self, @@ -283,3 +287,19 @@ def _in_cache(self, rel_path: str) -> bool: """Check if the given dataset is in the local cache and return True if so.""" cache_path = self._get_cache_path(rel_path) return os.path.exists(cache_path) + + def _pull_into_cache(self, rel_path) -> bool: + ipt_timer = ExecutionTimer() + # Ensure the cache directory structure exists (e.g., dataset_#_files/) + rel_path_dir = os.path.dirname(rel_path) + if not os.path.exists(self._get_cache_path(rel_path_dir)): + os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) + # Now pull in the file + file_ok = self._download(rel_path) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) + log.debug("_pull_into_cache: %s", ipt_timer) + return file_ok + + def _get_object_id(self, obj: Any) -> str: ... + + def _download(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 0cbde3616526..073040bfc193 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -290,16 +290,6 @@ def _key_exists(self, rel_path): return False return exists - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - return file_ok - def _transfer_cb(self, complete, total): self.transfer_progress += 10 diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 1404ead95d66..312d59b79090 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -356,18 +356,6 @@ def _data_object_exists(self, rel_path): finally: log.debug("irods_pt _data_object_exists: %s", ipt_timer) - def _pull_into_cache(self, rel_path): - ipt_timer = ExecutionTimer() - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - log.debug("irods_pt _pull_into_cache: %s", ipt_timer) - return file_ok - def _download(self, rel_path): ipt_timer = ExecutionTimer() log.debug("Pulling data object '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 1c4c6b8065ca..3e189f71187c 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -154,17 +154,9 @@ def _init_pithos(self): if project and c.get("x-container-policy-project") != project: self.pithos.reassign_container(project) - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - rel_cache_path_dir = self._get_cache_path(rel_path_dir) - if not os.path.exists(rel_cache_path_dir): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - cache_path = self._get_cache_path(rel_path_dir) - self.pithos.download_object(rel_path, cache_path) - fix_permissions(self.config, cache_path) - return cache_path + def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) + self.pithos.download_object(rel_path, local_destination) # No need to overwrite "shutdown" diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 73e24799c6b5..15d5f020bd4b 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -348,16 +348,6 @@ def _in_cache(self, rel_path): # else: # return False - def _pull_into_cache(self, rel_path): - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - return file_ok - def _transfer_cb(self, complete, total): self.transfer_progress += 10 From e2c36736cb898140e5195a7233f22776f808bcb5 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:26:19 -0400 Subject: [PATCH 05/40] Converge a few cloud object stores toward same names ahead of refactor. --- lib/galaxy/objectstore/azure_blob.py | 6 +++--- lib/galaxy/objectstore/cloud.py | 6 +++--- lib/galaxy/objectstore/irods.py | 6 +++--- lib/galaxy/objectstore/s3.py | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 296ffaf8359e..ab341b6c4ebb 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -189,7 +189,7 @@ def _get_size_in_azure(self, rel_path): log.exception("Could not get size of blob '%s' from Azure", rel_path) return -1 - def _in_azure(self, rel_path): + def _exists_remotely(self, rel_path: str): try: exists = self._blob_client(rel_path).exists() except AzureHttpError: @@ -289,7 +289,7 @@ def _exists(self, obj, **kwargs): return True in_cache = self._in_cache(rel_path) - in_azure = self._in_azure(rel_path) + in_azure = self._exists_remotely(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get("dir_only", False) @@ -408,7 +408,7 @@ def _delete(self, obj, entire_dir=False, **kwargs): # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well - if self._in_azure(rel_path): + if self._exists_remotely(rel_path): log.debug("Deleting from Azure: %s", rel_path) self._blob_client(rel_path).delete_blob() return True diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 073040bfc193..2dacfb5bd02e 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -272,7 +272,7 @@ def _get_size_in_cloud(self, rel_path): log.exception("Could not get size of key '%s' from S3", rel_path) return -1 - def _key_exists(self, rel_path): + def _exists_remotely(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file @@ -406,7 +406,7 @@ def _exists(self, obj, **kwargs): if self._in_cache(rel_path): in_cache = True # Check cloud - in_cloud = self._key_exists(rel_path) + in_cloud = self._exists_remotely(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) # dir_only does not get synced so shortcut the decision dir_only = kwargs.get("dir_only", False) @@ -505,7 +505,7 @@ def _delete(self, obj, entire_dir=False, **kwargs): # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well - if self._key_exists(rel_path): + if self._exists_remotely(rel_path): key = self.bucket.objects.get(rel_path) log.debug("Deleting key %s", key.name) key.delete() diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 312d59b79090..d6b891f9bbaf 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -337,7 +337,7 @@ def _get_size_in_irods(self, rel_path): log.debug("irods_pt _get_size_in_irods: %s", ipt_timer) # rel_path is file or folder? - def _data_object_exists(self, rel_path): + def _exists_remotely(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -354,7 +354,7 @@ def _data_object_exists(self, rel_path): log.debug("Collection or data object (%s) does not exist", data_object_path) return False finally: - log.debug("irods_pt _data_object_exists: %s", ipt_timer) + log.debug("irods_pt _exists_remotely: %s", ipt_timer) def _download(self, rel_path): ipt_timer = ExecutionTimer() @@ -487,7 +487,7 @@ def _exists(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) # Check cache and irods - if self._in_cache(rel_path) or self._data_object_exists(rel_path): + if self._in_cache(rel_path) or self._exists_remotely(rel_path): log.debug("irods_pt _exists: %s", ipt_timer) return True diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 15d5f020bd4b..ec5319ca6f65 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -300,7 +300,7 @@ def _get_size_in_s3(self, rel_path): log.exception("Could not get size of key '%s' from S3", rel_path) return -1 - def _key_exists(self, rel_path): + def _exists_remotely(self, rel_path): exists = False try: # A hackish way of testing if the rel_path is a folder vs a file @@ -330,7 +330,7 @@ def _in_cache(self, rel_path): # looking likely to be implementable reliably. # if os.path.exists(cache_path): # # print("***1 %s exists" % cache_path) - # if self._key_exists(rel_path): + # if self._exists_remotely(rel_path): # # print("***2 %s exists in S3" % rel_path) # # Make sure the size in cache is available in its entirety # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path))) @@ -474,7 +474,7 @@ def _exists(self, obj, **kwargs): if self._in_cache(rel_path): in_cache = True # Check S3 - in_s3 = self._key_exists(rel_path) + in_s3 = self._exists_remotely(rel_path) # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) # dir_only does not get synced so shortcut the decision if dir_only: @@ -572,7 +572,7 @@ def _delete(self, obj, entire_dir=False, **kwargs): # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well - if self._key_exists(rel_path): + if self._exists_remotely(rel_path): key = Key(self._bucket, rel_path) log.debug("Deleting key %s", key.name) key.delete() From 6e79b10346c1fbe1103b6bc8ccb2604e4dde590c Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:32:27 -0400 Subject: [PATCH 06/40] De-duplicate object store _exists across 4 object stores. --- lib/galaxy/objectstore/azure_blob.py | 33 ------------------------- lib/galaxy/objectstore/caching.py | 37 +++++++++++++++++++++++++++- lib/galaxy/objectstore/cloud.py | 33 ------------------------- lib/galaxy/objectstore/irods.py | 21 ---------------- lib/galaxy/objectstore/s3.py | 34 ------------------------- 5 files changed, 36 insertions(+), 122 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index ab341b6c4ebb..9f4c59a8a3b3 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,39 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def _exists(self, obj, **kwargs): - in_cache = in_azure = False - rel_path = self._construct_path(obj, **kwargs) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - - # check job work directory stuff early to skip API hits. - if dir_only and base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True - - in_cache = self._in_cache(rel_path) - in_azure = self._exists_remotely(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in azure: %s" % (rel_path, in_cache, in_azure)) - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only: - if in_cache or in_azure: - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_azure: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_azure: - return True - else: - return False - def file_ready(self, obj, **kwargs): """ A helper method that checks if a file corresponding to a dataset is diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 6adb889957b9..77c19ea4b2c6 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -297,9 +297,44 @@ def _pull_into_cache(self, rel_path) -> bool: # Now pull in the file file_ok = self._download(rel_path) fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - log.debug("_pull_into_cache: %s", ipt_timer) + log.debug("_pull_into_cache: %s\n\n\n\n\n\n", ipt_timer) return file_ok + def _exists(self, obj, **kwargs): + in_cache = exists_remotely = False + rel_path = self._construct_path(obj, **kwargs) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + + # check job work directory stuff early to skip API hits. + if dir_only and base_dir: + if not os.path.exists(rel_path): + os.makedirs(rel_path, exist_ok=True) + return True + + in_cache = self._in_cache(rel_path) + exists_remotely = self._exists_remotely(rel_path) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + if dir_only: + if in_cache or exists_remotely: + return True + else: + return False + + # TODO: Sync should probably not be done here. Add this to an async upload stack? + if in_cache and not exists_remotely: + self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) + return True + elif exists_remotely: + return True + else: + return False + + def _exists_remotely(self, rel_path: str) -> bool: ... + + def _push_to_os(self, rel_path, source_file: str) -> None: ... + def _get_object_id(self, obj: Any) -> str: ... def _download(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 2dacfb5bd02e..e518bc8089fb 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -398,39 +398,6 @@ def file_ready(self, obj, **kwargs): ) return False - def _exists(self, obj, **kwargs): - in_cache = False - rel_path = self._construct_path(obj, **kwargs) - - # Check cache - if self._in_cache(rel_path): - in_cache = True - # Check cloud - in_cloud = self._exists_remotely(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only: - if in_cache or in_cloud: - return True - # for JOB_WORK directory - elif base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_cloud: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_cloud: - return True - else: - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index d6b891f9bbaf..78b11906bebe 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -482,27 +482,6 @@ def file_ready(self, obj, **kwargs): log.debug("irods_pt _file_ready: %s", ipt_timer) return False - def _exists(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - - # Check cache and irods - if self._in_cache(rel_path) or self._exists_remotely(rel_path): - log.debug("irods_pt _exists: %s", ipt_timer) - return True - - # dir_only does not get synced so shortcut the decision - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only and base_dir: - # for JOB_WORK directory - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - log.debug("irods_pt _exists: %s", ipt_timer) - return True - log.debug("irods_pt _exists: %s", ipt_timer) - return False - def _create(self, obj, **kwargs): ipt_timer = ExecutionTimer() if not self._exists(obj, **kwargs): diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index ec5319ca6f65..6282190b379d 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -458,40 +458,6 @@ def file_ready(self, obj, **kwargs): ) return False - def _exists(self, obj, **kwargs): - in_cache = in_s3 = False - rel_path = self._construct_path(obj, **kwargs) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - - # check job work directory stuff early to skip API hits. - if dir_only and base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True - - # Check cache - if self._in_cache(rel_path): - in_cache = True - # Check S3 - in_s3 = self._exists_remotely(rel_path) - # log.debug("~~~~~~ File '%s' exists in cache: %s; in s3: %s" % (rel_path, in_cache, in_s3)) - # dir_only does not get synced so shortcut the decision - if dir_only: - if in_cache or in_s3: - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not in_s3: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif in_s3: - return True - else: - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields From b098da310db3ad55fe95296bc7e9534eb4a1817f Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:36:26 -0400 Subject: [PATCH 07/40] Remove unused object store method. --- lib/galaxy/objectstore/__init__.py | 14 -------------- lib/galaxy/objectstore/azure_blob.py | 17 ----------------- lib/galaxy/objectstore/cloud.py | 18 ------------------ lib/galaxy/objectstore/irods.py | 21 --------------------- lib/galaxy/objectstore/rucio.py | 19 ------------------- lib/galaxy/objectstore/s3.py | 18 ------------------ 6 files changed, 107 deletions(-) diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index 62d49420e5e6..b954ea910f65 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -373,16 +373,6 @@ def shutdown(self): """Close any connections for this ObjectStore.""" self.running = False - def file_ready( - self, obj, base_dir=None, dir_only=False, extra_dir=None, extra_dir_at_root=False, alt_name=None, obj_dir=False - ): - """ - Check if a file corresponding to a dataset is ready to be used. - - Return True if so, False otherwise - """ - return True - @classmethod def parse_xml(clazz, config_xml): """Parse an XML description of a configuration for this object store. @@ -938,10 +928,6 @@ def _exists(self, obj, **kwargs): """Determine if the `obj` exists in any of the backends.""" return self._call_method("_exists", obj, False, False, **kwargs) - def file_ready(self, obj, **kwargs): - """Determine if the file for `obj` is ready to be used by any of the backends.""" - return self._call_method("file_ready", obj, False, False, **kwargs) - def _create(self, obj, **kwargs): """Create a backing file in a random backend.""" objectstore = random.choice(list(self.backends.values())) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 9f4c59a8a3b3..8c6c96519c57 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,23 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - local_size = os.path.getsize(self._get_cache_path(rel_path)) - remote_size = self._get_size_in_azure(rel_path) - if local_size == remote_size: - return True - else: - log.debug("Waiting for dataset %s to transfer from OS: %s/%s", rel_path, local_size, remote_size) - - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index e518bc8089fb..e825a4afaa0b 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -380,24 +380,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_cloud(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_cloud(rel_path), - ) - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 78b11906bebe..53f02b0856fb 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -461,27 +461,6 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): finally: log.debug("irods_pt _push_to_irods: %s", ipt_timer) - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_irods(rel_path): - log.debug("irods_pt _file_ready: %s", ipt_timer) - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_irods(rel_path), - ) - log.debug("irods_pt _file_ready: %s", ipt_timer) - return False - def _create(self, obj, **kwargs): ipt_timer = ExecutionTimer() if not self._exists(obj, **kwargs): diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 349764185c42..3cf7b247d477 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -364,25 +364,6 @@ def _exists(self, obj, **kwargs): def parse_xml(cls, config_xml): return parse_config_xml(config_xml) - def file_ready(self, obj, **kwargs): - log.debug("rucio file_ready") - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self.rucio_broker.get_size(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self.rucio_broker.get_size(rel_path), - ) - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 6282190b379d..dae930fb458e 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -440,24 +440,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def file_ready(self, obj, **kwargs): - """ - A helper method that checks if a file corresponding to a dataset is - ready and available to be used. Return ``True`` if so, ``False`` otherwise. - """ - rel_path = self._construct_path(obj, **kwargs) - # Make sure the size in cache is available in its entirety - if self._in_cache(rel_path): - if os.path.getsize(self._get_cache_path(rel_path)) == self._get_size_in_s3(rel_path): - return True - log.debug( - "Waiting for dataset %s to transfer from OS: %s/%s", - rel_path, - os.path.getsize(self._get_cache_path(rel_path)), - self._get_size_in_s3(rel_path), - ) - return False - def _create(self, obj, **kwargs): if not self._exists(obj, **kwargs): # Pull out locally used fields From c8147f39da5c35b228bbf4ee67da72868f4acbba Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:43:28 -0400 Subject: [PATCH 08/40] Remove duplicated _create logic across 4 object stores. --- lib/galaxy/objectstore/azure_blob.py | 35 ---------------------------- lib/galaxy/objectstore/caching.py | 32 ++++++++++++++++++++++++- lib/galaxy/objectstore/cloud.py | 29 ----------------------- lib/galaxy/objectstore/irods.py | 31 ------------------------ lib/galaxy/objectstore/s3.py | 35 ---------------------------- 5 files changed, 31 insertions(+), 131 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 8c6c96519c57..bd29cc625f77 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,41 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - # Although not really necessary to create S3 folders (because S3 has - # flat namespace), do so for consistency with the regular file system - # S3 folders are marked by having trailing '/' so add it now - # s3_dir = '%s/' % rel_path - # self._push_to_os(s3_dir, from_string='') - # If instructed, create the dataset in cache & in S3 - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): size = self._size(obj, **kwargs) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 77c19ea4b2c6..c9419303fb6a 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -331,9 +331,39 @@ def _exists(self, obj, **kwargs): else: return False + def _create(self, obj, **kwargs): + if not self._exists(obj, **kwargs): + # Pull out locally used fields + extra_dir = kwargs.get("extra_dir", None) + extra_dir_at_root = kwargs.get("extra_dir_at_root", False) + dir_only = kwargs.get("dir_only", False) + alt_name = kwargs.get("alt_name", None) + + # Construct hashed path + rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) + + # Optionally append extra_dir + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # Create given directory in cache + cache_dir = os.path.join(self.staging_path, rel_path) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + + # If instructed, create the dataset in cache & in S3 + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") + open(os.path.join(self.staging_path, rel_path), "w").close() + self._push_to_os(rel_path, from_string="") + return self + def _exists_remotely(self, rel_path: str) -> bool: ... - def _push_to_os(self, rel_path, source_file: str) -> None: ... + def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: ... def _get_object_id(self, obj: Any) -> str: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index e825a4afaa0b..ac858530b469 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -380,35 +380,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): return bool(self._size(obj, **kwargs) == 0) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 53f02b0856fb..7eec323c000d 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -461,37 +461,6 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): finally: log.debug("irods_pt _push_to_irods: %s", ipt_timer) - def _create(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_irods(rel_path, from_string="") - log.debug("irods_pt _create: %s", ipt_timer) - return self - def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): return bool(self._size(obj, **kwargs) > 0) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index dae930fb458e..328ab5727a60 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -440,41 +440,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - # Although not really necessary to create S3 folders (because S3 has - # flat namespace), do so for consistency with the regular file system - # S3 folders are marked by having trailing '/' so add it now - # s3_dir = '%s/' % rel_path - # self._push_to_os(s3_dir, from_string='') - # If instructed, create the dataset in cache & in S3 - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): return bool(self._size(obj, **kwargs) == 0) From 65d40186b35f17bbc8ec3ff8e8dc815caf416586 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:48:26 -0400 Subject: [PATCH 09/40] Remove duplicated _empty across caching object stores. --- lib/galaxy/objectstore/azure_blob.py | 8 -------- lib/galaxy/objectstore/caching.py | 13 ++++++++++++- lib/galaxy/objectstore/cloud.py | 6 ------ lib/galaxy/objectstore/irods.py | 6 ------ lib/galaxy/objectstore/pithos.py | 9 --------- lib/galaxy/objectstore/rucio.py | 7 ------- lib/galaxy/objectstore/s3.py | 6 ------ 7 files changed, 12 insertions(+), 43 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index bd29cc625f77..88577a1431c1 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,14 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - size = self._size(obj, **kwargs) - is_empty = bool(size == 0) - return is_empty - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {str(obj)}, kwargs: {str(kwargs)}") - def _size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index c9419303fb6a..564794671e56 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -16,7 +16,10 @@ from typing_extensions import NamedTuple -from galaxy.exceptions import ObjectInvalid +from galaxy.exceptions import ( + ObjectInvalid, + ObjectNotFound, +) from galaxy.util import ( directory_hash_id, ExecutionTimer, @@ -361,6 +364,12 @@ def _create(self, obj, **kwargs): self._push_to_os(rel_path, from_string="") return self + def _empty(self, obj, **kwargs): + if self._exists(obj, **kwargs): + return self._size(obj, **kwargs) == 0 + else: + raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") + def _exists_remotely(self, rel_path: str) -> bool: ... def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: ... @@ -368,3 +377,5 @@ def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: def _get_object_id(self, obj: Any) -> str: ... def _download(self, rel_path: str) -> bool: ... + + def _size(self, obj) -> int: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index ac858530b469..e408ebe6566a 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -380,12 +380,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) == 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - def _size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 7eec323c000d..403ceeb4f30c 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -461,12 +461,6 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): finally: log.debug("irods_pt _push_to_irods: %s", ipt_timer) - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) > 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - def _size(self, obj, **kwargs) -> int: ipt_timer = ExecutionTimer() rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 3e189f71187c..247424111eb0 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -225,15 +225,6 @@ def _create(self, obj, **kwargs): self.pithos.upload_from_string(rel_path, "") return self - def _empty(self, obj, **kwargs): - """ - :returns: weather the object has content - :raises ObjectNotFound: - """ - if not self._exists(obj, **kwargs): - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - return bool(self._size(obj, **kwargs)) - def _size(self, obj, **kwargs) -> int: """ :returns: The size of the object, or 0 if it doesn't exist (sorry for diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 3cf7b247d477..827af97c60e2 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -394,13 +394,6 @@ def _create(self, obj, **kwargs): log.debug("rucio _create: %s", rel_path) return self - def _empty(self, obj, **kwargs): - log.debug("rucio _empty") - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) > 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - def _size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) log.debug("rucio _size: %s", rel_path) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 328ab5727a60..2f824068d1a5 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -440,12 +440,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return bool(self._size(obj, **kwargs) == 0) - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - def _size(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) if self._in_cache(rel_path): From 6b2c0f12d7ffc0ec69a496791a3951ae20f01fca Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 16:54:33 -0400 Subject: [PATCH 10/40] Refactor names to make clear copy-paste. --- lib/galaxy/objectstore/azure_blob.py | 8 ++++---- lib/galaxy/objectstore/cloud.py | 4 ++-- lib/galaxy/objectstore/irods.py | 6 +++--- lib/galaxy/objectstore/pithos.py | 3 +++ lib/galaxy/objectstore/rucio.py | 5 ++++- lib/galaxy/objectstore/s3.py | 8 ++++---- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 88577a1431c1..15def88f925c 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -180,7 +180,7 @@ def _configure_connection(self): ) self.service = service - def _get_size_in_azure(self, rel_path): + def _get_remote_size(self, rel_path): try: properties = self._blob_client(rel_path).get_blob_properties() size_in_bytes = properties.size @@ -204,11 +204,11 @@ def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) - if not self.cache_target.fits_in_cache(self._get_size_in_azure(rel_path)): + if not self.cache_target.fits_in_cache(self._get_remote_size(rel_path)): log.critical( "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", rel_path, - self._get_size_in_azure(rel_path), + self._get_remote_size(rel_path), self.cache_target.log_description, ) return False @@ -284,7 +284,7 @@ def _size(self, obj, **kwargs): except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) elif self._exists(obj, **kwargs): - return self._get_size_in_azure(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index e408ebe6566a..a253c21412f5 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -264,7 +264,7 @@ def _get_bucket(self, bucket_name): def _get_transfer_progress(self): return self.transfer_progress - def _get_size_in_cloud(self, rel_path): + def _get_remote_size(self, rel_path): try: obj = self.bucket.objects.get(rel_path) return obj.size @@ -388,7 +388,7 @@ def _size(self, obj, **kwargs): except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex) elif self._exists(obj, **kwargs): - return self._get_size_in_cloud(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 403ceeb4f30c..fb5fe33a178e 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -317,7 +317,7 @@ def to_dict(self): return as_dict # rel_path is file or folder? - def _get_size_in_irods(self, rel_path): + def _get_remote_size(self, rel_path): ipt_timer = ExecutionTimer() p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -334,7 +334,7 @@ def _get_size_in_irods(self, rel_path): log.warning("Collection or data object (%s) does not exist", data_object_path) return -1 finally: - log.debug("irods_pt _get_size_in_irods: %s", ipt_timer) + log.debug("irods_pt _get_remote_size: %s", ipt_timer) # rel_path is file or folder? def _exists_remotely(self, rel_path): @@ -473,7 +473,7 @@ def _size(self, obj, **kwargs) -> int: log.debug("irods_pt _size: %s", ipt_timer) elif self._exists(obj, **kwargs): log.debug("irods_pt _size: %s", ipt_timer) - return self._get_size_in_irods(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) log.debug("irods_pt _size: %s", ipt_timer) return 0 diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 247424111eb0..11fbc04e8cde 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -237,6 +237,9 @@ def _size(self, obj, **kwargs) -> int: return os.path.getsize(self._get_cache_path(path)) except OSError as ex: log.warning("Could not get size of file %s in local cache, will try Pithos. Error: %s", path, ex) + return self._get_remote_size(path) + + def _get_remote_size(self, path): try: file = self.pithos.get_object_info(path) except ClientError as ce: diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 827af97c60e2..afc6a26e4550 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -406,10 +406,13 @@ def _size(self, obj, **kwargs): if size != 0: return size if self._exists(obj, **kwargs): - return self.rucio_broker.get_size(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 + def _get_remote_size(self, rel_path): + return self.rucio_broker.get_size(rel_path) + def _delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get("extra_dir", None) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 2f824068d1a5..3f0224cc7514 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -292,7 +292,7 @@ def _get_bucket(self, bucket_name): def _get_transfer_progress(self): return self.transfer_progress - def _get_size_in_s3(self, rel_path): + def _get_remote_size(self, rel_path): try: key = self._bucket.get_key(rel_path) return key.size @@ -333,8 +333,8 @@ def _in_cache(self, rel_path): # if self._exists_remotely(rel_path): # # print("***2 %s exists in S3" % rel_path) # # Make sure the size in cache is available in its entirety - # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_size_in_s3(rel_path))) - # if os.path.getsize(cache_path) == self._get_size_in_s3(rel_path): + # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_remote_size(rel_path))) + # if os.path.getsize(cache_path) == self._get_remote_size(rel_path): # # print("***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path) # exists = True # else: @@ -448,7 +448,7 @@ def _size(self, obj, **kwargs): except OSError as ex: log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex) elif self._exists(obj, **kwargs): - return self._get_size_in_s3(rel_path) + return self._get_remote_size(rel_path) log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 From 3cf2ec1f5a793dde8ec8b5080188d5757ea5d14e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 20:05:32 -0400 Subject: [PATCH 11/40] Eliminate copy-paste for _size across 4 objct stores. --- lib/galaxy/objectstore/azure_blob.py | 12 ------------ lib/galaxy/objectstore/caching.py | 16 ++++++++++++++-- lib/galaxy/objectstore/cloud.py | 12 ------------ lib/galaxy/objectstore/irods.py | 17 ----------------- lib/galaxy/objectstore/pithos.py | 14 -------------- lib/galaxy/objectstore/s3.py | 12 ------------ 6 files changed, 14 insertions(+), 69 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 15def88f925c..fdbbc9dbb4f0 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,18 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_remote_size(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - def _delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get("extra_dir", None) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 564794671e56..6d713cc400a8 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -370,6 +370,20 @@ def _empty(self, obj, **kwargs): else: raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") + def _size(self, obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + if self._in_cache(rel_path): + try: + return os.path.getsize(self._get_cache_path(rel_path)) + except OSError as ex: + log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) + elif self._exists_remotely(rel_path): + return self._get_remote_size(rel_path) + log.warning("Did not find dataset '%s', returning 0 for size", rel_path) + return 0 + + def _get_remote_size(self, rel_path: str) -> int: ... + def _exists_remotely(self, rel_path: str) -> bool: ... def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: ... @@ -377,5 +391,3 @@ def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: def _get_object_id(self, obj: Any) -> str: ... def _download(self, rel_path: str) -> bool: ... - - def _size(self, obj) -> int: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index a253c21412f5..35a969401e6f 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -380,18 +380,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try cloud. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_remote_size(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - def _delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get("extra_dir", None) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index fb5fe33a178e..8f240e58091a 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -461,23 +461,6 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): finally: log.debug("irods_pt _push_to_irods: %s", ipt_timer) - def _size(self, obj, **kwargs) -> int: - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try iRODS. Error: %s", rel_path, ex) - finally: - log.debug("irods_pt _size: %s", ipt_timer) - elif self._exists(obj, **kwargs): - log.debug("irods_pt _size: %s", ipt_timer) - return self._get_remote_size(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - log.debug("irods_pt _size: %s", ipt_timer) - return 0 - def _delete(self, obj, entire_dir=False, **kwargs): ipt_timer = ExecutionTimer() rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 11fbc04e8cde..7ca1ff63fdfb 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -225,20 +225,6 @@ def _create(self, obj, **kwargs): self.pithos.upload_from_string(rel_path, "") return self - def _size(self, obj, **kwargs) -> int: - """ - :returns: The size of the object, or 0 if it doesn't exist (sorry for - that, not our fault, the ObjectStore interface is like that some - times) - """ - path = self._construct_path(obj, **kwargs) - if self._in_cache(path): - try: - return os.path.getsize(self._get_cache_path(path)) - except OSError as ex: - log.warning("Could not get size of file %s in local cache, will try Pithos. Error: %s", path, ex) - return self._get_remote_size(path) - def _get_remote_size(self, path): try: file = self.pithos.get_object_info(path) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 3f0224cc7514..fbf2cae7f8e2 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -440,18 +440,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try S3. Error: %s", rel_path, ex) - elif self._exists(obj, **kwargs): - return self._get_remote_size(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - def _delete(self, obj, entire_dir=False, **kwargs): rel_path = self._construct_path(obj, **kwargs) extra_dir = kwargs.get("extra_dir", None) From 9996d3f33adb49f6c195565d71cdc47d3662c79b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 17:17:31 -0400 Subject: [PATCH 12/40] Remove duplication around _get_data in object stores. --- lib/galaxy/objectstore/azure_blob.py | 12 ------------ lib/galaxy/objectstore/caching.py | 12 ++++++++++++ lib/galaxy/objectstore/cloud.py | 12 ------------ lib/galaxy/objectstore/irods.py | 14 -------------- lib/galaxy/objectstore/pithos.py | 16 ---------------- lib/galaxy/objectstore/rucio.py | 14 -------------- lib/galaxy/objectstore/s3.py | 12 ------------ 7 files changed, 12 insertions(+), 80 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index fdbbc9dbb4f0..28f5c81d9736 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -313,18 +313,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_filename(self, obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) base_dir = kwargs.get("base_dir", None) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 6d713cc400a8..82486d0a27d0 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -303,6 +303,18 @@ def _pull_into_cache(self, rel_path) -> bool: log.debug("_pull_into_cache: %s\n\n\n\n\n\n", ipt_timer) return file_ok + def _get_data(self, obj, start=0, count=-1, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Check cache first and get file if not there + if not self._in_cache(rel_path): + self._pull_into_cache(rel_path) + # Read the file content from cache + data_file = open(self._get_cache_path(rel_path)) + data_file.seek(start) + content = data_file.read(count) + data_file.close() + return content + def _exists(self, obj, **kwargs): in_cache = exists_remotely = False rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 35a969401e6f..c24874bb0e54 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -418,18 +418,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_filename(self, obj, **kwargs): base_dir = kwargs.get("base_dir", None) dir_only = kwargs.get("dir_only", False) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 8f240e58091a..c9e40ad7efba 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -529,20 +529,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.debug("irods_pt _delete: %s", ipt_timer) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - ipt_timer = ExecutionTimer() - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - log.debug("irods_pt _get_data: %s", ipt_timer) - return content - def _get_filename(self, obj, **kwargs): ipt_timer = ExecutionTimer() base_dir = kwargs.get("base_dir", None) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 7ca1ff63fdfb..c62e400e79ce 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -264,22 +264,6 @@ def _delete(self, obj, **kwargs): log.exception(f"Could not delete {path} from Pithos, {ce}") return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - """Fetch (e.g., download) data - :param start: Chunk of data starts here - :param count: Fetch at most as many data, fetch all if negative - """ - path = self._construct_path(obj, **kwargs) - if self._in_cache(path): - cache_path = self._pull_into_cache(path) - else: - cache_path = self._get_cache_path(path) - data_file = open(cache_path) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_filename(self, obj, **kwargs): """Get the expected filename with absolute path""" base_dir = kwargs.get("base_dir", None) diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index afc6a26e4550..d371e89d3746 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -442,20 +442,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - log.debug("rucio _get_data: %s", rel_path) - auth_token = self._get_token(**kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0: - self._pull_into_cache(rel_path, auth_token) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_token(self, **kwargs): auth_token = kwargs.get("auth_token", None) if auth_token: diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index fbf2cae7f8e2..a4b6e9569f56 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -478,18 +478,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path) or os.path.getsize(self._get_cache_path(rel_path)) == 0: - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - def _get_filename(self, obj, **kwargs): base_dir = kwargs.get("base_dir", None) dir_only = kwargs.get("dir_only", False) From 8038932df08b8adac8be0c51d371cbfc7bc9a6ce Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 17:20:59 -0400 Subject: [PATCH 13/40] Remove _in_cache from s3 (rebase above somewhere?) --- lib/galaxy/objectstore/s3.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index a4b6e9569f56..139056ef16a5 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -319,35 +319,6 @@ def _exists_remotely(self, rel_path): return False return exists - def _in_cache(self, rel_path): - """Check if the given dataset is in the local cache and return True if so.""" - # log.debug("------ Checking cache for rel_path %s" % rel_path) - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - # TODO: Part of checking if a file is in cache should be to ensure the - # size of the cached file matches that on S3. Once the upload tool explicitly - # creates, this check sould be implemented- in the mean time, it's not - # looking likely to be implementable reliably. - # if os.path.exists(cache_path): - # # print("***1 %s exists" % cache_path) - # if self._exists_remotely(rel_path): - # # print("***2 %s exists in S3" % rel_path) - # # Make sure the size in cache is available in its entirety - # # print("File '%s' cache size: %s, S3 size: %s" % (cache_path, os.path.getsize(cache_path), self._get_remote_size(rel_path))) - # if os.path.getsize(cache_path) == self._get_remote_size(rel_path): - # # print("***2.1 %s exists in S3 and the size is the same as in cache (in_cache=True)" % rel_path) - # exists = True - # else: - # # print("***2.2 %s exists but differs in size from cache (in_cache=False)" % cache_path) - # exists = False - # else: - # # Although not perfect decision making, this most likely means - # # that the file is currently being uploaded - # # print("***3 %s found in cache but not in S3 (in_cache=True)" % cache_path) - # exists = True - # else: - # return False - def _transfer_cb(self, complete, total): self.transfer_progress += 10 From be5983d297fd31bdba3600555cd5cea463bd237b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 17:33:32 -0400 Subject: [PATCH 14/40] Remove duplication around _get_filename. --- lib/galaxy/objectstore/azure_blob.py | 30 ------------------- lib/galaxy/objectstore/caching.py | 37 +++++++++++++++++++++++ lib/galaxy/objectstore/irods.py | 45 ---------------------------- lib/galaxy/objectstore/pithos.py | 23 -------------- lib/galaxy/objectstore/s3.py | 43 ++------------------------ 5 files changed, 40 insertions(+), 138 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 28f5c81d9736..2c03dbed04b4 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -313,36 +313,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_filename(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {str(obj)}, kwargs: {str(kwargs)}") - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if create is True: self._create(obj, **kwargs) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 82486d0a27d0..4610b870359c 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -394,6 +394,43 @@ def _size(self, obj, **kwargs): log.warning("Did not find dataset '%s', returning 0 for size", rel_path) return 0 + def _get_filename(self, obj, **kwargs): + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + sync_cache = kwargs.get("sync_cache", True) + + rel_path = self._construct_path(obj, **kwargs) + + # for JOB_WORK directory + if base_dir and dir_only and obj_dir: + return os.path.abspath(rel_path) + + cache_path = self._get_cache_path(rel_path) + if not sync_cache: + return cache_path + + # Check if the file exists in the cache first, always pull if file size in cache is zero + if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): + return cache_path + + # Check if the file exists in persistent storage and, if it does, pull it into cache + elif self._exists(obj, **kwargs): + if dir_only: + self._download_directory_into_cache(rel_path, cache_path) + return cache_path + else: + if self._pull_into_cache(rel_path): + return cache_path + raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") + + def _download_directory_into_cache(self, rel_path, cache_path): + # azure, pithos, irod, and cloud did not do this prior to refactoring so I am assuming + # there is just operations that fail with these object stores, + # I'm placing a no-op here to match their behavior but we should + # maybe implement this for those object stores. + pass + def _get_remote_size(self, rel_path: str) -> int: ... def _exists_remotely(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index c9e40ad7efba..831a5cce9dbf 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -529,51 +529,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.debug("irods_pt _delete: %s", ipt_timer) return False - def _get_filename(self, obj, **kwargs): - ipt_timer = ExecutionTimer() - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - rel_path = self._construct_path(obj, **kwargs) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - log.debug("irods_pt _get_filename: %s", ipt_timer) - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # iRODS does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - else: - if self._pull_into_cache(rel_path): - log.debug("irods_pt _get_filename: %s", ipt_timer) - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - log.debug("irods_pt _get_filename: %s", ipt_timer) - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): ipt_timer = ExecutionTimer() if create: diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index c62e400e79ce..91646a381ea1 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -264,29 +264,6 @@ def _delete(self, obj, **kwargs): log.exception(f"Could not delete {path} from Pithos, {ce}") return False - def _get_filename(self, obj, **kwargs): - """Get the expected filename with absolute path""" - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - path = self._construct_path(obj, **kwargs) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(path) - cache_path = self._get_cache_path(path) - if dir_only: - if not os.path.exists(cache_path): - os.makedirs(cache_path, exist_ok=True) - return cache_path - if self._in_cache(path): - return cache_path - elif self._exists(obj, **kwargs): - if not dir_only: - self._pull_into_cache(path) - return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - def _update_from_file(self, obj, **kwargs): """Update the store when a file is updated""" if kwargs.get("create"): diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 139056ef16a5..81eb54d7b078 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -448,48 +448,11 @@ def _delete(self, obj, entire_dir=False, **kwargs): except OSError: log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - - def _get_filename(self, obj, **kwargs): - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - sync_cache = kwargs.get("sync_cache", True) - - rel_path = self._construct_path(obj, **kwargs) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # S3 does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: - download_directory(self._bucket, rel_path, cache_path) - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path + def _download_directory_into_cache(self, rel_path, cache_path): + download_directory(self._bucket, rel_path, cache_path) + def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self._create(obj, **kwargs) From 8aabf8b471044b2b7b1dca9c4833f9fc169286c4 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 14:28:38 -0400 Subject: [PATCH 15/40] Remove get_file_name from cloud. --- lib/galaxy/objectstore/cloud.py | 39 --------------------------------- 1 file changed, 39 deletions(-) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index c24874bb0e54..a8d4c30ed52c 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -418,45 +418,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _get_filename(self, obj, **kwargs): - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - rel_path = self._construct_path(obj, **kwargs) - sync_cache = kwargs.get("sync_cache", True) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - # S3 does not recognize directories as files so cannot check if those exist. - # So, if checking dir only, ensure given dir exists in cache and return - # the expected cache path. - # dir_only = kwargs.get('dir_only', False) - # if dir_only: - # if not os.path.exists(cache_path): - # os.makedirs(cache_path) - # return cache_path - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: # Directories do not get pulled into cache - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - # For the case of retrieving a directory only, return the expected path - # even if it does not exist. - # if dir_only: - # return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self._create(obj, **kwargs) From 7c8978b0ad3d142c2b334b8a04082da41a069c06 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 14:29:34 -0400 Subject: [PATCH 16/40] Remove duplication around _update_from_file. --- lib/galaxy/objectstore/azure_blob.py | 28 ------------------------- lib/galaxy/objectstore/caching.py | 29 ++++++++++++++++++++++++++ lib/galaxy/objectstore/cloud.py | 24 --------------------- lib/galaxy/objectstore/irods.py | 31 ++-------------------------- lib/galaxy/objectstore/pithos.py | 22 -------------------- lib/galaxy/objectstore/s3.py | 24 --------------------- 6 files changed, 31 insertions(+), 127 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 2c03dbed04b4..3e3e4b3e0ddb 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -313,34 +313,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create is True: - self._create(obj, **kwargs) - - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - fix_permissions(self.config, cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - - self._push_to_os(rel_path, source_file) - - else: - raise ObjectNotFound( - f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" - ) - def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 4610b870359c..518a4f3dcfdc 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -3,6 +3,7 @@ import logging import os +import shutil import threading import time from math import inf @@ -431,6 +432,34 @@ def _download_directory_into_cache(self, rel_path, cache_path): # maybe implement this for those object stores. pass + def _update_from_file(self, obj, file_name=None, create=False, **kwargs): + if create: + self._create(obj, **kwargs) + + if self._exists(obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Chose whether to use the dataset file itself or an alternate file + if file_name: + source_file = os.path.abspath(file_name) + # Copy into cache + cache_file = self._get_cache_path(rel_path) + try: + if source_file != cache_file and self.cache_updated_data: + # FIXME? Should this be a `move`? + shutil.copy2(source_file, cache_file) + fix_permissions(self.config, cache_file) + except OSError: + log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) + else: + source_file = self._get_cache_path(rel_path) + + self._push_to_os(rel_path, source_file) + + else: + raise ObjectNotFound( + f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" + ) + def _get_remote_size(self, rel_path: str) -> int: ... def _exists_remotely(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index a8d4c30ed52c..6aaa89e6e5ff 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -418,30 +418,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - fix_permissions(self.config, cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on cloud - self._push_to_os(rel_path, source_file) - else: - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") - def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 831a5cce9dbf..ab426889f17c 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -382,7 +382,7 @@ def _download(self, rel_path): finally: log.debug("irods_pt _download: %s", ipt_timer) - def _push_to_irods(self, rel_path, source_file=None, from_string=None): + def _push_to_os(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the iRODS. Extract folder name from rel_path as iRODS collection name, and extract file name from rel_path @@ -459,7 +459,7 @@ def _push_to_irods(self, rel_path, source_file=None, from_string=None): ) return True finally: - log.debug("irods_pt _push_to_irods: %s", ipt_timer) + log.debug("irods_pt _push_to_os: %s", ipt_timer) def _delete(self, obj, entire_dir=False, **kwargs): ipt_timer = ExecutionTimer() @@ -529,33 +529,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): log.debug("irods_pt _delete: %s", ipt_timer) return False - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - ipt_timer = ExecutionTimer() - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Choose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - fix_permissions(self.config, cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on iRODS - self._push_to_irods(rel_path, source_file) - else: - log.debug("irods_pt _update_from_file: %s", ipt_timer) - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") - log.debug("irods_pt _update_from_file: %s", ipt_timer) - # Unlike S3, url is not really applicable to iRODS def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 91646a381ea1..75eb8e3b05f1 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -264,28 +264,6 @@ def _delete(self, obj, **kwargs): log.exception(f"Could not delete {path} from Pithos, {ce}") return False - def _update_from_file(self, obj, **kwargs): - """Update the store when a file is updated""" - if kwargs.get("create"): - self._create(obj, **kwargs) - if not self._exists(obj, **kwargs): - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") - - path = self._construct_path(obj, **kwargs) - cache_path = self._get_cache_path(path) - file_name = kwargs.get("file_name") - if file_name: - source_path = os.path.abspath(file_name) - try: - if source_path != cache_path: - shutil.copy2(source_path, cache_path) - fix_permissions(self.config, cache_path) - except OSError: - log.exception('Trouble copying source file "%s" to cache "%s"', source_path, cache_path) - else: - with open(cache_path) as f: - self.pithos.upload_object(obj, f) - def _get_object_url(self, obj, **kwargs): """ :returns: URL for direct access, None if no object diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 81eb54d7b078..0ee1d26c0189 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -453,30 +453,6 @@ def _delete(self, obj, entire_dir=False, **kwargs): def _download_directory_into_cache(self, rel_path, cache_path): download_directory(self._bucket, rel_path, cache_path) - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create: - self._create(obj, **kwargs) - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - fix_permissions(self.config, cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - # Update the file on S3 - self._push_to_os(rel_path, source_file) - else: - raise ObjectNotFound(f"objectstore.update_from_file, object does not exist: {obj}, kwargs: {kwargs}") - def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) From 465ca6f7a338bd089677f7ba2ec55237697fc4b6 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 19:47:32 -0400 Subject: [PATCH 17/40] Refactor object stores so we can remove _delete... --- lib/galaxy/objectstore/azure_blob.py | 30 ++++++++++++++++-------- lib/galaxy/objectstore/cloud.py | 34 +++++++++++++++++++--------- lib/galaxy/objectstore/pithos.py | 25 +++++++++++++++----- lib/galaxy/objectstore/s3.py | 34 +++++++++++++++++++--------- 4 files changed, 86 insertions(+), 37 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 3e3e4b3e0ddb..36918dbff71e 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -294,25 +294,37 @@ def _delete(self, obj, entire_dir=False, **kwargs): # but requires iterating through each individual blob in Azure and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) - for blob in blobs: - log.debug("Deleting from Azure: %s", blob) - self._blob_client(blob.name).delete_blob() - return True + return self._delete_remote_all(rel_path) else: # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well if self._exists_remotely(rel_path): log.debug("Deleting from Azure: %s", rel_path) - self._blob_client(rel_path).delete_blob() - return True - except AzureHttpError: - log.exception("Could not delete blob '%s' from Azure", rel_path) + return self._delete_existing_remote(rel_path) except OSError: log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False + def _delete_remote_all(self, rel_path: str) -> bool: + try: + blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) + for blob in blobs: + log.debug("Deleting from Azure: %s", blob) + self._blob_client(blob.name).delete_blob() + return True + except AzureHttpError: + log.exception("Could not delete blob '%s' from Azure", rel_path) + return False + + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + self._blob_client(rel_path).delete_blob() + return True + except AzureHttpError: + log.exception("Could not delete blob '%s' from Azure", rel_path) + return False + def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 6aaa89e6e5ff..1c6e6aacd68c 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -398,26 +398,38 @@ def _delete(self, obj, entire_dir=False, **kwargs): # but requires iterating through each individual key in S3 and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - results = self.bucket.objects.list(prefix=rel_path) - for key in results: - log.debug("Deleting key %s", key.name) - key.delete() - return True + return self._delete_remote_all(rel_path) else: # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well if self._exists_remotely(rel_path): - key = self.bucket.objects.get(rel_path) - log.debug("Deleting key %s", key.name) - key.delete() - return True - except Exception: - log.exception("Could not delete key '%s' from cloud", rel_path) + return self._delete_existing_remote(rel_path) except OSError: log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False + def _delete_remote_all(self, rel_path: str) -> bool: + try: + results = self.bucket.objects.list(prefix=rel_path) + for key in results: + log.debug("Deleting key %s", key.name) + key.delete() + return True + except Exception: + log.exception("Could not delete key '%s' from cloud", rel_path) + return False + + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + key = self.bucket.objects.get(rel_path) + log.debug("Deleting key %s", key.name) + key.delete() + return True + except Exception: + log.exception("Could not delete key '%s' from cloud", rel_path) + return False + def _get_object_url(self, obj, **kwargs): if self._exists(obj, **kwargs): rel_path = self._construct_path(obj, **kwargs) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 75eb8e3b05f1..e1c40d5dc360 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -252,18 +252,31 @@ def _delete(self, obj, **kwargs): extra_dir = kwargs.get("extra_dir", False) if entire_dir and extra_dir: shutil.rmtree(cache_path) - log.debug(f"On Pithos: delete -r {path}/") - self.pithos.del_object(path, delimiter="/") - return True + return self._delete_remote_all(path) else: os.unlink(cache_path) - self.pithos.del_object(path) + return self._delete_existing_remote(path) except OSError: log.exception(f"{self._get_filename(obj, **kwargs)} delete error") - except ClientError as ce: - log.exception(f"Could not delete {path} from Pithos, {ce}") return False + def _delete_remote_all(self, path: str) -> bool: + try: + log.debug(f"On Pithos: delete -r {path}/") + self.pithos.del_object(path, delimiter="/") + return True + except ClientError: + log.exception(f"Could not delete path '{path}' from Pithos") + return False + + def _delete_existing_remote(self, path: str) -> bool: + try: + self.pithos.del_object(path) + return True + except ClientError: + log.exception(f"Could not delete path '{path}' from Pithos") + return False + def _get_object_url(self, obj, **kwargs): """ :returns: URL for direct access, None if no object diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 0ee1d26c0189..133cf306c451 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -429,27 +429,39 @@ def _delete(self, obj, entire_dir=False, **kwargs): # but requires iterating through each individual key in S3 and deleing it. if entire_dir and extra_dir: shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - results = self._bucket.get_all_keys(prefix=rel_path) - for key in results: - log.debug("Deleting key %s", key.name) - key.delete() - return True + return self._delete_remote_all(rel_path) else: # Delete from cache first unlink(self._get_cache_path(rel_path), ignore_errors=True) # Delete from S3 as well if self._exists_remotely(rel_path): - key = Key(self._bucket, rel_path) - log.debug("Deleting key %s", key.name) - key.delete() - return True - except S3ResponseError: - log.exception("Could not delete key '%s' from S3", rel_path) + self._delete_existing_remote(rel_path) except OSError: log.exception("%s delete error", self._get_filename(obj, **kwargs)) return False # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path + def _delete_remote_all(self, rel_path: str) -> bool: + try: + results = self._bucket.get_all_keys(prefix=rel_path) + for key in results: + log.debug("Deleting key %s", key.name) + key.delete() + return True + except S3ResponseError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + key = Key(self._bucket, rel_path) + log.debug("Deleting key %s", key.name) + key.delete() + return True + except S3ResponseError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + def _download_directory_into_cache(self, rel_path, cache_path): download_directory(self._bucket, rel_path, cache_path) From 2fd544eee1c54b368d1a84651608db412eb04d17 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 19:54:03 -0400 Subject: [PATCH 18/40] Remove duplication around _delete in object stores. --- lib/galaxy/objectstore/azure_blob.py | 30 ---------------------------- lib/galaxy/objectstore/caching.py | 30 ++++++++++++++++++++++++++++ lib/galaxy/objectstore/cloud.py | 29 --------------------------- lib/galaxy/objectstore/pithos.py | 26 ------------------------ lib/galaxy/objectstore/s3.py | 30 ---------------------------- 5 files changed, 30 insertions(+), 115 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 36918dbff71e..a9ff20dfe52a 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -276,36 +276,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): # Public Methods # ################## - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - try: - if base_dir and dir_only and obj_dir: - # Remove temporary data in JOB_WORK directory - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/blobs we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual blob in Azure and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - return self._delete_remote_all(rel_path) - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._exists_remotely(rel_path): - log.debug("Deleting from Azure: %s", rel_path) - return self._delete_existing_remote(rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False - def _delete_remote_all(self, rel_path: str) -> bool: try: blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 518a4f3dcfdc..19f6f6189822 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -26,6 +26,7 @@ ExecutionTimer, nice_size, string_as_bool, + unlink, ) from galaxy.util.path import safe_relpath from galaxy.util.sleeper import Sleeper @@ -432,6 +433,35 @@ def _download_directory_into_cache(self, rel_path, cache_path): # maybe implement this for those object stores. pass + def _delete(self, obj, entire_dir=False, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + extra_dir = kwargs.get("extra_dir", None) + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + try: + # Remove temporary data in JOB_WORK directory + if base_dir and dir_only and obj_dir: + shutil.rmtree(os.path.abspath(rel_path)) + return True + + # For the case of extra_files, because we don't have a reference to + # individual files/keys we need to remove the entire directory structure + # with all the files in it. This is easy for the local file system, + # but requires iterating through each individual key in S3 and deleing it. + if entire_dir and extra_dir: + shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) + return self._delete_remote_all(rel_path) + else: + # Delete from cache first + unlink(self._get_cache_path(rel_path), ignore_errors=True) + # Delete from S3 as well + if self._exists_remotely(rel_path): + return self._delete_existing_remote(rel_path) + except OSError: + log.exception("%s delete error", self._get_filename(obj, **kwargs)) + return False + def _update_from_file(self, obj, file_name=None, create=False, **kwargs): if create: self._create(obj, **kwargs) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 1c6e6aacd68c..ac596131d655 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -380,35 +380,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) return False - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - try: - # Remove temparory data in JOB_WORK directory - if base_dir and dir_only and obj_dir: - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/keys we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual key in S3 and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - return self._delete_remote_all(rel_path) - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._exists_remotely(rel_path): - return self._delete_existing_remote(rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False - def _delete_remote_all(self, rel_path: str) -> bool: try: results = self.bucket.objects.list(prefix=rel_path) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index e1c40d5dc360..d04f1b354a10 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -234,32 +234,6 @@ def _get_remote_size(self, path): return 0 return int(file["content-length"]) - def _delete(self, obj, **kwargs): - """Delete the object - :returns: weather the object was deleted - """ - path = self._construct_path(obj, **kwargs) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - try: - if all((base_dir, dir_only, obj_dir)): - shutil.rmtree(os.path.abspath(path)) - return True - cache_path = self._get_cache_path(path) - - entire_dir = kwargs.get("entire_dir", False) - extra_dir = kwargs.get("extra_dir", False) - if entire_dir and extra_dir: - shutil.rmtree(cache_path) - return self._delete_remote_all(path) - else: - os.unlink(cache_path) - return self._delete_existing_remote(path) - except OSError: - log.exception(f"{self._get_filename(obj, **kwargs)} delete error") - return False - def _delete_remote_all(self, path: str) -> bool: try: log.debug(f"On Pithos: delete -r {path}/") diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 133cf306c451..3763ff1e0bb9 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -411,36 +411,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): raise return False - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - try: - # Remove temparory data in JOB_WORK directory - if base_dir and dir_only and obj_dir: - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/keys we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual key in S3 and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - return self._delete_remote_all(rel_path) - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._exists_remotely(rel_path): - self._delete_existing_remote(rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False - # return cache_path # Until the upload tool does not explicitly create the dataset, return expected path - def _delete_remote_all(self, rel_path: str) -> bool: try: results = self._bucket.get_all_keys(prefix=rel_path) From d50ed66b6d1d27ad3c945fcceaef083ac3155c18 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 20:13:46 -0400 Subject: [PATCH 19/40] Remove a bunch of extra imports now. --- lib/galaxy/objectstore/azure_blob.py | 11 ----------- lib/galaxy/objectstore/cloud.py | 11 ----------- lib/galaxy/objectstore/irods.py | 8 -------- lib/galaxy/objectstore/pithos.py | 12 +----------- lib/galaxy/objectstore/rucio.py | 1 - lib/galaxy/objectstore/s3.py | 9 --------- 6 files changed, 1 insertion(+), 51 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index a9ff20dfe52a..7e796e22f91d 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -4,7 +4,6 @@ import logging import os -import shutil from datetime import ( datetime, timedelta, @@ -21,17 +20,7 @@ except ImportError: BlobServiceClient = None -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - unlink, -) -from galaxy.util.path import safe_relpath from . import ConcreteObjectStore -from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index ac596131d655..3565b747d3ef 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -6,22 +6,11 @@ import multiprocessing import os import os.path -import shutil import subprocess from datetime import datetime from typing import Optional -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - safe_relpath, - unlink, -) from . import ConcreteObjectStore -from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index ab426889f17c..10ab5f9c3a8f 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -20,20 +20,12 @@ except ImportError: irods = None -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) from galaxy.util import ( - directory_hash_id, ExecutionTimer, string_as_bool, - umask_fix_perms, unlink, ) -from galaxy.util.path import safe_relpath from . import DiskObjectStore -from ._util import fix_permissions from .caching import UsesCache IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it" diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index d04f1b354a10..2ba74636ac2b 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -4,7 +4,6 @@ import logging import os -import shutil try: from kamaki.clients import ( @@ -17,17 +16,8 @@ except ImportError: KamakiClient = None -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) -from galaxy.util import ( - directory_hash_id, - umask_fix_perms, -) -from galaxy.util.path import safe_relpath +from galaxy.util import directory_hash_id from . import ConcreteObjectStore -from ._util import fix_permissions from .caching import UsesCache NO_KAMAKI_ERROR_MESSAGE = ( diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index d371e89d3746..874aa3564f4a 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -32,7 +32,6 @@ umask_fix_perms, unlink, ) -from galaxy.util.path import safe_relpath from . import ConcreteObjectStore from .caching import ( CacheTarget, diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 3763ff1e0bb9..e82d1b800061 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -5,7 +5,6 @@ import logging import multiprocessing import os -import shutil import subprocess import time from datetime import datetime @@ -20,19 +19,11 @@ except ImportError: boto = None # type: ignore[assignment] -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) from galaxy.util import ( - directory_hash_id, string_as_bool, - unlink, which, ) -from galaxy.util.path import safe_relpath from . import ConcreteObjectStore -from ._util import fix_permissions from .caching import ( CacheTarget, enable_cache_monitor, From c8a08342d5c834d586d34bfec76b6c02d678f559 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 20:39:26 -0400 Subject: [PATCH 20/40] Remove some duplication around cache targets. --- lib/galaxy/objectstore/azure_blob.py | 11 +---------- lib/galaxy/objectstore/caching.py | 11 +++++++++++ lib/galaxy/objectstore/cloud.py | 11 +---------- lib/galaxy/objectstore/rucio.py | 11 +---------- lib/galaxy/objectstore/s3.py | 11 +---------- 5 files changed, 15 insertions(+), 40 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 7e796e22f91d..e74a8758dbaa 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -22,7 +22,6 @@ from . import ConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, @@ -308,13 +307,5 @@ def _get_store_usage_percent(self, obj): # https://learn.microsoft.com/en-us/azure/storage/blobs/scalability-targets return 0.0 - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 19f6f6189822..1bd002b770f8 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -490,6 +490,17 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" ) + @property + def cache_target(self) -> CacheTarget: + return CacheTarget( + self.staging_path, + self.cache_size, + 0.9, + ) + + def _shutdown_cache_monitor(self) -> None: + self.cache_monitor and self.cache_monitor.shutdown() + def _get_remote_size(self, rel_path: str) -> int: ... def _exists_remotely(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 3565b747d3ef..44d18227fb97 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -12,7 +12,6 @@ from . import ConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, InProcessCacheMonitor, UsesCache, @@ -225,14 +224,6 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def _get_bucket(self, bucket_name): try: bucket = self.conn.storage.buckets.get(bucket_name) @@ -404,4 +395,4 @@ def _get_store_usage_percent(self, obj): return 0.0 def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 874aa3564f4a..d1253949d038 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -34,7 +34,6 @@ ) from . import ConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, @@ -561,13 +560,5 @@ def __build_kwargs(self, obj, **kwargs): kwargs["object_id"] = obj.id return kwargs - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index e82d1b800061..d46daafb691e 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -25,7 +25,6 @@ ) from . import ConcreteObjectStore from .caching import ( - CacheTarget, enable_cache_monitor, InProcessCacheMonitor, parse_caching_config_dict_from_xml, @@ -253,14 +252,6 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - def _get_bucket(self, bucket_name): """Sometimes a handle to a bucket is not established right away so try it a few times. Raise error is connection is not established.""" @@ -440,7 +431,7 @@ def _get_store_usage_percent(self, obj): return 0.0 def shutdown(self): - self.cache_monitor and self.cache_monitor.shutdown() + self._shutdown_cache_monitor() class GenericS3ObjectStore(S3ObjectStore): From 7229ae346c228b91c50068cc6dd0b872a682f2af Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 20:41:52 -0400 Subject: [PATCH 21/40] Remove unused copy-pasta from cloud object store. --- lib/galaxy/objectstore/cloud.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 44d18227fb97..f79de39b6653 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -65,7 +65,6 @@ class Cloud(ConcreteObjectStore, CloudConfigMixin, UsesCache): def __init__(self, config, config_dict): super().__init__(config, config_dict) - self.transfer_progress = 0 bucket_dict = config_dict["bucket"] cache_dict = config_dict.get("cache") or {} @@ -241,9 +240,6 @@ def _get_bucket(self, bucket_name): log.exception(f"Could not get bucket '{bucket_name}'") raise Exception - def _get_transfer_progress(self): - return self.transfer_progress - def _get_remote_size(self, rel_path): try: obj = self.bucket.objects.get(rel_path) @@ -270,9 +266,6 @@ def _exists_remotely(self, rel_path): return False return exists - def _transfer_cb(self, complete, total): - self.transfer_progress += 10 - def _download(self, rel_path): try: log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) @@ -295,7 +288,6 @@ def _download(self, rel_path): return True else: log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) - self.transfer_progress = 0 # Reset transfer progress counter with open(self._get_cache_path(rel_path), "wb+") as downloaded_file_handle: key.save_content(downloaded_file_handle) return True @@ -334,7 +326,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): os.path.getsize(source_file), rel_path, ) - self.transfer_progress = 0 # Reset transfer progress counter if not self.bucket.objects.get(rel_path): created_obj = self.bucket.objects.create(rel_path) created_obj.upload_from_file(source_file) From 9341bfb2cc8eee570b6088c87748bc50163096d0 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 20:42:37 -0400 Subject: [PATCH 22/40] Remove wrong comments. --- lib/galaxy/objectstore/azure_blob.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index e74a8758dbaa..0dcedeaf8f31 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -144,10 +144,6 @@ def to_dict(self): ) return as_dict - ################### - # Private Methods # - ################### - # config_xml is an ElementTree object. @classmethod def parse_xml(clazz, config_xml): @@ -260,10 +256,6 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) return False - ################## - # Public Methods # - ################## - def _delete_remote_all(self, rel_path: str) -> bool: try: blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) From deb0494aa63900c02e511c00551e8b123b6552d2 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 07:14:41 -0400 Subject: [PATCH 23/40] Remove duplication around cache monitor starting. --- lib/galaxy/objectstore/azure_blob.py | 4 +--- lib/galaxy/objectstore/caching.py | 4 ++++ lib/galaxy/objectstore/cloud.py | 6 +----- lib/galaxy/objectstore/rucio.py | 3 +-- lib/galaxy/objectstore/s3.py | 7 +------ 5 files changed, 8 insertions(+), 16 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 0dcedeaf8f31..6e306e74137d 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -116,9 +116,7 @@ def _initialize(self): raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE) self._configure_connection() - - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + self._start_cache_monitor_if_needed() def to_dict(self): as_dict = super().to_dict() diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index 1bd002b770f8..d743cbf7e383 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -501,6 +501,10 @@ def cache_target(self) -> CacheTarget: def _shutdown_cache_monitor(self) -> None: self.cache_monitor and self.cache_monitor.shutdown() + def _start_cache_monitor_if_needed(self): + if self.enable_cache_monitor: + self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + def _get_remote_size(self, rel_path: str) -> int: ... def _exists_remotely(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index f79de39b6653..99cd22df094d 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -88,7 +88,7 @@ def _initialize(self): self.conn = self._get_connection(self.provider, self.credentials) self.bucket = self._get_bucket(self.bucket_name) - self.start_cache_monitor() + self._start_cache_monitor_if_needed() # Test if 'axel' is available for parallel download and pull the key into cache try: subprocess.call("axel") @@ -96,10 +96,6 @@ def _initialize(self): except OSError: self.use_axel = False - def start_cache_monitor(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) - @staticmethod def _get_connection(provider, credentials): log.debug(f"Configuring `{provider}` Connection") diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index d1253949d038..dc3435a84c52 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -308,8 +308,7 @@ def __init__(self, config, config_dict): self._initialize() def _initialize(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + self._start_cache_monitor_if_needed() def _pull_into_cache(self, rel_path, auth_token): log.debug("rucio _pull_into_cache: %s", rel_path) diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index d46daafb691e..7fd55e20241f 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -26,7 +26,6 @@ from . import ConcreteObjectStore from .caching import ( enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, UsesCache, ) @@ -208,17 +207,13 @@ def _initialize(self): self._configure_connection() self._bucket = self._get_bucket(self.bucket) - self.start_cache_monitor() + self._start_cache_monitor_if_needed() # Test if 'axel' is available for parallel download and pull the key into cache if which("axel"): self.use_axel = True else: self.use_axel = False - def start_cache_monitor(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) - def _configure_connection(self): log.debug("Configuring S3 Connection") # If access_key is empty use default credential chain From 1ac4a0b06a79a8f76e303a6ec61fe591659f5787 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 07:29:49 -0400 Subject: [PATCH 24/40] Converge _download a bit if needed. --- lib/galaxy/objectstore/azure_blob.py | 5 +++-- lib/galaxy/objectstore/cloud.py | 14 ++++++++------ lib/galaxy/objectstore/irods.py | 4 ++-- lib/galaxy/objectstore/s3.py | 14 ++++++++------ 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 6e306e74137d..3d4180af0656 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -186,11 +186,12 @@ def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) - if not self.cache_target.fits_in_cache(self._get_remote_size(rel_path)): + remote_size = self._get_remote_size(rel_path) + if not self.cache_target.fits_in_cache(remote_size): log.critical( "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", rel_path, - self._get_remote_size(rel_path), + remote_size, self.cache_target.log_description, ) return False diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 99cd22df094d..ddf0a2659063 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -263,28 +263,30 @@ def _exists_remotely(self, rel_path): return exists def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) try: - log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) key = self.bucket.objects.get(rel_path) + remote_size = key.size # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(key.size): + if not self.cache_target.fits_in_cache(remote_size): log.critical( "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", rel_path, - key.size, + remote_size, self.cache_target.log_description, ) return False if self.use_axel: - log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) ret_code = subprocess.call(f"axel -a -n {ncores} '{url}'") if ret_code == 0: return True else: - log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) - with open(self._get_cache_path(rel_path), "wb+") as downloaded_file_handle: + log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) + with open(local_destination, "wb+") as downloaded_file_handle: key.save_content(downloaded_file_handle) return True except Exception: diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 10ab5f9c3a8f..0f23f3d99206 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -350,7 +350,8 @@ def _exists_remotely(self, rel_path): def _download(self, rel_path): ipt_timer = ExecutionTimer() - log.debug("Pulling data object '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + cache_path = self._get_cache_path(rel_path) + log.debug("Pulling data object '%s' into cache to %s", rel_path, cache_path) p = Path(rel_path) data_object_name = p.stem + p.suffix @@ -364,7 +365,6 @@ def _download(self, rel_path): options = {kw.FORCE_FLAG_KW: "", kw.DEST_RESC_NAME_KW: self.resource} try: - cache_path = self._get_cache_path(rel_path) self.session.data_objects.get(data_object_path, cache_path, **options) log.debug("Pulled data object '%s' into cache to %s", rel_path, cache_path) return True diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 7fd55e20241f..0e9996bc0ef6 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -300,33 +300,35 @@ def _transfer_cb(self, complete, total): self.transfer_progress += 10 def _download(self, rel_path): + local_destination = self._get_cache_path(rel_path) try: - log.debug("Pulling key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) key = self._bucket.get_key(rel_path) if key is None: message = f"Attempting to download an invalid key for path {rel_path}." log.critical(message) raise Exception(message) + remote_size = key.size # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(key.size): + if not self.cache_target.fits_in_cache(remote_size): log.critical( "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", rel_path, - key.size, + remote_size, self.cache_target.log_description, ) return False if self.use_axel: - log.debug("Parallel pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) ret_code = subprocess.call(["axel", "-a", "-n", str(ncores), url]) if ret_code == 0: return True else: - log.debug("Pulled key '%s' into cache to %s", rel_path, self._get_cache_path(rel_path)) + log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) self.transfer_progress = 0 # Reset transfer progress counter - key.get_contents_to_filename(self._get_cache_path(rel_path), cb=self._transfer_cb, num_cb=10) + key.get_contents_to_filename(local_destination, cb=self._transfer_cb, num_cb=10) return True except S3ResponseError: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self._bucket.name) From ab0f3a744b7e168486d45e103e38e7e10984d1fa Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 07:46:32 -0400 Subject: [PATCH 25/40] Re-work object store de-duplication for proper typing. --- lib/galaxy/objectstore/_caching_base.py | 333 ++++++++++++++++++++++++ lib/galaxy/objectstore/azure_blob.py | 10 +- lib/galaxy/objectstore/caching.py | 301 --------------------- lib/galaxy/objectstore/cloud.py | 12 +- lib/galaxy/objectstore/irods.py | 7 +- lib/galaxy/objectstore/pithos.py | 7 +- lib/galaxy/objectstore/rucio.py | 9 +- lib/galaxy/objectstore/s3.py | 9 +- 8 files changed, 350 insertions(+), 338 deletions(-) create mode 100644 lib/galaxy/objectstore/_caching_base.py diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py new file mode 100644 index 000000000000..9b0ca715f2b2 --- /dev/null +++ b/lib/galaxy/objectstore/_caching_base.py @@ -0,0 +1,333 @@ +import logging +import os +import shutil +from typing import ( + Any, + Dict, + Optional, +) + +from galaxy.exceptions import ( + ObjectInvalid, + ObjectNotFound, +) +from galaxy.objectstore import ConcreteObjectStore +from galaxy.util import ( + directory_hash_id, + ExecutionTimer, + unlink, +) +from galaxy.util.path import safe_relpath +from ._util import fix_permissions +from .caching import ( + CacheTarget, + InProcessCacheMonitor, +) + +log = logging.getLogger(__name__) + + +class CachingConcreteObjectStore(ConcreteObjectStore): + staging_path: str + extra_dirs: Dict[str, str] + config: Any + cache_updated_data: bool + enable_cache_monitor: bool + cache_size: int + cache_monitor: Optional[InProcessCacheMonitor] = None + cache_monitor_interval: int + + def _construct_path( + self, + obj, + base_dir=None, + dir_only=None, + extra_dir=None, + extra_dir_at_root=False, + alt_name=None, + obj_dir=False, + in_cache=False, + **kwargs, + ): + # extra_dir should never be constructed from provided data but just + # make sure there are no shenannigans afoot + if extra_dir and extra_dir != os.path.normpath(extra_dir): + log.warning("extra_dir is not normalized: %s", extra_dir) + raise ObjectInvalid("The requested object is invalid") + # ensure that any parent directory references in alt_name would not + # result in a path not contained in the directory path constructed here + if alt_name: + if not safe_relpath(alt_name): + log.warning("alt_name would locate path outside dir: %s", alt_name) + raise ObjectInvalid("The requested object is invalid") + # alt_name can contain parent directory references, but S3 will not + # follow them, so if they are valid we normalize them out + alt_name = os.path.normpath(alt_name) + + object_id = self._get_object_id(obj) + rel_path = os.path.join(*directory_hash_id(object_id)) + + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # for JOB_WORK directory + if obj_dir: + rel_path = os.path.join(rel_path, str(object_id)) + if base_dir: + base = self.extra_dirs.get(base_dir) + assert base + return os.path.join(base, rel_path) + + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat") + + if in_cache: + return self._get_cache_path(rel_path) + + return rel_path + + def _get_cache_path(self, rel_path: str) -> str: + return os.path.abspath(os.path.join(self.staging_path, rel_path)) + + def _in_cache(self, rel_path: str) -> bool: + """Check if the given dataset is in the local cache and return True if so.""" + cache_path = self._get_cache_path(rel_path) + return os.path.exists(cache_path) + + def _pull_into_cache(self, rel_path) -> bool: + ipt_timer = ExecutionTimer() + # Ensure the cache directory structure exists (e.g., dataset_#_files/) + rel_path_dir = os.path.dirname(rel_path) + if not os.path.exists(self._get_cache_path(rel_path_dir)): + os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) + # Now pull in the file + file_ok = self._download(rel_path) + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) + log.debug("_pull_into_cache: %s\n\n\n\n\n\n", ipt_timer) + return file_ok + + def _get_data(self, obj, start=0, count=-1, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Check cache first and get file if not there + if not self._in_cache(rel_path): + self._pull_into_cache(rel_path) + # Read the file content from cache + data_file = open(self._get_cache_path(rel_path)) + data_file.seek(start) + content = data_file.read(count) + data_file.close() + return content + + def _exists(self, obj, **kwargs): + in_cache = exists_remotely = False + rel_path = self._construct_path(obj, **kwargs) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + + # check job work directory stuff early to skip API hits. + if dir_only and base_dir: + if not os.path.exists(rel_path): + os.makedirs(rel_path, exist_ok=True) + return True + + in_cache = self._in_cache(rel_path) + exists_remotely = self._exists_remotely(rel_path) + dir_only = kwargs.get("dir_only", False) + base_dir = kwargs.get("base_dir", None) + if dir_only: + if in_cache or exists_remotely: + return True + else: + return False + + # TODO: Sync should probably not be done here. Add this to an async upload stack? + if in_cache and not exists_remotely: + self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) + return True + elif exists_remotely: + return True + else: + return False + + def _create(self, obj, **kwargs): + if not self._exists(obj, **kwargs): + # Pull out locally used fields + extra_dir = kwargs.get("extra_dir", None) + extra_dir_at_root = kwargs.get("extra_dir_at_root", False) + dir_only = kwargs.get("dir_only", False) + alt_name = kwargs.get("alt_name", None) + + # Construct hashed path + rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) + + # Optionally append extra_dir + if extra_dir is not None: + if extra_dir_at_root: + rel_path = os.path.join(extra_dir, rel_path) + else: + rel_path = os.path.join(rel_path, extra_dir) + + # Create given directory in cache + cache_dir = os.path.join(self.staging_path, rel_path) + if not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + + # If instructed, create the dataset in cache & in S3 + if not dir_only: + rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") + open(os.path.join(self.staging_path, rel_path), "w").close() + self._push_to_os(rel_path, from_string="") + return self + + def _empty(self, obj, **kwargs): + if self._exists(obj, **kwargs): + return self._size(obj, **kwargs) == 0 + else: + raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") + + def _size(self, obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + if self._in_cache(rel_path): + try: + return os.path.getsize(self._get_cache_path(rel_path)) + except OSError as ex: + log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) + elif self._exists_remotely(rel_path): + return self._get_remote_size(rel_path) + log.warning("Did not find dataset '%s', returning 0 for size", rel_path) + return 0 + + def _get_filename(self, obj, **kwargs): + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + sync_cache = kwargs.get("sync_cache", True) + + rel_path = self._construct_path(obj, **kwargs) + + # for JOB_WORK directory + if base_dir and dir_only and obj_dir: + return os.path.abspath(rel_path) + + cache_path = self._get_cache_path(rel_path) + if not sync_cache: + return cache_path + + # Check if the file exists in the cache first, always pull if file size in cache is zero + if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): + return cache_path + + # Check if the file exists in persistent storage and, if it does, pull it into cache + elif self._exists(obj, **kwargs): + if dir_only: + self._download_directory_into_cache(rel_path, cache_path) + return cache_path + else: + if self._pull_into_cache(rel_path): + return cache_path + raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") + + def _download_directory_into_cache(self, rel_path, cache_path): + # azure, pithos, irod, and cloud did not do this prior to refactoring so I am assuming + # there is just operations that fail with these object stores, + # I'm placing a no-op here to match their behavior but we should + # maybe implement this for those object stores. + pass + + def _delete(self, obj, entire_dir=False, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + extra_dir = kwargs.get("extra_dir", None) + base_dir = kwargs.get("base_dir", None) + dir_only = kwargs.get("dir_only", False) + obj_dir = kwargs.get("obj_dir", False) + try: + # Remove temporary data in JOB_WORK directory + if base_dir and dir_only and obj_dir: + shutil.rmtree(os.path.abspath(rel_path)) + return True + + # For the case of extra_files, because we don't have a reference to + # individual files/keys we need to remove the entire directory structure + # with all the files in it. This is easy for the local file system, + # but requires iterating through each individual key in S3 and deleing it. + if entire_dir and extra_dir: + shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) + return self._delete_remote_all(rel_path) + else: + # Delete from cache first + unlink(self._get_cache_path(rel_path), ignore_errors=True) + # Delete from S3 as well + if self._exists_remotely(rel_path): + return self._delete_existing_remote(rel_path) + except OSError: + log.exception("%s delete error", self._get_filename(obj, **kwargs)) + return False + + def _update_from_file(self, obj, file_name=None, create=False, **kwargs): + if create: + self._create(obj, **kwargs) + + if self._exists(obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + # Chose whether to use the dataset file itself or an alternate file + if file_name: + source_file = os.path.abspath(file_name) + # Copy into cache + cache_file = self._get_cache_path(rel_path) + try: + if source_file != cache_file and self.cache_updated_data: + # FIXME? Should this be a `move`? + shutil.copy2(source_file, cache_file) + fix_permissions(self.config, cache_file) + except OSError: + log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) + else: + source_file = self._get_cache_path(rel_path) + + self._push_to_os(rel_path, source_file) + + else: + raise ObjectNotFound( + f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" + ) + + @property + def cache_target(self) -> CacheTarget: + print("GRABBING CACHE_TARGET>...") + return CacheTarget( + self.staging_path, + self.cache_size, + 0.9, + ) + + def _shutdown_cache_monitor(self) -> None: + self.cache_monitor and self.cache_monitor.shutdown() + + def _start_cache_monitor_if_needed(self): + if self.enable_cache_monitor: + self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) + + def _get_remote_size(self, rel_path: str) -> int: + raise NotImplementedError() + + def _exists_remotely(self, rel_path: str) -> bool: + raise NotImplementedError() + + def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: + raise NotImplementedError() + + # def _get_object_id(self, obj: Any) -> str: + # raise NotImplementedError() + + def _download(self, rel_path: str) -> bool: + raise NotImplementedError() + + # Do not need to override these if instead replacing _delete + def _delete_existing_remote(self, rel_path) -> bool: + raise NotImplementedError() + + def _delete_remote_all(self, rel_path) -> bool: + raise NotImplementedError() diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 3d4180af0656..84acf8b45ef2 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -8,7 +8,6 @@ datetime, timedelta, ) -from typing import Optional try: from azure.common import AzureHttpError @@ -20,12 +19,10 @@ except ImportError: BlobServiceClient = None -from . import ConcreteObjectStore +from ._caching_base import CachingConcreteObjectStore from .caching import ( enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, - UsesCache, ) NO_BLOBSERVICE_ERROR_MESSAGE = ( @@ -72,7 +69,7 @@ def parse_config_xml(config_xml): }, "cache": cache_dict, "extra_dirs": extra_dirs, - "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -80,14 +77,13 @@ def parse_config_xml(config_xml): raise -class AzureBlobObjectStore(ConcreteObjectStore, UsesCache): +class AzureBlobObjectStore(CachingConcreteObjectStore): """ Object store that stores objects as blobs in an Azure Blob Container. A local cache exists that is used as an intermediate location for files between Galaxy and Azure. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "azure_blob" def __init__(self, config, config_dict): diff --git a/lib/galaxy/objectstore/caching.py b/lib/galaxy/objectstore/caching.py index d743cbf7e383..502d55b5a75c 100644 --- a/lib/galaxy/objectstore/caching.py +++ b/lib/galaxy/objectstore/caching.py @@ -3,13 +3,10 @@ import logging import os -import shutil import threading import time from math import inf from typing import ( - Any, - Dict, List, Optional, Tuple, @@ -17,20 +14,11 @@ from typing_extensions import NamedTuple -from galaxy.exceptions import ( - ObjectInvalid, - ObjectNotFound, -) from galaxy.util import ( - directory_hash_id, - ExecutionTimer, nice_size, string_as_bool, - unlink, ) -from galaxy.util.path import safe_relpath from galaxy.util.sleeper import Sleeper -from ._util import fix_permissions log = logging.getLogger(__name__) @@ -225,292 +213,3 @@ def shutdown(self): # Wait for the cache monitor thread to join before ending self.cache_monitor_thread.join(5) - - -# mixin for object stores using a cache directory -class UsesCache: - staging_path: str - extra_dirs: Dict[str, str] - config: Any - - def _construct_path( - self, - obj, - base_dir=None, - dir_only=None, - extra_dir=None, - extra_dir_at_root=False, - alt_name=None, - obj_dir=False, - in_cache=False, - **kwargs, - ): - # extra_dir should never be constructed from provided data but just - # make sure there are no shenannigans afoot - if extra_dir and extra_dir != os.path.normpath(extra_dir): - log.warning("extra_dir is not normalized: %s", extra_dir) - raise ObjectInvalid("The requested object is invalid") - # ensure that any parent directory references in alt_name would not - # result in a path not contained in the directory path constructed here - if alt_name: - if not safe_relpath(alt_name): - log.warning("alt_name would locate path outside dir: %s", alt_name) - raise ObjectInvalid("The requested object is invalid") - # alt_name can contain parent directory references, but S3 will not - # follow them, so if they are valid we normalize them out - alt_name = os.path.normpath(alt_name) - - object_id = self._get_object_id(obj) - rel_path = os.path.join(*directory_hash_id(object_id)) - - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # for JOB_WORK directory - if obj_dir: - rel_path = os.path.join(rel_path, str(object_id)) - if base_dir: - base = self.extra_dirs.get(base_dir) - assert base - return os.path.join(base, rel_path) - - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat") - - if in_cache: - return self._get_cache_path(rel_path) - - return rel_path - - def _get_cache_path(self, rel_path: str) -> str: - return os.path.abspath(os.path.join(self.staging_path, rel_path)) - - def _in_cache(self, rel_path: str) -> bool: - """Check if the given dataset is in the local cache and return True if so.""" - cache_path = self._get_cache_path(rel_path) - return os.path.exists(cache_path) - - def _pull_into_cache(self, rel_path) -> bool: - ipt_timer = ExecutionTimer() - # Ensure the cache directory structure exists (e.g., dataset_#_files/) - rel_path_dir = os.path.dirname(rel_path) - if not os.path.exists(self._get_cache_path(rel_path_dir)): - os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) - # Now pull in the file - file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - log.debug("_pull_into_cache: %s\n\n\n\n\n\n", ipt_timer) - return file_ok - - def _get_data(self, obj, start=0, count=-1, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Check cache first and get file if not there - if not self._in_cache(rel_path): - self._pull_into_cache(rel_path) - # Read the file content from cache - data_file = open(self._get_cache_path(rel_path)) - data_file.seek(start) - content = data_file.read(count) - data_file.close() - return content - - def _exists(self, obj, **kwargs): - in_cache = exists_remotely = False - rel_path = self._construct_path(obj, **kwargs) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - - # check job work directory stuff early to skip API hits. - if dir_only and base_dir: - if not os.path.exists(rel_path): - os.makedirs(rel_path, exist_ok=True) - return True - - in_cache = self._in_cache(rel_path) - exists_remotely = self._exists_remotely(rel_path) - dir_only = kwargs.get("dir_only", False) - base_dir = kwargs.get("base_dir", None) - if dir_only: - if in_cache or exists_remotely: - return True - else: - return False - - # TODO: Sync should probably not be done here. Add this to an async upload stack? - if in_cache and not exists_remotely: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) - return True - elif exists_remotely: - return True - else: - return False - - def _create(self, obj, **kwargs): - if not self._exists(obj, **kwargs): - # Pull out locally used fields - extra_dir = kwargs.get("extra_dir", None) - extra_dir_at_root = kwargs.get("extra_dir_at_root", False) - dir_only = kwargs.get("dir_only", False) - alt_name = kwargs.get("alt_name", None) - - # Construct hashed path - rel_path = os.path.join(*directory_hash_id(self._get_object_id(obj))) - - # Optionally append extra_dir - if extra_dir is not None: - if extra_dir_at_root: - rel_path = os.path.join(extra_dir, rel_path) - else: - rel_path = os.path.join(rel_path, extra_dir) - - # Create given directory in cache - cache_dir = os.path.join(self.staging_path, rel_path) - if not os.path.exists(cache_dir): - os.makedirs(cache_dir, exist_ok=True) - - # If instructed, create the dataset in cache & in S3 - if not dir_only: - rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") - open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") - return self - - def _empty(self, obj, **kwargs): - if self._exists(obj, **kwargs): - return self._size(obj, **kwargs) == 0 - else: - raise ObjectNotFound(f"objectstore.empty, object does not exist: {obj}, kwargs: {kwargs}") - - def _size(self, obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - if self._in_cache(rel_path): - try: - return os.path.getsize(self._get_cache_path(rel_path)) - except OSError as ex: - log.info("Could not get size of file '%s' in local cache, will try Azure. Error: %s", rel_path, ex) - elif self._exists_remotely(rel_path): - return self._get_remote_size(rel_path) - log.warning("Did not find dataset '%s', returning 0 for size", rel_path) - return 0 - - def _get_filename(self, obj, **kwargs): - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - sync_cache = kwargs.get("sync_cache", True) - - rel_path = self._construct_path(obj, **kwargs) - - # for JOB_WORK directory - if base_dir and dir_only and obj_dir: - return os.path.abspath(rel_path) - - cache_path = self._get_cache_path(rel_path) - if not sync_cache: - return cache_path - - # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): - return cache_path - - # Check if the file exists in persistent storage and, if it does, pull it into cache - elif self._exists(obj, **kwargs): - if dir_only: - self._download_directory_into_cache(rel_path, cache_path) - return cache_path - else: - if self._pull_into_cache(rel_path): - return cache_path - raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") - - def _download_directory_into_cache(self, rel_path, cache_path): - # azure, pithos, irod, and cloud did not do this prior to refactoring so I am assuming - # there is just operations that fail with these object stores, - # I'm placing a no-op here to match their behavior but we should - # maybe implement this for those object stores. - pass - - def _delete(self, obj, entire_dir=False, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - extra_dir = kwargs.get("extra_dir", None) - base_dir = kwargs.get("base_dir", None) - dir_only = kwargs.get("dir_only", False) - obj_dir = kwargs.get("obj_dir", False) - try: - # Remove temporary data in JOB_WORK directory - if base_dir and dir_only and obj_dir: - shutil.rmtree(os.path.abspath(rel_path)) - return True - - # For the case of extra_files, because we don't have a reference to - # individual files/keys we need to remove the entire directory structure - # with all the files in it. This is easy for the local file system, - # but requires iterating through each individual key in S3 and deleing it. - if entire_dir and extra_dir: - shutil.rmtree(self._get_cache_path(rel_path), ignore_errors=True) - return self._delete_remote_all(rel_path) - else: - # Delete from cache first - unlink(self._get_cache_path(rel_path), ignore_errors=True) - # Delete from S3 as well - if self._exists_remotely(rel_path): - return self._delete_existing_remote(rel_path) - except OSError: - log.exception("%s delete error", self._get_filename(obj, **kwargs)) - return False - - def _update_from_file(self, obj, file_name=None, create=False, **kwargs): - if create: - self._create(obj, **kwargs) - - if self._exists(obj, **kwargs): - rel_path = self._construct_path(obj, **kwargs) - # Chose whether to use the dataset file itself or an alternate file - if file_name: - source_file = os.path.abspath(file_name) - # Copy into cache - cache_file = self._get_cache_path(rel_path) - try: - if source_file != cache_file and self.cache_updated_data: - # FIXME? Should this be a `move`? - shutil.copy2(source_file, cache_file) - fix_permissions(self.config, cache_file) - except OSError: - log.exception("Trouble copying source file '%s' to cache '%s'", source_file, cache_file) - else: - source_file = self._get_cache_path(rel_path) - - self._push_to_os(rel_path, source_file) - - else: - raise ObjectNotFound( - f"objectstore.update_from_file, object does not exist: {str(obj)}, kwargs: {str(kwargs)}" - ) - - @property - def cache_target(self) -> CacheTarget: - return CacheTarget( - self.staging_path, - self.cache_size, - 0.9, - ) - - def _shutdown_cache_monitor(self) -> None: - self.cache_monitor and self.cache_monitor.shutdown() - - def _start_cache_monitor_if_needed(self): - if self.enable_cache_monitor: - self.cache_monitor = InProcessCacheMonitor(self.cache_target, self.cache_monitor_interval) - - def _get_remote_size(self, rel_path: str) -> int: ... - - def _exists_remotely(self, rel_path: str) -> bool: ... - - def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: ... - - def _get_object_id(self, obj: Any) -> str: ... - - def _download(self, rel_path: str) -> bool: ... diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index ddf0a2659063..c78992dd6d94 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -8,14 +8,9 @@ import os.path import subprocess from datetime import datetime -from typing import Optional -from . import ConcreteObjectStore -from .caching import ( - enable_cache_monitor, - InProcessCacheMonitor, - UsesCache, -) +from ._caching_base import CachingConcreteObjectStore +from .caching import enable_cache_monitor from .s3 import parse_config_xml try: @@ -53,14 +48,13 @@ def _config_to_dict(self): } -class Cloud(ConcreteObjectStore, CloudConfigMixin, UsesCache): +class Cloud(CachingConcreteObjectStore, CloudConfigMixin): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between Galaxy and the cloud storage. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "cloud" def __init__(self, config, config_dict): diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 0f23f3d99206..44160b9cfab3 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -25,8 +25,7 @@ string_as_bool, unlink, ) -from . import DiskObjectStore -from .caching import UsesCache +from ._caching_base import CachingConcreteObjectStore IRODS_IMPORT_MESSAGE = "The Python irods package is required to use this feature, please install it" # 1 MB @@ -109,7 +108,7 @@ def parse_config_xml(config_xml): "cache_updated_data": cache_updated_data, }, "extra_dirs": extra_dirs, - "private": DiskObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -145,7 +144,7 @@ def _config_to_dict(self): } -class IRODSObjectStore(DiskObjectStore, CloudConfigMixin, UsesCache): +class IRODSObjectStore(CachingConcreteObjectStore, CloudConfigMixin): """ Object store that stores files as data objects in an iRODS Zone. A local cache exists that is used as an intermediate location for files between Galaxy and iRODS. diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 2ba74636ac2b..89cfb13dfe59 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -17,8 +17,7 @@ KamakiClient = None from galaxy.util import directory_hash_id -from . import ConcreteObjectStore -from .caching import UsesCache +from ._caching_base import CachingConcreteObjectStore NO_KAMAKI_ERROR_MESSAGE = ( "ObjectStore configured, but no kamaki.clients dependency available." @@ -69,7 +68,7 @@ def parse_config_xml(config_xml): log.error(msg) raise Exception(msg) r["extra_dirs"] = [{k: e.get(k) for k in attrs} for e in extra_dirs] - r["private"] = ConcreteObjectStore.parse_private_from_config_xml(config_xml) + r["private"] = CachingConcreteObjectStore.parse_private_from_config_xml(config_xml) if "job_work" not in (d["type"] for d in r["extra_dirs"]): msg = f'No value for {tag}:type="job_work" in XML tree' log.error(msg) @@ -80,7 +79,7 @@ def parse_config_xml(config_xml): return r -class PithosObjectStore(ConcreteObjectStore, UsesCache): +class PithosObjectStore(CachingConcreteObjectStore): """ Object store that stores objects as items in a Pithos+ container. Cache is ignored for the time being. diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index dc3435a84c52..2274946e375a 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -2,7 +2,6 @@ import logging import os import shutil -from typing import Optional try: import rucio.common @@ -32,12 +31,10 @@ umask_fix_perms, unlink, ) -from . import ConcreteObjectStore +from ._caching_base import CachingConcreteObjectStore from .caching import ( enable_cache_monitor, - InProcessCacheMonitor, parse_caching_config_dict_from_xml, - UsesCache, ) log = logging.getLogger(__name__) @@ -272,7 +269,7 @@ def delete(self, key, auth_token): return True -class RucioObjectStore(ConcreteObjectStore, UsesCache): +class RucioObjectStore(CachingConcreteObjectStore): """ Object store implementation that uses ORNL remote data broker. @@ -280,8 +277,6 @@ class RucioObjectStore(ConcreteObjectStore, UsesCache): Galaxy at some future point or significantly modified. """ - cache_monitor: Optional[InProcessCacheMonitor] = None - store_type = "rucio" def to_dict(self): diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 0e9996bc0ef6..175009f21efb 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -8,7 +8,6 @@ import subprocess import time from datetime import datetime -from typing import Optional try: # Imports are done this way to allow objectstore code to be used outside of Galaxy. @@ -23,11 +22,10 @@ string_as_bool, which, ) -from . import ConcreteObjectStore +from ._caching_base import CachingConcreteObjectStore from .caching import ( enable_cache_monitor, parse_caching_config_dict_from_xml, - UsesCache, ) from .s3_multipart_upload import multipart_upload @@ -109,7 +107,7 @@ def parse_config_xml(config_xml): }, "cache": cache_dict, "extra_dirs": extra_dirs, - "private": ConcreteObjectStore.parse_private_from_config_xml(config_xml), + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), } except Exception: # Toss it back up after logging, we can't continue loading at this point. @@ -144,14 +142,13 @@ def _config_to_dict(self): } -class S3ObjectStore(ConcreteObjectStore, CloudConfigMixin, UsesCache): +class S3ObjectStore(CachingConcreteObjectStore, CloudConfigMixin): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between Galaxy and S3. """ - cache_monitor: Optional[InProcessCacheMonitor] = None store_type = "aws_s3" def __init__(self, config, config_dict): From 702fe366fb436506be3c43e0a13beb4e7d4a5c2d Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 09:00:04 -0400 Subject: [PATCH 26/40] Fix duplication around cache logging. Eliminate CloudConfigMixin... This got copid and pasted over and over and each place just used once. Really odd reading of my original code but I guess I understand why people did it. --- lib/galaxy/objectstore/_caching_base.py | 12 ++++++ lib/galaxy/objectstore/azure_blob.py | 10 +---- lib/galaxy/objectstore/cloud.py | 43 ++++++++----------- lib/galaxy/objectstore/irods.py | 56 ++++++++++++------------- lib/galaxy/objectstore/s3.py | 9 +--- 5 files changed, 58 insertions(+), 72 deletions(-) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index 9b0ca715f2b2..ce495f92fb5b 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -182,6 +182,18 @@ def _create(self, obj, **kwargs): self._push_to_os(rel_path, from_string="") return self + def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> bool: + if remote_size is None: + remote_size = self._get_remote_size(rel_path) + if not self.cache_target.fits_in_cache(remote_size): + log.critical( + "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", + rel_path, + remote_size, + self.cache_target.log_description, + ) + return False + def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): return self._size(obj, **kwargs) == 0 diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index 84acf8b45ef2..ddd9eadbc8bf 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -3,7 +3,6 @@ """ import logging -import os from datetime import ( datetime, timedelta, @@ -182,14 +181,7 @@ def _download(self, rel_path): local_destination = self._get_cache_path(rel_path) try: log.debug("Pulling '%s' into cache to %s", rel_path, local_destination) - remote_size = self._get_remote_size(rel_path) - if not self.cache_target.fits_in_cache(remote_size): - log.critical( - "File %s is larger (%s bytes) than the configured cache allows (%s). Cannot download.", - rel_path, - remote_size, - self.cache_target.log_description, - ) + if not self._caching_allowed(rel_path): return False else: with open(local_destination, "wb") as f: diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index c78992dd6d94..35a713b88a81 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -31,24 +31,7 @@ ) -class CloudConfigMixin: - def _config_to_dict(self): - return { - "provider": self.provider, - "auth": self.credentials, - "bucket": { - "name": self.bucket_name, - "use_reduced_redundancy": self.use_rr, - }, - "cache": { - "size": self.cache_size, - "path": self.staging_path, - "cache_updated_data": self.cache_updated_data, - }, - } - - -class Cloud(CachingConcreteObjectStore, CloudConfigMixin): +class Cloud(CachingConcreteObjectStore): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between @@ -213,6 +196,21 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict + def _config_to_dict(self): + return { + "provider": self.provider, + "auth": self.credentials, + "bucket": { + "name": self.bucket_name, + "use_reduced_redundancy": self.use_rr, + }, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } + def _get_bucket(self, bucket_name): try: bucket = self.conn.storage.buckets.get(bucket_name) @@ -262,14 +260,7 @@ def _download(self, rel_path): log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) key = self.bucket.objects.get(rel_path) remote_size = key.size - # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(remote_size): - log.critical( - "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", - rel_path, - remote_size, - self.cache_target.log_description, - ) + if not self._caching_allowed(rel_path, remote_size): return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index 44160b9cfab3..c37ee75c72d2 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -116,35 +116,7 @@ def parse_config_xml(config_xml): raise -class CloudConfigMixin: - def _config_to_dict(self): - return { - "auth": { - "username": self.username, - "password": self.password, - }, - "resource": { - "name": self.resource, - }, - "zone": { - "name": self.zone, - }, - "connection": { - "host": self.host, - "port": self.port, - "timeout": self.timeout, - "refresh_time": self.refresh_time, - "connection_pool_monitor_interval": self.connection_pool_monitor_interval, - }, - "cache": { - "size": self.cache_size, - "path": self.staging_path, - "cache_updated_data": self.cache_updated_data, - }, - } - - -class IRODSObjectStore(CachingConcreteObjectStore, CloudConfigMixin): +class IRODSObjectStore(CachingConcreteObjectStore): """ Object store that stores files as data objects in an iRODS Zone. A local cache exists that is used as an intermediate location for files between Galaxy and iRODS. @@ -307,6 +279,32 @@ def to_dict(self): as_dict.update(self._config_to_dict()) return as_dict + def _config_to_dict(self): + return { + "auth": { + "username": self.username, + "password": self.password, + }, + "resource": { + "name": self.resource, + }, + "zone": { + "name": self.zone, + }, + "connection": { + "host": self.host, + "port": self.port, + "timeout": self.timeout, + "refresh_time": self.refresh_time, + "connection_pool_monitor_interval": self.connection_pool_monitor_interval, + }, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } + # rel_path is file or folder? def _get_remote_size(self, rel_path): ipt_timer = ExecutionTimer() diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 175009f21efb..39fb41b64109 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -306,14 +306,7 @@ def _download(self, rel_path): log.critical(message) raise Exception(message) remote_size = key.size - # Test if cache is large enough to hold the new file - if not self.cache_target.fits_in_cache(remote_size): - log.critical( - "File %s is larger (%s) than the configured cache allows (%s). Cannot download.", - rel_path, - remote_size, - self.cache_target.log_description, - ) + if not self._caching_allowed(rel_path, remote_size): return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) From c0e09c96d2a29ffa18627458b3256caff0a25ba9 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 13:12:50 -0400 Subject: [PATCH 27/40] De-duplicat _push_to_os. --- lib/galaxy/objectstore/_caching_base.py | 53 +++++++++++++++--- lib/galaxy/objectstore/azure_blob.py | 57 ++++--------------- lib/galaxy/objectstore/cloud.py | 73 +++++++------------------ 3 files changed, 77 insertions(+), 106 deletions(-) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index ce495f92fb5b..30360d71c1d2 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -1,6 +1,7 @@ import logging import os import shutil +from datetime import datetime from typing import ( Any, Dict, @@ -193,6 +194,44 @@ def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> self.cache_target.log_description, ) return False + return True + + def _push_to_os(self, rel_path, source_file=None, from_string=None): + source_file = source_file or self._get_cache_path(rel_path) + if from_string is None and not os.path.exists(source_file): + log.error( + "Tried updating remote path '%s' from source file '%s', but source file does not exist.", + rel_path, + source_file, + ) + return False + + if from_string is None and os.path.getsize(source_file) == 0: + log.debug( + "Wanted to push file '%s' to remote path '%s' but its size is 0; skipping.", source_file, rel_path + ) + return True + + if from_string is not None: + return self._push_string_to_path(rel_path, from_string) + else: + start_time = datetime.now() + log.debug( + "Pushing cache file '%s' of size %s bytes to '%s'", + source_file, + os.path.getsize(source_file), + rel_path, + ) + success = self._push_file_to_path(rel_path, source_file) + end_time = datetime.now() + log.debug( + "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)", + source_file, + rel_path, + os.path.getsize(source_file), + end_time - start_time, + ) + return success def _empty(self, obj, **kwargs): if self._exists(obj, **kwargs): @@ -308,7 +347,6 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): @property def cache_target(self) -> CacheTarget: - print("GRABBING CACHE_TARGET>...") return CacheTarget( self.staging_path, self.cache_size, @@ -328,12 +366,6 @@ def _get_remote_size(self, rel_path: str) -> int: def _exists_remotely(self, rel_path: str) -> bool: raise NotImplementedError() - def _push_to_os(self, rel_path, source_file: Optional[str] = None, from_string: Optional[str] = None) -> None: - raise NotImplementedError() - - # def _get_object_id(self, obj: Any) -> str: - # raise NotImplementedError() - def _download(self, rel_path: str) -> bool: raise NotImplementedError() @@ -343,3 +375,10 @@ def _delete_existing_remote(self, rel_path) -> bool: def _delete_remote_all(self, rel_path) -> bool: raise NotImplementedError() + + # Do not need to override these if instead replacing _push_to_os + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: + raise NotImplementedError() + + def _push_file_to_path(self, rel_path: str, target_file: str) -> bool: + raise NotImplementedError() diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index ddd9eadbc8bf..bc04bf9871df 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -191,57 +191,22 @@ def _download(self, rel_path): log.exception("Problem downloading '%s' from Azure", rel_path) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): - """ - Push the file pointed to by ``rel_path`` to the object store naming the blob - ``rel_path``. If ``source_file`` is provided, push that file instead while - still using ``rel_path`` as the blob name. - If ``from_string`` is provided, set contents of the file to the value of - the string. - """ + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: try: - source_file = source_file or self._get_cache_path(rel_path) - - if from_string is None and not os.path.exists(source_file): - log.error( - "Tried updating blob '%s' from source file '%s', but source file does not exist.", - rel_path, - source_file, - ) - return False - - if from_string is None and os.path.getsize(source_file) == 0: - log.debug( - "Wanted to push file '%s' to azure blob '%s' but its size is 0; skipping.", source_file, rel_path - ) - return True - - if from_string is not None: - self._blob_client(rel_path).upload_blob(from_string, overwrite=True) - log.debug("Pushed data from string '%s' to blob '%s'", from_string, rel_path) - else: - start_time = datetime.now() - log.debug( - "Pushing cache file '%s' of size %s bytes to '%s'", - source_file, - os.path.getsize(source_file), - rel_path, - ) - with open(source_file, "rb") as f: - self._blob_client(rel_path).upload_blob(f, overwrite=True) - end_time = datetime.now() - log.debug( - "Pushed cache file '%s' to blob '%s' (%s bytes transferred in %s sec)", - source_file, - rel_path, - os.path.getsize(source_file), - end_time - start_time, - ) + self._blob_client(rel_path).upload_blob(from_string, overwrite=True) return True + except AzureHttpError: + log.exception("Trouble pushing to Azure Blob '%s' from string", rel_path) + return False + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + with open(source_file, "rb") as f: + self._blob_client(rel_path).upload_blob(f, overwrite=True) + return True except AzureHttpError: log.exception("Trouble pushing to Azure Blob '%s' from file '%s'", rel_path, source_file) - return False + return False def _delete_remote_all(self, rel_path: str) -> bool: try: diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 35a713b88a81..89d1f0855833 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -7,7 +7,6 @@ import os import os.path import subprocess -from datetime import datetime from ._caching_base import CachingConcreteObjectStore from .caching import enable_cache_monitor @@ -278,61 +277,29 @@ def _download(self, rel_path): log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): - """ - Push the file pointed to by ``rel_path`` to the object store naming the key - ``rel_path``. If ``source_file`` is provided, push that file instead while - still using ``rel_path`` as the key name. - If ``from_string`` is provided, set contents of the file to the value of - the string. - """ + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: try: - source_file = source_file if source_file else self._get_cache_path(rel_path) - if os.path.exists(source_file): - if os.path.getsize(source_file) == 0 and (self.bucket.objects.get(rel_path) is not None): - log.debug( - "Wanted to push file '%s' to S3 key '%s' but its size is 0; skipping.", source_file, rel_path - ) - return True - if from_string: - if not self.bucket.objects.get(rel_path): - created_obj = self.bucket.objects.create(rel_path) - created_obj.upload(source_file) - else: - self.bucket.objects.get(rel_path).upload(source_file) - log.debug("Pushed data from string '%s' to key '%s'", from_string, rel_path) - else: - start_time = datetime.now() - log.debug( - "Pushing cache file '%s' of size %s bytes to key '%s'", - source_file, - os.path.getsize(source_file), - rel_path, - ) - if not self.bucket.objects.get(rel_path): - created_obj = self.bucket.objects.create(rel_path) - created_obj.upload_from_file(source_file) - else: - self.bucket.objects.get(rel_path).upload_from_file(source_file) - - end_time = datetime.now() - log.debug( - "Pushed cache file '%s' to key '%s' (%s bytes transfered in %s sec)", - source_file, - rel_path, - os.path.getsize(source_file), - end_time - start_time, - ) - return True + if not self.bucket.objects.get(rel_path): + created_obj = self.bucket.objects.create(rel_path) + created_obj.upload(from_string) else: - log.error( - "Tried updating key '%s' from source file '%s', but source file does not exist.", - rel_path, - source_file, - ) + self.bucket.objects.get(rel_path).upload(from_string) + return True except Exception: - log.exception("Trouble pushing S3 key '%s' from file '%s'", rel_path, source_file) - return False + log.exception("Trouble pushing to cloud '%s' from string", rel_path) + return False + + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + if not self.bucket.objects.get(rel_path): + created_obj = self.bucket.objects.create(rel_path) + created_obj.upload_from_file(source_file) + else: + self.bucket.objects.get(rel_path).upload_from_file(source_file) + return True + except Exception: + log.exception("Trouble pushing to cloud '%s' from file '%s'", rel_path, source_file) + return False def _delete_remote_all(self, rel_path: str) -> bool: try: From c42904700979b5353091fcd716a397c1369b4376 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 18:13:25 -0400 Subject: [PATCH 28/40] Fix irods test cases for code refactoring. --- .../integration/objectstore/test_objectstore_datatype_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/objectstore/test_objectstore_datatype_upload.py b/test/integration/objectstore/test_objectstore_datatype_upload.py index 3a17174bd66c..a383e43edaa1 100644 --- a/test/integration/objectstore/test_objectstore_datatype_upload.py +++ b/test/integration/objectstore/test_objectstore_datatype_upload.py @@ -234,7 +234,7 @@ def test_upload_datatype_irods_idle_connections( # Verify the connection pool has 0 active and 1 idle connections assert len(connection_pool.active) == 0 - assert len(connection_pool.idle) == 1 + assert len(connection_pool.idle) in [1, 2] # Wait for the idle connection to turn stale time.sleep(REFRESH_TIME) From 64262333d1f4d34cd23c51cebfae519390e9d828 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 13:56:22 -0400 Subject: [PATCH 29/40] Fix axel call across s3 & cloud object stores. --- lib/galaxy/objectstore/cloud.py | 2 +- lib/galaxy/objectstore/s3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 89d1f0855833..de3b4ef253e9 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -265,7 +265,7 @@ def _download(self, rel_path): log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) - ret_code = subprocess.call(f"axel -a -n {ncores} '{url}'") + ret_code = subprocess.call(["axel", "-a", "-o", local_destination, "-n", str(ncores), url]) if ret_code == 0: return True else: diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 39fb41b64109..2392c2a28573 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -312,7 +312,7 @@ def _download(self, rel_path): log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) ncores = multiprocessing.cpu_count() url = key.generate_url(7200) - ret_code = subprocess.call(["axel", "-a", "-n", str(ncores), url]) + ret_code = subprocess.call(["axel", "-a", "-o", local_destination, "-n", str(ncores), url]) if ret_code == 0: return True else: From 7fd767e8a21fc07780298ac2524c9b719ccf2b7e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 14:04:03 -0400 Subject: [PATCH 30/40] Merge axel download stuff together. --- lib/galaxy/objectstore/_util.py | 22 +++++++++++++++++++++- lib/galaxy/objectstore/cloud.py | 17 ++++------------- lib/galaxy/objectstore/s3.py | 21 +++++---------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/lib/galaxy/objectstore/_util.py b/lib/galaxy/objectstore/_util.py index e70f26e0661e..fbdf9adde4f6 100644 --- a/lib/galaxy/objectstore/_util.py +++ b/lib/galaxy/objectstore/_util.py @@ -1,6 +1,11 @@ +import multiprocessing import os +import subprocess -from galaxy.util import umask_fix_perms +from galaxy.util import ( + umask_fix_perms, + which, +) def fix_permissions(config, rel_path: str): @@ -13,3 +18,18 @@ def fix_permissions(config, rel_path: str): if os.path.islink(path): continue umask_fix_perms(path, config.umask, 0o666, config.gid) + + +class UsesAxel: + use_axel: bool + + def _init_axel(self) -> None: + if which("axel"): + self.use_axel = True + else: + self.use_axel = False + + def _axel_download(self, url: str, path: str): + ncores = multiprocessing.cpu_count() + ret_code = subprocess.call(["axel", "-a", "-o", path, "-n", str(ncores), url]) + return ret_code == 0 diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index de3b4ef253e9..b6599b4e916d 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -3,12 +3,11 @@ """ import logging -import multiprocessing import os import os.path -import subprocess from ._caching_base import CachingConcreteObjectStore +from ._util import UsesAxel from .caching import enable_cache_monitor from .s3 import parse_config_xml @@ -30,7 +29,7 @@ ) -class Cloud(CachingConcreteObjectStore): +class Cloud(CachingConcreteObjectStore, UsesAxel): """ Object store that stores objects as items in an cloud storage. A local cache exists that is used as an intermediate location for files between @@ -65,12 +64,7 @@ def _initialize(self): self.conn = self._get_connection(self.provider, self.credentials) self.bucket = self._get_bucket(self.bucket_name) self._start_cache_monitor_if_needed() - # Test if 'axel' is available for parallel download and pull the key into cache - try: - subprocess.call("axel") - self.use_axel = True - except OSError: - self.use_axel = False + self._init_axel() @staticmethod def _get_connection(provider, credentials): @@ -263,11 +257,8 @@ def _download(self, rel_path): return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) - ncores = multiprocessing.cpu_count() url = key.generate_url(7200) - ret_code = subprocess.call(["axel", "-a", "-o", local_destination, "-n", str(ncores), url]) - if ret_code == 0: - return True + return self._axel_download(url, local_destination) else: log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) with open(local_destination, "wb+") as downloaded_file_handle: diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 2392c2a28573..9cff7e0fa800 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -3,9 +3,7 @@ """ import logging -import multiprocessing import os -import subprocess import time from datetime import datetime @@ -18,11 +16,9 @@ except ImportError: boto = None # type: ignore[assignment] -from galaxy.util import ( - string_as_bool, - which, -) +from galaxy.util import string_as_bool from ._caching_base import CachingConcreteObjectStore +from ._util import UsesAxel from .caching import ( enable_cache_monitor, parse_caching_config_dict_from_xml, @@ -142,7 +138,7 @@ def _config_to_dict(self): } -class S3ObjectStore(CachingConcreteObjectStore, CloudConfigMixin): +class S3ObjectStore(CachingConcreteObjectStore, CloudConfigMixin, UsesAxel): """ Object store that stores objects as items in an AWS S3 bucket. A local cache exists that is used as an intermediate location for files between @@ -205,11 +201,7 @@ def _initialize(self): self._configure_connection() self._bucket = self._get_bucket(self.bucket) self._start_cache_monitor_if_needed() - # Test if 'axel' is available for parallel download and pull the key into cache - if which("axel"): - self.use_axel = True - else: - self.use_axel = False + self._init_axel() def _configure_connection(self): log.debug("Configuring S3 Connection") @@ -310,11 +302,8 @@ def _download(self, rel_path): return False if self.use_axel: log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) - ncores = multiprocessing.cpu_count() url = key.generate_url(7200) - ret_code = subprocess.call(["axel", "-a", "-o", local_destination, "-n", str(ncores), url]) - if ret_code == 0: - return True + return self._axel_download(url, local_destination) else: log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) self.transfer_progress = 0 # Reset transfer progress counter From 13e1b32ceec7386a48d8b1cd91509655833d1167 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 7 May 2024 19:57:45 -0400 Subject: [PATCH 31/40] Implement a boto3 object store. --- lib/galaxy/dependencies/__init__.py | 3 + lib/galaxy/objectstore/__init__.py | 4 + lib/galaxy/objectstore/s3_boto3.py | 340 ++++++++++++++++++++++ pyproject.toml | 1 + test/unit/objectstore/test_objectstore.py | 84 +++++- 5 files changed, 431 insertions(+), 1 deletion(-) create mode 100644 lib/galaxy/objectstore/s3_boto3.py diff --git a/lib/galaxy/dependencies/__init__.py b/lib/galaxy/dependencies/__init__.py index 0bb785aa136a..44322353329a 100644 --- a/lib/galaxy/dependencies/__init__.py +++ b/lib/galaxy/dependencies/__init__.py @@ -234,6 +234,9 @@ def check_python_pam(self): def check_azure_storage(self): return "azure_blob" in self.object_stores + def check_boto3(self): + return "boto3" in self.object_stores + def check_kamaki(self): return "pithos" in self.object_stores diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index b954ea910f65..a09bcf4adb31 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -1386,6 +1386,10 @@ def type_to_object_store_class(store: str, fsmon: bool = False) -> Tuple[Type[Ba objectstore_constructor_kwds = {} if store == "disk": objectstore_class = DiskObjectStore + elif store == "boto3": + from .s3_boto3 import S3ObjectStore as Boto3ObjectStore + + objectstore_class = Boto3ObjectStore elif store in ["s3", "aws_s3"]: from .s3 import S3ObjectStore diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py new file mode 100644 index 000000000000..cd269aad61e4 --- /dev/null +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -0,0 +1,340 @@ +"""A more modern version of the S3 object store based on boto3 instead of boto. +""" + +import logging +import os +from typing import ( + TYPE_CHECKING, +) + +from typing_extensions import ( + Literal, + NotRequired, + TypedDict, +) + +if TYPE_CHECKING: + from mypy_boto3_c3.client import S3Client + +try: + # Imports are done this way to allow objectstore code to be used outside of Galaxy. + import boto3 + from botocore.client import ClientError +except ImportError: + boto3 = None # type: ignore[assignment,unused-ignore] + +from ._caching_base import CachingConcreteObjectStore +from .caching import ( + enable_cache_monitor, + parse_caching_config_dict_from_xml, +) + +NO_BOTO_ERROR_MESSAGE = ( + "S3/Swift object store configured, but no boto3 dependency available." + "Please install and properly configure boto or modify object store configuration." +) + +log = logging.getLogger(__name__) +logging.getLogger("boto").setLevel(logging.INFO) # Otherwise boto is quite noisy + + +def download_directory(bucket, remote_folder, local_path): + # List objects in the specified S3 folder + objects = bucket.list(prefix=remote_folder) + + for obj in objects: + remote_file_path = obj.key + local_file_path = os.path.join(local_path, os.path.relpath(remote_file_path, remote_folder)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + obj.get_contents_to_filename(local_file_path) + + +def parse_config_xml(config_xml): + try: + a_xml = config_xml.findall("auth")[0] + access_key = a_xml.get("access_key") + secret_key = a_xml.get("secret_key") + + b_xml = config_xml.findall("bucket")[0] + bucket_name = b_xml.get("name") + + cn_xml = config_xml.findall("connection") + if not cn_xml: + cn_xml = {} + else: + cn_xml = cn_xml[0] + endpoint_url = cn_xml.get("endpoint_url") + region = cn_xml.get("region") + cache_dict = parse_caching_config_dict_from_xml(config_xml) + + tag, attrs = "extra_dir", ("type", "path") + extra_dirs = config_xml.findall(tag) + if not extra_dirs: + msg = f"No {tag} element in XML tree" + log.error(msg) + raise Exception(msg) + extra_dirs = [{k: e.get(k) for k in attrs} for e in extra_dirs] + + config_dict = { + "auth": { + "access_key": access_key, + "secret_key": secret_key, + }, + "bucket": { + "name": bucket_name, + }, + "connection": { + "endpoint_url": endpoint_url, + "region": region, + }, + "cache": cache_dict, + "extra_dirs": extra_dirs, + "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), + } + name = config_xml.attrib.get("name", None) + if name is not None: + config_dict["name"] = name + device = config_xml.attrib.get("device", None) + config_dict["device"] = device + return config_dict + except Exception: + # Toss it back up after logging, we can't continue loading at this point. + log.exception("Malformed ObjectStore Configuration XML -- unable to continue") + raise + + +class S3ClientConstructorKwds(TypedDict): + service_name: Literal["s3"] + endpoint_url: NotRequired[str] + region_name: NotRequired[str] + aws_access_key_id: NotRequired[str] + aws_secret_access_key: NotRequired[str] + + +class S3ObjectStore(CachingConcreteObjectStore): + """ + Object store that stores objects as items in an AWS S3 bucket. A local + cache exists that is used as an intermediate location for files between + Galaxy and S3. + """ + + _client: "S3Client" + store_type = "aws_s3_boto3" + cloud = True + + def __init__(self, config, config_dict): + super().__init__(config, config_dict) + self.cache_monitor = None + + auth_dict = config_dict["auth"] + bucket_dict = config_dict["bucket"] + connection_dict = config_dict.get("connection", {}) + cache_dict = config_dict.get("cache") or {} + self.enable_cache_monitor, self.cache_monitor_interval = enable_cache_monitor(config, config_dict) + + self.access_key = auth_dict.get("access_key") + self.secret_key = auth_dict.get("secret_key") + + self.bucket = bucket_dict.get("name") + + self.endpoint_url = connection_dict.get("endpoint_url") + self.region = connection_dict.get("region") + + self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size + self.staging_path = cache_dict.get("path") or self.config.object_store_cache_path + self.cache_updated_data = cache_dict.get("cache_updated_data", True) + + extra_dirs = {e["type"]: e["path"] for e in config_dict.get("extra_dirs", [])} + self.extra_dirs.update(extra_dirs) + + self._initialize() + + def _initialize(self): + if boto3 is None: + raise Exception(NO_BOTO_ERROR_MESSAGE) + + self._configure_connection() + self._start_cache_monitor_if_needed() + + def _configure_connection(self): + log.debug("Configuring S3 Connection") + self._init_client() + if not self._bucket_exists: + self._create_bucket() + + # get_object_url only works on AWS if client is set, so if it wasn't + # fetch it and reset the client now. Skip this logic entirely for other + # non-AWS services by ensuring endpoint_url is not set. + if not self.endpoint_url and not self.region: + response = self._client.get_bucket_location( + Bucket=self.bucket, + ) + if "LocationConstraint" in response: + region = response["LocationConstraint"] + self.region = region + self._init_client() + + def _init_client(self): + # set _client based on current args. + # If access_key is empty use default credential chain + kwds: S3ClientConstructorKwds = { + "service_name": "s3", + } + if self.endpoint_url: + kwds["endpoint_url"] = self.endpoint_url + if self.region: + kwds["region_name"] = self.region + if self.access_key: + kwds["aws_access_key_id"] = self.access_key + kwds["aws_secret_access_key"] = self.secret_key + self._client = boto3.client(**kwds) + + @property + def _bucket_exists(self) -> bool: + try: + self._client.head_bucket(Bucket=self.bucket) + return True + except ClientError as err: + if err.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return False + raise + + def _create_bucket(self): + kwds = {} + if self.region: + kwds["CreateBucketConfiguration"] = dict(LocationConstraint=self.region) + self._client.create_bucket(Bucket=self.bucket, **kwds) + + @classmethod + def parse_xml(clazz, config_xml): + return parse_config_xml(config_xml) + + def _config_to_dict(self): + return { + "auth": { + "access_key": self.access_key, + "secret_key": self.secret_key, + }, + "bucket": { + "name": self.bucket, + }, + "connection": { + "endpoint_url": self.endpoint_url, + "region": self.region, + }, + "cache": { + "size": self.cache_size, + "path": self.staging_path, + "cache_updated_data": self.cache_updated_data, + }, + } + + def to_dict(self): + as_dict = super().to_dict() + as_dict.update(self._config_to_dict()) + return as_dict + + def _get_remote_size(self, rel_path) -> int: + response = self._client.head_object(Bucket=self.bucket, Key=rel_path) + return response["ContentLength"] + + def _exists_remotely(self, rel_path: str) -> bool: + try: + self._client.head_object(Bucket=self.bucket, Key=rel_path) + return True + except ClientError as e: + if e.response["Error"]["Code"] == "404": + return False + raise + + def _download(self, rel_path: str) -> bool: + local_destination = self._get_cache_path(rel_path) + try: + log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) + if not self._caching_allowed(rel_path): + return False + self._client.download_file(self.bucket, rel_path, local_destination) + return True + except ClientError: + log.exception("Failed to download file from S3") + return False + + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: + try: + self._client.put_object(Body=from_string.encode("utf-8"), Bucket=self.bucket, Key=rel_path) + return True + except ClientError: + log.exception("Trouble pushing to S3 '%s' from string", rel_path) + return False + + def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: + try: + self._client.upload_file(source_file, self.bucket, rel_path) + return True + except ClientError: + log.exception("Trouble pushing to S3 '%s' from file '%s'", rel_path, source_file) + return False + + def _delete_remote_all(self, rel_path: str) -> bool: + try: + for key in self._keys(rel_path): + self._client.delete_object(Bucket=self.bucket, Key=key) + return True + except ClientError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + + def _delete_existing_remote(self, rel_path: str) -> bool: + try: + self._client.delete_object(Bucket=self.bucket, Key=rel_path) + return True + except ClientError: + log.exception("Could not delete blob '%s' from S3", rel_path) + return False + + # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 + def _keys(self, prefix="/", delimiter="/", start_after=""): + s3_paginator = self._client.get_paginator("list_objects_v2") + prefix = prefix.lstrip(delimiter) + start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + for page in s3_paginator.paginate(Bucket=self.bucket, Prefix=prefix, StartAfter=start_after): + for content in page.get("Contents", ()): + yield content["Key"] + + def _download_directory_into_cache(self, rel_path, cache_path): + for key in self._keys(rel_path): + local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + self._client.download_file(self.bucket, rel_path, local_file_path) + + def _get_object_url(self, obj, **kwargs): + try: + if self._exists(obj, **kwargs): + rel_path = self._construct_path(obj, **kwargs) + url = self._client.generate_presigned_url( + ClientMethod="get_object", + Params={ + "Bucket": self.bucket, + "Key": rel_path, + }, + ExpiresIn=3600, + HttpMethod="GET", + ) + return url + except ClientError: + log.exception("Failed to generate URL for dataset.") + return None + + def _get_store_usage_percent(self, obj): + return 0.0 + + def shutdown(self): + self._shutdown_cache_monitor() diff --git a/pyproject.toml b/pyproject.toml index 7e18ef256bde..50c5fda83932 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,6 +183,7 @@ types-python-dateutil = "*" types-PyYAML = "*" types-requests = "*" types-six = "*" +"boto3-stubs[s3]" = "*" [tool.ruff] target-version = "py38" diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 1229a01e0a74..5f60c007973c 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1,4 +1,5 @@ import os +import random import time from tempfile import ( mkdtemp, @@ -1544,6 +1545,50 @@ def test_real_aws_s3_store(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) +AMAZON_BOTO3_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML = """ +type: boto3 +store_by: uuid +auth: + access_key: ${access_key} + secret_key: ${secret_key} + +bucket: + name: ${bucket} + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure +""" + + +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_BUCKET") +def test_real_aws_s3_store_boto3(tmp_path): + template_vars = { + "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], + "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], + } + with TestConfig(AMAZON_BOTO3_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +def test_real_aws_s3_store_boto3_new_bucket(tmp_path): + rand_int = random.randint(100000, 999999) + template_vars = { + "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], + "bucket": f"mynewbucket{rand_int}", + } + with TestConfig(AMAZON_BOTO3_S3_SIMPLE_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + AMAZON_CLOUDBRIDGE_TEMPLATE_TEST_CONFIG_YAML = """ type: cloud store_by: uuid @@ -1617,7 +1662,6 @@ def test_aws_via_cloudbridge_store_with_region(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) - GOOGLE_S3_INTEROP_TEMPLATE_TEST_CONFIG_YAML = """ type: generic_s3 store_by: uuid @@ -1639,6 +1683,7 @@ def test_aws_via_cloudbridge_store_with_region(tmp_path): path: database/tmp_azure """ + @skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") @skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") @skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") @@ -1655,6 +1700,43 @@ def test_gcp_via_s3_interop(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) +GOOGLE_INTEROP_VIA_BOTO3_TEMPLATE_TEST_CONFIG_YAML = """ +type: boto3 +store_by: uuid +auth: + access_key: ${access_key} + secret_key: ${secret_key} + +bucket: + name: ${bucket} + +connection: + endpoint_url: https://storage.googleapis.com + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure +""" + + +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop_and_boto3(tmp_path): + template_vars = { + "access_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY"], + "bucket": os.environ["GALAXY_TEST_GOOGLE_BUCKET"], + } + with TestConfig(GOOGLE_INTEROP_VIA_BOTO3_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as ( + _, + object_store, + ): + verify_caching_object_store_functionality(tmp_path, object_store) + + class MockDataset: def __init__(self, id): self.id = id From 8dadbcf1f569e1be34640a5291005f53572b0cbb Mon Sep 17 00:00:00 2001 From: John Chilton Date: Wed, 8 May 2024 15:24:25 -0400 Subject: [PATCH 32/40] Implement advanced transfer options for boto3. --- lib/galaxy/objectstore/s3_boto3.py | 91 ++++++++++++++++++----- test/unit/objectstore/test_objectstore.py | 42 +++++++++++ 2 files changed, 115 insertions(+), 18 deletions(-) diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py index cd269aad61e4..42739d03cd6b 100644 --- a/lib/galaxy/objectstore/s3_boto3.py +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -4,6 +4,9 @@ import logging import os from typing import ( + Any, + Callable, + Dict, TYPE_CHECKING, ) @@ -19,10 +22,13 @@ try: # Imports are done this way to allow objectstore code to be used outside of Galaxy. import boto3 + from boto3.s3.transfer import TransferConfig from botocore.client import ClientError except ImportError: boto3 = None # type: ignore[assignment,unused-ignore] + TransferConfig = None # type: ignore[assignment,unused-ignore,misc] +from galaxy.util import asbool from ._caching_base import CachingConcreteObjectStore from .caching import ( enable_cache_monitor, @@ -35,22 +41,6 @@ ) log = logging.getLogger(__name__) -logging.getLogger("boto").setLevel(logging.INFO) # Otherwise boto is quite noisy - - -def download_directory(bucket, remote_folder, local_path): - # List objects in the specified S3 folder - objects = bucket.list(prefix=remote_folder) - - for obj in objects: - remote_file_path = obj.key - local_file_path = os.path.join(local_path, os.path.relpath(remote_file_path, remote_folder)) - - # Create directories if they don't exist - os.makedirs(os.path.dirname(local_file_path), exist_ok=True) - - # Download the file - obj.get_contents_to_filename(local_file_path) def parse_config_xml(config_xml): @@ -71,6 +61,28 @@ def parse_config_xml(config_xml): region = cn_xml.get("region") cache_dict = parse_caching_config_dict_from_xml(config_xml) + transfer_xml = config_xml.findall("transfer") + if not transfer_xml: + transfer_xml = {} + else: + transfer_xml = transfer_xml[0] + transfer_dict = {} + for prefix in ["", "upload_", "download_"]: + for key in [ + "multipart_threshold", + "max_concurrency", + "multipart_chunksize", + "num_download_attempts", + "max_io_queue", + "io_chunksize", + "use_threads", + "max_bandwidth", + ]: + full_key = f"{prefix}{key}" + value = transfer_xml.get(full_key) + if transfer_xml.get(full_key) is not None: + transfer_dict[full_key] = value + tag, attrs = "extra_dir", ("type", "path") extra_dirs = config_xml.findall(tag) if not extra_dirs: @@ -91,6 +103,7 @@ def parse_config_xml(config_xml): "endpoint_url": endpoint_url, "region": region, }, + "transfer": transfer_dict, "cache": cache_dict, "extra_dirs": extra_dirs, "private": CachingConcreteObjectStore.parse_private_from_config_xml(config_xml), @@ -134,6 +147,26 @@ def __init__(self, config, config_dict): bucket_dict = config_dict["bucket"] connection_dict = config_dict.get("connection", {}) cache_dict = config_dict.get("cache") or {} + transfer_dict = config_dict.get("transfer", {}) + typed_transfer_dict = {} + for prefix in ["", "upload_", "download_"]: + options: Dict[str, Callable[[Any], Any]] = { + "multipart_threshold": int, + "max_concurrency": int, + "multipart_chunksize": int, + "num_download_attempts": int, + "max_io_queue": int, + "io_chunksize": int, + "use_threads": asbool, + "max_bandwidth": int, + } + for key, key_type in options.items(): + full_key = f"{prefix}{key}" + transfer_value = transfer_dict.get(full_key) + if transfer_value is not None: + typed_transfer_dict[full_key] = key_type(transfer_value) + self.transfer_dict = typed_transfer_dict + self.enable_cache_monitor, self.cache_monitor_interval = enable_cache_monitor(config, config_dict) self.access_key = auth_dict.get("access_key") @@ -226,6 +259,7 @@ def _config_to_dict(self): "endpoint_url": self.endpoint_url, "region": self.region, }, + "transfer": self.transfer_dict, "cache": { "size": self.cache_size, "path": self.staging_path, @@ -257,7 +291,8 @@ def _download(self, rel_path: str) -> bool: log.debug("Pulling key '%s' into cache to %s", rel_path, local_destination) if not self._caching_allowed(rel_path): return False - self._client.download_file(self.bucket, rel_path, local_destination) + config = self._transfer_config("download") + self._client.download_file(self.bucket, rel_path, local_destination, Config=config) return True except ClientError: log.exception("Failed to download file from S3") @@ -273,7 +308,8 @@ def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: try: - self._client.upload_file(source_file, self.bucket, rel_path) + config = self._transfer_config("upload") + self._client.upload_file(source_file, self.bucket, rel_path, Config=config) return True except ClientError: log.exception("Trouble pushing to S3 '%s' from file '%s'", rel_path, source_file) @@ -336,5 +372,24 @@ def _get_object_url(self, obj, **kwargs): def _get_store_usage_percent(self, obj): return 0.0 + def _transfer_config(self, prefix: Literal["upload", "download"]) -> "TransferConfig": + config = {} + for key in [ + "multipart_threshold", + "max_concurrency", + "multipart_chunksize", + "num_download_attempts", + "max_io_queue", + "io_chunksize", + "use_threads", + "max_bandwidth", + ]: + specific_key = f"{prefix}_key" + if specific_key in self.transfer_dict: + config[key] = self.transfer_dict[specific_key] + elif key in self.transfer_dict: + config[key] = self.transfer_dict[key] + return TransferConfig(**config) + def shutdown(self): self._shutdown_cache_monitor() diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 5f60c007973c..91f8320df5c5 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1322,6 +1322,14 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ reset_cache(object_store.cache_target) assert not object_store.exists(to_delete_dataset) + # Test bigger file to force multi-process. + big_file_dataset = MockDataset(6) + size = 1024 + path = tmp_path / "big_file.bytes" + with path.open("wb") as f: + f.write(os.urandom(size)) + object_store.update_from_file(big_file_dataset, file_name=hello_path, create=True) + # Test get_object_url returns a read-only URL url = object_store.get_object_url(hello_world_dataset) if check_get_url: @@ -1576,6 +1584,40 @@ def test_real_aws_s3_store_boto3(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) +AMAZON_BOTO3_S3_MULTITHREAD_TEMPLATE_TEST_CONFIG_YAML = """ +type: boto3 +store_by: uuid +auth: + access_key: ${access_key} + secret_key: ${secret_key} + +bucket: + name: ${bucket} + +transfer: + multipart_threshold: 10 + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure +""" + + +@skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_AWS_BUCKET") +def test_real_aws_s3_store_boto3_multipart(tmp_path): + template_vars = { + "access_key": os.environ["GALAXY_TEST_AWS_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], + "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], + } + with TestConfig(AMAZON_BOTO3_S3_MULTITHREAD_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + verify_caching_object_store_functionality(tmp_path, object_store) + + @skip_unless_environ("GALAXY_TEST_AWS_ACCESS_KEY") @skip_unless_environ("GALAXY_TEST_AWS_SECRET_KEY") def test_real_aws_s3_store_boto3_new_bucket(tmp_path): From 8e59873376a6000a720d507e2862d3ffe4ff6e0b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 09:46:25 -0400 Subject: [PATCH 33/40] Rev object store docs. --- .../sample/object_store_conf.sample.yml | 85 ++++++++++++++++++- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/lib/galaxy/config/sample/object_store_conf.sample.yml b/lib/galaxy/config/sample/object_store_conf.sample.yml index b1b2cb34afec..5fdfcb8d8ed1 100644 --- a/lib/galaxy/config/sample/object_store_conf.sample.yml +++ b/lib/galaxy/config/sample/object_store_conf.sample.yml @@ -135,10 +135,64 @@ backends: store_by: uuid files_dir: /old-fs/galaxy/files + +# There are now four ways to access S3 related services. Two are +# suitable just for AWS services (aws_s3 & cloud), one is +# more suited for non-AWS S3 compatible services (generic_s3), +# and finally boto3 gracefully handles either scenario. +# +# boto3 is built on the newest and most widely used Python client +# outside of Galaxy. It has advanced transfer options and is likely +# the client you should use for new setup. generic_s3 and aws_s3 +# have existed in Galaxy for longer and could perhaps be considered +# more battle tested. Both boto3 and generic_s3 have been tested +# with multiple non-AWS APIs including minio and GCP. The cloud +# implementation is based on CloudBridge and is still supported +# and has been recently tested - the downside is mostly the advanced +# multi-threaded processing options of boto3 are not available +# and it has not been battle tested like aws_s3. + +# +# Sample AWS S3 Object Store configuration (newest boto3 client) +# +type: boto3 +auth: + access_key: ... + secret_key: ... +bucket: + name: unique_bucket_name_all_lowercase +connection: # not strictly needed but more of the API works with this. + region: us-east-1 +transfer: + multipart_threshold: 10000000 + download_max_concurrency: 5 + upload_max_concurrency: 10 + # any of these options: + # multipart_threshold, max_concurrency, multipart_chunksize, + # num_download_attempts, max_io_queue, io_chunksize, use_threads, + # and max_bandwidth + # can be set. By default they will apply to uploads and downloads + # but they can be prefixed with upload_ or download_ as shown above + # to apply to just one scenario. More information about these parameters + # can be found at: + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig + +cache: + path: database/object_store_cache_s3 + size: 1000 + cache_updated_data: true +extra_dirs: + - type: job_work + path: database/job_working_directory_s3 + + + # -# Sample AWS S3 Object Store configuration +# Sample AWS S3 Object Store configuration (legacy boto implementation) # - +# This implementation will use axel automatically for file transfers if it is on +# Galaxy's path. Otherwise, it will use various python-based strategies for multi-part +# upload of large uploads but all downloads will be single threaded. type: aws_s3 auth: access_key: ... @@ -147,6 +201,8 @@ bucket: name: unique_bucket_name_all_lowercase use_reduced_redundancy: false max_chunk_size: 250 +connection: # not strictly needed but more of the API works with this. + region: us-east-1 cache: path: database/object_store_cache_s3 size: 1000 @@ -182,7 +238,30 @@ extra_dirs: path: database/job_working_directory_irods # -# Sample non-AWS S3 Object Store (e.g. swift) configuration +# Sample non-AWS S3 Object Store (e.g. swift) configuration (boto3) +# + +type: boto3 +auth: + access_key: ... + secret_key: ... +bucket: + name: unique_bucket_name_all_lowercase +connection: + endpoint_url: https://swift.example.org:6000/ + # region: some services may make use of region is specified. +cache: + path: database/object_store_cache_swift + size: 1000 + cache_updated_data: true +# transfer: # see transfer options for boto3 above in AWS configuration. +extra_dirs: + - type: job_work + path: database/job_working_directory_swift + + +# +# Sample non-AWS S3 Object Store (e.g. swift) configuration (legacy boto client) # type: generic_s3 From c19e28ec7e5ca17a11260c855fe0af109f17316b Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 10:28:56 -0400 Subject: [PATCH 34/40] Allow older style connection parameters on newer boto3 object store. Should make migration for admins a bit easier. --- .../sample/object_store_conf.sample.yml | 2 + lib/galaxy/objectstore/s3_boto3.py | 17 +++++++ test/unit/objectstore/test_objectstore.py | 47 ++++++++++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/config/sample/object_store_conf.sample.yml b/lib/galaxy/config/sample/object_store_conf.sample.yml index 5fdfcb8d8ed1..0c96a549f22d 100644 --- a/lib/galaxy/config/sample/object_store_conf.sample.yml +++ b/lib/galaxy/config/sample/object_store_conf.sample.yml @@ -250,6 +250,8 @@ bucket: connection: endpoint_url: https://swift.example.org:6000/ # region: some services may make use of region is specified. + # older style host, port, secure, and conn_path available to generic_s3 work + # here also - Galaxy will just infer a endpoint_url from those. cache: path: database/object_store_cache_swift size: 1000 diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py index 42739d03cd6b..1e5b5fe876bb 100644 --- a/lib/galaxy/objectstore/s3_boto3.py +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -43,6 +43,16 @@ log = logging.getLogger(__name__) +def host_to_endpoint(mapping): + # convert older-style boto parameters to boto3 endpoint_url. + host = mapping["host"] + port = mapping.get("port", 6000) + is_secure = asbool(mapping.get("is_secure", "True")) + conn_path = mapping.get("conn_path", "/") + scheme = "https" if is_secure else "http" + return f"{scheme}://{host}:{port}{conn_path}" + + def parse_config_xml(config_xml): try: a_xml = config_xml.findall("auth")[0] @@ -58,6 +68,10 @@ def parse_config_xml(config_xml): else: cn_xml = cn_xml[0] endpoint_url = cn_xml.get("endpoint_url") + + # for admin ease - allow older style host, port, is_secure, conn_path to be used. + if endpoint_url is None and cn_xml.get("host") is not None: + endpoint_url = host_to_endpoint(cn_xml) region = cn_xml.get("region") cache_dict = parse_caching_config_dict_from_xml(config_xml) @@ -175,6 +189,9 @@ def __init__(self, config, config_dict): self.bucket = bucket_dict.get("name") self.endpoint_url = connection_dict.get("endpoint_url") + if self.endpoint_url is None and "host" in connection_dict: + self.endpoint_url = host_to_endpoint(connection_dict) + self.region = connection_dict.get("region") self.cache_size = cache_dict.get("size") or self.config.object_store_cache_size diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 91f8320df5c5..8f50d8511fbb 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1614,7 +1614,10 @@ def test_real_aws_s3_store_boto3_multipart(tmp_path): "secret_key": os.environ["GALAXY_TEST_AWS_SECRET_KEY"], "bucket": os.environ["GALAXY_TEST_AWS_BUCKET"], } - with TestConfig(AMAZON_BOTO3_S3_MULTITHREAD_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as (_, object_store): + with TestConfig(AMAZON_BOTO3_S3_MULTITHREAD_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars) as ( + _, + object_store, + ): verify_caching_object_store_functionality(tmp_path, object_store) @@ -1779,6 +1782,48 @@ def test_gcp_via_s3_interop_and_boto3(tmp_path): verify_caching_object_store_functionality(tmp_path, object_store) +GOOGLE_INTEROP_VIA_BOTO3_WITH_LEGACY_PARAMS_TEMPLATE_TEST_CONFIG_YAML = """ +type: boto3 +store_by: uuid +auth: + access_key: ${access_key} + secret_key: ${secret_key} + +bucket: + name: ${bucket} + +connection: + host: storage.googleapis.com + port: 443 + secure: true + conn_pat: '/' + +extra_dirs: +- type: job_work + path: database/job_working_directory_azure +- type: temp + path: database/tmp_azure +""" + + +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY") +@skip_unless_environ("GALAXY_TEST_GOOGLE_BUCKET") +def test_gcp_via_s3_interop_and_boto3_with_legacy_params(tmp_path): + template_vars = { + "access_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_ACCESS_KEY"], + "secret_key": os.environ["GALAXY_TEST_GOOGLE_INTEROP_SECRET_KEY"], + "bucket": os.environ["GALAXY_TEST_GOOGLE_BUCKET"], + } + with TestConfig( + GOOGLE_INTEROP_VIA_BOTO3_WITH_LEGACY_PARAMS_TEMPLATE_TEST_CONFIG_YAML, template_vars=template_vars + ) as ( + _, + object_store, + ): + verify_caching_object_store_functionality(tmp_path, object_store) + + class MockDataset: def __init__(self, id): self.id = id From 7506f43ae27519134c70d15ca99f289cf7055be7 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 11:04:07 -0400 Subject: [PATCH 35/40] Drop logging that contains a password. --- lib/galaxy/objectstore/pithos.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 89cfb13dfe59..23f7308485c8 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -92,7 +92,6 @@ def __init__(self, config, config_dict): self.staging_path = self.config.file_path log.info("Parse config_xml for pithos object store") self.config_dict = config_dict - log.debug(self.config_dict) self._initialize() From c6a247322f156c5d18177664acf9c578efb8714e Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 12:13:52 -0400 Subject: [PATCH 36/40] Revise object store changes based on PR review. --- lib/galaxy/objectstore/_caching_base.py | 13 +++++-------- lib/galaxy/objectstore/irods.py | 4 ++-- lib/galaxy/objectstore/s3.py | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index 30360d71c1d2..43c889c98ba7 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -15,7 +15,6 @@ from galaxy.objectstore import ConcreteObjectStore from galaxy.util import ( directory_hash_id, - ExecutionTimer, unlink, ) from galaxy.util.path import safe_relpath @@ -99,7 +98,6 @@ def _in_cache(self, rel_path: str) -> bool: return os.path.exists(cache_path) def _pull_into_cache(self, rel_path) -> bool: - ipt_timer = ExecutionTimer() # Ensure the cache directory structure exists (e.g., dataset_#_files/) rel_path_dir = os.path.dirname(rel_path) if not os.path.exists(self._get_cache_path(rel_path_dir)): @@ -107,7 +105,6 @@ def _pull_into_cache(self, rel_path) -> bool: # Now pull in the file file_ok = self._download(rel_path) fix_permissions(self.config, self._get_cache_path(rel_path_dir)) - log.debug("_pull_into_cache: %s\n\n\n\n\n\n", ipt_timer) return file_ok def _get_data(self, obj, start=0, count=-1, **kwargs): @@ -146,7 +143,7 @@ def _exists(self, obj, **kwargs): # TODO: Sync should probably not be done here. Add this to an async upload stack? if in_cache and not exists_remotely: - self._push_to_os(rel_path, source_file=self._get_cache_path(rel_path)) + self._push_to_storage(rel_path, source_file=self._get_cache_path(rel_path)) return True elif exists_remotely: return True @@ -180,7 +177,7 @@ def _create(self, obj, **kwargs): if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{self._get_object_id(obj)}.dat") open(os.path.join(self.staging_path, rel_path), "w").close() - self._push_to_os(rel_path, from_string="") + self._push_to_storage(rel_path, from_string="") return self def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> bool: @@ -196,7 +193,7 @@ def _caching_allowed(self, rel_path: str, remote_size: Optional[int] = None) -> return False return True - def _push_to_os(self, rel_path, source_file=None, from_string=None): + def _push_to_storage(self, rel_path, source_file=None, from_string=None): source_file = source_file or self._get_cache_path(rel_path) if from_string is None and not os.path.exists(source_file): log.error( @@ -338,7 +335,7 @@ def _update_from_file(self, obj, file_name=None, create=False, **kwargs): else: source_file = self._get_cache_path(rel_path) - self._push_to_os(rel_path, source_file) + self._push_to_storage(rel_path, source_file) else: raise ObjectNotFound( @@ -376,7 +373,7 @@ def _delete_existing_remote(self, rel_path) -> bool: def _delete_remote_all(self, rel_path) -> bool: raise NotImplementedError() - # Do not need to override these if instead replacing _push_to_os + # Do not need to override these if instead replacing _push_to_storage def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: raise NotImplementedError() diff --git a/lib/galaxy/objectstore/irods.py b/lib/galaxy/objectstore/irods.py index c37ee75c72d2..9241c1efe75c 100644 --- a/lib/galaxy/objectstore/irods.py +++ b/lib/galaxy/objectstore/irods.py @@ -371,7 +371,7 @@ def _download(self, rel_path): finally: log.debug("irods_pt _download: %s", ipt_timer) - def _push_to_os(self, rel_path, source_file=None, from_string=None): + def _push_to_storage(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the iRODS. Extract folder name from rel_path as iRODS collection name, and extract file name from rel_path @@ -448,7 +448,7 @@ def _push_to_os(self, rel_path, source_file=None, from_string=None): ) return True finally: - log.debug("irods_pt _push_to_os: %s", ipt_timer) + log.debug("irods_pt _push_to_storage: %s", ipt_timer) def _delete(self, obj, entire_dir=False, **kwargs): ipt_timer = ExecutionTimer() diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 9cff7e0fa800..0155077a3ced 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -313,7 +313,7 @@ def _download(self, rel_path): log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self._bucket.name) return False - def _push_to_os(self, rel_path, source_file=None, from_string=None): + def _push_to_storage(self, rel_path, source_file=None, from_string=None): """ Push the file pointed to by ``rel_path`` to the object store naming the key ``rel_path``. If ``source_file`` is provided, push that file instead while From 524f2ae7d18ebb0be97afa1e2056572586b7c395 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 12:49:11 -0400 Subject: [PATCH 37/40] Prevent incomplete files from sticking around in the object store cache. Maybe fixes https://github.com/galaxyproject/galaxy/pull/18117. --- lib/galaxy/objectstore/_caching_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index 43c889c98ba7..fd20b015ff28 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -104,7 +104,10 @@ def _pull_into_cache(self, rel_path) -> bool: os.makedirs(self._get_cache_path(rel_path_dir), exist_ok=True) # Now pull in the file file_ok = self._download(rel_path) - fix_permissions(self.config, self._get_cache_path(rel_path_dir)) + if file_ok: + fix_permissions(self.config, self._get_cache_path(rel_path_dir)) + else: + unlink(self._get_cache_path(rel_path), ignore_errors=True) return file_ok def _get_data(self, obj, start=0, count=-1, **kwargs): From 8d060b3f2d2e867b88853fbd8702f8e754d02891 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Thu, 9 May 2024 12:59:20 -0400 Subject: [PATCH 38/40] Do not allow creation of object stores with non-writable caches. Fixes biggest issue with #16438 - IMO anyway. --- lib/galaxy/objectstore/_caching_base.py | 12 ++++++++++++ lib/galaxy/objectstore/azure_blob.py | 1 + lib/galaxy/objectstore/cloud.py | 1 + lib/galaxy/objectstore/pithos.py | 1 + lib/galaxy/objectstore/rucio.py | 1 + lib/galaxy/objectstore/s3.py | 1 + lib/galaxy/objectstore/s3_boto3.py | 1 + 7 files changed, 18 insertions(+) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index fd20b015ff28..917129186b7d 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -37,6 +37,18 @@ class CachingConcreteObjectStore(ConcreteObjectStore): cache_monitor: Optional[InProcessCacheMonitor] = None cache_monitor_interval: int + def _ensure_staging_path_writable(self): + staging_path = self.staging_path + if not os.path.exists(staging_path): + os.makedirs(staging_path, exist_ok=True) + if not os.path.exists(staging_path): + raise Exception(f"Caching object store created with path '{staging_path}' that does not exist") + + if not os.access(staging_path, os.R_OK): + raise Exception(f"Caching object store created with path '{staging_path}' that does not readable") + if not os.access(staging_path, os.W_OK): + raise Exception(f"Caching object store created with path '{staging_path}' that does not writable") + def _construct_path( self, obj, diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index bc04bf9871df..eb834ea5c4bd 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -111,6 +111,7 @@ def _initialize(self): raise Exception(NO_BLOBSERVICE_ERROR_MESSAGE) self._configure_connection() + self._ensure_staging_path_writable() self._start_cache_monitor_if_needed() def to_dict(self): diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index b6599b4e916d..775e2c8951ba 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -63,6 +63,7 @@ def _initialize(self): self.conn = self._get_connection(self.provider, self.credentials) self.bucket = self._get_bucket(self.bucket_name) + self._ensure_staging_path_writable() self._start_cache_monitor_if_needed() self._init_axel() diff --git a/lib/galaxy/objectstore/pithos.py b/lib/galaxy/objectstore/pithos.py index 23f7308485c8..43697062d9d0 100644 --- a/lib/galaxy/objectstore/pithos.py +++ b/lib/galaxy/objectstore/pithos.py @@ -99,6 +99,7 @@ def _initialize(self): if KamakiClient is None: raise Exception(NO_KAMAKI_ERROR_MESSAGE) + self._ensure_staging_path_writable() log.info("Authenticate Synnefo account") self._authenticate() log.info("Initialize Pithos+ client") diff --git a/lib/galaxy/objectstore/rucio.py b/lib/galaxy/objectstore/rucio.py index 2274946e375a..4bb6540a34de 100644 --- a/lib/galaxy/objectstore/rucio.py +++ b/lib/galaxy/objectstore/rucio.py @@ -303,6 +303,7 @@ def __init__(self, config, config_dict): self._initialize() def _initialize(self): + self._ensure_staging_path_writable() self._start_cache_monitor_if_needed() def _pull_into_cache(self, rel_path, auth_token): diff --git a/lib/galaxy/objectstore/s3.py b/lib/galaxy/objectstore/s3.py index 0155077a3ced..8c040e7523b8 100644 --- a/lib/galaxy/objectstore/s3.py +++ b/lib/galaxy/objectstore/s3.py @@ -198,6 +198,7 @@ def _initialize(self): "conn_path": self.conn_path, } + self._ensure_staging_path_writable() self._configure_connection() self._bucket = self._get_bucket(self.bucket) self._start_cache_monitor_if_needed() diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py index 1e5b5fe876bb..fb3c237ca07d 100644 --- a/lib/galaxy/objectstore/s3_boto3.py +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -207,6 +207,7 @@ def _initialize(self): if boto3 is None: raise Exception(NO_BOTO_ERROR_MESSAGE) + self._ensure_staging_path_writable() self._configure_connection() self._start_cache_monitor_if_needed() From 52e03dad3ed4c37009c206b08a0d46ce06ad0959 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 10 May 2024 11:37:57 -0400 Subject: [PATCH 39/40] Various object store extra files handling fixes. - Fix s3 which broke during the migration because we stopped adding a / I thought was silly. - Use the test case I wrote that mimics the integration test failure to also fix the boto3 and azure_blob object stores for these failures. Also make note of a lingering issue that is pretty important --- lib/galaxy/objectstore/__init__.py | 44 ++++++++++++++-------- lib/galaxy/objectstore/_caching_base.py | 15 ++++++-- lib/galaxy/objectstore/azure_blob.py | 35 +++++++++++++++-- lib/galaxy/objectstore/cloud.py | 34 ++++++++++++----- lib/galaxy/objectstore/s3_boto3.py | 17 +++++++-- test/unit/objectstore/test_objectstore.py | 46 +++++++++++++++++++++++ 6 files changed, 155 insertions(+), 36 deletions(-) diff --git a/lib/galaxy/objectstore/__init__.py b/lib/galaxy/objectstore/__init__.py index a09bcf4adb31..412228167971 100644 --- a/lib/galaxy/objectstore/__init__.py +++ b/lib/galaxy/objectstore/__init__.py @@ -55,7 +55,10 @@ from .caching import CacheTarget if TYPE_CHECKING: - from galaxy.model import DatasetInstance + from galaxy.model import ( + Dataset, + DatasetInstance, + ) NO_SESSION_ERROR_MESSAGE = ( "Attempted to 'create' object store entity in configuration with no database session present." @@ -1662,18 +1665,27 @@ def persist_extra_files( if not extra_files_path_name: extra_files_path_name = primary_data.dataset.extra_files_path_name_from(object_store) assert extra_files_path_name - for root, _dirs, files in safe_walk(src_extra_files_path): - extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path)) - extra_dir = os.path.normpath(extra_dir) - for f in files: - if not in_directory(f, src_extra_files_path): - # Unclear if this can ever happen if we use safe_walk ... probably not ? - raise MalformedContents(f"Invalid dataset path: {f}") - object_store.update_from_file( - primary_data.dataset, - extra_dir=extra_dir, - alt_name=f, - file_name=os.path.join(root, f), - create=True, - preserve_symlinks=True, - ) + persist_extra_files_for_dataset(object_store, src_extra_files_path, primary_data.dataset, extra_files_path_name) + + +def persist_extra_files_for_dataset( + object_store: ObjectStore, + src_extra_files_path: str, + dataset: "Dataset", + extra_files_path_name: str, +): + for root, _dirs, files in safe_walk(src_extra_files_path): + extra_dir = os.path.join(extra_files_path_name, os.path.relpath(root, src_extra_files_path)) + extra_dir = os.path.normpath(extra_dir) + for f in files: + if not in_directory(f, src_extra_files_path): + # Unclear if this can ever happen if we use safe_walk ... probably not ? + raise MalformedContents(f"Invalid dataset path: {f}") + object_store.update_from_file( + dataset, + extra_dir=extra_dir, + alt_name=f, + file_name=os.path.join(root, f), + create=True, + preserve_symlinks=True, + ) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index 917129186b7d..383e1ca0c57a 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -93,6 +93,9 @@ def _construct_path( assert base return os.path.join(base, rel_path) + # This is how the remote file stores represent folders + rel_path = f"{rel_path}/" + if not dir_only: rel_path = os.path.join(rel_path, alt_name if alt_name else f"dataset_{object_id}.dat") @@ -294,10 +297,14 @@ def _get_filename(self, obj, **kwargs): raise ObjectNotFound(f"objectstore.get_filename, no cache_path: {obj}, kwargs: {kwargs}") def _download_directory_into_cache(self, rel_path, cache_path): - # azure, pithos, irod, and cloud did not do this prior to refactoring so I am assuming - # there is just operations that fail with these object stores, - # I'm placing a no-op here to match their behavior but we should - # maybe implement this for those object stores. + # pithos & irods never did this prior to refactoring so I am assuming + # there is just operations that fail with these object stores. + # As part of the refactoring that resulted in this method + # https://github.com/galaxyproject/galaxy/pull/18117 I wrote test + # cases and I verified the other object stores that didn't implement + # this had issues - I implemented this new functionality in the + # Azure and Cloud object stores to fix those object stores. New + # object stores should definitely override this. pass def _delete(self, obj, entire_dir=False, **kwargs): diff --git a/lib/galaxy/objectstore/azure_blob.py b/lib/galaxy/objectstore/azure_blob.py index eb834ea5c4bd..d630b157537a 100644 --- a/lib/galaxy/objectstore/azure_blob.py +++ b/lib/galaxy/objectstore/azure_blob.py @@ -3,6 +3,7 @@ """ import logging +import os from datetime import ( datetime, timedelta, @@ -167,9 +168,20 @@ def _get_remote_size(self, rel_path): log.exception("Could not get size of blob '%s' from Azure", rel_path) return -1 + def _blobs_from(self, rel_path): + return self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) + def _exists_remotely(self, rel_path: str): try: - exists = self._blob_client(rel_path).exists() + is_dir = rel_path[-1] == "/" + if is_dir: + blobs = self._blobs_from(rel_path) + if blobs: + return True + else: + return False + else: + exists = self._blob_client(rel_path).exists() except AzureHttpError: log.exception("Trouble checking existence of Azure blob '%s'", rel_path) return False @@ -185,13 +197,28 @@ def _download(self, rel_path): if not self._caching_allowed(rel_path): return False else: - with open(local_destination, "wb") as f: - self._blob_client(rel_path).download_blob().download_to_stream(f) + self._download_to_file(rel_path, local_destination) return True except AzureHttpError: log.exception("Problem downloading '%s' from Azure", rel_path) return False + def _download_to_file(self, rel_path, local_destination): + with open(local_destination, "wb") as f: + self._blob_client(rel_path).download_blob().download_to_stream(f) + + def _download_directory_into_cache(self, rel_path, cache_path): + blobs = self._blobs_from(rel_path) + for blob in blobs: + key = blob.name + local_file_path = os.path.join(cache_path, os.path.relpath(key, rel_path)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + self._download_to_file(key, local_file_path) + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: try: self._blob_client(rel_path).upload_blob(from_string, overwrite=True) @@ -211,7 +238,7 @@ def _push_file_to_path(self, rel_path: str, source_file: str) -> bool: def _delete_remote_all(self, rel_path: str) -> bool: try: - blobs = self.service.get_container_client(self.container_name).list_blobs(name_starts_with=rel_path) + blobs = self._blobs_from(rel_path) for blob in blobs: log.debug("Deleting from Azure: %s", blob) self._blob_client(blob.name).delete_blob() diff --git a/lib/galaxy/objectstore/cloud.py b/lib/galaxy/objectstore/cloud.py index 775e2c8951ba..79af7a6df8ad 100644 --- a/lib/galaxy/objectstore/cloud.py +++ b/lib/galaxy/objectstore/cloud.py @@ -256,19 +256,35 @@ def _download(self, rel_path): remote_size = key.size if not self._caching_allowed(rel_path, remote_size): return False - if self.use_axel: - log.debug("Parallel pulled key '%s' into cache to %s", rel_path, local_destination) - url = key.generate_url(7200) - return self._axel_download(url, local_destination) - else: - log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) - with open(local_destination, "wb+") as downloaded_file_handle: - key.save_content(downloaded_file_handle) - return True + log.debug("Pulled key '%s' into cache to %s", rel_path, local_destination) + self._download_to(key, local_destination) + return True except Exception: log.exception("Problem downloading key '%s' from S3 bucket '%s'", rel_path, self.bucket.name) return False + def _download_directory_into_cache(self, rel_path, cache_path): + # List objects in the specified cloud folder + objects = self.bucket.objects.list(prefix=rel_path) + + for obj in objects: + remote_file_path = obj.name + local_file_path = os.path.join(cache_path, os.path.relpath(remote_file_path, rel_path)) + + # Create directories if they don't exist + os.makedirs(os.path.dirname(local_file_path), exist_ok=True) + + # Download the file + self._download_to(obj, local_file_path) + + def _download_to(self, key, local_destination): + if self.use_axel: + url = key.generate_url(7200) + return self._axel_download(url, local_destination) + else: + with open(local_destination, "wb+") as downloaded_file_handle: + key.save_content(downloaded_file_handle) + def _push_string_to_path(self, rel_path: str, from_string: str) -> bool: try: if not self.bucket.objects.get(rel_path): diff --git a/lib/galaxy/objectstore/s3_boto3.py b/lib/galaxy/objectstore/s3_boto3.py index fb3c237ca07d..eb6f888cdacd 100644 --- a/lib/galaxy/objectstore/s3_boto3.py +++ b/lib/galaxy/objectstore/s3_boto3.py @@ -41,6 +41,10 @@ ) log = logging.getLogger(__name__) +# This object store generates a lot of logging by default, fairly sure it is an anti-pattern +# to just disable library logging. +# logging.getLogger("botocore").setLevel(logging.INFO) +# logging.getLogger("s3transfer").setLevel(logging.INFO) def host_to_endpoint(mapping): @@ -296,8 +300,15 @@ def _get_remote_size(self, rel_path) -> int: def _exists_remotely(self, rel_path: str) -> bool: try: - self._client.head_object(Bucket=self.bucket, Key=rel_path) - return True + is_dir = rel_path[-1] == "/" + if is_dir: + for _ in self._keys(rel_path): + return True + + return False + else: + self._client.head_object(Bucket=self.bucket, Key=rel_path) + return True except ClientError as e: if e.response["Error"]["Code"] == "404": return False @@ -367,7 +378,7 @@ def _download_directory_into_cache(self, rel_path, cache_path): os.makedirs(os.path.dirname(local_file_path), exist_ok=True) # Download the file - self._client.download_file(self.bucket, rel_path, local_file_path) + self._client.download_file(self.bucket, key, local_file_path) def _get_object_url(self, obj, **kwargs): try: diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index 8f50d8511fbb..e87eeaa68ad4 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1,5 +1,6 @@ import os import random +import shutil import time from tempfile import ( mkdtemp, @@ -12,6 +13,7 @@ from requests import get from galaxy.exceptions import ObjectInvalid +from galaxy.objectstore import persist_extra_files_for_dataset from galaxy.objectstore.azure_blob import AzureBlobObjectStore from galaxy.objectstore.caching import ( CacheTarget, @@ -1330,6 +1332,37 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ f.write(os.urandom(size)) object_store.update_from_file(big_file_dataset, file_name=hello_path, create=True) + extra_files_dataset = MockDataset(7) + object_store.create(extra_files_dataset) + extra = tmp_path / "extra" + extra.mkdir() + extra_file = extra / "new_value.txt" + extra_file.write_text("My new value") + + persist_extra_files_for_dataset( + object_store, + extra, + extra_files_dataset, # type: ignore[arg-type,unused-ignore] + extra_files_dataset._extra_files_rel_path, + ) + + # kind of a huge problem, the object stores that pass the following + # tests, do not do so if we use Galaxy's cache cleaning which does + # not remove directories. The @mvdbeek integration test cache clearing + # here (remove and restore the whole caching directory) does work and + # definitely should work. We just need to also figure out how to make + # reset_cache work here. + + # reset_cache(object_store.cache_target) + shutil.rmtree(object_store.cache_target.path) + os.makedirs(object_store.cache_target.path) + + extra_path = _extra_file_path(object_store, extra_files_dataset) + assert os.path.exists(extra_path) + expected_extra_file = os.path.join(extra_path, "new_value.txt") + assert os.path.exists(expected_extra_file) + assert open(expected_extra_file, "r").read() == "My new value" + # Test get_object_url returns a read-only URL url = object_store.get_object_url(hello_world_dataset) if check_get_url: @@ -1338,6 +1371,15 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ assert response.text == "Hello World!" +def _extra_file_path(object_store, dataset): + # invoke the magic calls the model layer would invoke here... + if object_store.exists(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path): + return object_store.get_filename(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path) + return object_store.construct_path( + dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path, in_cache=True + ) + + def verify_object_store_functionality(tmp_path, object_store, check_get_url=True): # Test no dataset with id 1 exists. absent_dataset = MockDataset(1) @@ -1835,6 +1877,10 @@ def rel_path_for_uuid_test(self): rel_path = os.path.join(*directory_hash_id(self.uuid)) return rel_path + @property + def _extra_files_rel_path(self): + return f"dataset_{self.uuid}_files" + def _assert_has_keys(the_dict, keys): for key in keys: From 9d1ba548b017b6c5ce42eee5f7cf0e9290c0a4ee Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 10 May 2024 12:05:53 -0400 Subject: [PATCH 40/40] Fix extra files handling for cached object stores when reset_cache called. --- lib/galaxy/objectstore/_caching_base.py | 5 +++- test/unit/objectstore/test_objectstore.py | 29 +++++++++++++---------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/lib/galaxy/objectstore/_caching_base.py b/lib/galaxy/objectstore/_caching_base.py index 383e1ca0c57a..b63593ec7c50 100644 --- a/lib/galaxy/objectstore/_caching_base.py +++ b/lib/galaxy/objectstore/_caching_base.py @@ -283,7 +283,10 @@ def _get_filename(self, obj, **kwargs): return cache_path # Check if the file exists in the cache first, always pull if file size in cache is zero - if self._in_cache(rel_path) and (dir_only or os.path.getsize(self._get_cache_path(rel_path)) > 0): + # For dir_only - the cache cleaning may have left empty directories so I think we need to + # always resync the cache. Gotta make sure we're being judicious in out data.extra_files_path + # calls I think. + if not dir_only and self._in_cache(rel_path) and os.path.getsize(self._get_cache_path(rel_path)) > 0: return cache_path # Check if the file exists in persistent storage and, if it does, pull it into cache diff --git a/test/unit/objectstore/test_objectstore.py b/test/unit/objectstore/test_objectstore.py index e87eeaa68ad4..e64bbec47625 100644 --- a/test/unit/objectstore/test_objectstore.py +++ b/test/unit/objectstore/test_objectstore.py @@ -1346,14 +1346,12 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ extra_files_dataset._extra_files_rel_path, ) - # kind of a huge problem, the object stores that pass the following - # tests, do not do so if we use Galaxy's cache cleaning which does - # not remove directories. The @mvdbeek integration test cache clearing - # here (remove and restore the whole caching directory) does work and - # definitely should work. We just need to also figure out how to make - # reset_cache work here. - - # reset_cache(object_store.cache_target) + # The following checks used to exhibit different behavior depending + # on how the cache was cleaned - removing the whole directory vs + # just cleaning up files the way Galaxy's internal caching works with + # reset_cache. So we test both here. + + # hard reset shutil.rmtree(object_store.cache_target.path) os.makedirs(object_store.cache_target.path) @@ -1361,7 +1359,16 @@ def verify_caching_object_store_functionality(tmp_path, object_store, check_get_ assert os.path.exists(extra_path) expected_extra_file = os.path.join(extra_path, "new_value.txt") assert os.path.exists(expected_extra_file) - assert open(expected_extra_file, "r").read() == "My new value" + assert open(expected_extra_file).read() == "My new value" + + # Redo the above test with Galaxy's reset_cache which leaves empty directories + # around. + reset_cache(object_store.cache_target) + extra_path = _extra_file_path(object_store, extra_files_dataset) + assert os.path.exists(extra_path) + expected_extra_file = os.path.join(extra_path, "new_value.txt") + assert os.path.exists(expected_extra_file) + assert open(expected_extra_file).read() == "My new value" # Test get_object_url returns a read-only URL url = object_store.get_object_url(hello_world_dataset) @@ -1375,9 +1382,7 @@ def _extra_file_path(object_store, dataset): # invoke the magic calls the model layer would invoke here... if object_store.exists(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path): return object_store.get_filename(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path) - return object_store.construct_path( - dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path, in_cache=True - ) + return object_store.construct_path(dataset, dir_only=True, extra_dir=dataset._extra_files_rel_path, in_cache=True) def verify_object_store_functionality(tmp_path, object_store, check_get_url=True):