From 049402f4d04732e9602240de2dbff9a7aa475be1 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Fri, 8 Nov 2024 12:13:15 -0800 Subject: [PATCH] Support overwriting an existing file at the same location (#229) Co-authored-by: Ryan Ly --- CHANGELOG.md | 1 + src/hdmf_zarr/backend.py | 46 ++++++++++++++++++++++++++------- tests/unit/base_tests_zarrio.py | 9 ++++--- tests/unit/test_zarrio.py | 23 +++++++++++++++++ 4 files changed, 67 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c34edeb..dc12b091 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements * Added support for Pathlib paths. @mavaylon1 [#212](https://github.com/hdmf-dev/hdmf-zarr/pull/212) * Updated packages used for testing and readthedocs configuration. @mavaylon1, @rly [#214](https://github.com/hdmf-dev/hdmf-zarr/pull/214) +* Add `force_overwite` parameter for `ZarrIO.__init__` to allow overwriting an existing file or directory. @oruebel [#229](https://github.com/hdmf-dev/hdmf-zarr/pull/229) ## 0.9.0 (September 16, 2024) ### Enhancements diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index cb0fea66..f320683c 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -1,6 +1,7 @@ """Module with the Zarr-based I/O-backend for HDMF""" # Python imports import os +import shutil import warnings import numpy as np import tempfile @@ -91,7 +92,7 @@ def can_read(path): 'doc': 'the path to the Zarr file or a supported Zarr store'}, {'name': 'manager', 'type': BuildManager, 'doc': 'the BuildManager to use for I/O', 'default': None}, {'name': 'mode', 'type': str, - 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "w-") ' + 'doc': 'the mode to open the Zarr file with, one of ("w", "r", "r+", "a", "r-"). ' 'the mode r- is used to force open without consolidated metadata in read only mode.'}, {'name': 'synchronizer', 'type': (zarr.ProcessSynchronizer, zarr.ThreadSynchronizer, bool), 'doc': 'Zarr synchronizer to use for parallel I/O. If set to True a ProcessSynchronizer is used.', @@ -102,11 +103,18 @@ def can_read(path): 'default': None}, {'name': 'storage_options', 'type': dict, 'doc': 'Zarr storage options to read remote folders', - 'default': None}) + 'default': None}, + {'name': 'force_overwrite', + 'type': bool, + 'doc': "force overwriting existing object when in 'w' mode. The existing file or directory" + " will be deleted when before opening (even if the object is not Zarr, e.g,. an HDF5 file)", + 'default': False} + ) def __init__(self, **kwargs): self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) - path, manager, mode, synchronizer, object_codec_class, storage_options = popargs( - 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', 'storage_options', kwargs) + path, manager, mode, synchronizer, object_codec_class, storage_options, force_overwrite = popargs( + 'path', 'manager', 'mode', 'synchronizer', 'object_codec_class', + 'storage_options', 'force_overwrite', kwargs) if manager is None: manager = BuildManager(TypeMap(NamespaceCatalog())) if isinstance(synchronizer, bool): @@ -118,6 +126,7 @@ def __init__(self, **kwargs): else: self.__synchronizer = synchronizer self.__mode = mode + self.__force_overwrite = force_overwrite if isinstance(path, Path): path = str(path) self.__path = path @@ -160,25 +169,44 @@ def synchronizer(self): def object_codec_class(self): return self.__codec_cls + @property + def mode(self): + """ + The mode specified by the user when creating the ZarrIO instance. + + NOTE: The Zarr library may not honor the mode. E.g., DirectoryStore in Zarr uses + append mode and does not allow setting a file to read-only mode. + """ + return self.__mode + def open(self): """Open the Zarr file""" if self.__file is None: + # Allow overwriting an existing file (e.g., an HDF5 file). Zarr will normally fail if the + # existing object at the path is a file. So if we are in `w` mode we need to delete the file first + if self.mode == 'w' and self.__force_overwrite: + if isinstance(self.path, (str, Path)) and os.path.exists(self.path): + if os.path.isdir(self.path): # directory + shutil.rmtree(self.path) + else: # File + os.remove(self.path) + # Within zarr, open_consolidated only allows the mode to be 'r' or 'r+'. # As a result, when in other modes, the file will not use consolidated metadata. - if self.__mode != 'r': + if self.mode != 'r': # When we consolidate metadata, we use ConsolidatedMetadataStore. # This interface does not allow for setting items. # In the doc string, it says it is "read only". As a result, we cannot use r+ with consolidate_metadata. # r- is only an internal mode in ZarrIO to force the use of regular open. For Zarr we need to # use the regular mode r when r- is specified - mode_to_use = self.__mode if self.__mode != 'r-' else 'r' + mode_to_use = self.mode if self.mode != 'r-' else 'r' self.__file = zarr.open(store=self.path, mode=mode_to_use, synchronizer=self.__synchronizer, storage_options=self.__storage_options) else: self.__file = self.__open_file_consolidated(store=self.path, - mode=self.__mode, + mode=self.mode, synchronizer=self.__synchronizer, storage_options=self.__storage_options) @@ -343,9 +371,9 @@ def export(self, **kwargs): """Export data read from a file from any backend to Zarr. See :py:meth:`hdmf.backends.io.HDMFIO.export` for more details. """ - if self.__mode != 'w': + if self.mode != 'w': raise UnsupportedOperation("Cannot export to file %s in mode '%s'. Please use mode 'w'." - % (self.source, self.__mode)) + % (self.source, self.mode)) src_io = getargs('src_io', kwargs) write_args, cache_spec = popargs('write_args', 'cache_spec', kwargs) diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index b63bb1cf..c0324642 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -97,7 +97,10 @@ def setUp(self): def tearDown(self): if os.path.exists(self.store): - shutil.rmtree(self.store) + if os.path.isdir(self.store): + shutil.rmtree(self.store) + else: # in case a test created a file instead of a directory + os.remove(self.store) def createReferenceBuilder(self): data_1 = np.arange(100, 200, 10).reshape(2, 5) @@ -117,9 +120,9 @@ def createReferenceBuilder(self): 'ref_dataset': dataset_ref}) return builder - def create_zarr(self, consolidate_metadata=True): + def create_zarr(self, consolidate_metadata=True, force_overwrite=False, mode='a'): builder = self.createReferenceBuilder() - writer = ZarrIO(self.store, mode='a') + writer = ZarrIO(self.store, mode=mode, force_overwrite=force_overwrite) writer.write_builder(builder, consolidate_metadata) writer.close() diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 824e5ae4..260bf9a4 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -198,6 +198,29 @@ def test_force_open_without_consolidated_fails(self): except ValueError as e: self.fail("ZarrIO.__open_file_consolidated raised an unexpected ValueError: {}".format(e)) +class TestOverwriteExistingFile(ZarrStoreTestCase): + def test_force_overwrite_when_file_exists(self): + """ + Test that we can overwrite a file when opening with `w` mode even if there is + an existing file. Zarr can write into a directory but not a file. + """ + # create a dummy text file + with open(self.store, "w") as file: + file.write("Just a test file used in TestOverwriteExistingFile") + # try to create a Zarr file at the same location (i.e., self.store) as the + # test text file to force overwriting the existing file. + self.create_zarr(force_overwrite=True, mode='w') + + def test_force_overwrite_when_dir_exists(self): + """ + Test that we can overwrite a directory when opening with `w` mode even if there is + an existing directory. + """ + # create a Zarr file + self.create_zarr() + # try to overwrite the existing Zarr file + self.create_zarr(force_overwrite=True, mode='w') + class TestDimensionLabels(BuildDatasetShapeMixin): """