Debug the use of relative links (#46)

* Change zarr extension for NWB conversion * Add external link support for sphinx * Add tutorial to show how to create and NWB file with NWBZarrIO directly * Make source path for links relative to the current Zarr file and add function to resolve relative paths on read * Fix requirement definition in setup.py * Update requirements-dev.txt * Update run_tests.yml * Update test.py * Update tox.ini * Updated ToDo and Limitations list in docs Co-authored-by: mavaylon1 <[email protected]>
hdmf-dev · Dec 21, 2022 · 5d2811d · 5d2811d
1 parent 439df57
commit 5d2811d
Show file tree

Hide file tree

Showing 11 changed files with 247 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,15 @@
 # HDMF-ZARR Changelog
 
+## 0.1.7 (Upcoming)
+
+### Bugs
+* Use path relative to the current Zarr file in the definition of links and references to avoid breaking
+  links when moving Zarr files @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))
+* Fix bugs in requirements defined in setup.py @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))
+
+### Docs
+* Add tutoial illustrating how to create a new NWB file with NWBZarrIO @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))
+
 ## 0.1.0 
 
 ### New features

diff --git a/docs/gallery/plot_convert_nwb.py → docs/gallery/plot_convert_nwb_hdf5.py b/docs/gallery/plot_convert_nwb.py → docs/gallery/plot_convert_nwb_hdf5.py
@@ -27,7 +27,7 @@
 # ---------------------------------------------
 #
 
-zarr_filename = "test_zarr_" + filename
+zarr_filename = "test_zarr_" + filename + ".zarr"
 hdf_filename = "test_hdf5_" + filename
 
 # Delete our converted HDF5 file from previous runs of this notebook

diff --git a/docs/gallery/plot_nwb_zarrio.py b/docs/gallery/plot_nwb_zarrio.py
@@ -0,0 +1,158 @@
+"""
+.. _nwbzarrio_tutorial:
+
+Creating NWB files using ``NWBZarrIO``
+======================================
+
+Similar to :py:class:`pynwb.NWBHDF5IO`, the :py:class:`~hdmf_zarr.nwb.NWBZarrIO` extends the basic
+:py:class:`~hdmf_zarr.backend.ZarrIO` to perform default setup for BuildManager, loading or namespaces etc.,
+in the context of the NWB format, to simplify using hdmf-zarr with the NWB data standard.  With this we
+can use :py:class:`~hdmf_zarr.nwb.NWBZarrIO` directly with the PyNWB API to read/write NWB files to/from Zarr.
+I.e., we can follow the standard PyNWB tutorials for using NWB files, and only need to replace
+:py:class:`pynwb.NWBHDF5IO` with :py:class:`~hdmf_zarr.nwb.NWBZarrIO` for read/write (and replace the
+use of py:class:`H5DataIO`  with :py:class:`~hdmf_zarr.utils.ZarrDataIO`).
+
+
+Creating and NWB extracellular electrophysiology file
+-----------------------------------------------------
+
+As an example, we here create an extracellular electrophysiology NWB file.
+This example is derived from :pynwb-docs:`Extracellular Electrophysiology <tutorials/domain/ecephys.html>`
+tutorial.
+
+"""
+# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png'
+# Ignore warnings about the development of the ZarrIO backend
+import warnings
+warnings.filterwarnings('ignore', '.*The ZarrIO backend is experimental*', )
+
+from datetime import datetime
+from dateutil.tz import tzlocal
+
+import numpy as np
+from pynwb import NWBFile
+from pynwb.ecephys import ElectricalSeries, LFP
+
+# Create the NWBFile
+nwbfile = NWBFile(
+    session_description="my first synthetic recording",
+    identifier="EXAMPLE_ID",
+    session_start_time=datetime.now(tzlocal()),
+    experimenter="Dr. Bilbo Baggins",
+    lab="Bag End Laboratory",
+    institution="University of Middle Earth at the Shire",
+    experiment_description="I went on an adventure with thirteen dwarves "
+    "to reclaim vast treasures.",
+    session_id="LONELYMTN",
+)
+
+# Create a device
+device = nwbfile.create_device(
+    name="array", description="the best array", manufacturer="Probe Company 9000"
+)
+
+# Add electrodes and electrode groups to the NWB file
+nwbfile.add_electrode_column(name="label", description="label of electrode")
+
+nshanks = 4
+nchannels_per_shank = 3
+electrode_counter = 0
+
+for ishank in range(nshanks):
+    # create an electrode group for this shank
+    electrode_group = nwbfile.create_electrode_group(
+        name="shank{}".format(ishank),
+        description="electrode group for shank {}".format(ishank),
+        device=device,
+        location="brain area",
+    )
+    # add electrodes to the electrode table
+    for ielec in range(nchannels_per_shank):
+        nwbfile.add_electrode(
+            x=5.3, y=1.5, z=8.5, imp=np.nan,
+            filtering='unknown',
+            group=electrode_group,
+            label="shank{}elec{}".format(ishank, ielec),
+            location="brain area",
+        )
+        electrode_counter += 1
+
+# Create a table region to select the electrodes to record from
+all_table_region = nwbfile.create_electrode_table_region(
+    region=list(range(electrode_counter)),  # reference row indices 0 to N-1
+    description="all electrodes",
+)
+
+# Add a mock electrical recording acquisition to the NWBFile
+raw_data = np.random.randn(50, 4)
+raw_electrical_series = ElectricalSeries(
+    name="ElectricalSeries",
+    data=raw_data,
+    electrodes=all_table_region,
+    starting_time=0.0,  # timestamp of the first sample in seconds relative to the session start time
+    rate=20000.0,  # in Hz
+)
+nwbfile.add_acquisition(raw_electrical_series)
+
+# Add a mock LFP processing result to the NWBFile
+lfp_data = np.random.randn(50, 4)
+lfp_electrical_series = ElectricalSeries(
+    name="ElectricalSeries",
+    data=lfp_data,
+    electrodes=all_table_region,
+    starting_time=0.0,
+    rate=200.0,
+)
+lfp = LFP(electrical_series=lfp_electrical_series)
+ecephys_module = nwbfile.create_processing_module(
+    name="ecephys", description="processed extracellular electrophysiology data"
+)
+ecephys_module.add(lfp)
+
+# Add mock spike times and units to the NWBFile
+nwbfile.add_unit_column(name="quality", description="sorting quality")
+
+poisson_lambda = 20
+firing_rate = 20
+n_units = 10
+for n_units_per_shank in range(n_units):
+    n_spikes = np.random.poisson(lam=poisson_lambda)
+    spike_times = np.round(
+        np.cumsum(np.random.exponential(1 / firing_rate, n_spikes)), 5
+    )
+    nwbfile.add_unit(
+        spike_times=spike_times, quality="good", waveform_mean=[1.0, 2.0, 3.0, 4.0, 5.0]
+    )
+
+###############################################################################
+# Writing the NWB file to Zarr
+# -----------------------------
+from hdmf_zarr.nwb import NWBZarrIO
+import os
+
+path = "ecephys_tutorial.nwb.zarr"
+absolute_path = os.path.abspath(path)
+with NWBZarrIO(path=path, mode="w") as io:
+    io.write(nwbfile)
+
+###############################################################################
+# Test opening the file
+# ---------------------
+with NWBZarrIO(path=path, mode="r") as io:
+    infile = io.read()
+
+###############################################################################
+# Test opening with the absolute path instead
+# -------------------------------------------
+with NWBZarrIO(path=absolute_path, mode="r") as io:
+    infile = io.read()
+
+###############################################################################
+# Test changing the current directory
+# ------------------------------------
+
+import os
+os.chdir(os.path.abspath(os.path.join(os.getcwd(), "../")))
+
+with NWBZarrIO(path=absolute_path, mode="r") as io:
+    infile = io.read()
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -58,6 +58,7 @@
 extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.intersphinx',
+    'sphinx.ext.extlinks',
     'sphinx.ext.napoleon',
     'sphinx_gallery.gen_gallery',
 ]
@@ -85,6 +86,12 @@
     'zarr': ('https://zarr.readthedocs.io/en/stable/', None)
 }
 
+# Use this for mapping to external links
+extlinks = {
+    'pynwb-docs': ('https://pynwb.readthedocs.io/en/stable/', '%s'),
+    'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/', ''),
+}
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 

diff --git a/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png
diff --git a/docs/source/figures/gallery_thumbnails.pptx b/docs/source/figures/gallery_thumbnails.pptx
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
@@ -19,26 +19,17 @@ Supported features
 - Write/Read of basic data types, strings and compound data types
 - Chunking
 - Compression and I/O filters
-- Links
+- Links.
 - Object references
 - Writing/loading namespaces/specifications
 - Iterative data write using :py:class:`~hdmf.data_utils.AbstractDataChunkIterator`
 
 Limitations
 ^^^^^^^^^^^
 
-- Support for region references is not yet implemented  (see :py:class:`hdmf_zarr.backend.ZarrIO.__get_ref`)
 - The Zarr backend is currently experimental and may still change.
-- Links and reference are not natively supported by Zarr. Links and references are implemented in :py:class:`~hdmf_zarr.backend.ZarrIO` in an OS independent fashion. The backend reserves attributes (see :py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`) to store the paths of the target objects (see also  :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links`)
+- Support for region references is not yet implemented  (see `hdmf-zarr issue #47 <https://github.com/hdmf-dev/hdmf-zarr/issues/47>`_.
 - Attributes are stored as JSON documents in Zarr (using the DirectoryStore). As such, all attributes must be JSON serializable. The :py:class:`~hdmf_zarr.backend.ZarrIO` backend attempts to cast types to JSON serializable types as much as possible.
-- Currently the :py:class:`~hdmf_zarr.backend.ZarrIO` backend uses Zarr's :py:class:`~zarr.storage.DirectoryStore` only. Other `Zarr stores <https://zarr.readthedocs.io/en/stable/api/storage.html>`_ could be added but will require proper treatment of links and references for those backends as links are not supported in Zarr (see `https://github.com/zarr-developers/zarr-python/issues/389 <https://github.com/zarr-developers/zarr-python/issues/389>`_.
-- Exporting of HDF5 files with external links is not yet fully implemented/tested
-
-TODO
-^^^^
-
-- Resolve reference stored in datasets to the containers
-- Add support for RegionReferences
-- :py:class:`~hdmf.backends.hdf5.h5tools.HDF5IO` uses the ``export_source`` argument on export. Need to check with Ryan Ly if we need it here as well.
-- Handling of  external links on export is not yet fully implemented and is missing a few corner cases
-- Here we update the PyNWB test harness to add ZarrIO to the rountrip tests, which in turn runs all HDF5 roundtrip tests also for Zarr. This requires changing the test harness in PyNWB, instead it would be useful to be able to "inject" new I/O backends in the test harness so that we can specify those tests here, rather than implementing this in PyNWB and making PyNWB dependent on hdmf-zarr. See the files ``tests/integration/ui_write/base.py`` and ``tests/integration/hdf5/test_modular_storage.py`` as part of `PyNWB #1018 <https://github.com/NeurodataWithoutBorders/pynwb/pull/1018/files>`_ for details (the other parts of the this PR have already been ported to *hdmf-zarr*).
+- Currently the :py:class:`~hdmf_zarr.backend.ZarrIO` backend uses Zarr's :py:class:`~zarr.storage.DirectoryStore` only. Other `Zarr stores <https://zarr.readthedocs.io/en/stable/api/storage.html>`_ could be added but will require proper treatment of links and references for those backends as links are not supported in Zarr (see `zarr-python issues #389 <https://github.com/zarr-developers/zarr-python/issues/389>`_.
+- Exporting of HDF5 files with external links is not yet fully implemented/tested. (see `hdmf-zarr issue #49 <https://github.com/hdmf-dev/hdmf-zarr/issues/49>`_.
+- Object references are currently always resolved on read (as are links) rather than being loaded lazily (see `hdmf-zarr issue #50 <https://github.com/hdmf-dev/hdmf-zarr/issues/50>`_.
diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py
@@ -87,11 +87,20 @@ def __init__(self, **kwargs):
         # Codec class to be used. Alternates, e.g., =numcodecs.JSON
         self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class
         super().__init__(manager, source=path)
-        warn_msg = ('\033[91m' + 'The ZarrIO backend is experimental. It is under active ' +
-                    'development. The ZarrIO backend may change any time ' +
-                    'and backward compatibility is not guaranteed.' + '\033[0m')
+        warn_msg = ("The ZarrIO backend is experimental. It is under active development. "
+                    "The ZarrIO backend may change any time and backward compatibility is not guaranteed.")
         warnings.warn(warn_msg)
 
+    @property
+    def path(self):
+        """The path to the Zarr file as set by the use"""
+        return self.__path
+
+    @property
+    def abspath(self):
+        """The absolute path to the Zarr file"""
+        return os.path.abspath(self.path)
+
     @property
     def synchronizer(self):
         return self.__synchronizer
@@ -445,6 +454,27 @@ def __is_ref(self, dtype):
         else:
             return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE
 
+    def __resolve_ref(self, zarr_ref):
+        """
+        Get the full path to the object linked to by the zarr reference
+
+        The function only constructs the links to the targe object, but it does not check if the object exists
+
+        :param zarr_ref: Dict with `source` and `path` keys or a `ZarrRefernce` object
+        :return: Full path to the linked object
+        """
+        # Extract the path as defined in the zarr_ref object
+        if zarr_ref.get('source', None) is None:
+            ref_path = str(zarr_ref['path'])
+        elif zarr_ref.get('path', None) is None:
+            ref_path = str(zarr_ref['source'])
+        else:
+            ref_path = os.path.join(zarr_ref['source'], zarr_ref['path'].lstrip("/"))
+        # Make the path relative to the current file
+        ref_path = os.path.abspath(os.path.join(self.path, ref_path))
+        # Return the create path
+        return ref_path
+
     def __get_ref(self, ref_object):
         """
         Create a ZarrReference object that points to the given container
@@ -476,6 +506,9 @@ def __get_ref(self, ref_object):
         source = (builder.source
                   if (builder.source is not None and os.path.isdir(builder.source))
                   else self.__path)
+        # Make the source relative to the current file
+        source = os.path.relpath(os.path.abspath(source), start=self.abspath)
+        # Return the ZarrReference object
         return ZarrReference(source, path)
 
     def __add_link__(self, parent, target_source, target_path, link_name):
@@ -513,9 +546,9 @@ def write_link(self, **kwargs):
         # When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the
         # builder.source and target_builder.source are not being updated and point to the old file, but
         # for internal links (a.k.a, SoftLinks) they will be the same and our target will be part of
-        # our new file, so we can savely replace the source
+        # our new file, so we can safely replace the source
         if builder.source == target_builder.source:
-            zarr_ref.source = self.__path
+            zarr_ref.source = "."  # Link should be relative to self
         # EXPORT WITH LINKS: Make sure target is written. If is not then if the target points to a
         #                    non-Zarr source, then we need to copy the data instead of writing a
         #                    link to the data
@@ -998,15 +1031,10 @@ def __read_links(self, zarr_obj, parent):
             links = zarr_obj.attrs['zarr_link']
             for link in links:
                 link_name = link['name']
-                if link['source'] is None:
-                    l_path = str(link['path'])
-                elif link['path'] is None:
-                    l_path = str(link['source'])
-                else:
-                    l_path = os.path.join(link['source'], link['path'].lstrip("/"))
+                l_path = self.__resolve_ref(link)
                 if not os.path.exists(l_path):
-                    raise ValueError("Found bad link %s in %s to %s" % (link_name, self.__path, l_path))
-
+                    raise ValueError("Found bad link %s in %s in file %s to %s" %
+                                     (link_name, self.__get_path(parent), self.__path, l_path))
                 target_name = str(os.path.basename(l_path))
                 target_zarr_obj = zarr.open(l_path, mode='r')
                 # NOTE: __read_group and __read_dataset return the cached builders if the target has already been built
@@ -1108,11 +1136,7 @@ def __parse_ref(self, shape, obj_refs, reg_refs, data):
             o = data
             for i in p:
                 o = o[i]
-            source = o['source']
-            path = o['path']
-            if source is not None and source != "":
-                path = os.path.join(source, path.lstrip("/"))
-
+            path = self.__resolve_ref(o)
             if not os.path.exists(path):
                 raise ValueError("Found bad link in dataset to %s" % (path))
 
@@ -1135,11 +1159,7 @@ def __read_attrs(self, zarr_obj):
                 if isinstance(v, dict) and 'zarr_dtype' in v:
                     # TODO Is this the correct way to resolve references?
                     if v['zarr_dtype'] == 'object':
-                        source = v['value']['source']
-                        path = v['value']['path']
-                        if source is not None and source != "":
-                            path = os.path.join(source, path.lstrip("/"))
-
+                        path = self.__resolve_ref(v['value'])
                         if not os.path.exists(path):
                             raise ValueError("Found bad link in attribute to %s" % (path))
 

diff --git a/test.py b/test.py
@@ -92,13 +92,19 @@ def run_example_tests():
 
 
 def main():
+    warnings.warn(
+        "python test.py is deprecated. Please use pytest to run unit tests and run python test_gallery.py to "
+        "test Sphinx Gallery files.",
+        DeprecationWarning
+    )
+
     # setup and parse arguments
     parser = argparse.ArgumentParser('python test.py [options]')
     parser.set_defaults(verbosity=1, suites=[])
     parser.add_argument('-v', '--verbose', const=2, dest='verbosity', action='store_const', help='run in verbose mode')
     parser.add_argument('-q', '--quiet', const=0, dest='verbosity', action='store_const', help='run disabling output')
     parser.add_argument('-u', '--unit', action='append_const', const=flags['hdmf_zarr'], dest='suites',
-                        help='run unit tests for hdmf package')
+                        help='run unit tests for hdmf_zarr package')
     parser.add_argument('-e', '--example', action='append_const', const=flags['example'], dest='suites',
                         help='run example tests')
     args = parser.parse_args()