Skip to content

Commit

Permalink
Debug the use of relative links (#46)
Browse files Browse the repository at this point in the history
* Change zarr extension for NWB conversion
* Add external link support for sphinx
* Add tutorial to show how to create and NWB file with NWBZarrIO directly
* Make source path for links relative to the current Zarr file and add function to resolve relative paths on read
* Fix requirement definition in setup.py
* Update requirements-dev.txt
* Update run_tests.yml
* Update test.py
* Update tox.ini
* Updated ToDo and Limitations list in docs
Co-authored-by: mavaylon1 <[email protected]>
  • Loading branch information
oruebel authored Dec 21, 2022
1 parent 439df57 commit 5d2811d
Show file tree
Hide file tree
Showing 11 changed files with 247 additions and 55 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# HDMF-ZARR Changelog

## 0.1.7 (Upcoming)

### Bugs
* Use path relative to the current Zarr file in the definition of links and references to avoid breaking
links when moving Zarr files @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))
* Fix bugs in requirements defined in setup.py @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))

### Docs
* Add tutoial illustrating how to create a new NWB file with NWBZarrIO @oruebel ([#46](https://github.com/hdmf-dev/hdmf-zarr/pull/46))

## 0.1.0

### New features
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
# ---------------------------------------------
#

zarr_filename = "test_zarr_" + filename
zarr_filename = "test_zarr_" + filename + ".zarr"
hdf_filename = "test_hdf5_" + filename

# Delete our converted HDF5 file from previous runs of this notebook
Expand Down
158 changes: 158 additions & 0 deletions docs/gallery/plot_nwb_zarrio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""
.. _nwbzarrio_tutorial:
Creating NWB files using ``NWBZarrIO``
======================================
Similar to :py:class:`pynwb.NWBHDF5IO`, the :py:class:`~hdmf_zarr.nwb.NWBZarrIO` extends the basic
:py:class:`~hdmf_zarr.backend.ZarrIO` to perform default setup for BuildManager, loading or namespaces etc.,
in the context of the NWB format, to simplify using hdmf-zarr with the NWB data standard. With this we
can use :py:class:`~hdmf_zarr.nwb.NWBZarrIO` directly with the PyNWB API to read/write NWB files to/from Zarr.
I.e., we can follow the standard PyNWB tutorials for using NWB files, and only need to replace
:py:class:`pynwb.NWBHDF5IO` with :py:class:`~hdmf_zarr.nwb.NWBZarrIO` for read/write (and replace the
use of py:class:`H5DataIO` with :py:class:`~hdmf_zarr.utils.ZarrDataIO`).
Creating and NWB extracellular electrophysiology file
-----------------------------------------------------
As an example, we here create an extracellular electrophysiology NWB file.
This example is derived from :pynwb-docs:`Extracellular Electrophysiology <tutorials/domain/ecephys.html>`
tutorial.
"""
# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png'
# Ignore warnings about the development of the ZarrIO backend
import warnings
warnings.filterwarnings('ignore', '.*The ZarrIO backend is experimental*', )

from datetime import datetime
from dateutil.tz import tzlocal

import numpy as np
from pynwb import NWBFile
from pynwb.ecephys import ElectricalSeries, LFP

# Create the NWBFile
nwbfile = NWBFile(
session_description="my first synthetic recording",
identifier="EXAMPLE_ID",
session_start_time=datetime.now(tzlocal()),
experimenter="Dr. Bilbo Baggins",
lab="Bag End Laboratory",
institution="University of Middle Earth at the Shire",
experiment_description="I went on an adventure with thirteen dwarves "
"to reclaim vast treasures.",
session_id="LONELYMTN",
)

# Create a device
device = nwbfile.create_device(
name="array", description="the best array", manufacturer="Probe Company 9000"
)

# Add electrodes and electrode groups to the NWB file
nwbfile.add_electrode_column(name="label", description="label of electrode")

nshanks = 4
nchannels_per_shank = 3
electrode_counter = 0

for ishank in range(nshanks):
# create an electrode group for this shank
electrode_group = nwbfile.create_electrode_group(
name="shank{}".format(ishank),
description="electrode group for shank {}".format(ishank),
device=device,
location="brain area",
)
# add electrodes to the electrode table
for ielec in range(nchannels_per_shank):
nwbfile.add_electrode(
x=5.3, y=1.5, z=8.5, imp=np.nan,
filtering='unknown',
group=electrode_group,
label="shank{}elec{}".format(ishank, ielec),
location="brain area",
)
electrode_counter += 1

# Create a table region to select the electrodes to record from
all_table_region = nwbfile.create_electrode_table_region(
region=list(range(electrode_counter)), # reference row indices 0 to N-1
description="all electrodes",
)

# Add a mock electrical recording acquisition to the NWBFile
raw_data = np.random.randn(50, 4)
raw_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=raw_data,
electrodes=all_table_region,
starting_time=0.0, # timestamp of the first sample in seconds relative to the session start time
rate=20000.0, # in Hz
)
nwbfile.add_acquisition(raw_electrical_series)

# Add a mock LFP processing result to the NWBFile
lfp_data = np.random.randn(50, 4)
lfp_electrical_series = ElectricalSeries(
name="ElectricalSeries",
data=lfp_data,
electrodes=all_table_region,
starting_time=0.0,
rate=200.0,
)
lfp = LFP(electrical_series=lfp_electrical_series)
ecephys_module = nwbfile.create_processing_module(
name="ecephys", description="processed extracellular electrophysiology data"
)
ecephys_module.add(lfp)

# Add mock spike times and units to the NWBFile
nwbfile.add_unit_column(name="quality", description="sorting quality")

poisson_lambda = 20
firing_rate = 20
n_units = 10
for n_units_per_shank in range(n_units):
n_spikes = np.random.poisson(lam=poisson_lambda)
spike_times = np.round(
np.cumsum(np.random.exponential(1 / firing_rate, n_spikes)), 5
)
nwbfile.add_unit(
spike_times=spike_times, quality="good", waveform_mean=[1.0, 2.0, 3.0, 4.0, 5.0]
)

###############################################################################
# Writing the NWB file to Zarr
# -----------------------------
from hdmf_zarr.nwb import NWBZarrIO
import os

path = "ecephys_tutorial.nwb.zarr"
absolute_path = os.path.abspath(path)
with NWBZarrIO(path=path, mode="w") as io:
io.write(nwbfile)

###############################################################################
# Test opening the file
# ---------------------
with NWBZarrIO(path=path, mode="r") as io:
infile = io.read()

###############################################################################
# Test opening with the absolute path instead
# -------------------------------------------
with NWBZarrIO(path=absolute_path, mode="r") as io:
infile = io.read()

###############################################################################
# Test changing the current directory
# ------------------------------------

import os
os.chdir(os.path.abspath(os.path.join(os.getcwd(), "../")))

with NWBZarrIO(path=absolute_path, mode="r") as io:
infile = io.read()
7 changes: 7 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.intersphinx',
'sphinx.ext.extlinks',
'sphinx.ext.napoleon',
'sphinx_gallery.gen_gallery',
]
Expand Down Expand Up @@ -85,6 +86,12 @@
'zarr': ('https://zarr.readthedocs.io/en/stable/', None)
}

# Use this for mapping to external links
extlinks = {
'pynwb-docs': ('https://pynwb.readthedocs.io/en/stable/', '%s'),
'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/', ''),
}

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/source/figures/gallery_thumbnails.pptx
Binary file not shown.
19 changes: 5 additions & 14 deletions docs/source/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,17 @@ Supported features
- Write/Read of basic data types, strings and compound data types
- Chunking
- Compression and I/O filters
- Links
- Links.
- Object references
- Writing/loading namespaces/specifications
- Iterative data write using :py:class:`~hdmf.data_utils.AbstractDataChunkIterator`

Limitations
^^^^^^^^^^^

- Support for region references is not yet implemented (see :py:class:`hdmf_zarr.backend.ZarrIO.__get_ref`)
- The Zarr backend is currently experimental and may still change.
- Links and reference are not natively supported by Zarr. Links and references are implemented in :py:class:`~hdmf_zarr.backend.ZarrIO` in an OS independent fashion. The backend reserves attributes (see :py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`) to store the paths of the target objects (see also :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links`)
- Support for region references is not yet implemented (see `hdmf-zarr issue #47 <https://github.com/hdmf-dev/hdmf-zarr/issues/47>`_.
- Attributes are stored as JSON documents in Zarr (using the DirectoryStore). As such, all attributes must be JSON serializable. The :py:class:`~hdmf_zarr.backend.ZarrIO` backend attempts to cast types to JSON serializable types as much as possible.
- Currently the :py:class:`~hdmf_zarr.backend.ZarrIO` backend uses Zarr's :py:class:`~zarr.storage.DirectoryStore` only. Other `Zarr stores <https://zarr.readthedocs.io/en/stable/api/storage.html>`_ could be added but will require proper treatment of links and references for those backends as links are not supported in Zarr (see `https://github.com/zarr-developers/zarr-python/issues/389 <https://github.com/zarr-developers/zarr-python/issues/389>`_.
- Exporting of HDF5 files with external links is not yet fully implemented/tested

TODO
^^^^

- Resolve reference stored in datasets to the containers
- Add support for RegionReferences
- :py:class:`~hdmf.backends.hdf5.h5tools.HDF5IO` uses the ``export_source`` argument on export. Need to check with Ryan Ly if we need it here as well.
- Handling of external links on export is not yet fully implemented and is missing a few corner cases
- Here we update the PyNWB test harness to add ZarrIO to the rountrip tests, which in turn runs all HDF5 roundtrip tests also for Zarr. This requires changing the test harness in PyNWB, instead it would be useful to be able to "inject" new I/O backends in the test harness so that we can specify those tests here, rather than implementing this in PyNWB and making PyNWB dependent on hdmf-zarr. See the files ``tests/integration/ui_write/base.py`` and ``tests/integration/hdf5/test_modular_storage.py`` as part of `PyNWB #1018 <https://github.com/NeurodataWithoutBorders/pynwb/pull/1018/files>`_ for details (the other parts of the this PR have already been ported to *hdmf-zarr*).
- Currently the :py:class:`~hdmf_zarr.backend.ZarrIO` backend uses Zarr's :py:class:`~zarr.storage.DirectoryStore` only. Other `Zarr stores <https://zarr.readthedocs.io/en/stable/api/storage.html>`_ could be added but will require proper treatment of links and references for those backends as links are not supported in Zarr (see `zarr-python issues #389 <https://github.com/zarr-developers/zarr-python/issues/389>`_.
- Exporting of HDF5 files with external links is not yet fully implemented/tested. (see `hdmf-zarr issue #49 <https://github.com/hdmf-dev/hdmf-zarr/issues/49>`_.
- Object references are currently always resolved on read (as are links) rather than being loaded lazily (see `hdmf-zarr issue #50 <https://github.com/hdmf-dev/hdmf-zarr/issues/50>`_.
66 changes: 43 additions & 23 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,20 @@ def __init__(self, **kwargs):
# Codec class to be used. Alternates, e.g., =numcodecs.JSON
self.__codec_cls = numcodecs.pickles.Pickle if object_codec_class is None else object_codec_class
super().__init__(manager, source=path)
warn_msg = ('\033[91m' + 'The ZarrIO backend is experimental. It is under active ' +
'development. The ZarrIO backend may change any time ' +
'and backward compatibility is not guaranteed.' + '\033[0m')
warn_msg = ("The ZarrIO backend is experimental. It is under active development. "
"The ZarrIO backend may change any time and backward compatibility is not guaranteed.")
warnings.warn(warn_msg)

@property
def path(self):
"""The path to the Zarr file as set by the use"""
return self.__path

@property
def abspath(self):
"""The absolute path to the Zarr file"""
return os.path.abspath(self.path)

@property
def synchronizer(self):
return self.__synchronizer
Expand Down Expand Up @@ -445,6 +454,27 @@ def __is_ref(self, dtype):
else:
return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE

def __resolve_ref(self, zarr_ref):
"""
Get the full path to the object linked to by the zarr reference
The function only constructs the links to the targe object, but it does not check if the object exists
:param zarr_ref: Dict with `source` and `path` keys or a `ZarrRefernce` object
:return: Full path to the linked object
"""
# Extract the path as defined in the zarr_ref object
if zarr_ref.get('source', None) is None:
ref_path = str(zarr_ref['path'])
elif zarr_ref.get('path', None) is None:
ref_path = str(zarr_ref['source'])
else:
ref_path = os.path.join(zarr_ref['source'], zarr_ref['path'].lstrip("/"))
# Make the path relative to the current file
ref_path = os.path.abspath(os.path.join(self.path, ref_path))
# Return the create path
return ref_path

def __get_ref(self, ref_object):
"""
Create a ZarrReference object that points to the given container
Expand Down Expand Up @@ -476,6 +506,9 @@ def __get_ref(self, ref_object):
source = (builder.source
if (builder.source is not None and os.path.isdir(builder.source))
else self.__path)
# Make the source relative to the current file
source = os.path.relpath(os.path.abspath(source), start=self.abspath)
# Return the ZarrReference object
return ZarrReference(source, path)

def __add_link__(self, parent, target_source, target_path, link_name):
Expand Down Expand Up @@ -513,9 +546,9 @@ def write_link(self, **kwargs):
# When exporting from one source to another, the LinkBuilders.source are not updated, i.e,. the
# builder.source and target_builder.source are not being updated and point to the old file, but
# for internal links (a.k.a, SoftLinks) they will be the same and our target will be part of
# our new file, so we can savely replace the source
# our new file, so we can safely replace the source
if builder.source == target_builder.source:
zarr_ref.source = self.__path
zarr_ref.source = "." # Link should be relative to self
# EXPORT WITH LINKS: Make sure target is written. If is not then if the target points to a
# non-Zarr source, then we need to copy the data instead of writing a
# link to the data
Expand Down Expand Up @@ -998,15 +1031,10 @@ def __read_links(self, zarr_obj, parent):
links = zarr_obj.attrs['zarr_link']
for link in links:
link_name = link['name']
if link['source'] is None:
l_path = str(link['path'])
elif link['path'] is None:
l_path = str(link['source'])
else:
l_path = os.path.join(link['source'], link['path'].lstrip("/"))
l_path = self.__resolve_ref(link)
if not os.path.exists(l_path):
raise ValueError("Found bad link %s in %s to %s" % (link_name, self.__path, l_path))

raise ValueError("Found bad link %s in %s in file %s to %s" %
(link_name, self.__get_path(parent), self.__path, l_path))
target_name = str(os.path.basename(l_path))
target_zarr_obj = zarr.open(l_path, mode='r')
# NOTE: __read_group and __read_dataset return the cached builders if the target has already been built
Expand Down Expand Up @@ -1108,11 +1136,7 @@ def __parse_ref(self, shape, obj_refs, reg_refs, data):
o = data
for i in p:
o = o[i]
source = o['source']
path = o['path']
if source is not None and source != "":
path = os.path.join(source, path.lstrip("/"))

path = self.__resolve_ref(o)
if not os.path.exists(path):
raise ValueError("Found bad link in dataset to %s" % (path))

Expand All @@ -1135,11 +1159,7 @@ def __read_attrs(self, zarr_obj):
if isinstance(v, dict) and 'zarr_dtype' in v:
# TODO Is this the correct way to resolve references?
if v['zarr_dtype'] == 'object':
source = v['value']['source']
path = v['value']['path']
if source is not None and source != "":
path = os.path.join(source, path.lstrip("/"))

path = self.__resolve_ref(v['value'])
if not os.path.exists(path):
raise ValueError("Found bad link in attribute to %s" % (path))

Expand Down
8 changes: 7 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,19 @@ def run_example_tests():


def main():
warnings.warn(
"python test.py is deprecated. Please use pytest to run unit tests and run python test_gallery.py to "
"test Sphinx Gallery files.",
DeprecationWarning
)

# setup and parse arguments
parser = argparse.ArgumentParser('python test.py [options]')
parser.set_defaults(verbosity=1, suites=[])
parser.add_argument('-v', '--verbose', const=2, dest='verbosity', action='store_const', help='run in verbose mode')
parser.add_argument('-q', '--quiet', const=0, dest='verbosity', action='store_const', help='run disabling output')
parser.add_argument('-u', '--unit', action='append_const', const=flags['hdmf_zarr'], dest='suites',
help='run unit tests for hdmf package')
help='run unit tests for hdmf_zarr package')
parser.add_argument('-e', '--example', action='append_const', const=flags['example'], dest='suites',
help='run example tests')
args = parser.parse_args()
Expand Down
Loading

0 comments on commit 5d2811d

Please sign in to comment.