diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ddcdda7b..c36666c7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ ### Documentation and tutorial enhancements - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) +- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) ## PyNWB 2.5.0 (August 18, 2023) diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py new file mode 100644 index 000000000..c6524b64d --- /dev/null +++ b/docs/gallery/advanced_io/zarr_io.py @@ -0,0 +1,98 @@ +""" +Zarr IO +======= + +Zarr is an alternative backend option for NWB files. It is a Python package that +provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good +option for large datasets because, like HDF5, it is designed to store data on disk and +only load the data into memory when needed. Zarr is also a good option for parallel +computing because it supports concurrent reads and writes. + +Note that the Zarr native storage formats are optimized for storage in cloud storage +(e.g., S3). For very large files, Zarr will create many files which can lead to +issues for traditional file systems (that are not cloud object stores) due to +limitations on the number of files per directory (this affects local disk, +GDrive, Dropbox etc.). + +Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr package<>`. First, create an +an NWBFile using PyNWB. +""" + +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png' + + +from datetime import datetime +from dateutil.tz import tzlocal + +import numpy as np +from pynwb import NWBFile, TimeSeries + +# Create the NWBFile. Substitute your NWBFile generation here. +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +####################################################################################### +# Dataset Configuration +# --------------------- +# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these +# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous +# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the +# `numcodecs` library. For example, to create a :py:class:`.TimeSeries` +# with a Zarr backend, use the following: + +from numcodecs import Blosc +from hdmf_zarr import ZarrDataIO + +data_with_zarr_data_io = ZarrDataIO( + data=np.random.randn(100, 100), + chunks=(10, 10), + fillvalue=0, + compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE) +) + +####################################################################################### +# Now add it to the `NWBFile`. + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + data=data_with_zarr_data_io, + unit="m", + rate=10e3, + ) +) + +####################################################################################### +# Writing to Zarr +# --------------- +# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with +# :py:class:`hdmf_zarr.nwb.NWBZarrIO` for read/write + +from hdmf_zarr.nwb import NWBZarrIO +import os + +path = "zarr_tutorial.nwb.zarr" +absolute_path = os.path.abspath(path) +with NWBZarrIO(path=path, mode="w") as io: + io.write(nwbfile) + +####################################################################################### +# The main reason for using the absolute_path here is for testing purposes to ensure +# links and references work as expected. Otherwise, using the relative path here instead +# is fine. +# +# Reading from Zarr +# ----------------- +# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +with NWBZarrIO(path=absolute_path, mode="r") as io: + read_nwbfile = io.read() + +####################################################################################### +# .. note:: +# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. diff --git a/docs/source/conf.py b/docs/source/conf.py index 143d9d2c6..faf7d4a9b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -148,6 +148,7 @@ def __call__(self, filename): 'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None), 'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None), 'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None), + 'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None), } extlinks = { @@ -159,6 +160,7 @@ def __call__(self, filename): 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'), 'dandi': ('https://www.dandiarchive.org/%s', '%s'), "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"), + 'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'), } # Add any paths that contain templates here, relative to this directory. diff --git a/requirements-doc.txt b/requirements-doc.txt index 2050f4439..c37aee646 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,3 +12,4 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 +hdmf-zarr