Add Zarr IO tutorial (#1834)

* Add Zarr IO tutorial * Update CHANGELOG.md * Update CHANGELOG.md * add info, add thumbnail * Update docs/gallery/advanced_io/zarr_io.py * Update docs/gallery/advanced_io/zarr_io.py * Update requirements-dev.txt * Discard changes to requirements-dev.txt * Update requirements-doc.txt * Update zarr_io.py * Update docs/gallery/advanced_io/zarr_io.py --------- Co-authored-by: Ryan Ly <[email protected]>
NeurodataWithoutBorders · Jan 29, 2024 · 9c87ffd · 9c87ffd
1 parent 7c6868b
commit 9c87ffd
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 0 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 ### Documentation and tutorial enhancements
 - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761)
 - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825)
+- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834)
 
 ## PyNWB 2.5.0 (August 18, 2023)
 

diff --git a/docs/gallery/advanced_io/zarr_io.py b/docs/gallery/advanced_io/zarr_io.py
@@ -0,0 +1,98 @@
+"""
+Zarr IO
+=======
+
+Zarr is an alternative backend option for NWB files. It is a Python package that
+provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good
+option for large datasets because, like HDF5, it is designed to store data on disk and
+only load the data into memory when needed. Zarr is also a good option for parallel
+computing because it supports concurrent reads and writes.
+
+Note that the Zarr native storage formats are optimized for storage in cloud storage
+(e.g., S3). For very large files, Zarr will create many files which can lead to
+issues for traditional file systems (that are not cloud object stores) due to
+limitations on the number of files per directory (this affects local disk,
+GDrive, Dropbox etc.).
+
+Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr package<>`. First, create an
+an NWBFile using PyNWB.
+"""
+
+# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png'
+
+
+from datetime import datetime
+from dateutil.tz import tzlocal
+
+import numpy as np
+from pynwb import NWBFile, TimeSeries
+
+# Create the NWBFile. Substitute your NWBFile generation here.
+nwbfile = NWBFile(
+    session_description="my first synthetic recording",
+    identifier="EXAMPLE_ID",
+    session_start_time=datetime.now(tzlocal()),
+    session_id="LONELYMTN",
+)
+
+#######################################################################################
+# Dataset Configuration
+# ---------------------
+# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these
+# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous
+# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the
+# `numcodecs` library. For example, to create a :py:class:`.TimeSeries`
+# with a Zarr backend, use the following:
+
+from numcodecs import Blosc
+from hdmf_zarr import ZarrDataIO
+
+data_with_zarr_data_io = ZarrDataIO(
+    data=np.random.randn(100, 100),
+    chunks=(10, 10),
+    fillvalue=0,
+    compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE)
+)
+
+#######################################################################################
+# Now add it to the `NWBFile`.
+
+nwbfile.add_acquisition(
+    TimeSeries(
+        name="synthetic_timeseries",
+        data=data_with_zarr_data_io,
+        unit="m",
+        rate=10e3,
+    )
+)
+
+#######################################################################################
+# Writing to Zarr
+# ---------------
+# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with
+# :py:class:`hdmf_zarr.nwb.NWBZarrIO` for read/write
+
+from hdmf_zarr.nwb import NWBZarrIO
+import os
+
+path = "zarr_tutorial.nwb.zarr"
+absolute_path = os.path.abspath(path)
+with NWBZarrIO(path=path, mode="w") as io:
+    io.write(nwbfile)
+
+#######################################################################################
+# The main reason for using the absolute_path here is for testing purposes to ensure
+# links and references work as expected. Otherwise, using the relative path here instead
+# is fine.
+#
+# Reading from Zarr
+# -----------------
+# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous
+# :py:class:`hdmf_zarr.nwb.NWBZarrIO`.
+
+with NWBZarrIO(path=absolute_path, mode="r") as io:
+    read_nwbfile = io.read()
+
+#######################################################################################
+# .. note::
+#    For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -148,6 +148,7 @@ def __call__(self, filename):
     'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None),
     'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None),
     'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None),
+    'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None),
 }
 
 extlinks = {
@@ -159,6 +160,7 @@ def __call__(self, filename):
     'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'),
     'dandi': ('https://www.dandiarchive.org/%s', '%s'),
     "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"),
+    'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'),
 }
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/requirements-doc.txt b/requirements-doc.txt
@@ -12,3 +12,4 @@ dataframe_image   # used to render large dataframe as image in the sphinx galler
 lxml  # used by dataframe_image when using the matplotlib backend
 hdf5plugin
 dandi>=0.46.6
+hdmf-zarr