From 1e76df2dcb25b2952e65a65cdcb536f39fd1b36e Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Dec 2022 01:02:58 -0800 Subject: [PATCH 1/6] Add documention on the Zarr storage definition --- docs/source/conf.py | 8 + docs/source/index.rst | 1 + docs/source/overview.rst | 12 +- docs/source/storage.rst | 352 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 365 insertions(+), 8 deletions(-) create mode 100644 docs/source/storage.rst diff --git a/docs/source/conf.py b/docs/source/conf.py index 78482e37..2dfedaef 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -58,6 +58,7 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', + 'sphinx.ext.extlinks', 'sphinx.ext.napoleon', 'sphinx_gallery.gen_gallery', ] @@ -85,6 +86,13 @@ 'zarr': ('https://zarr.readthedocs.io/en/stable/', None) } +# Use this for mapping to external links +extlinks = { + 'pynwb-docs': ('https://pynwb.readthedocs.io/en/stable/', '%s'), + 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/', '%s'), + 'zarr-docs': ('https://zarr.readthedocs.io/en/stable/', '%s') +} + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/source/index.rst b/docs/source/index.rst index 42fdcab7..f08738db 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -30,6 +30,7 @@ Citing hdmf-zarr :maxdepth: 2 :caption: For Developers: + storage hdmf_zarr Indices and tables diff --git a/docs/source/overview.rst b/docs/source/overview.rst index f014ba14..42c34e10 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -10,11 +10,8 @@ Zarr Backend and Utilities * :py:class:`~hdmf_zarr.nwb.NWBZarrIO` uses :py:class:`~hdmf_zarr.backend.ZarrIO` to define a Zarr backend store for integration with PyNWB to simplify the use of `hdmf_zarr` with NWB (similar to :py:class:`~pynwb.NWBHDF5IO` in PyNWB) * :py:mod:`~hdmf_zarr.utils` implements utility classes for the :py:class:`~hdmf_zarr.backend.ZarrIO` backend. For end-users the :py:class:`~hdmf_zarr.utils.ZarrDataIO` class is relevant for defining advanced I/O options for datasets. -Features and Known Limitations ------------------------------- - Supported features -^^^^^^^^^^^^^^^^^^^ +------------------ - Write/Read of basic data types, strings and compound data types - Chunking @@ -24,12 +21,11 @@ Supported features - Writing/loading namespaces/specifications - Iterative data write using :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` -Limitations -^^^^^^^^^^^ +Known Limitations +----------------- -- Support for region references is not yet implemented (see :py:class:`hdmf_zarr.backend.ZarrIO.__get_ref`) +- Support for region references is not yet implemented. See also :ref:`sec-zarr-storage-references-region` for details. - The Zarr backend is currently experimental and may still change. -- Links and reference are not natively supported by Zarr. Links and references are implemented in :py:class:`~hdmf_zarr.backend.ZarrIO` in an OS independent fashion. The backend reserves attributes (see :py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`) to store the paths of the target objects (see also :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__`, :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links`) - Attributes are stored as JSON documents in Zarr (using the DirectoryStore). As such, all attributes must be JSON serializable. The :py:class:`~hdmf_zarr.backend.ZarrIO` backend attempts to cast types to JSON serializable types as much as possible. - Currently the :py:class:`~hdmf_zarr.backend.ZarrIO` backend uses Zarr's :py:class:`~zarr.storage.DirectoryStore` only. Other `Zarr stores `_ could be added but will require proper treatment of links and references for those backends as links are not supported in Zarr (see `https://github.com/zarr-developers/zarr-python/issues/389 `_. - Exporting of HDF5 files with external links is not yet fully implemented/tested diff --git a/docs/source/storage.rst b/docs/source/storage.rst new file mode 100644 index 00000000..dfb00cd3 --- /dev/null +++ b/docs/source/storage.rst @@ -0,0 +1,352 @@ +.. _sec-zarr-storage: + +======== +Storage +======== + +hdmf-zarr currently uses the Zarr :zarr-docs:`DirectoryStory `, +which uses directories and files on a standard file system to serialize data. Below we describe how + +Format Mapping +============== + +Here we describe the mapping of HDMF primitives (e.g., Groups, Datasets, Attributes, Links, etc.) used by +the HDMF schema language to Zarr storage primitives. HDMF data modeling primitives were originally designed +with HDF5 in mind. However, Zarr uses very similar primitives, and as such the high-level mapping between +HDMF schema and Zarr storage is overall fairly simple. The main complication is that Zarr does not support +links and references (see `Zarr issue #389 `_) +and as such have to implemented by hdmf-zarr. + +.. tabularcolumns:: |p{4cm}|p{11cm}| + +.. table:: Mapping of groups + :class: longtable + + ============= =============================================== + NWB Primitive Zarr Primitive + ============= =============================================== + Group Group + Dataset Dataset + Attribute Attribute + Link Stored as JSON formatted Attributes + ============= =============================================== + +Mapping of HDMF specification language keys +=========================================== + +Here we describe the mapping of keys from the HDMF specification language to Zarr storage objects: + +.. _sec-zarr-storage-groups: + +Groups +------ + +.. tabularcolumns:: |p{4cm}|p{11cm}| + +.. table:: Mapping of groups + :class: longtable + + ============================ ====================================================================================== + NWB Key Zarr + ============================ ====================================================================================== + name Name of the Group in Zarr + doc Zarr attribute ``doc`` on the Zarr group + groups Zarr groups within the Zarr group + datasets Zarr datasets within the Zarr group + attributes Zarr attributes on the Zarr group + links Stored as JSON formatted attributes on the Zarr Group + linkable Not mapped; Stored in schema only + quantity Not mapped; Number of appearances of the group + neurodata_type Attribute ``neurodata_type`` on the Zarr Group + namespace ID Attribute ``namespace`` on the Zarr Group + object ID Attribute ``object_id`` on the Zarr Group + ============================ ====================================================================================== + +.. _sec-zarr-storage-datasets: + +Datasets +-------- + +.. tabularcolumns:: |p{4cm}|p{11cm}| + +.. table:: Mapping of datasets + :class: longtable + + ============================ ========================================================================================================== + HDMF Specification Key Zarr + ============================ ========================================================================================================== + name Name of the dataset in Zarr + doc Zarr attribute ``doc`` on the Zarr dataset + dtype Data type of the Zarr dataset (see `dtype mappings`_ table) and stored in the ``zarr_dtype`` reserved attribute + shape Shape of the Zarr dataset if the shape is fixed, otherwise shape defines the maxshape + dims Not mapped + attributes Zarr attributes on the Zarr dataset + linkable Not mapped; Stored in schema only + quantity Not mapped; Number of appearances of the dataset + neurodata_type Attribute ``neurodata_type`` on the Zarr dataset + namespace ID Attribute ``namespace`` on the Zarr dataset + object ID Attribute ``object_id`` on the Zarr dataset + ============================ =========================================================================================================== + +.. note:: + + * TODO Update mapping of dims + +.. _sec-zarr-storage-attributes: + +Attributes +---------- + +.. tabularcolumns:: |p{4cm}|p{11cm}| + +.. table:: Mapping of attributes + :class: longtable + + ============================ ====================================================================================== + HDMF Specification Key Zarr + ============================ ====================================================================================== + name Name of the attribute in Zarr + doc Not mapped; Stored in schema only + dtype Data type of the Zarr attribute + shape Shape of the Zarr attribute if the shape is fixed, otherwise shape defines the maxshape + dims Not mapped; Reflected by the shape of the attribute data + required Not mapped; Stored in schema only + value Data value of the attribute + ============================ ====================================================================================== + +.. note:: + + Attributes are stored as JSON documents in Zarr (using the DirectoryStore). As such, all attributes + must be JSON serializable. The :py:class:`~hdmf_zarr.backend.ZarrIO` backend attempts to cast types + (e.g., numpy arrays) to JSON serializable types as much as possible, but not all possible types may + be supported. + +.. _sec-zarr-storage-attributes-reserved: + +Reserved attributes +^^^^^^^^^^^^^^^^^^^ + +The :py:class:`~hdmf_zarr.backend.ZarrIO` backend defines a set of reserved attribute names defined in +py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`. These reserved attributes are used to implement +functionality (e.g., links and object references) that are not natively supported by Zarr. + + ============================ ====================================================================================== + Reserved Attribute Name Usage + ============================ ====================================================================================== + zarr_link Attribute used to store links. See :ref:`sec-zarr-storage-links` for details. + zarr_dtype Attribute used to specify the data type of a dataset. This is used to implement the + storage of object references as part of datasets. + See :ref:`sec-zarr-storage-references` + ============================ ====================================================================================== + +.. _sec-zarr-storage-links: + +Links +----- + +Similar to soft links in a file system, a link is an object in a Group that links to another Group or Dataset, +either within the same Zarr file or another external Zarr file. Links and reference are not natively supported by +Zarr but are implemented in :py:class:`~hdmf_zarr.backend.ZarrIO` in an OS independent fashion using the ``zarr_link`` +reserved attribute (see :py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`) to store a list of dicts serialized +as JSON. Each dict (i.e., element) in the list defines a link, with each dict containing the following keys: + +* ``name`` : Name of the link +* ``source`` : Relative path to the root of the Zarr file containing the linked object. For links + pointing to an object within the same Zarr file, the value of source will be ``"."``. For external + links that point ot object in another Zarr file, the value of source will be the path to + the other Zarr file relative to the root path of the Zarr file containing the link. +* ``path`` : Path to the linked object within the Zarr file idenfied by the ``source`` key + +For example: + +.. code-block:: json + + "zarr_link": [ + { + "name": "device", + "path": "/general/devices/array", + "source": "." + } + ] + +.. tabularcolumns:: |p{4cm}|p{11cm}| + +.. table:: Mapping of links + :class: longtable + + ============================ ====================================================================================== + HDMF Specification Key Zarr + ============================ ====================================================================================== + name Name of the link + doc Not mapped; Stored in schema only + target_type Not mapped. The target type is determined by the type of the target of the link + ============================ ====================================================================================== + + +.. hint:: + + In Zarr, attributes are stored in JSON as part of the hidden ``.zattrs`` file in the folder defining + the Group or Dataset. + +.. hint:: + + In :py:class:`~hdmf_zarr.backend.ZarrIO`, links are written by the + :py:meth:`~hdmf_zarr.backend.ZarrIO.__write_link__` function, which also uses the helper functions + i) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to construct py:meth:`~hdmf_zarr.utils.ZarrRefernce` + and ii) :py:meth:`~hdmf_zarr.backend.ZarrIO.__add_link__` to add a link to the Zarr file. + :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_links` then parses links and also uses the + :py:meth:`~hdmf_zarr.backend.ZarrIO.__resolve_ref` helper function to resolve the paths stored in links. + + +.. _sec-zarr-storage-references: + +Object References +----------------- + +Object reference behave much the same way as Links, with the key difference that they are stored as part +of datasets or attributes. This approach allows for storage of large collections of references as values +of multi-dimensional arrays (i.e., the data type of the array is a reference type). + +Storing object references in Datasets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To identify that a dataset contains object reference, the reserved attribute ``zarr_dtype`` is set to +``'object'`` (see also :ref:`sec-zarr-storage-attributes-reserved`). In this way, we can unambiguously +if a dataset stores references that need to be resolved. + +Similar to Links, object references are defined via dicts, which are stored as elements of +the Dataset. In contrast to links, individual object reference do not have a ``name`` but are identified +by their location (i.e., index) in the dataset. As such, object references only have the ``source`` with +the relative path to the target Zarr file, and the ``path`` identifying the object within the source +Zarr file.The individual object references are defined in the +:py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via +the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function. + +By default, :py:class:`~hdmf_zarr.backend.ZarrIO` uses the ``numcodecs.pickles.Pickle`` codec to +encode object references defined as py:class:`~hdmf_zarr.utils.ZarrReference` dicts in datasets. +Users may set the codec used to encode objects in Zarr datasets via the ``object_codec_class`` +parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of +:py:class:`~hdmf_zarr.backend.ZarrIO`. E.g., we could use +``ZarrIO( ... , object_codec_class=numcodecs.JSON)`` to serialize objects using JSON. + +Storing object references in Attributes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +{'zarr_dtype': type_str, 'value': refs} + + + +.. _sec-zarr-storage-references-region: + +Region references +----------------- + +Region references, are similar to object references, but instead of references other Datasets or Groups, +region references link to subsets of another Dataset. To identify region references, the reserved attribute +``zarr_dtype`` is set to ``'region'`` (see also :ref:`sec-zarr-storage-attributes-reserved`). In addition +to the ``source`` and ``path``, the py:class:`~hdmf_zarr.utils.ZarrReference` object will also need to +store the definition of the ``region`` that is being referenced, e.g., a slice or list on point indices. + +.. warning:: + + Region references are not yet fully implemented in :py:class:`~hdmf_zarr.backend.ZarrIO`. + To implement region references will require updating: + 1) py:class:`~hdmf_zarr.utils.ZarrReference` to add a ``region`` key to support storing + the selection for the region, + 2) :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` to support passing in the region definition to + be added to the py:class:`~hdmf_zarr.utils.ZarrReference`, + 3) :py:meth:`~hdmf_zarr.backend.ZarrIO.write_dataset` already partially implements the required + logic for creating region references by checking for :py:class:`hdmf.build.RegionBuilder` inputs + but will likely need updates as well + 4) :py:meth:`~hdmf_zarr.backend.ZarrIO.~__read_dataset` to support reading region references, + which may also require updates to :py:meth:`~hdmf_zarr.backend.ZarrIO.~__parse_ref` and + :py:meth:`~hdmf_zarr.backend.ZarrIO.~__resolve_ref`, and 5) and possibly other parts of + :py:class:`~hdmf_zarr.backend.ZarrIO` + + + + + +* In datasets +* In attributes {'zarr_dtype': type_str, 'value': refs} + +.. _sec-zarr-storage-dtypes: + +dtype mappings +-------------- + +The mappings of data types is as follows + + +--------------------------+----------------------------------+----------------+ + | ``dtype`` **spec value** | **storage type** | **size** | + +--------------------------+----------------------------------+----------------+ + | * "float" | single precision floating point | 32 bit | + | * "float32" | | | + +--------------------------+----------------------------------+----------------+ + | * "double" | double precision floating point | 64 bit | + | * "float64" | | | + +--------------------------+----------------------------------+----------------+ + | * "long" | signed 64 bit integer | 64 bit | + | * "int64" | | | + +--------------------------+----------------------------------+----------------+ + | * "int" | signed 32 bit integer | 32 bit | + | * "int32" | | | + +--------------------------+----------------------------------+----------------+ + | * "int16" | signed 16 bit integer | 16 bit | + +--------------------------+----------------------------------+----------------+ + | * "int8" | signed 8 bit integer | 8 bit | + +--------------------------+----------------------------------+----------------+ + | * "uint32" | unsigned 32 bit integer | 32 bit | + +--------------------------+----------------------------------+----------------+ + | * "uint16" | unsigned 16 bit integer | 16 bit | + +--------------------------+----------------------------------+----------------+ + | * "uint8" | unsigned 8 bit integer | 8 bit | + +--------------------------+----------------------------------+----------------+ + | * "bool" | boolean | 8 bit | + +--------------------------+----------------------------------+----------------+ + | * "text" | unicode | variable | + | * "utf" | | | + | * "utf8" | | | + | * "utf-8" | | | + +--------------------------+----------------------------------+----------------+ + | * "ascii" | ascii | variable | + | * "str" | | | + +--------------------------+----------------------------------+----------------+ + | * "ref" | Reference to another group or | | + | * "reference" | dataset | | + | * "object" | | | + +--------------------------+----------------------------------+----------------+ + | * region | Reference to a region | | + | | of another dataset | | + +--------------------------+----------------------------------+----------------+ + | * compound dtype | HDF5 compound data type | | + +--------------------------+----------------------------------+----------------+ + | * "isodatetime" | ASCII ISO8061 datetime string. | variable | + | | For example | | + | | ``2018-09-28T14:43:54.123+02:00``| | + +--------------------------+----------------------------------+----------------+ + + +Caching format specifications +============================= + +In practice it is useful to cache the specification a file was created with (including extensions) +directly in the HDF5 file. Caching the specification in the file ensures that users can access +the specification directly if necessary without requiring external resources. However, the mechanisms for +caching format specifications is likely different for different storage backends and is not +part of the NWB format specification itself. For the HDF5 backend, caching of the schema is implemented as follows. + +The HDF5 backend adds the reserved top-level group ``/specifications`` in which all format specifications (including +extensions) are cached. The ``/specifications`` group contains for each specification namespace a subgroup +``/specifications//`` in which the specification for a particular version of a namespace +are stored (e.g., ``/specifications/core/2.0.1`` in the case of the NWB core namespace at version 2.0.1). +The actual specification data is then stored as a JSON string in scalar datasets with a binary, variable-length string +data type (e.g., ``dtype=special_dtype(vlen=binary_type)`` in Python). The specification of the namespace is stored in +``/specifications///namespace`` while additional source files are stored in +``/specifications///``. Here ```` refers to the main name +of the source-file without file extension (e.g., the core namespace defines ``nwb.ephys.yaml`` as source which would +be stored in ``/specifications/core/2.0.1/nwb.ecephys``). + + + + From 2658062a05ce96f45e8366deb0265dffce305e5e Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Dec 2022 01:27:30 -0800 Subject: [PATCH 2/6] Updated storage docs --- docs/source/storage.rst | 150 +++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 70 deletions(-) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index dfb00cd3..23b714a2 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -72,9 +72,9 @@ Datasets .. table:: Mapping of datasets :class: longtable - ============================ ========================================================================================================== + ============================ ====================================================================================================================== HDMF Specification Key Zarr - ============================ ========================================================================================================== + ============================ ====================================================================================================================== name Name of the dataset in Zarr doc Zarr attribute ``doc`` on the Zarr dataset dtype Data type of the Zarr dataset (see `dtype mappings`_ table) and stored in the ``zarr_dtype`` reserved attribute @@ -86,7 +86,7 @@ Datasets neurodata_type Attribute ``neurodata_type`` on the Zarr dataset namespace ID Attribute ``namespace`` on the Zarr dataset object ID Attribute ``object_id`` on the Zarr dataset - ============================ =========================================================================================================== + ============================ ====================================================================================================================== .. note:: @@ -218,7 +218,7 @@ Similar to Links, object references are defined via dicts, which are stored as e the Dataset. In contrast to links, individual object reference do not have a ``name`` but are identified by their location (i.e., index) in the dataset. As such, object references only have the ``source`` with the relative path to the target Zarr file, and the ``path`` identifying the object within the source -Zarr file.The individual object references are defined in the +Zarr file. The individual object references are defined in the :py:class:`~hdmf_zarr.backend.ZarrIO` as py:class:`~hdmf_zarr.utils.ZarrReference` object created via the :py:meth:`~hdmf_zarr.backend.ZarrIO.__get_ref` helper function. @@ -232,9 +232,26 @@ parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of Storing object references in Attributes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -{'zarr_dtype': type_str, 'value': refs} +Object references are stored in a attributes as dicts with the following keys: +* ``zarr_dtype`` : Indicating the data type for the attribute. For object references + ``zarr_dtype`` is set to ``"object"`` (or ``"region"`` for :ref:`sec-zarr-storage-references-region`) +* ``value``: The value of the object references, i.e., here the py:class:`~hdmf_zarr.utils.ZarrReference` + dictionary with the ``source`` and ``path`` keys defining the object reference (again, ``source`` is + here the relative path to the target Zarr file, and ``path`` identifys the object within the source + Zarr file). +For example in NWB, the attribute ``ElectricalSeries.electrodes.table`` would be defined as follows: + +.. code-block:: json + + "table": { + "value": { + "path": "/general/extracellular_ephys/electrodes", + "source": "." + }, + "zarr_dtype": "object" + } .. _sec-zarr-storage-references-region: @@ -258,17 +275,10 @@ store the definition of the ``region`` that is being referenced, e.g., a slice o 3) :py:meth:`~hdmf_zarr.backend.ZarrIO.write_dataset` already partially implements the required logic for creating region references by checking for :py:class:`hdmf.build.RegionBuilder` inputs but will likely need updates as well - 4) :py:meth:`~hdmf_zarr.backend.ZarrIO.~__read_dataset` to support reading region references, - which may also require updates to :py:meth:`~hdmf_zarr.backend.ZarrIO.~__parse_ref` and - :py:meth:`~hdmf_zarr.backend.ZarrIO.~__resolve_ref`, and 5) and possibly other parts of - :py:class:`~hdmf_zarr.backend.ZarrIO` - - - - - -* In datasets -* In attributes {'zarr_dtype': type_str, 'value': refs} + 4) :py:meth:`~hdmf_zarr.backend.ZarrIO.__read_dataset` to support reading region references, + which may also require updates to :py:meth:`~hdmf_zarr.backend.ZarrIO.__parse_ref` and + :py:meth:`~hdmf_zarr.backend.ZarrIO.__resolve_ref`, and + 5) and possibly other parts of :py:class:`~hdmf_zarr.backend.ZarrIO` .. _sec-zarr-storage-dtypes: @@ -277,71 +287,71 @@ dtype mappings The mappings of data types is as follows - +--------------------------+----------------------------------+----------------+ - | ``dtype`` **spec value** | **storage type** | **size** | - +--------------------------+----------------------------------+----------------+ - | * "float" | single precision floating point | 32 bit | - | * "float32" | | | - +--------------------------+----------------------------------+----------------+ - | * "double" | double precision floating point | 64 bit | - | * "float64" | | | - +--------------------------+----------------------------------+----------------+ - | * "long" | signed 64 bit integer | 64 bit | - | * "int64" | | | - +--------------------------+----------------------------------+----------------+ - | * "int" | signed 32 bit integer | 32 bit | - | * "int32" | | | - +--------------------------+----------------------------------+----------------+ - | * "int16" | signed 16 bit integer | 16 bit | - +--------------------------+----------------------------------+----------------+ - | * "int8" | signed 8 bit integer | 8 bit | - +--------------------------+----------------------------------+----------------+ - | * "uint32" | unsigned 32 bit integer | 32 bit | - +--------------------------+----------------------------------+----------------+ - | * "uint16" | unsigned 16 bit integer | 16 bit | - +--------------------------+----------------------------------+----------------+ - | * "uint8" | unsigned 8 bit integer | 8 bit | - +--------------------------+----------------------------------+----------------+ - | * "bool" | boolean | 8 bit | - +--------------------------+----------------------------------+----------------+ - | * "text" | unicode | variable | - | * "utf" | | | - | * "utf8" | | | - | * "utf-8" | | | - +--------------------------+----------------------------------+----------------+ - | * "ascii" | ascii | variable | - | * "str" | | | - +--------------------------+----------------------------------+----------------+ - | * "ref" | Reference to another group or | | - | * "reference" | dataset | | - | * "object" | | | - +--------------------------+----------------------------------+----------------+ - | * region | Reference to a region | | - | | of another dataset | | - +--------------------------+----------------------------------+----------------+ - | * compound dtype | HDF5 compound data type | | - +--------------------------+----------------------------------+----------------+ - | * "isodatetime" | ASCII ISO8061 datetime string. | variable | - | | For example | | - | | ``2018-09-28T14:43:54.123+02:00``| | - +--------------------------+----------------------------------+----------------+ + +--------------------------+------------------------------------+----------------+ + | ``dtype`` **spec value** | **storage type** | **size** | + +--------------------------+------------------------------------+----------------+ + | * "float" | single precision floating point | 32 bit | + | * "float32" | | | + +--------------------------+------------------------------------+----------------+ + | * "double" | double precision floating point | 64 bit | + | * "float64" | | | + +--------------------------+------------------------------------+----------------+ + | * "long" | signed 64 bit integer | 64 bit | + | * "int64" | | | + +--------------------------+------------------------------------+----------------+ + | * "int" | signed 32 bit integer | 32 bit | + | * "int32" | | | + +--------------------------+------------------------------------+----------------+ + | * "int16" | signed 16 bit integer | 16 bit | + +--------------------------+------------------------------------+----------------+ + | * "int8" | signed 8 bit integer | 8 bit | + +--------------------------+------------------------------------+----------------+ + | * "uint32" | unsigned 32 bit integer | 32 bit | + +--------------------------+------------------------------------+----------------+ + | * "uint16" | unsigned 16 bit integer | 16 bit | + +--------------------------+------------------------------------+----------------+ + | * "uint8" | unsigned 8 bit integer | 8 bit | + +--------------------------+------------------------------------+----------------+ + | * "bool" | boolean | 8 bit | + +--------------------------+------------------------------------+----------------+ + | * "text" | unicode | variable | + | * "utf" | | | + | * "utf8" | | | + | * "utf-8" | | | + +--------------------------+------------------------------------+----------------+ + | * "ascii" | ascii | variable | + | * "str" | | | + +--------------------------+------------------------------------+----------------+ + | * "ref" | Reference to another group or | | + | * "reference" | dataset. See | | + | * "object" | :ref:`sec-zarr-storage-references` | | + +--------------------------+------------------------------------+----------------+ + | * region | Reference to a region | | + | | of another dataset. See | | + | | :ref:sec-zarr-storage-references` | | + +--------------------------+------------------------------------+----------------+ + | * compound dtype | Compound data type | | + +--------------------------+------------------------------------+----------------+ + | * "isodatetime" | ASCII ISO8061 datetime string. | variable | + | | For example | | + | | ``2018-09-28T14:43:54.123+02:00`` | | + +--------------------------+------------------------------------+----------------+ Caching format specifications ============================= In practice it is useful to cache the specification a file was created with (including extensions) -directly in the HDF5 file. Caching the specification in the file ensures that users can access -the specification directly if necessary without requiring external resources. However, the mechanisms for -caching format specifications is likely different for different storage backends and is not -part of the NWB format specification itself. For the HDF5 backend, caching of the schema is implemented as follows. +directly in the Zarr file. Caching the specification in the file ensures that users can access +the specification directly if necessary without requiring external resources. +For the Zarr backend, caching of the schema is implemented as follows. -The HDF5 backend adds the reserved top-level group ``/specifications`` in which all format specifications (including +The Zarr backend adds the reserved top-level group ``/specifications`` in which all format specifications (including extensions) are cached. The ``/specifications`` group contains for each specification namespace a subgroup ``/specifications//`` in which the specification for a particular version of a namespace are stored (e.g., ``/specifications/core/2.0.1`` in the case of the NWB core namespace at version 2.0.1). The actual specification data is then stored as a JSON string in scalar datasets with a binary, variable-length string -data type (e.g., ``dtype=special_dtype(vlen=binary_type)`` in Python). The specification of the namespace is stored in +data type. The specification of the namespace is stored in ``/specifications///namespace`` while additional source files are stored in ``/specifications///``. Here ```` refers to the main name of the source-file without file extension (e.g., the core namespace defines ``nwb.ephys.yaml`` as source which would From e7d62f3b5208e8d3220c303555174511589874d1 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Thu, 15 Dec 2022 01:28:36 -0800 Subject: [PATCH 3/6] Updated changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c677f2ce..9370d082 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # HDMF-ZARR Changelog +## 0.1.x (Upcoming) +- Add docs for describing the mapping of HDMF schema to Zarr storage + ## 0.1.0 ### New features From 818664c2a448b20659bd02a747e90b4c05018f8c Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Fri, 16 Dec 2022 14:01:13 -0800 Subject: [PATCH 4/6] Update docs/source/storage.rst Co-authored-by: Ben Dichter --- docs/source/storage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 23b714a2..bf70d120 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -153,7 +153,7 @@ as JSON. Each dict (i.e., element) in the list defines a link, with each dict co * ``name`` : Name of the link * ``source`` : Relative path to the root of the Zarr file containing the linked object. For links pointing to an object within the same Zarr file, the value of source will be ``"."``. For external - links that point ot object in another Zarr file, the value of source will be the path to + links that point to object in another Zarr file, the value of source will be the path to the other Zarr file relative to the root path of the Zarr file containing the link. * ``path`` : Path to the linked object within the Zarr file idenfied by the ``source`` key From e7fddb411caca69a1075d6df35624ea0fad5f419 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Fri, 16 Dec 2022 14:01:25 -0800 Subject: [PATCH 5/6] Update docs/source/storage.rst Co-authored-by: Ben Dichter --- docs/source/storage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index bf70d120..511f4f24 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -258,7 +258,7 @@ For example in NWB, the attribute ``ElectricalSeries.electrodes.table`` would be Region references ----------------- -Region references, are similar to object references, but instead of references other Datasets or Groups, +Region references are similar to object references, but instead of references other Datasets or Groups, region references link to subsets of another Dataset. To identify region references, the reserved attribute ``zarr_dtype`` is set to ``'region'`` (see also :ref:`sec-zarr-storage-attributes-reserved`). In addition to the ``source`` and ``path``, the py:class:`~hdmf_zarr.utils.ZarrReference` object will also need to From 01f4555658229293f434c6623d6237bcf377902e Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Thu, 22 Dec 2022 10:43:16 -0800 Subject: [PATCH 6/6] Update storage.rst --- docs/source/storage.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/storage.rst b/docs/source/storage.rst index 511f4f24..f27e9380 100644 --- a/docs/source/storage.rst +++ b/docs/source/storage.rst @@ -124,7 +124,7 @@ Attributes .. _sec-zarr-storage-attributes-reserved: Reserved attributes -^^^^^^^^^^^^^^^^^^^ +------------------- The :py:class:`~hdmf_zarr.backend.ZarrIO` backend defines a set of reserved attribute names defined in py:attr:`~hdmf_zarr.backend.ZarrIO.__reserve_attribute`. These reserved attributes are used to implement @@ -208,7 +208,7 @@ of datasets or attributes. This approach allows for storage of large collections of multi-dimensional arrays (i.e., the data type of the array is a reference type). Storing object references in Datasets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------- To identify that a dataset contains object reference, the reserved attribute ``zarr_dtype`` is set to ``'object'`` (see also :ref:`sec-zarr-storage-attributes-reserved`). In this way, we can unambiguously @@ -230,7 +230,7 @@ parameter of the :py:func:`~hdmf_zarr.backend.ZarrIO.__init__` constructor of ``ZarrIO( ... , object_codec_class=numcodecs.JSON)`` to serialize objects using JSON. Storing object references in Attributes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------------- Object references are stored in a attributes as dicts with the following keys: