Skip to content

Commit

Permalink
VectorData Refactor Expandable (#1158)
Browse files Browse the repository at this point in the history
* prior

* chckpoint

* possible poc

* fix

* finished tests

* dtype

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix again

* fix again

* clean up

* coverage

* Update CHANGELOG.md

* Update CHANGELOG.md

* Update h5tools.py

* Update h5tools.py

* clean up for review

* clean up for review

* clean up

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
mavaylon1 and pre-commit-ci[bot] authored Aug 27, 2024
1 parent 49a60df commit ddd433a
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 47 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Enhancements
- Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157)
- Added support for datasets to be expandable by default for the HDF5 backend. @mavaylon1 [#1158](https://github.com/hdmf-dev/hdmf/pull/1158)

## HDMF 3.14.3 (July 29, 2024)

Expand Down
31 changes: 24 additions & 7 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,9 @@ def copy_file(self, **kwargs):
'default': True},
{'name': 'herd', 'type': 'hdmf.common.resources.HERD',
'doc': 'A HERD object to populate with references.',
'default': None})
'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')})
def write(self, **kwargs):
"""Write the container to an HDF5 file."""
if self.__mode == 'r':
Expand Down Expand Up @@ -826,10 +828,15 @@ def close_linked_files(self):
'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently',
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None})
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')})
def write_builder(self, **kwargs):
f_builder = popargs('builder', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data',
'exhaust_dci',
'export_source',
kwargs)
self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s"
% (f_builder.name, self.source, kwargs))
for name, gbldr in f_builder.groups.items():
Expand Down Expand Up @@ -1000,6 +1007,8 @@ def _filler():
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')},
returns='the Group that was created', rtype=Group)
def write_group(self, **kwargs):
parent, builder = popargs('parent', 'builder', kwargs)
Expand Down Expand Up @@ -1100,21 +1109,24 @@ def write_link(self, **kwargs):
'default': True},
{'name': 'export_source', 'type': str,
'doc': 'The source of the builders when exporting', 'default': None},
{'name': 'expandable', 'type': bool, 'default': True,
'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')},
returns='the Dataset that was created', rtype=Dataset)
def write_dataset(self, **kwargs): # noqa: C901
""" Write a dataset to HDF5
The function uses other dataset-dependent write functions, e.g,
``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data.
"""
parent, builder = popargs('parent', 'builder', kwargs)
parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs)
link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs)
self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name))
if self.get_written(builder):
self.logger.debug(" DatasetBuilder '%s' is already written" % builder.name)
return None
name = builder.name
data = builder.data
matched_spec_shape = builder.spec_shapes
dataio = None
options = dict() # dict with additional
if isinstance(data, H5DataIO):
Expand Down Expand Up @@ -1228,7 +1240,7 @@ def _filler():
return
# If the compound data type contains only regular data (i.e., no references) then we can write it as usual
else:
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
# Write a dataset containing references, i.e., a region or object reference.
# NOTE: we can ignore options['io_settings'] for scalar data
elif self.__is_ref(options['dtype']):
Expand Down Expand Up @@ -1323,7 +1335,7 @@ def _filler():
self.__dci_queue.append(dataset=dset, data=data)
# Write a regular in memory array (e.g., numpy array, list etc.)
elif hasattr(data, '__len__'):
dset = self.__list_fill__(parent, name, data, options)
dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options)
# Write a regular scalar dataset
else:
dset = self.__scalar_fill__(parent, name, data, options)
Expand Down Expand Up @@ -1451,7 +1463,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None):
return dset

@classmethod
def __list_fill__(cls, parent, name, data, options=None):
def __list_fill__(cls, parent, name, data, matched_spec_shape, expandable, options=None):
# define the io settings and data type if necessary
io_settings = {}
dtype = None
Expand All @@ -1473,6 +1485,11 @@ def __list_fill__(cls, parent, name, data, options=None):
data_shape = (len(data),)
else:
data_shape = get_data_shape(data)
if expandable:
# Don't override existing settings
if 'maxshape' not in io_settings:
if matched_spec_shape is not None:
io_settings['maxshape'] = matched_spec_shape

# Create the dataset
try:
Expand Down
16 changes: 12 additions & 4 deletions src/hdmf/build/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,9 @@ class DatasetBuilder(BaseBuilder):
'doc': 'The datatype of this dataset.', 'default': None},
{'name': 'attributes', 'type': dict,
'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()},
{'name': 'spec_shapes', 'type': tuple,
'doc': ('The shape(s) defined in the spec.'),
'default': None},
{'name': 'dimension_labels', 'type': tuple,
'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is '
'supplied only on build.'),
Expand All @@ -341,22 +344,27 @@ class DatasetBuilder(BaseBuilder):
{'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None})
def __init__(self, **kwargs):
""" Create a Builder object for a dataset """
name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source',
kwargs
)
name, data, dtype, attributes, spec_shapes, dimension_labels, maxshape, chunks, parent, source = getargs(
'name', 'data', 'dtype', 'attributes', 'spec_shapes', 'dimension_labels', 'maxshape', 'chunks',
'parent', 'source', kwargs)
super().__init__(name, attributes, parent, source)
self['data'] = data
self['attributes'] = _copy.copy(attributes)
self.__dimension_labels = dimension_labels
self.__chunks = chunks
self.__spec_shapes = spec_shapes
self.__maxshape = maxshape
if isinstance(data, BaseBuilder):
if dtype is None:
dtype = self.OBJECT_REF_TYPE
self.__dtype = dtype
self.__name = name

@property
def spec_shapes(self):
"""The shapes defined in the spec."""
return self.__spec_shapes

@property
def data(self):
"""The data stored in the dataset represented by this builder."""
Expand Down
88 changes: 57 additions & 31 deletions src/hdmf/build/objectmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ def get_attribute(self, **kwargs):
def get_attr_value(self, **kwargs):
''' Get the value of the attribute corresponding to this spec from the given container '''
spec, container, manager = getargs('spec', 'container', 'manager', kwargs)

attr_name = self.get_attribute(spec)
if attr_name is None:
return None
Expand Down Expand Up @@ -723,7 +724,10 @@ def build(self, **kwargs):
msg = "'container' must be of type Data with DatasetSpec"
raise ValueError(msg)
spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext)
dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims)
dimension_labels, matched_shape = self.__get_spec_info(container.data,
spec_shape,
spec_dims,
spec_dtype)
if isinstance(spec_dtype, RefSpec):
self.logger.debug("Building %s '%s' as a dataset of references (source: %s)"
% (container.__class__.__name__, container.name, repr(source)))
Expand All @@ -734,6 +738,7 @@ def build(self, **kwargs):
parent=parent,
source=source,
dtype=spec_dtype.reftype,
spec_shapes=matched_shape,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager))
Expand All @@ -748,6 +753,7 @@ def build(self, **kwargs):
parent=parent,
source=source,
dtype=spec_dtype,
spec_shapes=matched_shape,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container,
Expand All @@ -766,6 +772,7 @@ def build(self, **kwargs):
parent=parent,
source=source,
dtype="object",
spec_shapes=matched_shape,
dimension_labels=dimension_labels,
)
manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager))
Expand All @@ -789,6 +796,7 @@ def build(self, **kwargs):
parent=parent,
source=source,
dtype=dtype,
spec_shapes=matched_shape,
dimension_labels=dimension_labels,
)

Expand Down Expand Up @@ -820,10 +828,7 @@ def __check_dset_spec(self, orig, ext):
spec = ext
return dtype, shape, dims, spec

def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple:
if spec_shape is None or spec_dims is None:
return None
data_shape = get_data_shape(data)
def __get_matched_dimension(self, data_shape, spec_shape, spec_dtype=None):
# if shape is a list of allowed shapes, find the index of the shape that matches the data
if isinstance(spec_shape[0], list):
match_shape_inds = list()
Expand All @@ -839,37 +844,58 @@ def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple
break
if match:
match_shape_inds.append(i)
# use the most specific match -- the one with the fewest Nones
if match_shape_inds:
if len(match_shape_inds) == 1:
return tuple(spec_dims[match_shape_inds[0]])
return match_shape_inds

def __get_spec_info(self, data, spec_shape, spec_dims, spec_dtype=None):
"""This will return the dimension labels and shape by matching the data shape to a permissible spec shape."""
if spec_shape is None and spec_dims is None:
return None, None
else:
if spec_dtype is not None and isinstance(spec_dtype, list):
data_shape = (len(data),)
else:
data_shape = get_data_shape(data)

if isinstance(spec_shape[0], list):
match_shape_inds = self.__get_matched_dimension(data_shape, spec_shape, spec_dtype)
if len(match_shape_inds) == 0:
# no matches found
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None, None
elif len(match_shape_inds) == 1:
if spec_dims is not None:
return tuple(spec_dims[match_shape_inds[0]]), tuple(spec_shape[match_shape_inds[0]])
else:
return spec_dims, tuple(spec_shape[match_shape_inds[0]])
else:
count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds]
index_min_count = count_nones.index(min(count_nones))
best_match_ind = match_shape_inds[index_min_count]
return tuple(spec_dims[best_match_ind])
if spec_dims is not None:
return tuple(spec_dims[best_match_ind]), tuple(spec_shape[best_match_ind])
else:
return spec_dims, tuple(spec_shape[best_match_ind])
else:
# no matches found
msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
else:
if len(data_shape) != len(spec_shape):
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if spec_shape[j] is not None and spec_shape[j] != d:
match = False
break
if not match:
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None
# shape is a single list of allowed dimension lengths
return tuple(spec_dims)
if len(data_shape) != len(spec_shape):
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None, None
# check each dimension. None means any length is allowed
match = True
for j, d in enumerate(data_shape):
if spec_shape[j] is not None and spec_shape[j] != d:
match = False
break
if not match:
msg = "Shape of data does not match shape in spec '%s'" % self.spec.path
warnings.warn(msg, IncorrectDatasetShapeBuildWarning)
return None, None
# shape is a single list of allowed dimension lengths
if spec_dims is not None:
return tuple(spec_dims), tuple(spec_shape)
else:
return None, tuple(spec_shape)

def __is_reftype(self, data):
if (isinstance(data, AbstractDataChunkIterator) or
Expand Down
64 changes: 64 additions & 0 deletions tests/unit/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,70 @@ def __init__(self, spec):
manager = BuildManager(type_map)
return manager

############################################
# Qux: A test class with variable data shapes
############################################
class QuxData(Data):
pass


class QuxBucket(Container):
"PseudoFile"
@docval(
{"name": "name", "type": str, "doc": "the name of this bucket"},
{"name": "qux_data", "type": QuxData, "doc": "Data with user defined shape."},
)
def __init__(self, **kwargs):
name, qux_data = getargs("name", "qux_data", kwargs)
super().__init__(name=name)
self.__qux_data = qux_data
self.__qux_data.parent = self

@property
def qux_data(self):
return self.__qux_data


def get_qux_buildmanager(shape):
qux_data_spec = DatasetSpec(
doc="A test dataset of references specification with a data type",
name="qux_data",
data_type_def="QuxData",
shape=shape,
dtype='int'
)

qux_bucket_spec = GroupSpec(
doc="A test group specification for a data type containing data type",
data_type_def="QuxBucket",
datasets=[
DatasetSpec(doc="doc", data_type_inc="QuxData"),
],
)

spec_catalog = SpecCatalog()
spec_catalog.register_spec(qux_data_spec, "test.yaml")
spec_catalog.register_spec(qux_bucket_spec, "test.yaml")

namespace = SpecNamespace(
"a test namespace",
CORE_NAMESPACE,
[{"source": "test.yaml"}],
version="0.1.0",
catalog=spec_catalog,
)

namespace_catalog = NamespaceCatalog()
namespace_catalog.add_namespace(CORE_NAMESPACE, namespace)

type_map = TypeMap(namespace_catalog)
type_map.register_container_type(CORE_NAMESPACE, "QuxData", QuxData)
type_map.register_container_type(CORE_NAMESPACE, "QuxBucket", QuxBucket)

manager = BuildManager(type_map)
return manager



############################################
# Baz example data containers and specs
Expand Down
Loading

0 comments on commit ddd433a

Please sign in to comment.