From ddd433ad5fd0b0a5b68c6e325a31d2e6cabbefd1 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Tue, 27 Aug 2024 16:30:19 -0700 Subject: [PATCH 01/18] VectorData Refactor Expandable (#1158) * prior * chckpoint * possible poc * fix * finished tests * dtype * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix again * fix again * clean up * coverage * Update CHANGELOG.md * Update CHANGELOG.md * Update h5tools.py * Update h5tools.py * clean up for review * clean up for review * clean up --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 1 + src/hdmf/backends/hdf5/h5tools.py | 31 ++++++++--- src/hdmf/build/builders.py | 16 ++++-- src/hdmf/build/objectmapper.py | 88 +++++++++++++++++++----------- tests/unit/helpers/utils.py | 64 ++++++++++++++++++++++ tests/unit/test_io_hdf5_h5tools.py | 68 +++++++++++++++++++++-- 6 files changed, 221 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a6369094..e0b52b201 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Enhancements - Added support to append to a dataset of references for HDMF-Zarr. @mavaylon1 [#1157](https://github.com/hdmf-dev/hdmf/pull/1157) +- Added support for datasets to be expandable by default for the HDF5 backend. @mavaylon1 [#1158](https://github.com/hdmf-dev/hdmf/pull/1158) ## HDMF 3.14.3 (July 29, 2024) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 8135d75e7..273918840 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -383,7 +383,9 @@ def copy_file(self, **kwargs): 'default': True}, {'name': 'herd', 'type': 'hdmf.common.resources.HERD', 'doc': 'A HERD object to populate with references.', - 'default': None}) + 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}) def write(self, **kwargs): """Write the container to an HDF5 file.""" if self.__mode == 'r': @@ -826,10 +828,15 @@ def close_linked_files(self): 'doc': 'exhaust DataChunkIterators one at a time. If False, exhaust them concurrently', 'default': True}, {'name': 'export_source', 'type': str, - 'doc': 'The source of the builders when exporting', 'default': None}) + 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}) def write_builder(self, **kwargs): f_builder = popargs('builder', kwargs) - link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) + link_data, exhaust_dci, export_source = getargs('link_data', + 'exhaust_dci', + 'export_source', + kwargs) self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" % (f_builder.name, self.source, kwargs)) for name, gbldr in f_builder.groups.items(): @@ -1000,6 +1007,8 @@ def _filler(): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}, returns='the Group that was created', rtype=Group) def write_group(self, **kwargs): parent, builder = popargs('parent', 'builder', kwargs) @@ -1100,6 +1109,8 @@ def write_link(self, **kwargs): 'default': True}, {'name': 'export_source', 'type': str, 'doc': 'The source of the builders when exporting', 'default': None}, + {'name': 'expandable', 'type': bool, 'default': True, + 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}, returns='the Dataset that was created', rtype=Dataset) def write_dataset(self, **kwargs): # noqa: C901 """ Write a dataset to HDF5 @@ -1107,7 +1118,7 @@ def write_dataset(self, **kwargs): # noqa: C901 The function uses other dataset-dependent write functions, e.g, ``__scalar_fill__``, ``__list_fill__``, and ``__setup_chunked_dset__`` to write the data. """ - parent, builder = popargs('parent', 'builder', kwargs) + parent, builder, expandable = popargs('parent', 'builder', 'expandable', kwargs) link_data, exhaust_dci, export_source = getargs('link_data', 'exhaust_dci', 'export_source', kwargs) self.logger.debug("Writing DatasetBuilder '%s' to parent group '%s'" % (builder.name, parent.name)) if self.get_written(builder): @@ -1115,6 +1126,7 @@ def write_dataset(self, **kwargs): # noqa: C901 return None name = builder.name data = builder.data + matched_spec_shape = builder.spec_shapes dataio = None options = dict() # dict with additional if isinstance(data, H5DataIO): @@ -1228,7 +1240,7 @@ def _filler(): return # If the compound data type contains only regular data (i.e., no references) then we can write it as usual else: - dset = self.__list_fill__(parent, name, data, options) + dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options) # Write a dataset containing references, i.e., a region or object reference. # NOTE: we can ignore options['io_settings'] for scalar data elif self.__is_ref(options['dtype']): @@ -1323,7 +1335,7 @@ def _filler(): self.__dci_queue.append(dataset=dset, data=data) # Write a regular in memory array (e.g., numpy array, list etc.) elif hasattr(data, '__len__'): - dset = self.__list_fill__(parent, name, data, options) + dset = self.__list_fill__(parent, name, data, matched_spec_shape, expandable, options) # Write a regular scalar dataset else: dset = self.__scalar_fill__(parent, name, data, options) @@ -1451,7 +1463,7 @@ def __chunked_iter_fill__(cls, parent, name, data, options=None): return dset @classmethod - def __list_fill__(cls, parent, name, data, options=None): + def __list_fill__(cls, parent, name, data, matched_spec_shape, expandable, options=None): # define the io settings and data type if necessary io_settings = {} dtype = None @@ -1473,6 +1485,11 @@ def __list_fill__(cls, parent, name, data, options=None): data_shape = (len(data),) else: data_shape = get_data_shape(data) + if expandable: + # Don't override existing settings + if 'maxshape' not in io_settings: + if matched_spec_shape is not None: + io_settings['maxshape'] = matched_spec_shape # Create the dataset try: diff --git a/src/hdmf/build/builders.py b/src/hdmf/build/builders.py index cb658b6d4..6ed453166 100644 --- a/src/hdmf/build/builders.py +++ b/src/hdmf/build/builders.py @@ -330,6 +330,9 @@ class DatasetBuilder(BaseBuilder): 'doc': 'The datatype of this dataset.', 'default': None}, {'name': 'attributes', 'type': dict, 'doc': 'A dictionary of attributes to create in this dataset.', 'default': dict()}, + {'name': 'spec_shapes', 'type': tuple, + 'doc': ('The shape(s) defined in the spec.'), + 'default': None}, {'name': 'dimension_labels', 'type': tuple, 'doc': ('A list of labels for each dimension of this dataset from the spec. Currently this is ' 'supplied only on build.'), @@ -341,15 +344,15 @@ class DatasetBuilder(BaseBuilder): {'name': 'source', 'type': str, 'doc': 'The source of the data in this builder.', 'default': None}) def __init__(self, **kwargs): """ Create a Builder object for a dataset """ - name, data, dtype, attributes, dimension_labels, maxshape, chunks, parent, source = getargs( - 'name', 'data', 'dtype', 'attributes', 'dimension_labels', 'maxshape', 'chunks', 'parent', 'source', - kwargs - ) + name, data, dtype, attributes, spec_shapes, dimension_labels, maxshape, chunks, parent, source = getargs( + 'name', 'data', 'dtype', 'attributes', 'spec_shapes', 'dimension_labels', 'maxshape', 'chunks', + 'parent', 'source', kwargs) super().__init__(name, attributes, parent, source) self['data'] = data self['attributes'] = _copy.copy(attributes) self.__dimension_labels = dimension_labels self.__chunks = chunks + self.__spec_shapes = spec_shapes self.__maxshape = maxshape if isinstance(data, BaseBuilder): if dtype is None: @@ -357,6 +360,11 @@ def __init__(self, **kwargs): self.__dtype = dtype self.__name = name + @property + def spec_shapes(self): + """The shapes defined in the spec.""" + return self.__spec_shapes + @property def data(self): """The data stored in the dataset represented by this builder.""" diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index b5815ee2c..6dbfdb1d4 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -555,6 +555,7 @@ def get_attribute(self, **kwargs): def get_attr_value(self, **kwargs): ''' Get the value of the attribute corresponding to this spec from the given container ''' spec, container, manager = getargs('spec', 'container', 'manager', kwargs) + attr_name = self.get_attribute(spec) if attr_name is None: return None @@ -723,7 +724,10 @@ def build(self, **kwargs): msg = "'container' must be of type Data with DatasetSpec" raise ValueError(msg) spec_dtype, spec_shape, spec_dims, spec = self.__check_dset_spec(self.spec, spec_ext) - dimension_labels = self.__get_dimension_labels_from_spec(container.data, spec_shape, spec_dims) + dimension_labels, matched_shape = self.__get_spec_info(container.data, + spec_shape, + spec_dims, + spec_dtype) if isinstance(spec_dtype, RefSpec): self.logger.debug("Building %s '%s' as a dataset of references (source: %s)" % (container.__class__.__name__, container.name, repr(source))) @@ -734,6 +738,7 @@ def build(self, **kwargs): parent=parent, source=source, dtype=spec_dtype.reftype, + spec_shapes=matched_shape, dimension_labels=dimension_labels, ) manager.queue_ref(self.__set_dataset_to_refs(builder, spec_dtype, spec_shape, container, manager)) @@ -748,6 +753,7 @@ def build(self, **kwargs): parent=parent, source=source, dtype=spec_dtype, + spec_shapes=matched_shape, dimension_labels=dimension_labels, ) manager.queue_ref(self.__set_compound_dataset_to_refs(builder, spec, spec_dtype, container, @@ -766,6 +772,7 @@ def build(self, **kwargs): parent=parent, source=source, dtype="object", + spec_shapes=matched_shape, dimension_labels=dimension_labels, ) manager.queue_ref(self.__set_untyped_dataset_to_refs(builder, container, manager)) @@ -789,6 +796,7 @@ def build(self, **kwargs): parent=parent, source=source, dtype=dtype, + spec_shapes=matched_shape, dimension_labels=dimension_labels, ) @@ -820,10 +828,7 @@ def __check_dset_spec(self, orig, ext): spec = ext return dtype, shape, dims, spec - def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple: - if spec_shape is None or spec_dims is None: - return None - data_shape = get_data_shape(data) + def __get_matched_dimension(self, data_shape, spec_shape, spec_dtype=None): # if shape is a list of allowed shapes, find the index of the shape that matches the data if isinstance(spec_shape[0], list): match_shape_inds = list() @@ -839,37 +844,58 @@ def __get_dimension_labels_from_spec(self, data, spec_shape, spec_dims) -> tuple break if match: match_shape_inds.append(i) - # use the most specific match -- the one with the fewest Nones - if match_shape_inds: - if len(match_shape_inds) == 1: - return tuple(spec_dims[match_shape_inds[0]]) + return match_shape_inds + + def __get_spec_info(self, data, spec_shape, spec_dims, spec_dtype=None): + """This will return the dimension labels and shape by matching the data shape to a permissible spec shape.""" + if spec_shape is None and spec_dims is None: + return None, None + else: + if spec_dtype is not None and isinstance(spec_dtype, list): + data_shape = (len(data),) + else: + data_shape = get_data_shape(data) + + if isinstance(spec_shape[0], list): + match_shape_inds = self.__get_matched_dimension(data_shape, spec_shape, spec_dtype) + if len(match_shape_inds) == 0: + # no matches found + msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None, None + elif len(match_shape_inds) == 1: + if spec_dims is not None: + return tuple(spec_dims[match_shape_inds[0]]), tuple(spec_shape[match_shape_inds[0]]) + else: + return spec_dims, tuple(spec_shape[match_shape_inds[0]]) else: count_nones = [len([x for x in spec_shape[k] if x is None]) for k in match_shape_inds] index_min_count = count_nones.index(min(count_nones)) best_match_ind = match_shape_inds[index_min_count] - return tuple(spec_dims[best_match_ind]) + if spec_dims is not None: + return tuple(spec_dims[best_match_ind]), tuple(spec_shape[best_match_ind]) + else: + return spec_dims, tuple(spec_shape[best_match_ind]) else: - # no matches found - msg = "Shape of data does not match any allowed shapes in spec '%s'" % self.spec.path - warnings.warn(msg, IncorrectDatasetShapeBuildWarning) - return None - else: - if len(data_shape) != len(spec_shape): - msg = "Shape of data does not match shape in spec '%s'" % self.spec.path - warnings.warn(msg, IncorrectDatasetShapeBuildWarning) - return None - # check each dimension. None means any length is allowed - match = True - for j, d in enumerate(data_shape): - if spec_shape[j] is not None and spec_shape[j] != d: - match = False - break - if not match: - msg = "Shape of data does not match shape in spec '%s'" % self.spec.path - warnings.warn(msg, IncorrectDatasetShapeBuildWarning) - return None - # shape is a single list of allowed dimension lengths - return tuple(spec_dims) + if len(data_shape) != len(spec_shape): + msg = "Shape of data does not match shape in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None, None + # check each dimension. None means any length is allowed + match = True + for j, d in enumerate(data_shape): + if spec_shape[j] is not None and spec_shape[j] != d: + match = False + break + if not match: + msg = "Shape of data does not match shape in spec '%s'" % self.spec.path + warnings.warn(msg, IncorrectDatasetShapeBuildWarning) + return None, None + # shape is a single list of allowed dimension lengths + if spec_dims is not None: + return tuple(spec_dims), tuple(spec_shape) + else: + return None, tuple(spec_shape) def __is_reftype(self, data): if (isinstance(data, AbstractDataChunkIterator) or diff --git a/tests/unit/helpers/utils.py b/tests/unit/helpers/utils.py index 0f4b3c4bf..b2d2cb4ae 100644 --- a/tests/unit/helpers/utils.py +++ b/tests/unit/helpers/utils.py @@ -343,6 +343,70 @@ def __init__(self, spec): manager = BuildManager(type_map) return manager +############################################ +# Qux: A test class with variable data shapes +############################################ +class QuxData(Data): + pass + + +class QuxBucket(Container): + "PseudoFile" + @docval( + {"name": "name", "type": str, "doc": "the name of this bucket"}, + {"name": "qux_data", "type": QuxData, "doc": "Data with user defined shape."}, + ) + def __init__(self, **kwargs): + name, qux_data = getargs("name", "qux_data", kwargs) + super().__init__(name=name) + self.__qux_data = qux_data + self.__qux_data.parent = self + + @property + def qux_data(self): + return self.__qux_data + + +def get_qux_buildmanager(shape): + qux_data_spec = DatasetSpec( + doc="A test dataset of references specification with a data type", + name="qux_data", + data_type_def="QuxData", + shape=shape, + dtype='int' + ) + + qux_bucket_spec = GroupSpec( + doc="A test group specification for a data type containing data type", + data_type_def="QuxBucket", + datasets=[ + DatasetSpec(doc="doc", data_type_inc="QuxData"), + ], + ) + + spec_catalog = SpecCatalog() + spec_catalog.register_spec(qux_data_spec, "test.yaml") + spec_catalog.register_spec(qux_bucket_spec, "test.yaml") + + namespace = SpecNamespace( + "a test namespace", + CORE_NAMESPACE, + [{"source": "test.yaml"}], + version="0.1.0", + catalog=spec_catalog, + ) + + namespace_catalog = NamespaceCatalog() + namespace_catalog.add_namespace(CORE_NAMESPACE, namespace) + + type_map = TypeMap(namespace_catalog) + type_map.register_container_type(CORE_NAMESPACE, "QuxData", QuxData) + type_map.register_container_type(CORE_NAMESPACE, "QuxBucket", QuxBucket) + + manager = BuildManager(type_map) + return manager + + ############################################ # Baz example data containers and specs diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 5a4fd5a32..4dece7398 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -10,9 +10,12 @@ import zipfile import h5py -import numpy as np from h5py import SoftLink, HardLink, ExternalLink, File from h5py import filters as h5py_filters + +import numpy as np +import numpy.testing as npt + from hdmf.backends.hdf5 import H5DataIO from hdmf.backends.hdf5.h5tools import HDF5IO, SPEC_LOC_ATTR, H5PY_3 from hdmf.backends.io import HDMFIO @@ -28,12 +31,13 @@ from hdmf.testing import TestCase, remove_test_file from hdmf.common.resources import HERD from hdmf.term_set import TermSet, TermSetWrapper - +from hdmf.utils import get_data_shape from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager, Baz, BazData, BazCpdData, BazBucket, get_baz_buildmanager, CORE_NAMESPACE, get_temp_filepath, CacheSpecTestHelper, - CustomGroupSpec, CustomDatasetSpec, CustomSpecNamespace) + CustomGroupSpec, CustomDatasetSpec, CustomSpecNamespace, + QuxData, QuxBucket, get_qux_buildmanager) try: import zarr @@ -739,12 +743,12 @@ def test_copy_h5py_dataset_h5dataio_input(self): self.f['test_copy'][:].tolist()) def test_list_fill_empty(self): - dset = self.io.__list_fill__(self.f, 'empty_dataset', [], options={'dtype': int, 'io_settings': {}}) + dset = self.io.__list_fill__(self.f, 'empty_dataset', [], None, True, options={'dtype': int, 'io_settings': {}}) self.assertTupleEqual(dset.shape, (0,)) def test_list_fill_empty_no_dtype(self): with self.assertRaisesRegex(Exception, r"cannot add \S+ to [/\S]+ - could not determine type"): - self.io.__list_fill__(self.f, 'empty_dataset', []) + self.io.__list_fill__(self.f, 'empty_dataset', [], None, True) def test_read_str(self): a = ['a', 'bb', 'ccc', 'dddd', 'e'] @@ -3725,3 +3729,57 @@ def test_set_data_io(self): self.data.set_data_io(H5DataIO, dict(chunks=True)) assert isinstance(self.data.data, H5DataIO) assert self.data.data.io_settings["chunks"] + + +class TestExpand(TestCase): + def setUp(self): + self.manager = get_foo_buildmanager() + self.path = get_temp_filepath() + + def test_expand_false(self): + # Setup all the data we need + foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1]) + foofile = FooFile(buckets=[foobucket]) + + with HDF5IO(self.path, manager=self.manager, mode='w') as io: + io.write(foofile, expandable=False) + + with HDF5IO(self.path, manager=self.manager, mode='r') as io: + read_foofile = io.read() + self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data, + read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist()) + self.assertEqual(get_data_shape(read_foofile.buckets['bucket1'].foos['foo1'].my_data), + (5,)) + + def test_multi_shape_no_labels(self): + qux = QuxData(name='my_qux', data=[[1, 2, 3], [4, 5, 6]]) + quxbucket = QuxBucket('bucket1', qux) + + manager = get_qux_buildmanager([[None, None],[None, 3]]) + + with HDF5IO(self.path, manager=manager, mode='w') as io: + io.write(quxbucket, expandable=True) + + with HDF5IO(self.path, manager=manager, mode='r') as io: + read_quxbucket = io.read() + self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None,3)) + + def test_expand_set_shape(self): + qux = QuxData(name='my_qux', data=[[1, 2, 3], [4, 5, 6]]) + quxbucket = QuxBucket('bucket1', qux) + + manager = get_qux_buildmanager([None, 3]) + + with HDF5IO(self.path, manager=manager, mode='w') as io: + io.write(quxbucket, expandable=True) + + with HDF5IO(self.path, manager=manager, mode='r+') as io: + read_quxbucket = io.read() + read_quxbucket.qux_data.append([7,8,9]) + + expected = np.array([[1, 2, 3], + [4, 5, 6], + [7, 8, 9]]) + npt.assert_array_equal(read_quxbucket.qux_data.data[:], expected) + self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None,3)) From 06586c48198d7919ba5296a036d0a2bffba68a88 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 27 Aug 2024 17:33:30 -0700 Subject: [PATCH 02/18] fix and existing tests pass --- src/hdmf/build/objectmapper.py | 30 +++++++++++++++++-- .../mapper_tests/test_build_datetime.py | 3 +- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 6dbfdb1d4..fe22ea1db 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -14,7 +14,7 @@ IncorrectDatasetShapeBuildWarning) from ..container import AbstractContainer, Data, DataRegion from ..term_set import TermSetWrapper -from ..data_utils import DataIO, AbstractDataChunkIterator +from ..data_utils import DataIO, AbstractDataChunkIterator, InvalidDataIOError from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec @@ -1122,7 +1122,33 @@ def __add_datasets(self, builder, datasets, container, build_manager, source, ex raise BuildError(builder, msg) from ex self.logger.debug(" Adding untyped dataset for spec name %s and adding attributes" % repr(spec.name)) - sub_builder = DatasetBuilder(spec.name, data, parent=builder, source=source, dtype=dtype) + try: + expand = True + dimension_labels, matched_shape = self.__get_spec_info(data, + spec.shape, + spec.dims, + dtype) + except InvalidDataIOError: + # This try/except is for tests overwriting data in H5DataIO, i.e., raise error + # since this is not allowed. + # ---> test_io_hdf5_h5tools.py::HDF5IOEmptyDataset::test_overwrite_dataset + expand = False + + if expand: + sub_builder = DatasetBuilder(spec.name, + data, + parent=builder, + source=source, + dtype=dtype, + spec_shapes=matched_shape, + dimension_labels=dimension_labels, + ) + else: + sub_builder = DatasetBuilder(spec.name, + data, + parent=builder, + source=source, + dtype=dtype) builder.set_dataset(sub_builder) self.__add_attributes(sub_builder, spec.attributes, container, build_manager, source, export) else: diff --git a/tests/unit/build_tests/mapper_tests/test_build_datetime.py b/tests/unit/build_tests/mapper_tests/test_build_datetime.py index 9e2b5e84a..5d4ab5c39 100644 --- a/tests/unit/build_tests/mapper_tests/test_build_datetime.py +++ b/tests/unit/build_tests/mapper_tests/test_build_datetime.py @@ -67,8 +67,9 @@ def test_datetime_array(self): bar_inst = Bar(name='my_bar', data=[datetime(2023, 7, 9), datetime(2023, 7, 10)]) builder = type_map.build(bar_inst) ret = builder.get('data') + # breakpoint() assert ret.data == [b'2023-07-09T00:00:00', b'2023-07-10T00:00:00'] - assert ret.dtype == 'ascii' + # assert ret.dtype == 'ascii' def test_date_array(self): bar_spec = GroupSpec( From e488cf3dcef8dfaa07dcb1a3efb48e1bbd663add Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 27 Aug 2024 17:55:00 -0700 Subject: [PATCH 03/18] test --- tests/unit/test_io_hdf5_h5tools.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 4dece7398..56f0ea21b 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -791,6 +791,26 @@ def test_roundtrip_basic(self): self.assertListEqual(foofile.buckets['bucket1'].foos['foo1'].my_data, read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist()) + def test_roundtrip_basic_append(self): + # Setup all the data we need + foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) + foobucket = FooBucket('bucket1', [foo1]) + foofile = FooFile(buckets=[foobucket]) + + with HDF5IO(self.path, manager=self.manager, mode='w') as io: + io.write(foofile) + + with HDF5IO(self.path, manager=self.manager, mode='a') as io: + read_foofile = io.read() + data = read_foofile.buckets['bucket1'].foos['foo1'].my_data + shape = list(data.shape) + shape[0] += 1 + data.resize(shape) + data[-1] = 6 + self.assertListEqual(read_foofile.buckets['bucket1'].foos['foo1'].my_data[:].tolist(), + [1, 2, 3, 4, 5, 6]) + + def test_roundtrip_empty_dataset(self): foo1 = Foo('foo1', [], "I am foo1", 17, 3.14) foobucket = FooBucket('bucket1', [foo1]) From e5854003eee65fd6e94359f6b285dcad9fe51679 Mon Sep 17 00:00:00 2001 From: Matthew Avaylon Date: Tue, 3 Sep 2024 16:08:42 -0700 Subject: [PATCH 04/18] Update test_build_datetime.py --- tests/unit/build_tests/mapper_tests/test_build_datetime.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/build_tests/mapper_tests/test_build_datetime.py b/tests/unit/build_tests/mapper_tests/test_build_datetime.py index 5d4ab5c39..9e2b5e84a 100644 --- a/tests/unit/build_tests/mapper_tests/test_build_datetime.py +++ b/tests/unit/build_tests/mapper_tests/test_build_datetime.py @@ -67,9 +67,8 @@ def test_datetime_array(self): bar_inst = Bar(name='my_bar', data=[datetime(2023, 7, 9), datetime(2023, 7, 10)]) builder = type_map.build(bar_inst) ret = builder.get('data') - # breakpoint() assert ret.data == [b'2023-07-09T00:00:00', b'2023-07-10T00:00:00'] - # assert ret.dtype == 'ascii' + assert ret.dtype == 'ascii' def test_date_array(self): bar_spec = GroupSpec( From ce47e7280284b586afab5ff03c01a380c8b16723 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 3 Sep 2024 16:24:51 -0700 Subject: [PATCH 05/18] cleanup --- src/hdmf/build/objectmapper.py | 40 +++++++++++----------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index c844ead1e..0cea8840d 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -1134,33 +1134,19 @@ def __add_datasets(self, builder, datasets, container, build_manager, source, ex raise BuildError(builder, msg) from ex self.logger.debug(" Adding untyped dataset for spec name %s and adding attributes" % repr(spec.name)) - try: - expand = True - dimension_labels, matched_shape = self.__get_spec_info(data, - spec.shape, - spec.dims, - dtype) - except InvalidDataIOError: - # This try/except is for tests overwriting data in H5DataIO, i.e., raise error - # since this is not allowed. - # ---> test_io_hdf5_h5tools.py::HDF5IOEmptyDataset::test_overwrite_dataset - expand = False - - if expand: - sub_builder = DatasetBuilder(spec.name, - data, - parent=builder, - source=source, - dtype=dtype, - spec_shapes=matched_shape, - dimension_labels=dimension_labels, - ) - else: - sub_builder = DatasetBuilder(spec.name, - data, - parent=builder, - source=source, - dtype=dtype) + + dimension_labels, matched_shape = self.__get_spec_info(data, + spec.shape, + spec.dims, + dtype) + + sub_builder = DatasetBuilder(spec.name, + data, + parent=builder, + source=source, + dtype=dtype, + spec_shapes=matched_shape, + dimension_labels=dimension_labels) builder.set_dataset(sub_builder) self.__add_attributes(sub_builder, spec.attributes, container, build_manager, source, export) else: From 15689deb9feb70fc4e096f5ec505d1b40e66f686 Mon Sep 17 00:00:00 2001 From: mavaylon1 Date: Tue, 3 Sep 2024 16:27:31 -0700 Subject: [PATCH 06/18] cleanup --- src/hdmf/build/objectmapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 0cea8840d..fc25efc16 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -17,7 +17,7 @@ from ..container import AbstractContainer, Data, DataRegion from ..term_set import TermSetWrapper -from ..data_utils import DataIO, AbstractDataChunkIterator, InvalidDataIOError +from ..data_utils import DataIO, AbstractDataChunkIterator from ..query import ReferenceResolver from ..spec import Spec, AttributeSpec, DatasetSpec, GroupSpec, LinkSpec, RefSpec from ..spec.spec import BaseStorageSpec From 048925770ce34715dc5ddaf37199348ed2c44802 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 22:58:44 +0000 Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/unit/test_io_hdf5_h5tools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index b56dc5026..debec26d8 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -3984,4 +3984,3 @@ def test_expand_set_shape(self): [7, 8, 9]]) npt.assert_array_equal(read_quxbucket.qux_data.data[:], expected) self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None,3)) - From 7f3b40ba2c94f5d5f939c37001d4b587b389c484 Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:29:39 -0800 Subject: [PATCH 08/18] Update tests/unit/test_io_hdf5_h5tools.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 457c74fc8..76abe07a7 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4116,6 +4116,9 @@ def setUp(self): self.manager = get_foo_buildmanager() self.path = get_temp_filepath() + def tearDown(self): + if os.path.exists(self.path): + os.remove(self.path) def test_expand_false(self): # Setup all the data we need foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) From 62da59709dcbc6346ff1f160f7875c39b2d89adf Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:32:01 -0800 Subject: [PATCH 09/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/hdmf/backends/hdf5/h5tools.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index d8dcd6aa8..9f3f39b2c 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -789,10 +789,6 @@ def close_linked_files(self): 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}) def write_builder(self, **kwargs): f_builder = popargs('builder', kwargs) - link_data, exhaust_dci, export_source = getargs('link_data', - 'exhaust_dci', - 'export_source', - kwargs) self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" % (f_builder.name, self.source, kwargs)) for name, gbldr in f_builder.groups.items(): From 1641fc441113a9011d355380d85bd9b3b7e78769 Mon Sep 17 00:00:00 2001 From: rly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 00:35:52 -0800 Subject: [PATCH 10/18] Clean up write_builder --- src/hdmf/backends/hdf5/h5tools.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index d8dcd6aa8..4bb6abfb2 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -789,17 +789,13 @@ def close_linked_files(self): 'doc': ('Bool to set whether datasets are expandable by setting the maxshape.')}) def write_builder(self, **kwargs): f_builder = popargs('builder', kwargs) - link_data, exhaust_dci, export_source = getargs('link_data', - 'exhaust_dci', - 'export_source', - kwargs) self.logger.debug("Writing GroupBuilder '%s' to path '%s' with kwargs=%s" % (f_builder.name, self.source, kwargs)) - for name, gbldr in f_builder.groups.items(): + for gbldr in f_builder.groups.values(): self.write_group(self.__file, gbldr, **kwargs) - for name, dbldr in f_builder.datasets.items(): + for dbldr in f_builder.datasets.values(): self.write_dataset(self.__file, dbldr, **kwargs) - for name, lbldr in f_builder.links.items(): + for lbldr in f_builder.links.values(): self.write_link(self.__file, lbldr, export_source=kwargs.get("export_source")) self.set_attributes(self.__file, f_builder.attributes) self.__add_refs() From 9ca356cf0ae276b9a301f7a9ae2e278210cebe65 Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:37:24 -0800 Subject: [PATCH 11/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/hdmf/build/objectmapper.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index 46a2c4ea6..b37d0a944 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -895,6 +895,31 @@ def __check_dset_spec(self, orig, ext): return dtype, shape, dims, spec def __get_matched_dimension(self, data_shape, spec_shape, spec_dtype=None): + """ + Determine which allowed shapes match the given data shape. + + Parameters + ---------- + data_shape : tuple or list of int + The shape of the data being validated. + spec_shape : sequence + A sequence of allowed shapes for the dataset. When the first element is a + list (or other sequence), this is interpreted as a list of alternative + shapes, where each element is itself a shape specification. Each shape + specification is a sequence of dimension lengths or ``None``, where + ``None`` means any length is allowed for that dimension. + spec_dtype : any, optional + The expected data type for the dataset. This parameter is currently not + used in the matching logic but is accepted for interface consistency. + + Returns + ------- + list of int or None + A list of indices into ``spec_shape`` whose shape specifications match + ``data_shape``. Returns an empty list if no shapes match. Returns + ``None`` if ``spec_shape`` is not a list of alternative shapes (i.e., + when ``spec_shape[0]`` is not a list-like shape specification). + """ # if shape is a list of allowed shapes, find the index of the shape that matches the data if isinstance(spec_shape[0], list): match_shape_inds = list() From ad91c45c107e2e4fdd14fac90ebc6091528f46e1 Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:37:52 -0800 Subject: [PATCH 12/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/hdmf/build/objectmapper.py | 43 +++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/hdmf/build/objectmapper.py b/src/hdmf/build/objectmapper.py index b37d0a944..16704c2a7 100644 --- a/src/hdmf/build/objectmapper.py +++ b/src/hdmf/build/objectmapper.py @@ -938,7 +938,48 @@ def __get_matched_dimension(self, data_shape, spec_shape, spec_dtype=None): return match_shape_inds def __get_spec_info(self, data, spec_shape, spec_dims, spec_dtype=None): - """This will return the dimension labels and shape by matching the data shape to a permissible spec shape.""" + """ + Determine the dimension labels and shape for data based on the allowed shapes in the spec. + + This method compares the shape of ``data`` against the allowed shapes defined in + ``spec_shape`` and, if possible, selects the best-matching shape and corresponding + dimension labels from ``spec_dims``. + + Parameters + ---------- + data + The data object whose shape is to be matched against the specification. This may be + any array-like object supported by :func:`get_data_shape`. + spec_shape + The allowed shape definition(s) from the spec. This may be: + + * ``None`` – no shape is defined. + * A list/tuple of dimension lengths, where each element is an int or ``None``. + * A list of such lists/tuples, representing multiple allowed shapes. + spec_dims + The dimension labels defined in the spec. This may be: + + * ``None`` – no dimension labels are defined. + * A list/tuple of labels corresponding to a single shape in ``spec_shape``. + * A list of lists/tuples of labels, each corresponding to an entry in + ``spec_shape`` when multiple shapes are allowed. + spec_dtype + The dtype or list of dtypes defined in the spec. When ``spec_dtype`` is a list, + the data is treated as 1D with length equal to ``len(data)`` for the purpose of + shape matching. May be ``None`` if no dtype constraint is specified. + + Returns + ------- + tuple + A 2-tuple ``(dims, shape)`` where: + + * ``dims`` is either ``None`` or a tuple of selected dimension labels. + * ``shape`` is either ``None`` or a tuple of dimension lengths corresponding to + the best-matching allowed shape. + + If no matching shape is found, or if both ``spec_shape`` and ``spec_dims`` are + ``None``, the method returns ``(None, None)``. + """ if spec_shape is None and spec_dims is None: return None, None else: From 02a52421207f32c5b33c69420262bc7b193bceaf Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:39:22 -0800 Subject: [PATCH 13/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 76abe07a7..3cfc8e445 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4119,6 +4119,7 @@ def setUp(self): def tearDown(self): if os.path.exists(self.path): os.remove(self.path) + def test_expand_false(self): # Setup all the data we need foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14) From 0fbf0ecec17840e6860cd93abea5887bd51b3dad Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:39:34 -0800 Subject: [PATCH 14/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 3cfc8e445..3d3afc4b4 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4140,7 +4140,7 @@ def test_multi_shape_no_labels(self): qux = QuxData(name='my_qux', data=[[1, 2, 3], [4, 5, 6]]) quxbucket = QuxBucket('bucket1', qux) - manager = get_qux_buildmanager([[None, None],[None, 3]]) + manager = get_qux_buildmanager([[None, None], [None, 3]]) with HDF5IO(self.path, manager=manager, mode='w') as io: io.write(quxbucket, expandable=True) From 70f0791d82763e452b9eaa572bb8bf1ffb80a35a Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:39:45 -0800 Subject: [PATCH 15/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 3d3afc4b4..42a22cd80 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4160,7 +4160,7 @@ def test_expand_set_shape(self): with HDF5IO(self.path, manager=manager, mode='r+') as io: read_quxbucket = io.read() - read_quxbucket.qux_data.append([7,8,9]) + read_quxbucket.qux_data.append([7, 8, 9]) expected = np.array([[1, 2, 3], [4, 5, 6], From dd8b10195a61e7ff445e5879c306a40167825c58 Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:39:55 -0800 Subject: [PATCH 16/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 42a22cd80..0835e0001 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4147,7 +4147,7 @@ def test_multi_shape_no_labels(self): with HDF5IO(self.path, manager=manager, mode='r') as io: read_quxbucket = io.read() - self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None,3)) + self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None, 3)) def test_expand_set_shape(self): qux = QuxData(name='my_qux', data=[[1, 2, 3], [4, 5, 6]]) From d350d2a69249254c4de7b9f523910c6463475fbd Mon Sep 17 00:00:00 2001 From: Ryan Ly <310197+rly@users.noreply.github.com> Date: Fri, 6 Feb 2026 01:40:21 -0800 Subject: [PATCH 17/18] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/unit/test_io_hdf5_h5tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_io_hdf5_h5tools.py b/tests/unit/test_io_hdf5_h5tools.py index 0835e0001..e737b2feb 100644 --- a/tests/unit/test_io_hdf5_h5tools.py +++ b/tests/unit/test_io_hdf5_h5tools.py @@ -4166,4 +4166,4 @@ def test_expand_set_shape(self): [4, 5, 6], [7, 8, 9]]) npt.assert_array_equal(read_quxbucket.qux_data.data[:], expected) - self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None,3)) + self.assertEqual(read_quxbucket.qux_data.data.maxshape, (None, 3)) From 27fe4c4b636a079d277019c045cb03ced02d1f2d Mon Sep 17 00:00:00 2001 From: rly <310197+rly@users.noreply.github.com> Date: Mon, 9 Feb 2026 19:13:01 -0800 Subject: [PATCH 18/18] Check shape before maxshape in get_data_shape --- src/hdmf/utils.py | 5 ++--- tests/unit/utils_test/test_utils.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/hdmf/utils.py b/src/hdmf/utils.py index c7fe2b476..578da41f2 100644 --- a/src/hdmf/utils.py +++ b/src/hdmf/utils.py @@ -843,11 +843,10 @@ def __get_shape_helper(local_data): if isinstance(data, Data): data = data.data - # NOTE: data.maxshape will fail on empty h5py.Dataset without shape or maxshape. this will be fixed in h5py 3.0 - if hasattr(data, 'maxshape'): - return data.maxshape if hasattr(data, 'shape') and data.shape is not None: return data.shape + if hasattr(data, 'maxshape'): + return data.maxshape if isinstance(data, dict): return None if hasattr(data, '__len__') and not isinstance(data, (str, bytes)): diff --git a/tests/unit/utils_test/test_utils.py b/tests/unit/utils_test/test_utils.py index 3b8fb1013..98115d3d3 100644 --- a/tests/unit/utils_test/test_utils.py +++ b/tests/unit/utils_test/test_utils.py @@ -22,10 +22,10 @@ def test_h5dataset(self): res = get_data_shape(dset) self.assertTupleEqual(res, (3, 2)) - # test that maxshape takes priority + # test that shape takes priority over maxshape for objects that have both dset = f.create_dataset('shape_maxshape', shape=(3, 2), maxshape=(None, 100)) res = get_data_shape(dset) - self.assertTupleEqual(res, (None, 100)) + self.assertTupleEqual(res, (3, 2)) os.remove(path)