From 135420feef75b52127c7261c7b621e1c89b96390 Mon Sep 17 00:00:00 2001 From: Jonathan Helmus Date: Wed, 17 Feb 2021 18:45:30 -0600 Subject: [PATCH 1/8] add test for new style groups This test currently fails. --- tests/make_new_style_groups_file.py | 9 +++++++++ tests/new_style_groups.hdf5 | Bin 0 -> 8733 bytes tests/test_new_style_groups.py | 23 +++++++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100644 tests/make_new_style_groups_file.py create mode 100644 tests/new_style_groups.hdf5 create mode 100644 tests/test_new_style_groups.py diff --git a/tests/make_new_style_groups_file.py b/tests/make_new_style_groups_file.py new file mode 100644 index 0000000..9e55cb4 --- /dev/null +++ b/tests/make_new_style_groups_file.py @@ -0,0 +1,9 @@ +#! /usr/bin/env python +""" Create a HDF5 file with new-style groups. """ +import h5py +import numpy as np + + +f = h5py.File('new_style_groups.hdf5', 'w', track_order=True) +for i in range(9): + f.create_group('group' + str(i)) diff --git a/tests/new_style_groups.hdf5 b/tests/new_style_groups.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..555a1c79dc604deb9f34f2a3c1765968864b5663 GIT binary patch literal 8733 zcmeHM&1+LZ5T8w6(llyqMbRXtq zxyv8)gFSpcFrZh7{BMm$_z+&ly9*;8rXL^8 zU#@`0Mj3G8Z-O6Q$f;fYtx(+WJ20^3Z= zwlM=v{H+jw*t%W(P3*>qlYNMqzgz*0jWXcG-y-pceOk?5Gn-ApM{1-9zkr$@Y3cFY zD~`Aobb1TCxCgzb_9E(Cu$63fu8q?!m?!>lW?H=qc-`jEX%~#`!*!?kBkEnyT%P}T zZ>L?bK>XqR-(I_5oLbzo)Wz4kfa|ESQ3jlL!6Na8eOkQ>%#4)=yZh%_f6B$@E%FIT z4Sr;jRyY`Gx~c6$SN{arCa-(q(h=FC4@cNC_-XWPJZB}rW82xyoU2E}2T|J%9Hsc~ z*`l)Y+E(_fhqUV7Z1v&I&j)5!t_6G-oI6JJ0C~WdhMN$PQ`D@`Qz)mNas0_bES|e$ zg6CYccM!_bAplFq3o)>N@&fSw#Xtd0DDn2a=tH=##GWhVGF(vN%<#*%kXPc|=l*55 zrNm>U{1;eOV#cb~U{#3)&y5i&=<#o5K-K?ummUjWClnUHPI0Vy{Y)w^ZzUkvMgyzw z;VH7EzBICn`tm^h;BzkGNWA)^%I+H5k^$37#(|IK?TA*_x4WWVQ}4Cw)3Y-tCx{Wk zU}~V^RER8v!6ZS&@Cp@P7)%#boPI>#dcxo!sEAHb&rldlAyk|RQ8PvuOeR!}4AA?j cFql@T7(GId?80Dzp&~ZK2lZ{q?RE(A3+KQ(rvLx| literal 0 HcmV?d00001 diff --git a/tests/test_new_style_groups.py b/tests/test_new_style_groups.py new file mode 100644 index 0000000..71b9ee3 --- /dev/null +++ b/tests/test_new_style_groups.py @@ -0,0 +1,23 @@ +""" Test new style groups in pyfive. """ +import os + +import pyfive + +DIRNAME = os.path.dirname(__file__) +NEW_STYLE_GROUPS_HDF5_FILE = os.path.join(DIRNAME, 'new_style_groups.hdf5') + + +def test_groups(): + + with pyfive.File(NEW_STYLE_GROUPS_HDF5_FILE) as hfile: + + assert len(hfile) == 9 + grp0 = hfile['group0'] + grp1 = hfile['group1'] + grp2 = hfile['group2'] + grp3 = hfile['group3'] + grp4 = hfile['group4'] + grp5 = hfile['group5'] + grp6 = hfile['group6'] + grp7 = hfile['group7'] + grp8 = hfile['group8'] From ed0f0e28bd929fb6b6a78ff8a94851eb0e571caf Mon Sep 17 00:00:00 2001 From: woutdenolf Date: Mon, 12 Jul 2021 20:43:02 +0200 Subject: [PATCH 2/8] Refactor B-tree v1 to prepare for B-tree v2 --- pyfive/btree.py | 194 +++++++++++++++++++++------------------ tests/test_fletcher32.py | 4 +- 2 files changed, 108 insertions(+), 90 deletions(-) diff --git a/pyfive/btree.py b/pyfive/btree.py index ba9152c..c8a40c8 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -11,39 +11,91 @@ from .core import Reference -class BTree(object): - """ - HDF5 version 1 B-Tree. - """ +class AbstractBTree(object): + B_LINK_NODE = None + NODE_TYPE = None def __init__(self, fh, offset): """ initalize. """ self.fh = fh + self.offset = offset + self.depth = None + self.all_nodes = {} + + self._read_root_node() + self._read_children() + + def _read_children(self): + # Leaf nodes: level 0 + # Root node: level "depth" + node_level = self.depth + for node_level in range(self.depth, 0, -1): + for parent_node in self.all_nodes[node_level]: + for child_addr in parent_node['addresses']: + child_node = self._read_node(child_addr, node_level-1) + self._add_node(child_node) + + def _read_root_node(self): + root_node = self._read_node(self.offset, None) + self._add_node(root_node) + self.depth = root_node['node_level'] + + def _add_node(self, node): + node_level = node['node_level'] + if node_level in self.all_nodes: + self.all_nodes[node_level].append(node) + else: + self.all_nodes[node_level] = [node] - # read in the root node - root_node = self._read_node(offset) - self.root_node = root_node - - # read in all nodes - all_nodes = {} - node_level = root_node['node_level'] - all_nodes[node_level] = [root_node] - while node_level != 0: - new_nodes = [] - for parent_node in all_nodes[node_level]: - for addr in parent_node['addresses']: - new_nodes.append(self._read_node(addr)) - new_node_level = new_nodes[0]['node_level'] - all_nodes[new_node_level] = new_nodes - node_level = new_node_level - self.all_nodes = all_nodes - - def _read_node(self, offset): + def _read_node(self, offset, node_level): """ Return a single node in the B-Tree located at a given offset. """ + node = self._read_node_header(offset, node_level) + node['keys'] = [] + node['addresses'] = [] + return node + + def _read_node_header(self, offset): + """ Return a single node header in the b-tree located at a give offset. """ + raise NotImplementedError + + +class BTreeV1(AbstractBTree): + """ + HDF5 version 1 B-Tree. + """ + + # III.A.1. Disk Format: Level 1A1 - Version 1 B-trees + B_LINK_NODE = OrderedDict(( + ('signature', '4s'), + + ('node_type', 'B'), + ('node_level', 'B'), + ('entries_used', 'H'), + + ('left_sibling', 'Q'), # 8 byte addressing + ('right_sibling', 'Q'), # 8 byte addressing + )) + + def _read_node_header(self, offset, node_level): + """ Return a single node header in the b-tree located at a give offset. """ self.fh.seek(offset) - node = _unpack_struct_from_file(B_LINK_NODE_V1, self.fh) + node = _unpack_struct_from_file(self.B_LINK_NODE, self.fh) assert node['signature'] == b'TREE' + assert node['node_type'] == self.NODE_TYPE + if node_level is not None: + assert node["node_level"] == node_level + return node + +class BTreeV1Groups(BTreeV1): + """ + HDF5 version 1 B-Tree storing group nodes (type 0). + """ + NODE_TYPE = 0 + + def _read_node(self, offset, node_level): + """ Return a single node in the B-Tree located at a given offset. """ + node = self._read_node_header(offset, node_level) keys = [] addresses = [] for _ in range(node['entries_used']): @@ -65,42 +117,20 @@ def symbol_table_addresses(self): return all_address -class BTreeRawDataChunks(object): +class BTreeV1RawDataChunks(BTreeV1): """ HDF5 version 1 B-Tree storing raw data chunk nodes (type 1). """ + NODE_TYPE = 1 def __init__(self, fh, offset, dims): """ initalize. """ - self.fh = fh self.dims = dims + super().__init__(fh, offset) - # read in the root node - root_node = self._read_node(offset) - self.root_node = root_node - - # read in all other nodes - all_nodes = {} - node_level = root_node['node_level'] - all_nodes[node_level] = [root_node] - while node_level != 0: - new_nodes = [] - for parent_node in all_nodes[node_level]: - for addr in parent_node['addresses']: - new_nodes.append(self._read_node(addr)) - new_node_level = new_nodes[0]['node_level'] - all_nodes[new_node_level] = new_nodes - node_level = new_node_level - - self.all_nodes = all_nodes - - def _read_node(self, offset): + def _read_node(self, offset, node_level): """ Return a single node in the b-tree located at a give offset. """ - self.fh.seek(offset) - node = _unpack_struct_from_file(B_LINK_NODE_V1, self.fh) - assert node['signature'] == b'TREE' - assert node['node_type'] == 1 - + node = self._read_node_header(offset, node_level) keys = [] addresses = [] for _ in range(node['entries_used']): @@ -170,8 +200,8 @@ def construct_data_from_chunks( non_padded_region = tuple([slice(i) for i in data_shape]) return data[non_padded_region] - @staticmethod - def _filter_chunk(chunk_buffer, filter_mask, filter_pipeline, itemsize): + @classmethod + def _filter_chunk(cls, chunk_buffer, filter_mask, filter_pipeline, itemsize): """ Apply decompression filters to a chunk of data. """ num_filters = len(filter_pipeline) for i, pipeline_entry in enumerate(filter_pipeline[::-1]): @@ -185,7 +215,6 @@ def _filter_chunk(chunk_buffer, filter_mask, filter_pipeline, itemsize): filter_id = pipeline_entry['filter_id'] if filter_id == GZIP_DEFLATE_FILTER: chunk_buffer = zlib.decompress(chunk_buffer) - elif filter_id == SHUFFLE_FILTER: buffer_size = len(chunk_buffer) unshuffled_buffer = bytearray(buffer_size) @@ -196,7 +225,7 @@ def _filter_chunk(chunk_buffer, filter_mask, filter_pipeline, itemsize): unshuffled_buffer[j::itemsize] = chunk_buffer[start:end] chunk_buffer = unshuffled_buffer elif filter_id == FLETCH32_FILTER: - _verify_fletcher32(chunk_buffer) + cls._verify_fletcher32(chunk_buffer) # strip off 4-byte checksum from end of buffer chunk_buffer = chunk_buffer[:-4] else: @@ -204,40 +233,29 @@ def _filter_chunk(chunk_buffer, filter_mask, filter_pipeline, itemsize): "Filter with id: %i import supported" % (filter_id)) return chunk_buffer + @staticmethod + def _verify_fletcher32(chunk_buffer): + """ Verify a chunk with a fletcher32 checksum. """ + # calculate checksums + if len(chunk_buffer) % 2: + arr = np.frombuffer(chunk_buffer[:-4]+b'\x00', 'u2') + ref_sum1 = ref_sum1 % 65535 + ref_sum2 = ref_sum2 % 65535 + + # compare + if sum1 != ref_sum1 or sum2 != ref_sum2: + raise ValueError("fletcher32 checksum invalid") + return True -def _verify_fletcher32(chunk_buffer): - """ Verify a chunk with a fletcher32 checksum. """ - # calculate checksums - if len(chunk_buffer) % 2: - arr = np.frombuffer(chunk_buffer[:-4]+b'\x00', 'u2') - ref_sum1 = ref_sum1 % 65535 - ref_sum2 = ref_sum2 % 65535 - - # compare - if sum1 != ref_sum1 or sum2 != ref_sum2: - raise ValueError("fletcher32 checksum invalid") - return True - - -B_LINK_NODE_V1 = OrderedDict(( - ('signature', '4s'), - - ('node_type', 'B'), - ('node_level', 'B'), - ('entries_used', 'H'), - - ('left_sibling', 'Q'), # 8 byte addressing - ('right_sibling', 'Q'), # 8 byte addressing -)) # IV.A.2.l The Data Storage - Filter Pipeline message RESERVED_FILTER = 0 diff --git a/tests/test_fletcher32.py b/tests/test_fletcher32.py index e7858ea..40b41d1 100644 --- a/tests/test_fletcher32.py +++ b/tests/test_fletcher32.py @@ -6,7 +6,7 @@ from numpy.testing import assert_array_equal import pyfive -from pyfive.btree import _verify_fletcher32 +from pyfive.btree import BTreeV1RawDataChunks DIRNAME = os.path.dirname(__file__) DATASET_FLETCHER_HDF5_FILE = os.path.join(DIRNAME, 'fletcher32.hdf5') @@ -35,4 +35,4 @@ class TestChunkFletcher32(unittest.TestCase): def test_fletcher32_invalid(self): bad_chunk = b'\x00\x00\x00\x01' with self.assertRaises(ValueError) as context: - _verify_fletcher32(bad_chunk) + BTreeV1RawDataChunks._verify_fletcher32(bad_chunk) From 1e9f3e5da0a3a0515b3c949cd6c50ba183f2d5d2 Mon Sep 17 00:00:00 2001 From: woutdenolf Date: Mon, 12 Jul 2021 20:43:25 +0200 Subject: [PATCH 3/8] Add partial support for B-tree v2 --- pyfive/btree.py | 184 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/pyfive/btree.py b/pyfive/btree.py index c8a40c8..38518fc 100644 --- a/pyfive/btree.py +++ b/pyfive/btree.py @@ -257,6 +257,190 @@ def _verify_fletcher32(chunk_buffer): return True +class BTreeV2(AbstractBTree): + """ + HDF5 version 2 B-Tree. + """ + + # III.A.2. Disk Format: Level 1A2 - Version 2 B-trees + B_TREE_HEADER = OrderedDict(( + ('signature', '4s'), + + ('version', 'B'), + ('node_type', 'B'), + ('node_size', 'I'), + ('record_size', 'H'), + ('depth', 'H'), + ('split_percent', 'B'), + ('merge_percent', 'B'), + + ('root_address', 'Q'), # 8 byte addressing + ('root_nrecords', 'H'), + ('total_nrecords', 'Q'), # 8 byte addressing + )) + + B_LINK_NODE = OrderedDict(( + ('signature', '4s'), + + ('version', 'B'), + ('node_type', 'B'), + )) + + def _read_root_node(self): + h = self._read_tree_header(self.offset) + self.address_formats = self._calculate_address_formats(h) + self.header = h + self.depth = h["depth"] + + address = (h["root_address"], h["root_nrecords"], h["total_nrecords"]) + root_node = self._read_node(address, self.depth) + self._add_node(root_node) + + def _read_tree_header(self, offset): + self.fh.seek(self.offset) + header = _unpack_struct_from_file(self.B_TREE_HEADER, self.fh) + assert header['signature'] == b'BTHD' + assert header['node_type'] == self.NODE_TYPE + return header + + def _calculate_address_formats(self, header): + node_size = header["node_size"] + record_size = header["record_size"] + + nrecords_max = 0 + ntotalrecords_max = 0 + address_formats = [] + for node_level in range(header["depth"]+1): + offset_fmt = "" + num1_fmt = "" + num2_fmt = "" + if node_level == 0: # leaf node + offset_size = 0 + num1_size = 0 + num2_size = 0 + elif node_level == 1: # internal node (twig node) + offset_size = 8 + offset_fmt = " 1) + # addr_size = offset_size + num1_size + num2_size + return (node_size - 10 - addr_size)//(record_size + addr_size) + + @staticmethod + def _required_bytes(integer): + """ Calculate the minimal required bytes to contain an integer. """ + return (max(integer.bit_length(), 1) + 7) // 8 + + def _read_node(self, address, node_level): + """ Return a single node in the B-Tree located at a given offset. """ + offset, nrecords, ntotalrecords = address + node = self._read_node_header(offset, node_level) + + record_size = self.header['record_size'] + + keys = [] + for _ in range(nrecords): + record = self.fh.read(record_size) + keys.append(self._parse_record(record)) + + addresses = [] + fmts = self.address_formats[node_level] + if fmts[0]: + offset_size, num1_size, num2_size, offset_fmt, num1_fmt, num2_fmt = fmts + for _ in range(nrecords+1): + offset = struct.unpack(offset_fmt, self.fh.read(offset_size))[0] + num1 = struct.unpack(num1_fmt, self.fh.read(num1_size))[0] + num1 = int.from_bytes(num1, byteorder="little", signed=False) + if num2_size: + num2 = struct.unpack(num2_fmt, self.fh.read(num2_size))[0] + num2 = int.from_bytes(num2, byteorder="little", signed=False) + else: + num2 = num1 + addresses.append((offset, num1, num2)) + + node['keys'] = keys + node['addresses'] = addresses + return node + + def _read_node_header(self, offset, node_level): + """ Return a single node header in the b-tree located at a give offset. """ + self.fh.seek(offset) + node = _unpack_struct_from_file(self.B_LINK_NODE, self.fh) + assert node['node_type'] == self.NODE_TYPE + if node_level: + # Internal node (has children) + assert node['signature'] == b'BTIN' + else: + # Leaf node (has no children) + assert node['signature'] == b'BTLF' + node["node_level"] = node_level + return node + + def iter_records(self): + """ Iterate over all records. """ + for nodelist in self.all_nodes.values(): + for node in nodelist: + yield from node["keys"] + + def _parse_record(self, record): + raise NotImplementedError + + +class BTreeV2GroupNames(BTreeV2): + """ + HDF5 version 2 B-Tree storing group names (type 5). + """ + NODE_TYPE = 5 + + def _parse_record(self, record): + namehash = struct.unpack_from(" Date: Mon, 12 Jul 2021 20:43:53 +0200 Subject: [PATCH 4/8] Add partial support for fractal heaps --- pyfive/core.py | 7 ++ pyfive/misc_low_level.py | 214 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 219 insertions(+), 2 deletions(-) diff --git a/pyfive/core.py b/pyfive/core.py index 3afd7f1..298a916 100644 --- a/pyfive/core.py +++ b/pyfive/core.py @@ -50,3 +50,10 @@ def _unpack_struct_from(structure, buf, offset=0): fmt = '<' + ''.join(structure.values()) values = struct.unpack_from(fmt, buf, offset=offset) return OrderedDict(zip(structure.keys(), values)) + + +def _unpack_integer(nbytes, buf, offset=0): + """ Read an integer with an uncommon number of bytes. """ + fmt = "{}s".format(nbytes) + values = struct.unpack_from(fmt, buf, offset=offset) + return int.from_bytes(values[0], byteorder="little", signed=False) diff --git a/pyfive/misc_low_level.py b/pyfive/misc_low_level.py index 9f45875..e7f0907 100644 --- a/pyfive/misc_low_level.py +++ b/pyfive/misc_low_level.py @@ -1,10 +1,14 @@ """ Misc low-level representation of HDF5 objects. """ import struct +from math import log2 from collections import OrderedDict -from .core import _padded_size, _structure_size -from .core import _unpack_struct_from, _unpack_struct_from_file +from .core import _padded_size +from .core import _structure_size +from .core import _unpack_struct_from +from .core import _unpack_struct_from_file +from .core import _unpack_integer from .core import InvalidHDF5File @@ -157,6 +161,178 @@ def objects(self): return self._objects +class FractalHeap(object): + """ + HDF5 Fractal Heap. + """ + + def __init__(self, fh, offset): + + fh.seek(offset) + header = _unpack_struct_from_file(FRACTAL_HEAP_HEADER, fh) + assert header['signature'] == b'FRHP' + assert header['version'] == 0 + + if header['filter_info_size']: + raise NotImplementedError + + if header["btree_address_huge_objects"] == 0xffffffffffffffff: + header["btree_address_huge_objects"] = None + else: + raise NotImplementedError + + if header["root_block_address"] == 0xffffffffffffffff: + header["root_block_address"] = None + + nbits = header["log2_maximum_heap_size"] + block_offset_size = self._min_size_nbits(nbits) + h = OrderedDict(( + ('signature', '4s'), + ('version', 'B'), + ('heap_header_adddress', 'Q'), + ('block_offset', '{}s'.format(block_offset_size)) + )) + self.indirect_block_header = h.copy() + self.indirect_block_header_size = _structure_size(h) + if (header["flags"] & 2) == 2: + h['checksum'] = 'I' + self.direct_block_header = h + self.direct_block_header_size = _structure_size(h) + + maximum_dblock_size = header['maximum_direct_block_size'] + nbits = header['log2_maximum_heap_size'] + self._managed_object_offset_size = self._min_size_nbits(nbits) + value = min(maximum_dblock_size, header['max_managed_object_size']) + self._managed_object_length_size = self._min_size_integer(value) + + start_block_size = header['starting_block_size'] + table_width = header['table_width'] + if not start_block_size: + assert NotImplementedError + + log2_maximum_dblock_size = int(log2(maximum_dblock_size)) + assert 2**log2_maximum_dblock_size == maximum_dblock_size + log2_start_block_size = int(log2(start_block_size)) + assert 2**log2_start_block_size == start_block_size + self._max_direct_nrows = log2_maximum_dblock_size - log2_start_block_size + 2 + + log2_table_width = int(log2(table_width)) + assert 2**log2_table_width == table_width + self._indirect_nrows_sub = log2_table_width + log2_start_block_size - 1 + + self.header = header + self.nobjects = header["managed_object_count"] + header["huge_object_count"] + header["tiny_object_count"] + + managed = [] + root_address = header["root_block_address"] + if root_address: + nrows = header["indirect_current_rows_count"] + if nrows: + for data in self._iter_indirect_block(fh, root_address, nrows): + managed.append(data) + else: + data = self._read_direct_block(fh, root_address, start_block_size) + managed.append(data) + self.managed = b"".join(managed) + + def _read_direct_block(self, fh, offset, block_size): + fh.seek(offset) + data = fh.read(block_size) + header = _unpack_struct_from(self.direct_block_header, data) + header["signature"] == b"FHDB" + return data + + def get_data(self, heapid): + firstbyte = heapid[0] + reserved = firstbyte & 15 # bit 0-3 + idtype = (firstbyte >> 4) & 3 # bit 4-5 + version = firstbyte >> 6 # bit 6-7 + data_offset = 1 + if idtype == 0: # managed + assert version == 0 + nbytes = self._managed_object_offset_size + offset = _unpack_integer(nbytes, heapid, data_offset) + data_offset += nbytes + + nbytes = self._managed_object_length_size + size = _unpack_integer(nbytes, heapid, data_offset) + + return self.managed[offset:offset+size] + elif idtype == 1: # tiny + raise NotImplementedError + elif idtype == 2: # huge + raise NotImplementedError + else: + raise NotImplementedError + + def _min_size_integer(self, integer): + """ Calculate the minimal required bytes to contain an integer. """ + return self._min_size_nbits(integer.bit_length()) + + @staticmethod + def _min_size_nbits(nbits): + """ Calculate the minimal required bytes to contain a number of bits. """ + return nbits // 8 + min(nbits % 8, 1) + + def _read_integral(self, fh, nbytes): + num = fh.read(nbytes) + num = struct.unpack("{}s".format(nbytes))[0] + return int.from_bytes(num, byteorder="little", signed=False) + + def _iter_indirect_block(self, fh, offset, nrows): + fh.seek(offset) + header = _unpack_struct_from_file(self.indirect_block_header, fh) + header["signature"] == b"FHIB" + header["block_offset"] = int.from_bytes(header["block_offset"], byteorder="little", signed=False) + ndirect, nindirect = self._indirect_info(nrows) + + direct_blocks = list() + for i in range(ndirect): + address = struct.unpack(' Date: Mon, 12 Jul 2021 20:46:31 +0200 Subject: [PATCH 5/8] Add b-tree v2 to get_links --- pyfive/dataobjects.py | 234 ++++++++++++++++++++++++++++-------------- 1 file changed, 155 insertions(+), 79 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 8af28db..277b82e 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -13,9 +13,10 @@ from .core import _unpack_struct_from, _unpack_struct_from_file from .core import InvalidHDF5File from .core import Reference -from .btree import BTree, BTreeRawDataChunks +from .btree import BTreeV1Groups, BTreeV1RawDataChunks +from .btree import BTreeV2GroupNames, BTreeV2GroupOrders from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER -from .misc_low_level import Heap, SymbolTable, GlobalHeap +from .misc_low_level import Heap, SymbolTable, GlobalHeap, FractalHeap class DataObjects(object): @@ -384,10 +385,35 @@ def get_data(self): version, dims, layout_class, property_offset = ( self._get_data_message_properties(msg_offset)) + if layout_class == 0: # compact storage + raise NotImplementedError("Compact storage") + elif layout_class == 1: # contiguous storage + return self._get_contiguous_data(property_offset) if layout_class == 2: # chunked storage return self._get_chunked_data(msg_offset) - assert layout_class == 1 + def _get_data_message_properties(self, msg_offset): + """ Return the message properties of the DataObject. """ + dims, layout_class, property_offset = None, None, None + version, arg1, arg2 = struct.unpack_from( + '= 1) and (version <= 4) + return version, dims, layout_class, property_offset + + def _get_contiguous_data(self, property_offset): data_offset, = struct.unpack_from('= 1) and (version <= 4) - return version, dims, layout_class, property_offset - def _get_chunked_data(self, offset): """ Return data which is chunked. """ self._get_chunk_params() - chunk_btree = BTreeRawDataChunks( + chunk_btree = BTreeV1RawDataChunks( self.fh, self._chunk_address, self._chunk_dims) return chunk_btree.construct_data_from_chunks( self.chunks, self.shape, self.dtype, self.filter_pipeline) @@ -482,74 +487,131 @@ def find_msg_type(self, msg_type): def get_links(self): """ Return a dictionary of link_name: offset """ - sym_tbl_msgs = self.find_msg_type(SYMBOL_TABLE_MSG_TYPE) - if sym_tbl_msgs: - return self._get_links_from_symbol_tables(sym_tbl_msgs) - return self._get_links_from_link_msgs() - - def _get_links_from_symbol_tables(self, sym_tbl_msgs): + return dict(self.iter_links()) + + def iter_links(self): + for msg in self.msgs: + if msg['type'] == SYMBOL_TABLE_MSG_TYPE: + yield from self._iter_links_from_symbol_tables(msg) + elif msg['type'] == LINK_MSG_TYPE: + yield self._get_link_from_link_msg(msg) + elif msg['type'] == LINK_INFO_MSG_TYPE: + yield from self._iter_link_from_link_info_msg(msg) + + def _iter_links_from_symbol_tables(self, sym_tbl_msg): """ Return a dict of link_name: offset from a symbol table. """ - assert len(sym_tbl_msgs) == 1 - assert sym_tbl_msgs[0]['size'] == 16 - symbol_table_message = _unpack_struct_from( + assert sym_tbl_msg['size'] == 16 + data = _unpack_struct_from( SYMBOL_TABLE_MSG, self.msg_data, - sym_tbl_msgs[0]['offset_to_message']) + sym_tbl_msg['offset_to_message']) + yield from self._iter_links_btree_v1(data['btree_address'], data['heap_address']) - btree = BTree(self.fh, symbol_table_message['btree_address']) - heap = Heap(self.fh, symbol_table_message['heap_address']) - links = {} + def _iter_links_btree_v1(self, btree_address, heap_address): + """ Retrieve links from symbol table message. """ + btree = BTreeV1Groups(self.fh, btree_address) + heap = Heap(self.fh, heap_address) for symbol_table_address in btree.symbol_table_addresses(): table = SymbolTable(self.fh, symbol_table_address) table.assign_name(heap) - links.update(table.get_links(heap)) - return links - - def _get_links_from_link_msgs(self): - """ Retrieve links from link messages. """ - links = {} - link_msgs = self.find_msg_type(LINK_MSG_TYPE) - for link_msg in link_msgs: - offset = link_msg['offset_to_message'] - version, flags = struct.unpack_from(' Date: Fri, 16 Jul 2021 10:10:19 +0200 Subject: [PATCH 6/8] no optional dereferencing for now --- pyfive/dataobjects.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 277b82e..523c29f 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -521,7 +521,7 @@ def _get_link_from_link_msg(self, link_msg): return self._decode_link_msg(self.msg_data, offset)[1] @staticmethod - def _decode_link_msg(data, offset, dereference=True): + def _decode_link_msg(data, offset): version, flags = struct.unpack_from(' Date: Fri, 16 Jul 2021 10:14:00 +0200 Subject: [PATCH 7/8] refactor test for new style groups --- tests/test_new_style_groups.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/tests/test_new_style_groups.py b/tests/test_new_style_groups.py index 71b9ee3..db6d534 100644 --- a/tests/test_new_style_groups.py +++ b/tests/test_new_style_groups.py @@ -1,23 +1,15 @@ """ Test new style groups in pyfive. """ import os - import pyfive DIRNAME = os.path.dirname(__file__) -NEW_STYLE_GROUPS_HDF5_FILE = os.path.join(DIRNAME, 'new_style_groups.hdf5') +NEW_STYLE_GROUPS_HDF5_FILE = os.path.join(DIRNAME, "new_style_groups.hdf5") def test_groups(): - with pyfive.File(NEW_STYLE_GROUPS_HDF5_FILE) as hfile: - assert len(hfile) == 9 - grp0 = hfile['group0'] - grp1 = hfile['group1'] - grp2 = hfile['group2'] - grp3 = hfile['group3'] - grp4 = hfile['group4'] - grp5 = hfile['group5'] - grp6 = hfile['group6'] - grp7 = hfile['group7'] - grp8 = hfile['group8'] + # test that the objects are stored in the correct order + # (file was created with track_order=True) + for i, grp in enumerate(hfile): + assert grp == "group{:d}".format(i) From 6e10696ba278673e607847b869b4c7bfa77f2fd0 Mon Sep 17 00:00:00 2001 From: woutdenolf Date: Fri, 16 Jul 2021 10:20:06 +0200 Subject: [PATCH 8/8] use global UNDEFINED_ADDRESS --- pyfive/core.py | 3 +++ pyfive/dataobjects.py | 5 ++--- pyfive/misc_low_level.py | 9 +++++---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pyfive/core.py b/pyfive/core.py index 298a916..b0d2718 100644 --- a/pyfive/core.py +++ b/pyfive/core.py @@ -7,6 +7,9 @@ import struct +UNDEFINED_ADDRESS = 0xffffffffffffffff + + class InvalidHDF5File(Exception): """ Exception raised when an invalid HDF5 file is detected. """ pass diff --git a/pyfive/dataobjects.py b/pyfive/dataobjects.py index 523c29f..38cb9a9 100644 --- a/pyfive/dataobjects.py +++ b/pyfive/dataobjects.py @@ -13,6 +13,7 @@ from .core import _unpack_struct_from, _unpack_struct_from_file from .core import InvalidHDF5File from .core import Reference +from .core import UNDEFINED_ADDRESS from .btree import BTreeV1Groups, BTreeV1RawDataChunks from .btree import BTreeV2GroupNames, BTreeV2GroupOrders from .btree import GZIP_DEFLATE_FILTER, SHUFFLE_FILTER, FLETCH32_FILTER @@ -610,7 +611,7 @@ def _decode_link_info_msg(data, offset): else: fmt = LINK_INFO_MSG1 data = _unpack_struct_from(fmt, data, offset) - return {k: None if v == 0xffffffffffffffff else v for k, v in data.items()} + return {k: None if v == UNDEFINED_ADDRESS else v for k, v in data.items()} @property def is_dataset(self): @@ -644,8 +645,6 @@ def determine_data_shape(buf, offset): # integers, unless otherwise noted in the description of a field. Additionally, # all metadata fields are stored in little-endian byte order. -UNDEFINED_ADDRESS = struct.unpack('