From fd9dcd287a4c2e94d10ad03689f0484b877b4991 Mon Sep 17 00:00:00 2001 From: Tomas Tunys Date: Thu, 3 Aug 2017 13:01:06 +0200 Subject: [PATCH 1/3] added _load_attributes_from_hdf5_group() and _save_attributes_to_hdf5_group() methods that can deal with the problem of saving models with a lot of layers (or with nested deep models) into HDF5 files --- keras/engine/topology.py | 64 +++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/keras/engine/topology.py b/keras/engine/topology.py index 8cfae4e214a..bdfc57088e9 100644 --- a/keras/engine/topology.py +++ b/keras/engine/topology.py @@ -22,6 +22,7 @@ try: import h5py + HDF5_OBJECT_HEADER_LIMIT = 65536 except ImportError: h5py = None @@ -2825,10 +2826,59 @@ def _collect_input_shape(input_tensors): return shapes +def _save_attributes_to_hdf5_group(group, name, data): + """Saves attributes (data) of the specified name into the HDF5 group. + This method deals with an inherent problem of HDF5 file which is not + able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes. + + # Arguments + group: A pointer to a HDF5 group. + name: A name of the attributes to save. + data: Attributes data to store. + """ + data_npy = np.asarray(data) + + n_chunks = 1 + chunked_data = np.split(data_npy, n_chunks) + + while any(map(lambda x: x.nbytes > HDF5_OBJECT_HEADER_LIMIT, chunked_data)): + n_chunks += 1 + chunked_data = np.split(data_npy, n_chunks) + + if n_chunks > 1: + for chunk_id, chunk_data in enumerate(chunked_data): + group.attrs['%s%d' % (name, chunk_id)] = chunk_data + else: + group.attrs[name] = data + + +def _load_attributes_from_hdf5_group(group, name): + """Loads attributes of the specified name from the HDF5 group. This method + deals with an inherent problem of HDF5 file which is not able to store + data larger than HDF5_OBJECT_HEADER_LIMIT bytes. + + # Arguments + group: A pointer to a HDF5 group. + name: A name of the attributes to load. + + # Returns + data: Attributes data. + """ + if name in group.attrs: + data = [n.decode('utf8') for n in group.attrs[name]] + else: + data = [] + chunk_id = 0 + while ('%s%d' % (name, chunk_id)) in group.attrs: + data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]]) + chunk_id += 1 + return data + + def save_weights_to_hdf5_group(f, layers): from .. import __version__ as keras_version - f.attrs['layer_names'] = [layer.name.encode('utf8') for layer in layers] + _save_attributes_to_hdf5_group(f, 'layer_names', [layer.name.encode('utf8') for layer in layers]) f.attrs['backend'] = K.backend().encode('utf8') f.attrs['keras_version'] = str(keras_version).encode('utf8') @@ -2843,7 +2893,7 @@ def save_weights_to_hdf5_group(f, layers): else: name = 'param_' + str(i) weight_names.append(name.encode('utf8')) - g.attrs['weight_names'] = weight_names + _save_attributes_to_hdf5_group(g, 'weight_names', weight_names) for name, val in zip(weight_names, weight_values): param_dset = g.create_dataset(name, val.shape, dtype=val.dtype) @@ -3042,11 +3092,11 @@ def load_weights_from_hdf5_group(f, layers): if weights: filtered_layers.append(layer) - layer_names = [n.decode('utf8') for n in f.attrs['layer_names']] + layer_names = _load_attributes_from_hdf5_group(f, 'layer_names') filtered_layer_names = [] for name in layer_names: g = f[name] - weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] + weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') if weight_names: filtered_layer_names.append(name) layer_names = filtered_layer_names @@ -3061,7 +3111,7 @@ def load_weights_from_hdf5_group(f, layers): weight_value_tuples = [] for k, name in enumerate(layer_names): g = f[name] - weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] + weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') weight_values = [g[weight_name] for weight_name in weight_names] layer = filtered_layers[k] symbolic_weights = layer.weights @@ -3109,7 +3159,7 @@ def load_weights_from_hdf5_group_by_name(f, layers): original_backend = None # New file format. - layer_names = [n.decode('utf8') for n in f.attrs['layer_names']] + layer_names = _load_attributes_from_hdf5_group(f, 'layer_names') # Reverse index of layer name to list of layers with name. index = {} @@ -3122,7 +3172,7 @@ def load_weights_from_hdf5_group_by_name(f, layers): weight_value_tuples = [] for k, name in enumerate(layer_names): g = f[name] - weight_names = [n.decode('utf8') for n in g.attrs['weight_names']] + weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') weight_values = [g[weight_name] for weight_name in weight_names] for layer in index.get(name, []): From 7cf5682e95154e2215c60401326c839bbbf4c57d Mon Sep 17 00:00:00 2001 From: Tomas Tunys Date: Fri, 18 Aug 2017 17:43:48 +0200 Subject: [PATCH 2/3] np.split --> np.array_split, making _save_attributes_to_hdf5_group() robust to unrealistically long layer names --- keras/engine/topology.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/keras/engine/topology.py b/keras/engine/topology.py index bdfc57088e9..a59a0aea8f4 100644 --- a/keras/engine/topology.py +++ b/keras/engine/topology.py @@ -22,7 +22,7 @@ try: import h5py - HDF5_OBJECT_HEADER_LIMIT = 65536 + HDF5_OBJECT_HEADER_LIMIT = 64512 except ImportError: h5py = None @@ -2836,14 +2836,27 @@ def _save_attributes_to_hdf5_group(group, name, data): name: A name of the attributes to save. data: Attributes data to store. """ + # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT` + # because in that case even chunking the array would not make the saving + # possible. + bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT] + + # Expecting this to never be true. + if len(bad_attributes) > 0: + raise RuntimeError("the following attributes cannot be saved to HDF5 file " + "because they are larger than %d bytes: '%s'" + % (HDF5_OBJECT_HEADER_LIMIT, + "', '".join([x for x in bad_attributes]))) + data_npy = np.asarray(data) n_chunks = 1 - chunked_data = np.split(data_npy, n_chunks) + chunked_data = np.array_split(data_npy, n_chunks) + # This will never loop forever thanks to the test above. while any(map(lambda x: x.nbytes > HDF5_OBJECT_HEADER_LIMIT, chunked_data)): n_chunks += 1 - chunked_data = np.split(data_npy, n_chunks) + chunked_data = np.array_split(data_npy, n_chunks) if n_chunks > 1: for chunk_id, chunk_data in enumerate(chunked_data): From fd711430fb8e81dfdaf2360c055ce153b1ef4445 Mon Sep 17 00:00:00 2001 From: Tomas Tunys Date: Fri, 18 Aug 2017 17:45:51 +0200 Subject: [PATCH 3/3] adding unit tests for checking the validity of `layer_names` and `weights_names` HDF5 file attributes splitting --- tests/test_model_saving.py | 89 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/test_model_saving.py b/tests/test_model_saving.py index dd67fb15e86..694cbe20775 100644 --- a/tests/test_model_saving.py +++ b/tests/test_model_saving.py @@ -1,5 +1,6 @@ import pytest import os +import h5py import tempfile import numpy as np from numpy.testing import assert_allclose @@ -337,5 +338,93 @@ def test_saving_custom_activation_function(): assert_allclose(out, out2, atol=1e-05) +@keras_test +def test_saving_model_with_long_layer_names(): + # This layer name will make the `layers_name` HDF5 attribute blow + # out of proportion. Note that it fits into the internal HDF5 + # attribute memory limit on its own but because h5py converts + # the list of layer names into numpy array, which uses the same + # amout of memory for every item, it increases the memory + # requirements substantially. + x = Input(shape=(2,), name='input_' + ('x' * (2**15))) + f = x + for i in range(4): + f = Dense(2, name='dense_%d' % (i,))(f) + + model = Model(inputs=[x], outputs=[f]) + + model.compile(loss='mse', optimizer='adam', metrics=['acc']) + + x = np.random.random((1, 2)) + y = np.random.random((1, 2)) + model.train_on_batch(x, y) + + out = model.predict(x) + + _, fname = tempfile.mkstemp('.h5') + save_model(model, fname) + + model = load_model(fname) + + # Check that the HDF5 files contains chunked array + # of layer names. + with h5py.File(fname, 'r') as h5file: + n_layer_names_arrays = len([attr for attr in h5file['model_weights'].attrs + if attr.startswith('layer_names')]) + + os.remove(fname) + + # The chunking of layer names array should have happend. + assert n_layer_names_arrays > 0 + + out2 = model.predict(x) + assert_allclose(out, out2, atol=1e-05) + + +@keras_test +def test_saving_model_with_long_weights_names(): + x = Input(shape=(2,), name='nested_model_input') + f = x + for i in range(4): + f = Dense(2, name='nested_model_dense_%d' % (i,))(f) + # This layer name will make the `weights_name` + # HDF5 attribute blow out of proportion. + f = Dense(2, name='nested_model_output' + ('x' * (2**15)))(f) + nested_model = Model(inputs=[x], outputs=[f], name='nested_model') + + x = Input(shape=(2,), name='outer_model_input') + f = nested_model(x) + f = Dense(2, name='outer_model_output')(f) + + model = Model(inputs=[x], outputs=[f]) + + model.compile(loss='mse', optimizer='adam', metrics=['acc']) + + x = np.random.random((1, 2)) + y = np.random.random((1, 2)) + model.train_on_batch(x, y) + + out = model.predict(x) + + _, fname = tempfile.mkstemp('.h5') + save_model(model, fname) + + model = load_model(fname) + + # Check that the HDF5 files contains chunked array + # of weight names. + with h5py.File(fname, 'r') as h5file: + n_weight_names_arrays = len([attr for attr in h5file['model_weights']['nested_model'].attrs + if attr.startswith('weight_names')]) + + os.remove(fname) + + # The chunking of layer names array should have happend. + assert n_weight_names_arrays > 0 + + out2 = model.predict(x) + assert_allclose(out, out2, atol=1e-05) + + if __name__ == '__main__': pytest.main([__file__])