forked from rwth-i6/returnn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNormalizationData.py
399 lines (353 loc) · 13.5 KB
/
NormalizationData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import os
import six
import h5py
import numpy as np
from BundleFile import BundleFile
class NormalizationData(object):
"""This class holds normalization data for inputs and outputs.
It also contains methods to create the normalization HDF file.
"""
GROUP_INPUTS = 'inputs'
GROUP_OUTPUTS = 'outputs'
DATASET_MEAN = 'mean'
DATASET_MEAN_OF_SQUARES = 'meanOfSquares'
DATASET_VARIANCE = 'variance'
DATASET_TOTAL_FRAMES = 'totalNumberOfFrames'
DATASET_TIME_DIMENSION_INDEX = 0
DATASET_FEATURE_DIMENSION_INDEX = 1
SUMMATION_PRECISION = 1e-5
@staticmethod
def createNormalizationFile(bundleFilePath, outputFilePath, dtype=np.float64,
flag_includeOutputs=True):
"""Calculates means over inputs and outputs of datasets in the HDF files
described by the given bundle file.
:see: BundleFile.BundleFile
Each HDF dataset file is expected to have the following groups:
* NormalizationData.GROUP_INPUTS (the group for the input data)
* NormalizationData.GROUP_OUTPUTS (the group for the output data)
Each group may have datasets. Each dataset is expected to have
shape (time frames, features).
E.g. (267, 513) -- 267 time frames each containing a feature vector of
dimensionality 513.
The method writes results into the given output file.
Availability of means and variances depends on whether the corresponding
groups are available in the input dataset HDF files.
!!! IMPORTANT !!!
General rule of thumb: if one dataset file has both input and output
groups then you should make sure that all the dataset files have them.
Otherwise means and variance will not be correct.
It is OK if *all* the datasets have only the input group.
In this case means and variance only for inputs will be calculated.
:type bundleFilePath: str
:param bundleFilePath: path to the bundle file. :see: BundleFile.BundleFile
:type outputFilePath: str
:param outputFilePath: path to the output HDF normalization file.
:type dtype: numpy.dtype
:param dtype: type of data to use during calculations.
:type flag_includeOutputs: bool
:param flag_includeOutputs: if True then normalization data will be
calculated for outputs (targets) as well.
"""
NormalizationData._calculateNormalizationData(
bundleFilePath,
outputFilePath,
NormalizationData.GROUP_INPUTS,
dtype=dtype
)
if flag_includeOutputs:
NormalizationData._calculateNormalizationData(
bundleFilePath,
outputFilePath,
NormalizationData.GROUP_OUTPUTS,
dtype=dtype
)
@staticmethod
def _calculateNormalizationData(bundleFilePath, outputFilePath, groupName,
dtype=np.float64):
"""Helper method.
Calculates and writes into the output HDF file mean, mean of squares,
variance and total number of frames for the datasets in the given HDF
group.
:type bundleFilePath: str
:param bundleFilePath: path to the bundle file. :see: BundleFile.BundleFile
:type outputFilePath: str
:param outputFilePath: path to the output HDF normalization file. If file
already exists it will not be truncated.
:type groupName: str
:param groupName: name of the HDF group for which normalization data
should be calculated. Also, a group with this name will
be created in the output HDF file to store the calculated
normalization data.
:type dtype: numpy.dtype
:param dtype: type of data to use during calculations.
"""
accumulatedSum = None
accumulatedSumOfSqr = None
totalFrames = 0 if six.PY3 else long() # python 2/3 compatibility
bundle = BundleFile(bundleFilePath)
for filePath in bundle.datasetFilePaths:
with h5py.File(filePath, mode='r') as datasetFile:
intermSum, intermSumOfSqr, intermTotalFrames = \
NormalizationData._accumulateSums(
datasetFile,
groupName,
dtype=dtype
)
accumulatedSum = NormalizationData._updateTotalSum(
accumulatedSum,
intermSum
)
accumulatedSumOfSqr = NormalizationData._updateTotalSum(
accumulatedSumOfSqr,
intermSumOfSqr
)
totalFrames += intermTotalFrames
mean, meanOfSquares, variance = \
NormalizationData._calculateMeans(
accumulatedSum,
accumulatedSumOfSqr,
totalFrames
)
with h5py.File(outputFilePath, mode='a') as out:
NormalizationData._writeData(
out, groupName,
mean, meanOfSquares, variance, totalFrames,
dtype=dtype
)
@staticmethod
def _accumulateSums(f, groupName, dtype=np.float64):
"""Helper method.
Accumulate sums and sums of squares over feature vectors for a given group.
:type f: h5py.File
:param f: handle to an opened HDF file with datasets
:type groupName: str
:param groupName: HDF group containing datasets
:type dtype: numpy.dtype
:param dtype: type of data to use during calculations.
:rtype: tuple (numpy.ndarray | None, numpy.ndarray | None, long)
:return: tuple (sum, sum of squares, total number of time frames)
if they are available
"""
sum = None
sumOfSqr = None
totalFrames = np.int64(0)
if groupName not in f:
return sum, sumOfSqr, totalFrames
group = f[groupName]
datasetNames = group.keys()
if len(datasetNames) == 0:
return sum, sumOfSqr, totalFrames
featDims = group[datasetNames[0]].shape[
NormalizationData.DATASET_FEATURE_DIMENSION_INDEX
]
sum = np.zeros(featDims, dtype=dtype)
sumOfSqr = np.zeros(featDims, dtype=dtype)
for dsName in datasetNames:
dataset = group[dsName][...]
sum += np.sum(
dataset,
axis=NormalizationData.DATASET_TIME_DIMENSION_INDEX
)
sumOfSqr += np.sum(
np.square(dataset),
axis=NormalizationData.DATASET_TIME_DIMENSION_INDEX
)
totalFrames += dataset.shape[
NormalizationData.DATASET_TIME_DIMENSION_INDEX
]
return sum, sumOfSqr, totalFrames
@staticmethod
def _updateTotalSum(totalSum, intermediateSum):
"""Helper method.
Updates total sum with intermediate sum if the latter is available.
:type totalSum: numpy.ndarray | None
:param totalSum: total sum
:type intermediateSum: numpy.ndarray | None
:param intermediateSum: intermediate sum
:rtype: numpy.ndarray | None
:return: updated total sum if available
"""
if totalSum is None and intermediateSum is None:
return None
if totalSum is None:
return intermediateSum
if intermediateSum is None:
return totalSum
# floating point summation check
oldSum = totalSum
newSum = np.add(totalSum, intermediateSum)
sumErr = np.sum(np.abs(newSum - oldSum - intermediateSum))
if sumErr > NormalizationData.SUMMATION_PRECISION:
raise FloatingPointError(
'sums have very different orders of magnitude.'
' summation error = {}'.format(sumErr)
)
return newSum
@staticmethod
def _calculateMeans(totalSum, totalSumOfSqr, totalFrames):
"""Helper method.
Calculate mean, mean of squares and variance if they are available.
:type totalSum: numpy.ndarray | None
:param totalSum: total sum of features
:type totalSumOfSqr: numpy.ndarray | None
:param totalSumOfSqr: total sum of squares of features
:type totalFrames: long
:param totalFrames: total number of timeframes
:rtype: tuple (numpy.ndarray | None, numpy.ndarray | None, numpy.ndarray | None)
:return: tuple (mean, mean of squares, variance) if they are available
"""
mean = None
meanOfSquares = None
variance = None
if totalSum is not None:
assert totalFrames > 0
mean = totalSum / totalFrames
if mean is not None and totalSumOfSqr is not None:
assert totalFrames > 0
meanOfSquares = totalSumOfSqr / totalFrames
# Var[X] = E[X ^ 2] - (E[X]) ^ 2
variance = meanOfSquares - np.square(mean)
return mean, meanOfSquares, variance
@staticmethod
def _writeData(f, groupName, mean, meanOfSqr, variance, totalFrames,
dtype=np.float64):
"""Helper method.
Writes means and variance for a given group.
:type f: h5py.File
:param f: handle to an opened HDF file to which data should be written.
:type groupName: str
:param groupName: HDF group name
:type mean: numpy.ndarray | None
:param mean: mean
:type meanOfSqr: numpy.ndarray | None
:param meanOfSqr: mean of squares
:type variance: numpy.ndarray | None
:param variance: variance
:type totalFrames: long
:param totalFrames: total number of time frames
:type dtype: numpy.dtype
:param dtype: type of data to use for writing the data
"""
if groupName in f:
del f[groupName]
group = f.create_group(groupName)
dsNames = [
NormalizationData.DATASET_MEAN,
NormalizationData.DATASET_MEAN_OF_SQUARES,
NormalizationData.DATASET_VARIANCE
]
datasets = [
mean,
meanOfSqr,
variance
]
for name, ds in zip(dsNames, datasets):
NormalizationData._writeDataset(group, name, ds, dtype)
if totalFrames > 0:
group.create_dataset(
NormalizationData.DATASET_TOTAL_FRAMES,
data=totalFrames
)
@staticmethod
def _writeDataset(group, datasetName, dataset, dtype=np.float64):
"""Helper Method.
Writes dataset into an HDF group if the dataset is available.
:type group: h5py.Group
:param group: HDF group handle
:type datasetName: str
:param datasetName: name of the dataset
:type dataset: numpy.ndarray | None
:param dataset: actual data of the dataset
:type dtype: numpy.dtype
:param dtype: type of data to use for writing the data.
"""
if dataset is None:
return
group.create_dataset(datasetName, data=dataset, dtype=dtype)
def __init__(self, normalizationFilePath):
"""Reads normalization data from the given HDF file and saves it
into the member variables.
:type normalizationFilePath: str
:param normalizationFilePath: path to the HDF file with normalization data.
"""
self._normalizationFilePath = normalizationFilePath
self._inputMean = None
self._inputVariance = None
self._outputMean = None
self._outputVariance = None
self._readNormalizationData()
def _readNormalizationData(self):
"""Reads normalization data from the given HDF file.
The file is expected to have the following structure.
It may have two groups:
* NormalizationData.GROUP_INPUTS (the group for the input data)
* NormalizationData.GROUP_OUTPUTS (the group for the output data)
Each group may have two datasets:
* NormalizationData.DATASET_MEAN (the dataset for mean)
* NormalizationData.DATASET_VARIANCE (the dataset for variance)
Everything is optional e.g. when only the group for the input data
is present and it contains only the dataset for mean then only this
data will be read. No exception will be thrown.
The groups may also contain additional optional information such as
e.g. total number of time frames, mean of squares etc.
However, this information is not read here.
"""
if not os.path.isfile(self._normalizationFilePath):
raise IOError(self._normalizationFilePath + ' does not exist')
with h5py.File(self._normalizationFilePath, mode='r') as f:
self._inputMean, self._inputVariance = \
self._getMeanAndVarianceFromGroup(f, self.GROUP_INPUTS)
self._outputMean, self._outputVariance = \
self._getMeanAndVarianceFromGroup(f, self.GROUP_OUTPUTS)
@staticmethod
def _getMeanAndVarianceFromGroup(f, groupName):
"""Reads mean and variance from the given group if they are available.
Both mean and variance are optional i.e. they may be absent in the
given HDF group.
:type f: h5py.File
:param f: handle to an opened HDF file with normalization data.
:type groupName: str
:param groupName: name of the HDF group from which mean and variance
should be read.
:rtype: tuple (numpy.ndarray | None, numpy.ndarray | None)
:return: a tuple (mean, variance) each of which may be None
if the data is not available.
"""
mean = None
variance = None
if groupName not in f:
return mean, variance
group = f[groupName]
if NormalizationData.DATASET_MEAN in group:
mean = group[NormalizationData.DATASET_MEAN][...]
if NormalizationData.DATASET_VARIANCE in group:
variance = group[NormalizationData.DATASET_VARIANCE][...]
return mean, variance
@property
def inputMean(self):
"""Mean of the input data.
:rtype: numpy.ndarray | None
:return: Mean of the input data if it is available or None otherwise.
"""
return self._inputMean
@property
def inputVariance(self):
"""Variance of the input data.
:rtype: numpy.ndarray | None
:return: Variance of the input data if it is available or None otherwise.
"""
return self._inputVariance
@property
def outputMean(self):
"""Mean of the output data.
:rtype: numpy.ndarray | None
:return: Mean of the output data if it is available or None otherwise.
"""
return self._outputMean
@property
def outputVariance(self):
"""Variance of the output data.
:rtype: numpy.ndarray | None
:return: Variance of the output data if it is available or None otherwise.
"""
return self._outputVariance