-
Notifications
You must be signed in to change notification settings - Fork 51
/
reader.py
524 lines (430 loc) · 21.2 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
import bz2
import io
import os
import re
import shlex
import warnings
import zlib
from collections import OrderedDict
from typing import IO, Any, AnyStr, Iterable, Tuple
import nrrd
from nrrd.parsers import *
from nrrd.types import IndexOrder, NRRDFieldMap, NRRDFieldType, NRRDHeader
# Older versions of Python had issues when uncompressed data was larger than 4GB (2^32). This should be fixed in latest
# version of Python 2.7 and all versions of Python 3. The fix for this issue is to read the data in smaller chunks.
# Chunk size is set to be large at 1GB to improve performance. If issues arise decompressing larger files, try to reduce
# this value
_READ_CHUNKSIZE: int = 2 ** 32
_NRRD_REQUIRED_FIELDS = ['dimension', 'type', 'encoding', 'sizes']
ALLOW_DUPLICATE_FIELD: bool = False
"""Allow duplicate header fields when reading NRRD files
When there are duplicated fields in a NRRD file header, pynrrd throws an error by default. Setting this field as
:obj:`True` will instead show a warning.
Example:
Reading a NRRD file with duplicated header field 'space' with field set to :obj:`False`.
>>> filedata, fileheader = nrrd.read('filename_duplicatedheader.nrrd')
nrrd.errors.NRRDError: Duplicate header field: 'space'
Set the field as :obj:`True` to receive a warning instead.
>>> nrrd.reader.ALLOW_DUPLICATE_FIELD = True
>>> filedata, fileheader = nrrd.read('filename_duplicatedheader.nrrd')
UserWarning: Duplicate header field: 'space' warnings.warn(dup_message)
Note:
Duplicated fields are prohibited by the NRRD file specification.
"""
_TYPEMAP_NRRD2NUMPY = {
'signed char': 'i1',
'int8': 'i1',
'int8_t': 'i1',
'uchar': 'u1',
'unsigned char': 'u1',
'uint8': 'u1',
'uint8_t': 'u1',
'short': 'i2',
'short int': 'i2',
'signed short': 'i2',
'signed short int': 'i2',
'int16': 'i2',
'int16_t': 'i2',
'ushort': 'u2',
'unsigned short': 'u2',
'unsigned short int': 'u2',
'uint16': 'u2',
'uint16_t': 'u2',
'int': 'i4',
'signed int': 'i4',
'int32': 'i4',
'int32_t': 'i4',
'uint': 'u4',
'unsigned int': 'u4',
'uint32': 'u4',
'uint32_t': 'u4',
'longlong': 'i8',
'long long': 'i8',
'long long int': 'i8',
'signed long long': 'i8',
'signed long long int': 'i8',
'int64': 'i8',
'int64_t': 'i8',
'ulonglong': 'u8',
'unsigned long long': 'u8',
'unsigned long long int': 'u8',
'uint64': 'u8',
'uint64_t': 'u8',
'float': 'f4',
'double': 'f8',
'block': 'V'
}
def _get_field_type(field: str, custom_field_map: Optional[NRRDFieldMap]) -> NRRDFieldType:
if field in ['dimension', 'lineskip', 'line skip', 'byteskip', 'byte skip', 'space dimension']:
return 'int'
elif field in ['min', 'max', 'oldmin', 'old min', 'oldmax', 'old max']:
return 'double'
elif field in ['endian', 'encoding', 'content', 'sample units', 'datafile', 'data file', 'space', 'type']:
return 'string'
elif field in ['sizes']:
return 'int list'
elif field in ['spacings', 'thicknesses', 'axismins', 'axis mins', 'axismaxs', 'axis maxs']:
return 'double list'
elif field in ['kinds', 'centerings']:
return 'string list'
elif field in ['labels', 'units', 'space units']:
return 'quoted string list'
# No int vector fields yet
# elif field in []:
# return 'int vector'
elif field in ['space origin']:
return 'double vector'
elif field in ['measurement frame']:
return 'double matrix'
elif field in ['space directions']:
return nrrd.SPACE_DIRECTIONS_TYPE
else:
if custom_field_map and field in custom_field_map:
return custom_field_map[field]
# Default the type to string if unknown type
return 'string'
def _parse_field_value(value: str, field_type: NRRDFieldType) -> Any:
if field_type == 'int':
return int(value)
elif field_type == 'double':
return float(value)
elif field_type == 'string':
return str(value)
elif field_type == 'int list':
return parse_number_list(value, dtype=int)
elif field_type == 'double list':
return parse_number_list(value, dtype=float)
elif field_type == 'string list':
return [str(x) for x in value.split()]
elif field_type == 'quoted string list':
return shlex.split(value)
elif field_type == 'int vector':
return parse_vector(value, dtype=int)
elif field_type == 'double vector':
return parse_vector(value, dtype=float)
elif field_type == 'int matrix':
return parse_matrix(value, dtype=int)
elif field_type == 'double matrix':
# For matrices of double type, parse as an optional matrix to allow for rows of the matrix to be none
# This is only valid for double matrices because the matrix is represented with NaN in the entire row
# for none rows. NaN is only valid for floating point numbers
return parse_optional_matrix(value)
elif field_type == 'int vector list':
return parse_optional_vector_list(value, dtype=int)
elif field_type == 'double vector list':
return parse_optional_vector_list(value, dtype=float)
else:
raise NRRDError(f'Invalid field type given: {field_type}')
def _determine_datatype(header: NRRDHeader) -> np.dtype:
"""Determine the numpy dtype of the data."""
# Convert the NRRD type string identifier into a NumPy string identifier using a map
np_typestring = _TYPEMAP_NRRD2NUMPY[header['type']]
# This is only added if the datatype has more than one byte and is not using ASCII encoding
# Note: Endian is not required for ASCII encoding
if np.dtype(np_typestring).itemsize > 1 and header['encoding'] not in ['ASCII', 'ascii', 'text', 'txt']:
if 'endian' not in header:
raise NRRDError('Header is missing required field: endian')
elif header['endian'] == 'big':
np_typestring = '>' + np_typestring
elif header['endian'] == 'little':
np_typestring = '<' + np_typestring
else:
raise NRRDError(f'Invalid endian value in header: {header["endian"]}')
return np.dtype(np_typestring)
def _validate_magic_line(line: str) -> int:
"""For NRRD files, the first four characters are always "NRRD", and
remaining characters give information about the file format version
>>> _validate_magic_line('NRRD0005')
8
>>> _validate_magic_line('NRRD0006')
Traceback (most recent call last):
...
NrrdError: NRRD file version too new for this library.
>>> _validate_magic_line('NRRD')
Traceback (most recent call last):
...
NrrdError: Invalid NRRD magic line: NRRD
"""
if not line.startswith('NRRD'):
raise NRRDError('Invalid NRRD magic line. Is this an NRRD file?')
try:
version = int(line[4:])
if version > 5:
raise NRRDError(f'Unsupported NRRD file version (version: {version}). This library only supports v5 '
'and below.')
except ValueError:
raise NRRDError(f'Invalid NRRD magic line: {line}')
return len(line)
def read_header(file: Union[str, Iterable[AnyStr]], custom_field_map: Optional[NRRDFieldMap] = None) -> NRRDHeader:
"""Read contents of header and parse values from :obj:`file`
:obj:`file` can be a filename indicating where the NRRD header is located or a string iterator object. If a
filename is specified, then the file will be opened and closed after the header is read from it. If not specifying
a filename, the :obj:`file` parameter can be any sort of iterator that returns a string each time :meth:`next` is
called. The two common objects that meet these requirements are file objects and a list of strings. When
:obj:`file` is a file object, it must be opened with the binary flag ('b') on platforms where that makes a
difference, such as Windows.
See :ref:`background/how-to-use:reading nrrd files` for more information on reading NRRD files.
Parameters
----------
file : :class:`str` or string iterator
Filename, file object or string iterator object to read NRRD header from
custom_field_map : :class:`dict` (:class:`str`, :class:`str`), optional
Dictionary used for parsing custom field types where the key is the custom field name and the value is a
string identifying datatype for the custom field.
Returns
-------
header : :class:`dict` (:class:`str`, :obj:`Object`)
Dictionary containing the header fields and their corresponding parsed value
See Also
--------
:meth:`read`, :meth:`read_data`
"""
# If the file is a filename rather than the file handle, then open the file and call this function again with the
# file handle. Since read function uses a filename, it is easy to think read_header is the same syntax.
if isinstance(file, str) and file.count('\n') == 0:
with open(file, 'rb') as fh:
return read_header(fh, custom_field_map)
# Collect number of bytes in the file header (for seeking below)
header_size = 0
# Get iterator for the file and extract the first line, the magic line
it = iter(file)
magic_line = next(it)
# Depending on what type file is, decoding may or may not be necessary. Decode if necessary, otherwise skip.
need_decode = False
if hasattr(magic_line, 'decode'):
need_decode = True
magic_line = magic_line.decode('ascii', 'ignore')
# Validate the magic line and increment header size by size of the line
header_size += _validate_magic_line(magic_line)
# Create empty header
# This is an OrderedDict rather than an ordinary dict because an OrderedDict will keep it's order that key/values
# are added for when looping back through it. The added benefit of this is that saving the header will save the
# fields in the same order.
header = OrderedDict()
# Loop through each line
for line in it:
header_size += len(line)
if need_decode:
line = line.decode('ascii', 'ignore')
# Trailing whitespace ignored per the NRRD spec
line = line.rstrip()
# Skip comments starting with # (no leading whitespace is allowed)
# Or, stop reading the header once a blank line is encountered. This separates header from data.
if line.startswith('#'):
continue
elif line == '':
break
# Read the field and value from the line, split using regex to search for := or : delimiter
field, value = re.split(r':=?', line, 1)
# Remove whitespace before and after the field and value
field, value = field.strip(), value.strip()
# Check if the field has been added already
if field in header.keys():
if not ALLOW_DUPLICATE_FIELD:
raise NRRDError(f'Duplicate header field: {field}')
else:
warnings.warn(f'Duplicate header field: {field}')
# Get the datatype of the field based on its field name and custom field map
field_type = _get_field_type(field, custom_field_map)
# Parse the field value using the datatype retrieved
# Place it in the header dictionary
header[field] = _parse_field_value(value, field_type)
# Reading the file line by line is buffered and so the header is not in the correct position for reading data if
# the file contains the data in it as well. The solution is to set the file pointer to just behind the header.
if hasattr(file, 'seek'):
file.seek(header_size)
return header
def read_data(header: NRRDHeader, fh: Optional[IO] = None, filename: Optional[str] = None,
index_order: IndexOrder = 'F') -> npt.NDArray:
"""Read data from file into :class:`numpy.ndarray`
The two parameters :obj:`fh` and :obj:`filename` are optional depending on the parameters but it never hurts to
specify both. The file handle (:obj:`fh`) is necessary if the header is attached with the NRRD data. However, if
the NRRD data is detached from the header, then the :obj:`filename` parameter is required to obtain the absolute
path to the data file.
See :ref:`background/how-to-use:reading nrrd files` for more information on reading NRRD files.
Parameters
----------
header : :class:`dict` (:class:`str`, :obj:`Object`)
Parsed fields/values obtained from :meth:`read_header` function
fh : file-object, optional
File object pointing to first byte of data. Only necessary if data is attached to header.
filename : :class:`str`, optional
Filename of the header file. Only necessary if data is detached from the header. This is used to get the
absolute data path.
index_order : {'C', 'F'}, optional
Specifies the index order of the resulting data array. Either 'C' (C-order) where the dimensions are ordered
from slowest-varying to fastest-varying (e.g. (z, y, x)), or 'F' (Fortran-order) where the dimensions are
ordered from fastest-varying to slowest-varying (e.g. (x, y, z)).
Returns
-------
data : :class:`numpy.ndarray`
Data read from NRRD file
See Also
--------
:meth:`read`, :meth:`read_header`
"""
if index_order not in ['F', 'C']:
raise NRRDError('Invalid index order')
# Check that the required fields are in the header
for field in _NRRD_REQUIRED_FIELDS:
if field not in header:
raise NRRDError(f'Header is missing required field: {field}')
if header['dimension'] != len(header['sizes']):
raise NRRDError(f'Number of elements in sizes does not match dimension. Dimension: {header["dimension"]}, '
f'len(sizes): {len(header["sizes"])}')
# Determine the data type from the header
dtype = _determine_datatype(header)
# Determine the byte skip, line skip and the data file
# These all can be written with or without the space according to the NRRD spec, so we check them both
line_skip = header.get('lineskip', header.get('line skip', 0))
byte_skip = header.get('byteskip', header.get('byte skip', 0))
data_filename = header.get('datafile', header.get('data file', None))
# If the data file is separate from the header file, then open the data file to read from that instead
if data_filename is not None:
# If the pathname is relative, then append the current directory from the filename
if not os.path.isabs(data_filename):
if filename is None:
raise NRRDError('Filename parameter must be specified when a relative data file path is given')
data_filename = os.path.join(os.path.dirname(filename), data_filename)
# Override the fh parameter with the data filename
# Note that this is opened without a "with" block, thus it must be closed manually in all circumstances
fh = open(data_filename, 'rb')
# Get the total number of data points by multiplying the size of each dimension together
total_data_points = header['sizes'].prod(dtype=np.int64)
# Skip the number of lines requested when line_skip >= 0
# Irrespective of the NRRD file having attached/detached header
# Lines are skipped before getting to the beginning of the data
if line_skip >= 0:
for _ in range(line_skip):
fh.readline()
else:
# Must close the file because if the file was opened above from detached filename, there is no "with" block to
# close it for us
fh.close()
raise NRRDError('Invalid lineskip, allowed values are greater than or equal to 0')
# Skip the requested number of bytes or seek backward, and then parse the data using NumPy
if byte_skip < -1:
# Must close the file because if the file was opened above from detached filename, there is no "with" block to
# close it for us
fh.close()
raise NRRDError('Invalid byteskip, allowed values are greater than or equal to -1')
elif byte_skip >= 0:
fh.seek(byte_skip, os.SEEK_CUR)
elif byte_skip == -1 and header['encoding'] not in ['gzip', 'gz', 'bzip2', 'bz2']:
fh.seek(-dtype.itemsize * total_data_points, os.SEEK_END)
else:
# The only case left should be: byte_skip == -1 and header['encoding'] == 'gzip'
byte_skip = -dtype.itemsize * total_data_points
# If a compression encoding is used, then byte skip AFTER decompressing
if header['encoding'] == 'raw':
if isinstance(fh, io.BytesIO):
raw_data = bytearray(fh.read(total_data_points * dtype.itemsize))
data = np.frombuffer(raw_data, dtype)
else:
data = np.fromfile(fh, dtype)
elif header['encoding'] in ['ASCII', 'ascii', 'text', 'txt']:
if isinstance(fh, io.BytesIO):
data = np.fromstring(fh.read(), dtype, sep=' ')
else:
data = np.fromfile(fh, dtype, sep=' ')
else:
# Handle compressed data now
# Construct the decompression object based on encoding
if header['encoding'] in ['gzip', 'gz']:
decompobj = zlib.decompressobj(zlib.MAX_WBITS | 16)
elif header['encoding'] in ['bzip2', 'bz2']:
decompobj = bz2.BZ2Decompressor()
else:
# Must close the file because if the file was opened above from detached filename, there is no "with" block
# to close it for us
fh.close()
raise NRRDError(f'Unsupported encoding: {header["encoding"]}')
# Loop through the file and read a chunk at a time (see _READ_CHUNKSIZE why it is read in chunks)
decompressed_data = bytearray()
# Read all the remaining data from the file
# Obtain the length of the compressed data since we will be using it repeatedly, more efficient
compressed_data = fh.read()
compressed_data_len = len(compressed_data)
start_index = 0
# Loop through data and decompress it chunk by chunk
while start_index < compressed_data_len:
# Calculate the end index = start index plus chunk size
# Set to the string length to read the remaining chunk at the end
end_index = min(start_index + _READ_CHUNKSIZE, compressed_data_len)
# Decompress and append data
decompressed_data += decompobj.decompress(compressed_data[start_index:end_index])
# Update start index
start_index = end_index
# Delete the compressed data since we do not need it anymore
# This could potentially be using a lot of memory
del compressed_data
# Byte skip is applied AFTER the decompression. Skip first x bytes of the decompressed data and parse it using
# NumPy
data = np.frombuffer(decompressed_data[byte_skip:], dtype)
# Close the file, even if opened using "with" block, closing it manually does not hurt
fh.close()
if total_data_points != data.size:
raise NRRDError(f'Size of the data does not equal the product of all the dimensions: '
f'{total_data_points}-{data.size}={total_data_points - data.size}')
# In the NRRD header, the fields are specified in Fortran order, i.e, the first index is the one that changes
# fastest and last index changes slowest. This needs to be taken into consideration since numpy uses C-order
# indexing.
# The array shape from NRRD (x,y,z) needs to be reversed as numpy expects (z,y,x).
data = np.reshape(data, tuple(header['sizes'][::-1]))
# Transpose data to enable Fortran indexing if requested.
if index_order == 'F':
data = data.T
return data
def read(filename: str, custom_field_map: Optional[NRRDFieldMap] = None, index_order: IndexOrder = 'F') \
-> Tuple[npt.NDArray, NRRDHeader]:
"""Read a NRRD file and return the header and data
See :ref:`background/how-to-use:reading nrrd files` for more information on reading NRRD files.
.. note::
Users should be aware that the `index_order` argument needs to be consistent between `nrrd.read` and
`nrrd.write`. I.e., reading an array with `index_order='F'` will result in a transposed version of the
original data and hence the writer needs to be aware of this.
Parameters
----------
filename : :class:`str`
Filename of the NRRD file
custom_field_map : :class:`dict` (:class:`str`, :class:`str`), optional
Dictionary used for parsing custom field types where the key is the custom field name and the value is a
string identifying datatype for the custom field.
index_order : {'C', 'F'}, optional
Specifies the index order of the resulting data array. Either 'C' (C-order) where the dimensions are ordered
from slowest-varying to fastest-varying (e.g. (z, y, x)), or 'F' (Fortran-order) where the dimensions are
ordered from fastest-varying to slowest-varying (e.g. (x, y, z)).
Returns
-------
data : :class:`numpy.ndarray`
Data read from NRRD file
header : :class:`dict` (:class:`str`, :obj:`Object`)
Dictionary containing the header fields and their corresponding parsed value
See Also
--------
:meth:`write`, :meth:`read_header`, :meth:`read_data`
"""
with open(filename, 'rb') as fh:
header = read_header(fh, custom_field_map)
data = read_data(header, fh, filename, index_order)
return data, header