From 1d078743efcb85ddace880d930a4ed95772aaf9b Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 5 Oct 2023 15:54:42 +0100 Subject: [PATCH 1/7] Expose .units and .units_name on KeyData objects --- extra_data/keydata.py | 29 +++++++++++++++++++++++++++-- extra_data/tests/mockdata/xgm.py | 14 ++++++++++++++ extra_data/tests/test_keydata.py | 11 +++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index b121c113..b029c1bd 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -106,6 +106,28 @@ def size_gb(self): """The size of the data in memory in gigabytes.""" return self.nbytes / 1e9 + @property + def units(self): + dset = self.files[0].file[self.hdf5_data_path] + base_unit = dset.attrs.get('unitSymbol', None) + if base_unit is None: + return None + + prefix = dset.attrs.get('metricPrefixSymbol', '') + if prefix == 'u': + prefix = 'μ' # We are not afraid of unicode + return prefix + base_unit + + @property + def units_name(self): + dset = self.files[0].file[self.hdf5_data_path] + base_unit = dset.attrs.get('unitName', None) + if base_unit is None: + return None + + prefix = dset.attrs.get('metricPrefixName', '') + return prefix + base_unit + @property def source_file_paths(self): paths = [] @@ -144,8 +166,11 @@ def __getitem__(self, item): def _only_tids(self, tids): tids_arr = np.array(tids) - files = [f for f in self.files - if f.has_train_ids(tids_arr, self.inc_suspect_trains)] + # Keep 1 file, even if 0 trains selected. + files = [ + f for f in self.files + if f.has_train_ids(tids_arr, self.inc_suspect_trains) + ] or [self.files[0]] return KeyData( self.source, diff --git a/extra_data/tests/mockdata/xgm.py b/extra_data/tests/mockdata/xgm.py index 5f247474..8c972240 100644 --- a/extra_data/tests/mockdata/xgm.py +++ b/extra_data/tests/mockdata/xgm.py @@ -1,3 +1,5 @@ +import numpy as np + from .base import DeviceBase class XGM(DeviceBase): @@ -53,3 +55,15 @@ class XGM(DeviceBase): ('xTD', 'f4', (1000,)), ('yTD', 'f4', (1000,)), ] + + def write_instrument(self, f): + super().write_instrument(f) + + # Annotate intensityTD with some units to test retrieving them + ds = f[f'INSTRUMENT/{self.device_id}:output/data/intensityTD'] + ds.attrs['metricPrefixEnum']= np.array([14], dtype=np.int32) + ds.attrs['metricPrefixName'] = 'micro' + ds.attrs['metricPrefixSymbol'] = b'u' + ds.attrs['unitEnum'] = [15] + ds.attrs['unitName'] = b'joule' + ds.attrs['unitSymbol'] = b'J' diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index d2a9147a..f1ec4808 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -339,3 +339,14 @@ def test_file_no_trains(run_with_file_no_trains): run = RunDirectory(run_with_file_no_trains) xpos = run['SPB_XTD9_XGM/DOOCS/MAIN', 'beamPosition.ixPos'].ndarray() assert xpos.shape == (64,) + + +def test_units(mock_sa3_control_data): + run = H5File(mock_sa3_control_data) + xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] + + assert xgm_intensity.units == 'μJ' + assert xgm_intensity.units_name == 'microjoule' + + # Check that it still works after selecting 0 trains + assert xgm_intensity.select_trains(np.s_[:0]).units == 'μJ' From 4b1e302da0a0e28f9eda8cb7dace6a97063c22d3 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 5 Oct 2023 16:00:12 +0100 Subject: [PATCH 2/7] Write attributes more carefully for tests --- extra_data/tests/mockdata/xgm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/extra_data/tests/mockdata/xgm.py b/extra_data/tests/mockdata/xgm.py index 8c972240..d25f7372 100644 --- a/extra_data/tests/mockdata/xgm.py +++ b/extra_data/tests/mockdata/xgm.py @@ -60,10 +60,11 @@ def write_instrument(self, f): super().write_instrument(f) # Annotate intensityTD with some units to test retrieving them + # Karabo stores ASCII strings, assigning bytes is a shortcut to mimic that ds = f[f'INSTRUMENT/{self.device_id}:output/data/intensityTD'] ds.attrs['metricPrefixEnum']= np.array([14], dtype=np.int32) - ds.attrs['metricPrefixName'] = 'micro' + ds.attrs['metricPrefixName'] = b'micro' ds.attrs['metricPrefixSymbol'] = b'u' - ds.attrs['unitEnum'] = [15] + ds.attrs['unitEnum'] = np.array([15], dtype=np.int32) ds.attrs['unitName'] = b'joule' ds.attrs['unitSymbol'] = b'J' From 6a1c963186255d16a437554a0cf7fab4bc6716bc Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Thu, 5 Oct 2023 16:16:37 +0100 Subject: [PATCH 3/7] Update test for selecting 0 trains --- extra_data/tests/test_keydata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index f1ec4808..d69b81e0 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -49,7 +49,7 @@ def test_select_trains(mock_spb_raw_run): # Empty selection sel2 = xgm_beam_x[80:] assert sel2.shape == (0,) - assert len(sel2.files) == 0 + assert len(sel2.files) == 1 assert sel2.xarray().shape == (0,) # Single train From d3fa46cfd7f1c151dc768209593723cf9f35edd0 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 9 Oct 2023 14:44:01 +0100 Subject: [PATCH 4/7] Look in source file for dataset attributes if required --- extra_data/keydata.py | 25 +++++++++++++++++++------ extra_data/tests/test_keydata.py | 10 ++++++++++ extra_data/tests/test_voview.py | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index b029c1bd..0c6fb381 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -1,5 +1,6 @@ from typing import List, Optional, Tuple +import h5py import numpy as np from .exceptions import TrainIDError, NoDataError @@ -108,24 +109,24 @@ def size_gb(self): @property def units(self): - dset = self.files[0].file[self.hdf5_data_path] - base_unit = dset.attrs.get('unitSymbol', None) + attrs = self.attributes() + base_unit = attrs.get('unitSymbol', None) if base_unit is None: return None - prefix = dset.attrs.get('metricPrefixSymbol', '') + prefix = attrs.get('metricPrefixSymbol', '') if prefix == 'u': prefix = 'μ' # We are not afraid of unicode return prefix + base_unit @property def units_name(self): - dset = self.files[0].file[self.hdf5_data_path] - base_unit = dset.attrs.get('unitName', None) + attrs = self.attributes() + base_unit = attrs.get('unitName', None) if base_unit is None: return None - prefix = dset.attrs.get('metricPrefixName', '') + prefix = attrs.get('metricPrefixName', '') return prefix + base_unit @property @@ -151,6 +152,18 @@ def source_file_paths(self): from pathlib import Path return [Path(p) for p in paths] + def attributes(self): + dset = self.files[0].file[self.hdf5_data_path] + if len(dset.attrs) == 0 and dset.is_virtual: + # Virtual datasets were initially created without these attributes. + # Find a source file. Not using source_file_paths as it can give []. + _, filename, _, _ = dset.virtual_sources()[0] + # Not using FileAccess: no need for train or source lists. + f = h5py.File(filename, 'r') + dset = f[self.hdf5_data_path] + + return dict(dset.attrs) + def select_trains(self, trains): """Select a subset of trains in this data as a new :class:`KeyData` object. diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index d69b81e0..c9f00f8a 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -341,6 +341,16 @@ def test_file_no_trains(run_with_file_no_trains): assert xpos.shape == (64,) +def test_attributes(mock_sa3_control_data): + run = H5File(mock_sa3_control_data) + xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] + attrs = xgm_intensity.attributes() + + assert isinstance(attrs, dict) + assert attrs['metricPrefixName'] == 'micro' + assert attrs['unitSymbol'] == 'J' + + def test_units(mock_sa3_control_data): run = H5File(mock_sa3_control_data) xgm_intensity = run['SA3_XTD10_XGM/XGM/DOOCS:output', 'data.intensityTD'] diff --git a/extra_data/tests/test_voview.py b/extra_data/tests/test_voview.py index cbdfbe1d..60df0d33 100644 --- a/extra_data/tests/test_voview.py +++ b/extra_data/tests/test_voview.py @@ -63,6 +63,8 @@ def test_use_voview(mock_spb_raw_run, tmp_path): assert {p.name for p in xgm_intens[:30].source_file_paths} == { 'RAW-R0238-DA01-S00000.h5' } + assert xgm_intens.units == 'μJ' + assert xgm_intens.units_name == 'microjoule' From a8d7e70a87c9babde64ae336ab5e33be9cc0983a Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 9 Oct 2023 15:53:46 +0100 Subject: [PATCH 5/7] Use 'with h5py.File()' to ensure file is closed afterwards --- extra_data/keydata.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 0c6fb381..5b0d855a 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -153,14 +153,20 @@ def source_file_paths(self): return [Path(p) for p in paths] def attributes(self): + """Get a dict of all attributes stored with this data + + This may be awkward to use. See .units and .units_name for more + convenient forms. + """ dset = self.files[0].file[self.hdf5_data_path] - if len(dset.attrs) == 0 and dset.is_virtual: + attrs = dict(dset.attrs) + if (not attrs) and dset.is_virtual: # Virtual datasets were initially created without these attributes. # Find a source file. Not using source_file_paths as it can give []. _, filename, _, _ = dset.virtual_sources()[0] # Not using FileAccess: no need for train or source lists. - f = h5py.File(filename, 'r') - dset = f[self.hdf5_data_path] + with h5py.File(filename, 'r') as f: + attrs = dict(f[self.hdf5_data_path].attrs) return dict(dset.attrs) From 5d0d95de4f447313d2bbb82b0a53a5bf4d3be313 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 9 Oct 2023 15:53:55 +0100 Subject: [PATCH 6/7] Document units & units_name --- docs/reading_files.rst | 4 ++++ extra_data/keydata.py | 2 ++ 2 files changed, 6 insertions(+) diff --git a/docs/reading_files.rst b/docs/reading_files.rst index 1939c6f4..31b9f639 100644 --- a/docs/reading_files.rst +++ b/docs/reading_files.rst @@ -221,6 +221,10 @@ below, e.g.:: .. versionadded:: 1.9 + .. autoattribute:: units + + .. autoattribute:: units_name + The run or file object (a :class:`DataCollection`) also has methods to load data by sources and keys. :meth:`get_array`, :meth:`get_dask_array` and diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 5b0d855a..2931785d 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -109,6 +109,7 @@ def size_gb(self): @property def units(self): + """The units symbol for this data, e.g. 'μJ', or None if not found""" attrs = self.attributes() base_unit = attrs.get('unitSymbol', None) if base_unit is None: @@ -121,6 +122,7 @@ def units(self): @property def units_name(self): + """The units name for this data, e.g. 'microjoule', or None if not found""" attrs = self.attributes() base_unit = attrs.get('unitName', None) if base_unit is None: From 5926e94203ede93696ce893f31a44a06e9e295e5 Mon Sep 17 00:00:00 2001 From: Thomas Kluyver Date: Mon, 9 Oct 2023 16:53:38 +0100 Subject: [PATCH 7/7] Fix attributes() method with virtual datasets --- extra_data/keydata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 2931785d..69ac3223 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -170,7 +170,7 @@ def attributes(self): with h5py.File(filename, 'r') as f: attrs = dict(f[self.hdf5_data_path].attrs) - return dict(dset.attrs) + return attrs def select_trains(self, trains): """Select a subset of trains in this data as a new :class:`KeyData` object.